pixman-arma64-neon-asm.S (139470B)
1 /* 2 * Copyright © 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 */ 25 26 /* 27 * This file contains implementations of NEON optimized pixel processing 28 * functions. There is no full and detailed tutorial, but some functions 29 * (those which are exposing some new or interesting features) are 30 * extensively commented and can be used as examples. 31 * 32 * You may want to have a look at the comments for following functions: 33 * - pixman_composite_over_8888_0565_asm_neon 34 * - pixman_composite_over_n_8_0565_asm_neon 35 */ 36 37 /* Prevent the stack from becoming executable for no reason... */ 38 #if defined(__linux__) && defined(__ELF__) 39 .section .note.GNU-stack,"",%progbits 40 #endif 41 42 .text 43 .arch armv8-a 44 45 .altmacro 46 .p2align 2 47 48 #include "pixman-private.h" 49 #include "pixman-arm-asm.h" 50 #include "pixman-arma64-neon-asm.h" 51 52 /* Global configuration options and preferences */ 53 54 /* 55 * The code can optionally make use of unaligned memory accesses to improve 56 * performance of handling leading/trailing pixels for each scanline. 57 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 58 * example in linux if unaligned memory accesses are not configured to 59 * generate.exceptions. 60 */ 61 .set RESPECT_STRICT_ALIGNMENT, 1 62 63 /* 64 * Set default prefetch type. There is a choice between the following options: 65 * 66 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 67 * as NOP to workaround some HW bugs or for whatever other reason) 68 * 69 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 70 * advanced prefetch intruduces heavy overhead) 71 * 72 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 73 * which can run ARM and NEON instructions simultaneously so that extra ARM 74 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 75 * 76 * Note: some types of function can't support advanced prefetch and fallback 77 * to simple one (those which handle 24bpp pixels) 78 */ 79 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 80 81 /* Prefetch distance in pixels for simple prefetch */ 82 .set PREFETCH_DISTANCE_SIMPLE, 64 83 84 /* 85 * Implementation of pixman_composite_over_8888_0565_asm_neon 86 * 87 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and 88 * performs OVER compositing operation. Function fast_composite_over_8888_0565 89 * from pixman-fast-path.c does the same in C and can be used as a reference. 90 * 91 * First we need to have some NEON assembly code which can do the actual 92 * operation on the pixels and provide it to the template macro. 93 * 94 * Template macro quite conveniently takes care of emitting all the necessary 95 * code for memory reading and writing (including quite tricky cases of 96 * handling unaligned leading/trailing pixels), so we only need to deal with 97 * the data in NEON registers. 98 * 99 * NEON registers allocation in general is recommented to be the following: 100 * v0, v1, v2, v3 - contain loaded source pixel data 101 * v4, v5, v6, v7 - contain loaded destination pixels (if they are needed) 102 * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used) 103 * v28, v29, v30, v31 - place for storing the result (destination pixels) 104 * 105 * As can be seen above, four 64-bit NEON registers are used for keeping 106 * intermediate pixel data and up to 8 pixels can be processed in one step 107 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). 108 * 109 * This particular function uses the following registers allocation: 110 * v0, v1, v2, v3 - contain loaded source pixel data 111 * v4, v5 - contain loaded destination pixels (they are needed) 112 * v28, v29 - place for storing the result (destination pixels) 113 */ 114 115 /* 116 * Step one. We need to have some code to do some arithmetics on pixel data. 117 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used 118 * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5}, 119 * perform all the needed calculations and write the result to {v28, v29}. 120 * The rationale for having two macros and not just one will be explained 121 * later. In practice, any single monolitic function which does the work can 122 * be split into two parts in any arbitrary way without affecting correctness. 123 * 124 * There is one special trick here too. Common template macro can optionally 125 * make our life a bit easier by doing R, G, B, A color components 126 * deinterleaving for 32bpp pixel formats (and this feature is used in 127 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that 128 * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we 129 * actually use v0 register for blue channel (a vector of eight 8-bit 130 * values), v1 register for green, v2 for red and v3 for alpha. This 131 * simple conversion can be also done with a few NEON instructions: 132 * 133 * Packed to planar conversion: // vuzp8 is a wrapper macro 134 * vuzp8 v0, v1 135 * vuzp8 v2, v3 136 * vuzp8 v1, v3 137 * vuzp8 v0, v2 138 * 139 * Planar to packed conversion: // vzip8 is a wrapper macro 140 * vzip8 v0, v2 141 * vzip8 v1, v3 142 * vzip8 v2, v3 143 * vzip8 v0, v1 144 * 145 * But pixel can be loaded directly in planar format using LD4 / b NEON 146 * instruction. It is 1 cycle slower than LD1 / s, so this is not always 147 * desirable, that's why deinterleaving is optional. 148 * 149 * But anyway, here is the code: 150 */ 151 152 .macro pixman_composite_over_8888_0565_process_pixblock_head 153 /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format 154 and put data into v6 - red, v7 - green, v30 - blue */ 155 mov v4.d[1], v5.d[0] 156 shrn v6.8b, v4.8h, #8 157 shrn v7.8b, v4.8h, #3 158 sli v4.8h, v4.8h, #5 159 sri v6.8b, v6.8b, #5 160 mvn v3.8b, v3.8b /* invert source alpha */ 161 sri v7.8b, v7.8b, #6 162 shrn v30.8b, v4.8h, #2 163 /* now do alpha blending, storing results in 8-bit planar format 164 into v20 - red, v23 - green, v22 - blue */ 165 umull v10.8h, v3.8b, v6.8b 166 umull v11.8h, v3.8b, v7.8b 167 umull v12.8h, v3.8b, v30.8b 168 urshr v17.8h, v10.8h, #8 169 urshr v18.8h, v11.8h, #8 170 urshr v19.8h, v12.8h, #8 171 raddhn v20.8b, v10.8h, v17.8h 172 raddhn v23.8b, v11.8h, v18.8h 173 raddhn v22.8b, v12.8h, v19.8h 174 .endm 175 176 .macro pixman_composite_over_8888_0565_process_pixblock_tail 177 /* ... continue alpha blending */ 178 uqadd v17.8b, v2.8b, v20.8b 179 uqadd v18.8b, v0.8b, v22.8b 180 uqadd v19.8b, v1.8b, v23.8b 181 /* convert the result to r5g6b5 and store it into {v14} */ 182 ushll v14.8h, v17.8b, #7 183 sli v14.8h, v14.8h, #1 184 ushll v8.8h, v19.8b, #7 185 sli v8.8h, v8.8h, #1 186 ushll v9.8h, v18.8b, #7 187 sli v9.8h, v9.8h, #1 188 sri v14.8h, v8.8h, #5 189 sri v14.8h, v9.8h, #11 190 mov v28.d[0], v14.d[0] 191 mov v29.d[0], v14.d[1] 192 .endm 193 194 /* 195 * OK, now we got almost everything that we need. Using the above two 196 * macros, the work can be done right. But now we want to optimize 197 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really 198 * a lot from good code scheduling and software pipelining. 199 * 200 * Let's construct some code, which will run in the core main loop. 201 * Some pseudo-code of the main loop will look like this: 202 * head 203 * while (...) { 204 * tail 205 * head 206 * } 207 * tail 208 * 209 * It may look a bit weird, but this setup allows to hide instruction 210 * latencies better and also utilize dual-issue capability more 211 * efficiently (make pairs of load-store and ALU instructions). 212 * 213 * So what we need now is a '*_tail_head' macro, which will be used 214 * in the core main loop. A trivial straightforward implementation 215 * of this macro would look like this: 216 * 217 * pixman_composite_over_8888_0565_process_pixblock_tail 218 * st1 {v28.4h, v29.4h}, [DST_W], #32 219 * ld1 {v4.4h, v5.4h}, [DST_R], #16 220 * ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32 221 * pixman_composite_over_8888_0565_process_pixblock_head 222 * cache_preload 8, 8 223 * 224 * Now it also got some VLD/VST instructions. We simply can't move from 225 * processing one block of pixels to the other one with just arithmetics. 226 * The previously processed data needs to be written to memory and new 227 * data needs to be fetched. Fortunately, this main loop does not deal 228 * with partial leading/trailing pixels and can load/store a full block 229 * of pixels in a bulk. Additionally, destination buffer is already 230 * 16 bytes aligned here (which is good for performance). 231 * 232 * New things here are DST_R, DST_W, SRC and MASK identifiers. These 233 * are the aliases for ARM registers which are used as pointers for 234 * accessing data. We maintain separate pointers for reading and writing 235 * destination buffer (DST_R and DST_W). 236 * 237 * Another new thing is 'cache_preload' macro. It is used for prefetching 238 * data into CPU L2 cache and improve performance when dealing with large 239 * images which are far larger than cache size. It uses one argument 240 * (actually two, but they need to be the same here) - number of pixels 241 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some 242 * details about this macro. Moreover, if good performance is needed 243 * the code from this macro needs to be copied into '*_tail_head' macro 244 * and mixed with the rest of code for optimal instructions scheduling. 245 * We are actually doing it below. 246 * 247 * Now after all the explanations, here is the optimized code. 248 * Different instruction streams (originaling from '*_head', '*_tail' 249 * and 'cache_preload' macro) use different indentation levels for 250 * better readability. Actually taking the code from one of these 251 * indentation levels and ignoring a few LD/ST instructions would 252 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' 253 * macro! 254 */ 255 256 #if 1 257 258 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head 259 uqadd v17.8b, v2.8b, v20.8b 260 ld1 {v4.4h, v5.4h}, [DST_R], #16 261 mov v4.d[1], v5.d[0] 262 uqadd v18.8b, v0.8b, v22.8b 263 uqadd v19.8b, v1.8b, v23.8b 264 shrn v6.8b, v4.8h, #8 265 fetch_src_pixblock 266 shrn v7.8b, v4.8h, #3 267 sli v4.8h, v4.8h, #5 268 ushll v14.8h, v17.8b, #7 269 sli v14.8h, v14.8h, #1 270 PF add, PF_X, PF_X, #8 271 ushll v8.8h, v19.8b, #7 272 sli v8.8h, v8.8h, #1 273 PF tst, PF_CTL, #0xF 274 sri v6.8b, v6.8b, #5 275 PF beq, 10f 276 PF add, PF_X, PF_X, #8 277 10: 278 mvn v3.8b, v3.8b 279 PF beq, 10f 280 PF sub, PF_CTL, PF_CTL, #1 281 10: 282 sri v7.8b, v7.8b, #6 283 shrn v30.8b, v4.8h, #2 284 umull v10.8h, v3.8b, v6.8b 285 PF lsl, DUMMY, PF_X, #src_bpp_shift 286 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 287 umull v11.8h, v3.8b, v7.8b 288 umull v12.8h, v3.8b, v30.8b 289 PF lsl, DUMMY, PF_X, #dst_bpp_shift 290 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 291 sri v14.8h, v8.8h, #5 292 PF cmp, PF_X, ORIG_W 293 ushll v9.8h, v18.8b, #7 294 sli v9.8h, v9.8h, #1 295 urshr v17.8h, v10.8h, #8 296 PF ble, 10f 297 PF sub, PF_X, PF_X, ORIG_W 298 10: 299 urshr v19.8h, v11.8h, #8 300 urshr v18.8h, v12.8h, #8 301 PF ble, 10f 302 PF subs, PF_CTL, PF_CTL, #0x10 303 10: 304 sri v14.8h, v9.8h, #11 305 mov v28.d[0], v14.d[0] 306 mov v29.d[0], v14.d[1] 307 PF ble, 10f 308 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 309 PF ldrsb, DUMMY, [PF_SRC] 310 10: 311 raddhn v20.8b, v10.8h, v17.8h 312 raddhn v23.8b, v11.8h, v19.8h 313 PF ble, 10f 314 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 315 PF ldrsb, DUMMY, [PF_DST] 316 10: 317 raddhn v22.8b, v12.8h, v18.8h 318 st1 {v14.8h}, [DST_W], #16 319 .endm 320 321 #else 322 323 /* If we did not care much about the performance, we would just use this... */ 324 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head 325 pixman_composite_over_8888_0565_process_pixblock_tail 326 st1 {v14.8h}, [DST_W], #16 327 ld1 {v4.4h, v4.5h}, [DST_R], #16 328 fetch_src_pixblock 329 pixman_composite_over_8888_0565_process_pixblock_head 330 cache_preload 8, 8 331 .endm 332 333 #endif 334 335 /* 336 * And now the final part. We are using 'generate_composite_function' macro 337 * to put all the stuff together. We are specifying the name of the function 338 * which we want to get, number of bits per pixel for the source, mask and 339 * destination (0 if unused, like mask in this case). Next come some bit 340 * flags: 341 * FLAG_DST_READWRITE - tells that the destination buffer is both read 342 * and written, for write-only buffer we would use 343 * FLAG_DST_WRITEONLY flag instead 344 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data 345 * and separate color channels for 32bpp format. 346 * The next things are: 347 * - the number of pixels processed per iteration (8 in this case, because 348 * that's the maximum what can fit into four 64-bit NEON registers). 349 * - prefetch distance, measured in pixel blocks. In this case it is 5 times 350 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal 351 * prefetch distance can be selected by running some benchmarks. 352 * 353 * After that we specify some macros, these are 'default_init', 354 * 'default_cleanup' here which are empty (but it is possible to have custom 355 * init/cleanup macros to be able to save/restore some extra NEON registers 356 * like d8-d15 or do anything else) followed by 357 * 'pixman_composite_over_8888_0565_process_pixblock_head', 358 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and 359 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' 360 * which we got implemented above. 361 * 362 * The last part is the NEON registers allocation scheme. 363 */ 364 generate_composite_function \ 365 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ 366 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 367 8, /* number of pixels, processed in a single block */ \ 368 5, /* prefetch distance */ \ 369 default_init, \ 370 default_cleanup, \ 371 pixman_composite_over_8888_0565_process_pixblock_head, \ 372 pixman_composite_over_8888_0565_process_pixblock_tail, \ 373 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 374 28, /* dst_w_basereg */ \ 375 4, /* dst_r_basereg */ \ 376 0, /* src_basereg */ \ 377 24 /* mask_basereg */ 378 379 /******************************************************************************/ 380 381 .macro pixman_composite_over_n_0565_process_pixblock_head 382 /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format 383 and put data into v6 - red, v7 - green, v30 - blue */ 384 mov v4.d[1], v5.d[0] 385 shrn v6.8b, v4.8h, #8 386 shrn v7.8b, v4.8h, #3 387 sli v4.8h, v4.8h, #5 388 sri v6.8b, v6.8b, #5 389 sri v7.8b, v7.8b, #6 390 shrn v30.8b, v4.8h, #2 391 /* now do alpha blending, storing results in 8-bit planar format 392 into v20 - red, v23 - green, v22 - blue */ 393 umull v10.8h, v3.8b, v6.8b 394 umull v11.8h, v3.8b, v7.8b 395 umull v12.8h, v3.8b, v30.8b 396 urshr v13.8h, v10.8h, #8 397 urshr v14.8h, v11.8h, #8 398 urshr v15.8h, v12.8h, #8 399 raddhn v20.8b, v10.8h, v13.8h 400 raddhn v23.8b, v11.8h, v14.8h 401 raddhn v22.8b, v12.8h, v15.8h 402 .endm 403 404 .macro pixman_composite_over_n_0565_process_pixblock_tail 405 /* ... continue alpha blending */ 406 uqadd v17.8b, v2.8b, v20.8b 407 uqadd v18.8b, v0.8b, v22.8b 408 uqadd v19.8b, v1.8b, v23.8b 409 /* convert the result to r5g6b5 and store it into {v14} */ 410 ushll v14.8h, v17.8b, #7 411 sli v14.8h, v14.8h, #1 412 ushll v8.8h, v19.8b, #7 413 sli v8.8h, v8.8h, #1 414 ushll v9.8h, v18.8b, #7 415 sli v9.8h, v9.8h, #1 416 sri v14.8h, v8.8h, #5 417 sri v14.8h, v9.8h, #11 418 mov v28.d[0], v14.d[0] 419 mov v29.d[0], v14.d[1] 420 .endm 421 422 /* TODO: expand macros and do better instructions scheduling */ 423 .macro pixman_composite_over_n_0565_process_pixblock_tail_head 424 pixman_composite_over_n_0565_process_pixblock_tail 425 ld1 {v4.4h, v5.4h}, [DST_R], #16 426 st1 {v14.8h}, [DST_W], #16 427 pixman_composite_over_n_0565_process_pixblock_head 428 cache_preload 8, 8 429 .endm 430 431 .macro pixman_composite_over_n_0565_init 432 mov v3.s[0], w4 433 dup v0.8b, v3.b[0] 434 dup v1.8b, v3.b[1] 435 dup v2.8b, v3.b[2] 436 dup v3.8b, v3.b[3] 437 mvn v3.8b, v3.8b /* invert source alpha */ 438 .endm 439 440 generate_composite_function \ 441 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ 442 FLAG_DST_READWRITE, \ 443 8, /* number of pixels, processed in a single block */ \ 444 5, /* prefetch distance */ \ 445 pixman_composite_over_n_0565_init, \ 446 default_cleanup, \ 447 pixman_composite_over_n_0565_process_pixblock_head, \ 448 pixman_composite_over_n_0565_process_pixblock_tail, \ 449 pixman_composite_over_n_0565_process_pixblock_tail_head, \ 450 28, /* dst_w_basereg */ \ 451 4, /* dst_r_basereg */ \ 452 0, /* src_basereg */ \ 453 24 /* mask_basereg */ 454 455 /******************************************************************************/ 456 457 .macro pixman_composite_src_8888_0565_process_pixblock_head 458 ushll v8.8h, v1.8b, #7 459 sli v8.8h, v8.8h, #1 460 ushll v14.8h, v2.8b, #7 461 sli v14.8h, v14.8h, #1 462 ushll v9.8h, v0.8b, #7 463 sli v9.8h, v9.8h, #1 464 .endm 465 466 .macro pixman_composite_src_8888_0565_process_pixblock_tail 467 sri v14.8h, v8.8h, #5 468 sri v14.8h, v9.8h, #11 469 mov v28.d[0], v14.d[0] 470 mov v29.d[0], v14.d[1] 471 .endm 472 473 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head 474 sri v14.8h, v8.8h, #5 475 PF add, PF_X, PF_X, #8 476 PF tst, PF_CTL, #0xF 477 fetch_src_pixblock 478 PF beq, 10f 479 PF add, PF_X, PF_X, #8 480 PF sub, PF_CTL, PF_CTL, #1 481 10: 482 sri v14.8h, v9.8h, #11 483 mov v28.d[0], v14.d[0] 484 mov v29.d[0], v14.d[1] 485 PF cmp, PF_X, ORIG_W 486 PF lsl, DUMMY, PF_X, #src_bpp_shift 487 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 488 ushll v8.8h, v1.8b, #7 489 sli v8.8h, v8.8h, #1 490 st1 {v14.8h}, [DST_W], #16 491 PF ble, 10f 492 PF sub, PF_X, PF_X, ORIG_W 493 PF subs, PF_CTL, PF_CTL, #0x10 494 10: 495 ushll v14.8h, v2.8b, #7 496 sli v14.8h, v14.8h, #1 497 PF ble, 10f 498 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 499 PF ldrsb, DUMMY, [PF_SRC] 500 10: 501 ushll v9.8h, v0.8b, #7 502 sli v9.8h, v9.8h, #1 503 .endm 504 505 generate_composite_function \ 506 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ 507 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 508 8, /* number of pixels, processed in a single block */ \ 509 10, /* prefetch distance */ \ 510 default_init, \ 511 default_cleanup, \ 512 pixman_composite_src_8888_0565_process_pixblock_head, \ 513 pixman_composite_src_8888_0565_process_pixblock_tail, \ 514 pixman_composite_src_8888_0565_process_pixblock_tail_head 515 516 /******************************************************************************/ 517 518 .macro pixman_composite_src_0565_8888_process_pixblock_head 519 mov v0.d[1], v1.d[0] 520 shrn v30.8b, v0.8h, #8 521 shrn v29.8b, v0.8h, #3 522 sli v0.8h, v0.8h, #5 523 movi v31.8b, #255 524 sri v30.8b, v30.8b, #5 525 sri v29.8b, v29.8b, #6 526 shrn v28.8b, v0.8h, #2 527 .endm 528 529 .macro pixman_composite_src_0565_8888_process_pixblock_tail 530 .endm 531 532 /* TODO: expand macros and do better instructions scheduling */ 533 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head 534 pixman_composite_src_0565_8888_process_pixblock_tail 535 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 536 fetch_src_pixblock 537 pixman_composite_src_0565_8888_process_pixblock_head 538 cache_preload 8, 8 539 .endm 540 541 generate_composite_function \ 542 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ 543 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 544 8, /* number of pixels, processed in a single block */ \ 545 10, /* prefetch distance */ \ 546 default_init, \ 547 default_cleanup, \ 548 pixman_composite_src_0565_8888_process_pixblock_head, \ 549 pixman_composite_src_0565_8888_process_pixblock_tail, \ 550 pixman_composite_src_0565_8888_process_pixblock_tail_head 551 552 /******************************************************************************/ 553 554 .macro pixman_composite_add_8_8_process_pixblock_head 555 uqadd v28.8b, v0.8b, v4.8b 556 uqadd v29.8b, v1.8b, v5.8b 557 uqadd v30.8b, v2.8b, v6.8b 558 uqadd v31.8b, v3.8b, v7.8b 559 .endm 560 561 .macro pixman_composite_add_8_8_process_pixblock_tail 562 .endm 563 564 .macro pixman_composite_add_8_8_process_pixblock_tail_head 565 fetch_src_pixblock 566 PF add, PF_X, PF_X, #32 567 PF tst, PF_CTL, #0xF 568 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 569 PF beq, 10f 570 PF add, PF_X, PF_X, #32 571 PF sub, PF_CTL, PF_CTL, #1 572 10: 573 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 574 PF cmp, PF_X, ORIG_W 575 PF lsl, DUMMY, PF_X, #src_bpp_shift 576 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 577 PF lsl, DUMMY, PF_X, #dst_bpp_shift 578 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 579 PF ble, 10f 580 PF sub, PF_X, PF_X, ORIG_W 581 PF subs, PF_CTL, PF_CTL, #0x10 582 10: 583 uqadd v28.8b, v0.8b, v4.8b 584 PF ble, 10f 585 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 586 PF ldrsb, DUMMY, [PF_SRC] 587 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 588 PF ldrsb, DUMMY, [PF_DST] 589 10: 590 uqadd v29.8b, v1.8b, v5.8b 591 uqadd v30.8b, v2.8b, v6.8b 592 uqadd v31.8b, v3.8b, v7.8b 593 .endm 594 595 generate_composite_function \ 596 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ 597 FLAG_DST_READWRITE, \ 598 32, /* number of pixels, processed in a single block */ \ 599 10, /* prefetch distance */ \ 600 default_init, \ 601 default_cleanup, \ 602 pixman_composite_add_8_8_process_pixblock_head, \ 603 pixman_composite_add_8_8_process_pixblock_tail, \ 604 pixman_composite_add_8_8_process_pixblock_tail_head 605 606 /******************************************************************************/ 607 608 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head 609 fetch_src_pixblock 610 PF add, PF_X, PF_X, #8 611 PF tst, PF_CTL, #0xF 612 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 613 PF beq, 10f 614 PF add, PF_X, PF_X, #8 615 PF sub, PF_CTL, PF_CTL, #1 616 10: 617 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 618 PF cmp, PF_X, ORIG_W 619 PF lsl, DUMMY, PF_X, #src_bpp_shift 620 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 621 PF lsl, DUMMY, PF_X, #dst_bpp_shift 622 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 623 PF ble, 10f 624 PF sub, PF_X, PF_X, ORIG_W 625 PF subs, PF_CTL, PF_CTL, #0x10 626 10: 627 uqadd v28.8b, v0.8b, v4.8b 628 PF ble, 10f 629 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 630 PF ldrsb, DUMMY, [PF_SRC] 631 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 632 PF ldrsb, DUMMY, [PF_DST] 633 10: 634 uqadd v29.8b, v1.8b, v5.8b 635 uqadd v30.8b, v2.8b, v6.8b 636 uqadd v31.8b, v3.8b, v7.8b 637 .endm 638 639 generate_composite_function \ 640 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ 641 FLAG_DST_READWRITE, \ 642 8, /* number of pixels, processed in a single block */ \ 643 10, /* prefetch distance */ \ 644 default_init, \ 645 default_cleanup, \ 646 pixman_composite_add_8_8_process_pixblock_head, \ 647 pixman_composite_add_8_8_process_pixblock_tail, \ 648 pixman_composite_add_8888_8888_process_pixblock_tail_head 649 650 generate_composite_function_single_scanline \ 651 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ 652 FLAG_DST_READWRITE, \ 653 8, /* number of pixels, processed in a single block */ \ 654 default_init, \ 655 default_cleanup, \ 656 pixman_composite_add_8_8_process_pixblock_head, \ 657 pixman_composite_add_8_8_process_pixblock_tail, \ 658 pixman_composite_add_8888_8888_process_pixblock_tail_head 659 660 /******************************************************************************/ 661 662 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head 663 mvn v24.8b, v3.8b /* get inverted alpha */ 664 /* do alpha blending */ 665 umull v8.8h, v24.8b, v4.8b 666 umull v9.8h, v24.8b, v5.8b 667 umull v10.8h, v24.8b, v6.8b 668 umull v11.8h, v24.8b, v7.8b 669 .endm 670 671 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail 672 urshr v14.8h, v8.8h, #8 673 urshr v15.8h, v9.8h, #8 674 urshr v16.8h, v10.8h, #8 675 urshr v17.8h, v11.8h, #8 676 raddhn v28.8b, v14.8h, v8.8h 677 raddhn v29.8b, v15.8h, v9.8h 678 raddhn v30.8b, v16.8h, v10.8h 679 raddhn v31.8b, v17.8h, v11.8h 680 .endm 681 682 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 683 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 684 urshr v14.8h, v8.8h, #8 685 PF add, PF_X, PF_X, #8 686 PF tst, PF_CTL, #0xF 687 urshr v15.8h, v9.8h, #8 688 urshr v16.8h, v10.8h, #8 689 urshr v17.8h, v11.8h, #8 690 PF beq, 10f 691 PF add, PF_X, PF_X, #8 692 PF sub, PF_CTL, PF_CTL, #1 693 10: 694 raddhn v28.8b, v14.8h, v8.8h 695 raddhn v29.8b, v15.8h, v9.8h 696 PF cmp, PF_X, ORIG_W 697 raddhn v30.8b, v16.8h, v10.8h 698 raddhn v31.8b, v17.8h, v11.8h 699 fetch_src_pixblock 700 PF lsl, DUMMY, PF_X, #src_bpp_shift 701 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 702 mvn v22.8b, v3.8b 703 PF lsl, DUMMY, PF_X, #dst_bpp_shift 704 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 705 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 706 PF ble, 10f 707 PF sub, PF_X, PF_X, ORIG_W 708 10: 709 umull v8.8h, v22.8b, v4.8b 710 PF ble, 10f 711 PF subs, PF_CTL, PF_CTL, #0x10 712 10: 713 umull v9.8h, v22.8b, v5.8b 714 PF ble, 10f 715 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 716 PF ldrsb, DUMMY, [PF_SRC] 717 10: 718 umull v10.8h, v22.8b, v6.8b 719 PF ble, 10f 720 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 721 PF ldrsb, DUMMY, [PF_DST] 722 10: 723 umull v11.8h, v22.8b, v7.8b 724 .endm 725 726 generate_composite_function_single_scanline \ 727 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ 728 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 729 8, /* number of pixels, processed in a single block */ \ 730 default_init, \ 731 default_cleanup, \ 732 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ 733 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ 734 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 735 736 /******************************************************************************/ 737 738 .macro pixman_composite_over_8888_8888_process_pixblock_head 739 pixman_composite_out_reverse_8888_8888_process_pixblock_head 740 .endm 741 742 .macro pixman_composite_over_8888_8888_process_pixblock_tail 743 pixman_composite_out_reverse_8888_8888_process_pixblock_tail 744 uqadd v28.8b, v0.8b, v28.8b 745 uqadd v29.8b, v1.8b, v29.8b 746 uqadd v30.8b, v2.8b, v30.8b 747 uqadd v31.8b, v3.8b, v31.8b 748 .endm 749 750 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head 751 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 752 urshr v14.8h, v8.8h, #8 753 PF add, PF_X, PF_X, #8 754 PF tst, PF_CTL, #0xF 755 urshr v15.8h, v9.8h, #8 756 urshr v16.8h, v10.8h, #8 757 urshr v17.8h, v11.8h, #8 758 PF beq, 10f 759 PF add, PF_X, PF_X, #8 760 PF sub, PF_CTL, PF_CTL, #1 761 10: 762 raddhn v28.8b, v14.8h, v8.8h 763 raddhn v29.8b, v15.8h, v9.8h 764 PF cmp, PF_X, ORIG_W 765 raddhn v30.8b, v16.8h, v10.8h 766 raddhn v31.8b, v17.8h, v11.8h 767 uqadd v28.8b, v0.8b, v28.8b 768 uqadd v29.8b, v1.8b, v29.8b 769 uqadd v30.8b, v2.8b, v30.8b 770 uqadd v31.8b, v3.8b, v31.8b 771 fetch_src_pixblock 772 PF lsl, DUMMY, PF_X, #src_bpp_shift 773 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 774 mvn v22.8b, v3.8b 775 PF lsl, DUMMY, PF_X, #dst_bpp_shift 776 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 777 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 778 PF ble, 10f 779 PF sub, PF_X, PF_X, ORIG_W 780 10: 781 umull v8.8h, v22.8b, v4.8b 782 PF ble, 10f 783 PF subs, PF_CTL, PF_CTL, #0x10 784 10: 785 umull v9.8h, v22.8b, v5.8b 786 PF ble, 10f 787 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 788 PF ldrsb, DUMMY, [PF_SRC] 789 10: 790 umull v10.8h, v22.8b, v6.8b 791 PF ble, 10f 792 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 793 PF ldrsb, DUMMY, [PF_DST] 794 10: 795 umull v11.8h, v22.8b, v7.8b 796 .endm 797 798 generate_composite_function \ 799 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ 800 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 801 8, /* number of pixels, processed in a single block */ \ 802 5, /* prefetch distance */ \ 803 default_init, \ 804 default_cleanup, \ 805 pixman_composite_over_8888_8888_process_pixblock_head, \ 806 pixman_composite_over_8888_8888_process_pixblock_tail, \ 807 pixman_composite_over_8888_8888_process_pixblock_tail_head 808 809 generate_composite_function_single_scanline \ 810 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ 811 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 812 8, /* number of pixels, processed in a single block */ \ 813 default_init, \ 814 default_cleanup, \ 815 pixman_composite_over_8888_8888_process_pixblock_head, \ 816 pixman_composite_over_8888_8888_process_pixblock_tail, \ 817 pixman_composite_over_8888_8888_process_pixblock_tail_head 818 819 /******************************************************************************/ 820 821 .macro pixman_composite_over_n_8888_process_pixblock_head 822 /* deinterleaved source pixels in {v0, v1, v2, v3} */ 823 /* inverted alpha in {v24} */ 824 /* destination pixels in {v4, v5, v6, v7} */ 825 umull v8.8h, v24.8b, v4.8b 826 umull v9.8h, v24.8b, v5.8b 827 umull v10.8h, v24.8b, v6.8b 828 umull v11.8h, v24.8b, v7.8b 829 .endm 830 831 .macro pixman_composite_over_n_8888_process_pixblock_tail 832 urshr v14.8h, v8.8h, #8 833 urshr v15.8h, v9.8h, #8 834 urshr v16.8h, v10.8h, #8 835 urshr v17.8h, v11.8h, #8 836 raddhn v28.8b, v14.8h, v8.8h 837 raddhn v29.8b, v15.8h, v9.8h 838 raddhn v30.8b, v16.8h, v10.8h 839 raddhn v31.8b, v17.8h, v11.8h 840 uqadd v28.8b, v0.8b, v28.8b 841 uqadd v29.8b, v1.8b, v29.8b 842 uqadd v30.8b, v2.8b, v30.8b 843 uqadd v31.8b, v3.8b, v31.8b 844 .endm 845 846 .macro pixman_composite_over_n_8888_process_pixblock_tail_head 847 urshr v14.8h, v8.8h, #8 848 urshr v15.8h, v9.8h, #8 849 urshr v16.8h, v10.8h, #8 850 urshr v17.8h, v11.8h, #8 851 raddhn v28.8b, v14.8h, v8.8h 852 raddhn v29.8b, v15.8h, v9.8h 853 raddhn v30.8b, v16.8h, v10.8h 854 raddhn v31.8b, v17.8h, v11.8h 855 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 856 uqadd v28.8b, v0.8b, v28.8b 857 PF add, PF_X, PF_X, #8 858 PF tst, PF_CTL, #0x0F 859 PF beq, 10f 860 PF add, PF_X, PF_X, #8 861 PF sub, PF_CTL, PF_CTL, #1 862 10: 863 uqadd v29.8b, v1.8b, v29.8b 864 uqadd v30.8b, v2.8b, v30.8b 865 uqadd v31.8b, v3.8b, v31.8b 866 PF cmp, PF_X, ORIG_W 867 umull v8.8h, v24.8b, v4.8b 868 PF lsl, DUMMY, PF_X, #dst_bpp_shift 869 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 870 umull v9.8h, v24.8b, v5.8b 871 PF ble, 10f 872 PF sub, PF_X, PF_X, ORIG_W 873 10: 874 umull v10.8h, v24.8b, v6.8b 875 PF subs, PF_CTL, PF_CTL, #0x10 876 umull v11.8h, v24.8b, v7.8b 877 PF ble, 10f 878 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 879 PF ldrsb, DUMMY, [PF_DST] 880 10: 881 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 882 .endm 883 884 .macro pixman_composite_over_n_8888_init 885 mov v3.s[0], w4 886 dup v0.8b, v3.b[0] 887 dup v1.8b, v3.b[1] 888 dup v2.8b, v3.b[2] 889 dup v3.8b, v3.b[3] 890 mvn v24.8b, v3.8b /* get inverted alpha */ 891 .endm 892 893 generate_composite_function \ 894 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ 895 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 896 8, /* number of pixels, processed in a single block */ \ 897 5, /* prefetch distance */ \ 898 pixman_composite_over_n_8888_init, \ 899 default_cleanup, \ 900 pixman_composite_over_8888_8888_process_pixblock_head, \ 901 pixman_composite_over_8888_8888_process_pixblock_tail, \ 902 pixman_composite_over_n_8888_process_pixblock_tail_head 903 904 /******************************************************************************/ 905 906 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head 907 urshr v14.8h, v8.8h, #8 908 PF add, PF_X, PF_X, #8 909 PF tst, PF_CTL, #0xF 910 urshr v15.8h, v9.8h, #8 911 urshr v12.8h, v10.8h, #8 912 urshr v13.8h, v11.8h, #8 913 PF beq, 10f 914 PF add, PF_X, PF_X, #8 915 PF sub, PF_CTL, PF_CTL, #1 916 10: 917 raddhn v28.8b, v14.8h, v8.8h 918 raddhn v29.8b, v15.8h, v9.8h 919 PF cmp, PF_X, ORIG_W 920 raddhn v30.8b, v12.8h, v10.8h 921 raddhn v31.8b, v13.8h, v11.8h 922 uqadd v28.8b, v0.8b, v28.8b 923 uqadd v29.8b, v1.8b, v29.8b 924 uqadd v30.8b, v2.8b, v30.8b 925 uqadd v31.8b, v3.8b, v31.8b 926 ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32 927 mvn v22.8b, v3.8b 928 PF lsl, DUMMY, PF_X, #dst_bpp_shift 929 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 930 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 931 PF blt, 10f 932 PF sub, PF_X, PF_X, ORIG_W 933 10: 934 umull v8.8h, v22.8b, v4.8b 935 PF blt, 10f 936 PF subs, PF_CTL, PF_CTL, #0x10 937 10: 938 umull v9.8h, v22.8b, v5.8b 939 umull v10.8h, v22.8b, v6.8b 940 PF blt, 10f 941 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 942 PF ldrsb, DUMMY, [PF_DST] 943 10: 944 umull v11.8h, v22.8b, v7.8b 945 .endm 946 947 .macro pixman_composite_over_reverse_n_8888_init 948 mov v7.s[0], w4 949 dup v4.8b, v7.b[0] 950 dup v5.8b, v7.b[1] 951 dup v6.8b, v7.b[2] 952 dup v7.8b, v7.b[3] 953 .endm 954 955 generate_composite_function \ 956 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ 957 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 958 8, /* number of pixels, processed in a single block */ \ 959 5, /* prefetch distance */ \ 960 pixman_composite_over_reverse_n_8888_init, \ 961 default_cleanup, \ 962 pixman_composite_over_8888_8888_process_pixblock_head, \ 963 pixman_composite_over_8888_8888_process_pixblock_tail, \ 964 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ 965 28, /* dst_w_basereg */ \ 966 0, /* dst_r_basereg */ \ 967 4, /* src_basereg */ \ 968 24 /* mask_basereg */ 969 970 /******************************************************************************/ 971 972 .macro pixman_composite_over_8888_8_0565_process_pixblock_head 973 umull v0.8h, v24.8b, v8.8b /* IN for SRC pixels (part1) */ 974 umull v1.8h, v24.8b, v9.8b 975 umull v2.8h, v24.8b, v10.8b 976 umull v3.8h, v24.8b, v11.8b 977 mov v4.d[1], v5.d[0] 978 shrn v25.8b, v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */ 979 shrn v26.8b, v4.8h, #3 980 sli v4.8h, v4.8h, #5 981 urshr v17.8h, v0.8h, #8 /* IN for SRC pixels (part2) */ 982 urshr v18.8h, v1.8h, #8 983 urshr v19.8h, v2.8h, #8 984 urshr v20.8h, v3.8h, #8 985 raddhn v0.8b, v0.8h, v17.8h 986 raddhn v1.8b, v1.8h, v18.8h 987 raddhn v2.8b, v2.8h, v19.8h 988 raddhn v3.8b, v3.8h, v20.8h 989 sri v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */ 990 sri v26.8b, v26.8b, #6 991 mvn v3.8b, v3.8b 992 shrn v30.8b, v4.8h, #2 993 umull v18.8h, v3.8b, v25.8b /* now do alpha blending */ 994 umull v19.8h, v3.8b, v26.8b 995 umull v20.8h, v3.8b, v30.8b 996 .endm 997 998 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail 999 /* 3 cycle bubble (after vmull.u8) */ 1000 urshr v5.8h, v18.8h, #8 1001 urshr v6.8h, v19.8h, #8 1002 urshr v7.8h, v20.8h, #8 1003 raddhn v17.8b, v18.8h, v5.8h 1004 raddhn v19.8b, v19.8h, v6.8h 1005 raddhn v18.8b, v20.8h, v7.8h 1006 uqadd v5.8b, v2.8b, v17.8b 1007 /* 1 cycle bubble */ 1008 uqadd v6.8b, v0.8b, v18.8b 1009 uqadd v7.8b, v1.8b, v19.8b 1010 ushll v14.8h, v5.8b, #7 /* convert to 16bpp */ 1011 sli v14.8h, v14.8h, #1 1012 ushll v18.8h, v7.8b, #7 1013 sli v18.8h, v18.8h, #1 1014 ushll v19.8h, v6.8b, #7 1015 sli v19.8h, v19.8h, #1 1016 sri v14.8h, v18.8h, #5 1017 /* 1 cycle bubble */ 1018 sri v14.8h, v19.8h, #11 1019 mov v28.d[0], v14.d[0] 1020 mov v29.d[0], v14.d[1] 1021 .endm 1022 1023 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head 1024 #if 0 1025 ld1 {v4.8h}, [DST_R], #16 1026 shrn v25.8b, v4.8h, #8 1027 fetch_mask_pixblock 1028 shrn v26.8b, v4.8h, #3 1029 fetch_src_pixblock 1030 umull v22.8h, v24.8b, v10.8b 1031 urshr v13.8h, v18.8h, #8 1032 urshr v11.8h, v19.8h, #8 1033 urshr v15.8h, v20.8h, #8 1034 raddhn v17.8b, v18.8h, v13.8h 1035 raddhn v19.8b, v19.8h, v11.8h 1036 raddhn v18.8b, v20.8h, v15.8h 1037 uqadd v17.8b, v2.8b, v17.8b 1038 umull v21.8h, v24.8b, v9.8b 1039 uqadd v18.8b, v0.8b, v18.8b 1040 uqadd v19.8b, v1.8b, v19.8b 1041 ushll v14.8h, v17.8b, #7 1042 sli v14.8h, v14.8h, #1 1043 umull v20.8h, v24.8b, v8.8b 1044 ushll v18.8h, v18.8b, #7 1045 sli v18.8h, v18.8h, #1 1046 ushll v19.8h, v19.8b, #7 1047 sli v19.8h, v19.8h, #1 1048 sri v14.8h, v18.8h, #5 1049 umull v23.8h, v24.8b, v11.8b 1050 sri v14.8h, v19.8h, #11 1051 mov v28.d[0], v14.d[0] 1052 mov v29.d[0], v14.d[1] 1053 1054 cache_preload 8, 8 1055 1056 sli v4.8h, v4.8h, #5 1057 urshr v16.8h, v20.8h, #8 1058 urshr v17.8h, v21.8h, #8 1059 urshr v18.8h, v22.8h, #8 1060 urshr v19.8h, v23.8h, #8 1061 raddhn v0.8b, v20.8h, v16.8h 1062 raddhn v1.8b, v21.8h, v17.8h 1063 raddhn v2.8b, v22.8h, v18.8h 1064 raddhn v3.8b, v23.8h, v19.8h 1065 sri v25.8b, v25.8b, #5 1066 sri v26.8b, v26.8b, #6 1067 mvn v3.8b, v3.8b 1068 shrn v30.8b, v4.8h, #2 1069 st1 {v14.8h}, [DST_W], #16 1070 umull v18.8h, v3.8b, v25.8b 1071 umull v19.8h, v3.8b, v26.8b 1072 umull v20.8h, v3.8b, v30.8b 1073 #else 1074 pixman_composite_over_8888_8_0565_process_pixblock_tail 1075 st1 {v28.4h, v29.4h}, [DST_W], #16 1076 ld1 {v4.4h, v5.4h}, [DST_R], #16 1077 fetch_mask_pixblock 1078 fetch_src_pixblock 1079 pixman_composite_over_8888_8_0565_process_pixblock_head 1080 #endif 1081 .endm 1082 1083 generate_composite_function \ 1084 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ 1085 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1086 8, /* number of pixels, processed in a single block */ \ 1087 5, /* prefetch distance */ \ 1088 default_init_need_all_regs, \ 1089 default_cleanup_need_all_regs, \ 1090 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 1091 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 1092 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 1093 28, /* dst_w_basereg */ \ 1094 4, /* dst_r_basereg */ \ 1095 8, /* src_basereg */ \ 1096 24 /* mask_basereg */ 1097 1098 /******************************************************************************/ 1099 1100 /* 1101 * This function needs a special initialization of solid mask. 1102 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET 1103 * offset, split into color components and replicated in d8-d11 1104 * registers. Additionally, this function needs all the NEON registers, 1105 * so it has to save d8-d15 registers which are callee saved according 1106 * to ABI. These registers are restored from 'cleanup' macro. All the 1107 * other NEON registers are caller saved, so can be clobbered freely 1108 * without introducing any problems. 1109 */ 1110 .macro pixman_composite_over_n_8_0565_init 1111 mov v11.s[0], w4 1112 dup v8.8b, v11.b[0] 1113 dup v9.8b, v11.b[1] 1114 dup v10.8b, v11.b[2] 1115 dup v11.8b, v11.b[3] 1116 .endm 1117 1118 .macro pixman_composite_over_n_8_0565_cleanup 1119 .endm 1120 1121 generate_composite_function \ 1122 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ 1123 FLAG_DST_READWRITE, \ 1124 8, /* number of pixels, processed in a single block */ \ 1125 5, /* prefetch distance */ \ 1126 pixman_composite_over_n_8_0565_init, \ 1127 pixman_composite_over_n_8_0565_cleanup, \ 1128 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 1129 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 1130 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 1131 28, /* dst_w_basereg */ \ 1132 4, /* dst_r_basereg */ \ 1133 8, /* src_basereg */ \ 1134 24 /* mask_basereg */ 1135 1136 /******************************************************************************/ 1137 1138 .macro pixman_composite_over_8888_n_0565_init 1139 mov v24.s[0], w6 1140 dup v24.8b, v24.b[3] 1141 .endm 1142 1143 .macro pixman_composite_over_8888_n_0565_cleanup 1144 .endm 1145 1146 generate_composite_function \ 1147 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ 1148 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1149 8, /* number of pixels, processed in a single block */ \ 1150 5, /* prefetch distance */ \ 1151 pixman_composite_over_8888_n_0565_init, \ 1152 pixman_composite_over_8888_n_0565_cleanup, \ 1153 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 1154 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 1155 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 1156 28, /* dst_w_basereg */ \ 1157 4, /* dst_r_basereg */ \ 1158 8, /* src_basereg */ \ 1159 24 /* mask_basereg */ 1160 1161 /******************************************************************************/ 1162 1163 .macro pixman_composite_src_0565_0565_process_pixblock_head 1164 .endm 1165 1166 .macro pixman_composite_src_0565_0565_process_pixblock_tail 1167 .endm 1168 1169 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head 1170 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32 1171 fetch_src_pixblock 1172 cache_preload 16, 16 1173 .endm 1174 1175 generate_composite_function \ 1176 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ 1177 FLAG_DST_WRITEONLY, \ 1178 16, /* number of pixels, processed in a single block */ \ 1179 10, /* prefetch distance */ \ 1180 default_init, \ 1181 default_cleanup, \ 1182 pixman_composite_src_0565_0565_process_pixblock_head, \ 1183 pixman_composite_src_0565_0565_process_pixblock_tail, \ 1184 pixman_composite_src_0565_0565_process_pixblock_tail_head, \ 1185 0, /* dst_w_basereg */ \ 1186 0, /* dst_r_basereg */ \ 1187 0, /* src_basereg */ \ 1188 0 /* mask_basereg */ 1189 1190 /******************************************************************************/ 1191 1192 .macro pixman_composite_src_n_8_process_pixblock_head 1193 .endm 1194 1195 .macro pixman_composite_src_n_8_process_pixblock_tail 1196 .endm 1197 1198 .macro pixman_composite_src_n_8_process_pixblock_tail_head 1199 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32 1200 .endm 1201 1202 .macro pixman_composite_src_n_8_init 1203 mov v0.s[0], w4 1204 dup v3.8b, v0.b[0] 1205 dup v2.8b, v0.b[0] 1206 dup v1.8b, v0.b[0] 1207 dup v0.8b, v0.b[0] 1208 .endm 1209 1210 .macro pixman_composite_src_n_8_cleanup 1211 .endm 1212 1213 generate_composite_function \ 1214 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ 1215 FLAG_DST_WRITEONLY, \ 1216 32, /* number of pixels, processed in a single block */ \ 1217 0, /* prefetch distance */ \ 1218 pixman_composite_src_n_8_init, \ 1219 pixman_composite_src_n_8_cleanup, \ 1220 pixman_composite_src_n_8_process_pixblock_head, \ 1221 pixman_composite_src_n_8_process_pixblock_tail, \ 1222 pixman_composite_src_n_8_process_pixblock_tail_head, \ 1223 0, /* dst_w_basereg */ \ 1224 0, /* dst_r_basereg */ \ 1225 0, /* src_basereg */ \ 1226 0 /* mask_basereg */ 1227 1228 /******************************************************************************/ 1229 1230 .macro pixman_composite_src_n_0565_process_pixblock_head 1231 .endm 1232 1233 .macro pixman_composite_src_n_0565_process_pixblock_tail 1234 .endm 1235 1236 .macro pixman_composite_src_n_0565_process_pixblock_tail_head 1237 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32 1238 .endm 1239 1240 .macro pixman_composite_src_n_0565_init 1241 mov v0.s[0], w4 1242 dup v3.4h, v0.h[0] 1243 dup v2.4h, v0.h[0] 1244 dup v1.4h, v0.h[0] 1245 dup v0.4h, v0.h[0] 1246 .endm 1247 1248 .macro pixman_composite_src_n_0565_cleanup 1249 .endm 1250 1251 generate_composite_function \ 1252 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ 1253 FLAG_DST_WRITEONLY, \ 1254 16, /* number of pixels, processed in a single block */ \ 1255 0, /* prefetch distance */ \ 1256 pixman_composite_src_n_0565_init, \ 1257 pixman_composite_src_n_0565_cleanup, \ 1258 pixman_composite_src_n_0565_process_pixblock_head, \ 1259 pixman_composite_src_n_0565_process_pixblock_tail, \ 1260 pixman_composite_src_n_0565_process_pixblock_tail_head, \ 1261 0, /* dst_w_basereg */ \ 1262 0, /* dst_r_basereg */ \ 1263 0, /* src_basereg */ \ 1264 0 /* mask_basereg */ 1265 1266 /******************************************************************************/ 1267 1268 .macro pixman_composite_src_n_8888_process_pixblock_head 1269 .endm 1270 1271 .macro pixman_composite_src_n_8888_process_pixblock_tail 1272 .endm 1273 1274 .macro pixman_composite_src_n_8888_process_pixblock_tail_head 1275 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 1276 .endm 1277 1278 .macro pixman_composite_src_n_8888_init 1279 mov v0.s[0], w4 1280 dup v3.2s, v0.s[0] 1281 dup v2.2s, v0.s[0] 1282 dup v1.2s, v0.s[0] 1283 dup v0.2s, v0.s[0] 1284 .endm 1285 1286 .macro pixman_composite_src_n_8888_cleanup 1287 .endm 1288 1289 generate_composite_function \ 1290 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ 1291 FLAG_DST_WRITEONLY, \ 1292 8, /* number of pixels, processed in a single block */ \ 1293 0, /* prefetch distance */ \ 1294 pixman_composite_src_n_8888_init, \ 1295 pixman_composite_src_n_8888_cleanup, \ 1296 pixman_composite_src_n_8888_process_pixblock_head, \ 1297 pixman_composite_src_n_8888_process_pixblock_tail, \ 1298 pixman_composite_src_n_8888_process_pixblock_tail_head, \ 1299 0, /* dst_w_basereg */ \ 1300 0, /* dst_r_basereg */ \ 1301 0, /* src_basereg */ \ 1302 0 /* mask_basereg */ 1303 1304 /******************************************************************************/ 1305 1306 .macro pixman_composite_src_8888_8888_process_pixblock_head 1307 .endm 1308 1309 .macro pixman_composite_src_8888_8888_process_pixblock_tail 1310 .endm 1311 1312 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head 1313 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 1314 fetch_src_pixblock 1315 cache_preload 8, 8 1316 .endm 1317 1318 generate_composite_function \ 1319 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ 1320 FLAG_DST_WRITEONLY, \ 1321 8, /* number of pixels, processed in a single block */ \ 1322 10, /* prefetch distance */ \ 1323 default_init, \ 1324 default_cleanup, \ 1325 pixman_composite_src_8888_8888_process_pixblock_head, \ 1326 pixman_composite_src_8888_8888_process_pixblock_tail, \ 1327 pixman_composite_src_8888_8888_process_pixblock_tail_head, \ 1328 0, /* dst_w_basereg */ \ 1329 0, /* dst_r_basereg */ \ 1330 0, /* src_basereg */ \ 1331 0 /* mask_basereg */ 1332 1333 /******************************************************************************/ 1334 1335 .macro pixman_composite_src_x888_8888_process_pixblock_head 1336 orr v0.8b, v0.8b, v4.8b 1337 orr v1.8b, v1.8b, v4.8b 1338 orr v2.8b, v2.8b, v4.8b 1339 orr v3.8b, v3.8b, v4.8b 1340 .endm 1341 1342 .macro pixman_composite_src_x888_8888_process_pixblock_tail 1343 .endm 1344 1345 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head 1346 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 1347 fetch_src_pixblock 1348 orr v0.8b, v0.8b, v4.8b 1349 orr v1.8b, v1.8b, v4.8b 1350 orr v2.8b, v2.8b, v4.8b 1351 orr v3.8b, v3.8b, v4.8b 1352 cache_preload 8, 8 1353 .endm 1354 1355 .macro pixman_composite_src_x888_8888_init 1356 movi v4.2s, #0xff, lsl 24 1357 .endm 1358 1359 generate_composite_function \ 1360 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ 1361 FLAG_DST_WRITEONLY, \ 1362 8, /* number of pixels, processed in a single block */ \ 1363 10, /* prefetch distance */ \ 1364 pixman_composite_src_x888_8888_init, \ 1365 default_cleanup, \ 1366 pixman_composite_src_x888_8888_process_pixblock_head, \ 1367 pixman_composite_src_x888_8888_process_pixblock_tail, \ 1368 pixman_composite_src_x888_8888_process_pixblock_tail_head, \ 1369 0, /* dst_w_basereg */ \ 1370 0, /* dst_r_basereg */ \ 1371 0, /* src_basereg */ \ 1372 0 /* mask_basereg */ 1373 1374 /******************************************************************************/ 1375 1376 .macro pixman_composite_src_n_8_8888_process_pixblock_head 1377 /* expecting solid source in {v0, v1, v2, v3} */ 1378 /* mask is in v24 (v25, v26, v27 are unused) */ 1379 1380 /* in */ 1381 umull v8.8h, v24.8b, v0.8b 1382 umull v9.8h, v24.8b, v1.8b 1383 umull v10.8h, v24.8b, v2.8b 1384 umull v11.8h, v24.8b, v3.8b 1385 ursra v8.8h, v8.8h, #8 1386 ursra v9.8h, v9.8h, #8 1387 ursra v10.8h, v10.8h, #8 1388 ursra v11.8h, v11.8h, #8 1389 .endm 1390 1391 .macro pixman_composite_src_n_8_8888_process_pixblock_tail 1392 rshrn v28.8b, v8.8h, #8 1393 rshrn v29.8b, v9.8h, #8 1394 rshrn v30.8b, v10.8h, #8 1395 rshrn v31.8b, v11.8h, #8 1396 .endm 1397 1398 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head 1399 fetch_mask_pixblock 1400 PF add, PF_X, PF_X, #8 1401 rshrn v28.8b, v8.8h, #8 1402 PF tst, PF_CTL, #0x0F 1403 rshrn v29.8b, v9.8h, #8 1404 PF beq, 10f 1405 PF add, PF_X, PF_X, #8 1406 10: 1407 rshrn v30.8b, v10.8h, #8 1408 PF beq, 10f 1409 PF sub, PF_CTL, PF_CTL, #1 1410 10: 1411 rshrn v31.8b, v11.8h, #8 1412 PF cmp, PF_X, ORIG_W 1413 umull v8.8h, v24.8b, v0.8b 1414 PF lsl, DUMMY, PF_X, #mask_bpp_shift 1415 PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] 1416 umull v9.8h, v24.8b, v1.8b 1417 PF ble, 10f 1418 PF sub, PF_X, PF_X, ORIG_W 1419 10: 1420 umull v10.8h, v24.8b, v2.8b 1421 PF ble, 10f 1422 PF subs, PF_CTL, PF_CTL, #0x10 1423 10: 1424 umull v11.8h, v24.8b, v3.8b 1425 PF ble, 10f 1426 PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift 1427 PF ldrsb, DUMMY, [PF_MASK] 1428 10: 1429 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 1430 ursra v8.8h, v8.8h, #8 1431 ursra v9.8h, v9.8h, #8 1432 ursra v10.8h, v10.8h, #8 1433 ursra v11.8h, v11.8h, #8 1434 .endm 1435 1436 .macro pixman_composite_src_n_8_8888_init 1437 mov v3.s[0], w4 1438 dup v0.8b, v3.b[0] 1439 dup v1.8b, v3.b[1] 1440 dup v2.8b, v3.b[2] 1441 dup v3.8b, v3.b[3] 1442 .endm 1443 1444 .macro pixman_composite_src_n_8_8888_cleanup 1445 .endm 1446 1447 generate_composite_function \ 1448 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ 1449 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1450 8, /* number of pixels, processed in a single block */ \ 1451 5, /* prefetch distance */ \ 1452 pixman_composite_src_n_8_8888_init, \ 1453 pixman_composite_src_n_8_8888_cleanup, \ 1454 pixman_composite_src_n_8_8888_process_pixblock_head, \ 1455 pixman_composite_src_n_8_8888_process_pixblock_tail, \ 1456 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ 1457 1458 /******************************************************************************/ 1459 1460 .macro pixman_composite_src_n_8_8_process_pixblock_head 1461 umull v0.8h, v24.8b, v16.8b 1462 umull v1.8h, v25.8b, v16.8b 1463 umull v2.8h, v26.8b, v16.8b 1464 umull v3.8h, v27.8b, v16.8b 1465 ursra v0.8h, v0.8h, #8 1466 ursra v1.8h, v1.8h, #8 1467 ursra v2.8h, v2.8h, #8 1468 ursra v3.8h, v3.8h, #8 1469 .endm 1470 1471 .macro pixman_composite_src_n_8_8_process_pixblock_tail 1472 rshrn v28.8b, v0.8h, #8 1473 rshrn v29.8b, v1.8h, #8 1474 rshrn v30.8b, v2.8h, #8 1475 rshrn v31.8b, v3.8h, #8 1476 .endm 1477 1478 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head 1479 fetch_mask_pixblock 1480 PF add, PF_X, PF_X, #8 1481 rshrn v28.8b, v0.8h, #8 1482 PF tst, PF_CTL, #0x0F 1483 rshrn v29.8b, v1.8h, #8 1484 PF beq, 10f 1485 PF add, PF_X, PF_X, #8 1486 10: 1487 rshrn v30.8b, v2.8h, #8 1488 PF beq, 10f 1489 PF sub, PF_CTL, PF_CTL, #1 1490 10: 1491 rshrn v31.8b, v3.8h, #8 1492 PF cmp, PF_X, ORIG_W 1493 umull v0.8h, v24.8b, v16.8b 1494 PF lsl, DUMMY, PF_X, mask_bpp_shift 1495 PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] 1496 umull v1.8h, v25.8b, v16.8b 1497 PF ble, 10f 1498 PF sub, PF_X, PF_X, ORIG_W 1499 10: 1500 umull v2.8h, v26.8b, v16.8b 1501 PF ble, 10f 1502 PF subs, PF_CTL, PF_CTL, #0x10 1503 10: 1504 umull v3.8h, v27.8b, v16.8b 1505 PF ble, 10f 1506 PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift 1507 PF ldrsb, DUMMY, [PF_MASK] 1508 10: 1509 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 1510 ursra v0.8h, v0.8h, #8 1511 ursra v1.8h, v1.8h, #8 1512 ursra v2.8h, v2.8h, #8 1513 ursra v3.8h, v3.8h, #8 1514 .endm 1515 1516 .macro pixman_composite_src_n_8_8_init 1517 mov v16.s[0], w4 1518 dup v16.8b, v16.b[3] 1519 .endm 1520 1521 .macro pixman_composite_src_n_8_8_cleanup 1522 .endm 1523 1524 generate_composite_function \ 1525 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ 1526 FLAG_DST_WRITEONLY, \ 1527 32, /* number of pixels, processed in a single block */ \ 1528 5, /* prefetch distance */ \ 1529 pixman_composite_src_n_8_8_init, \ 1530 pixman_composite_src_n_8_8_cleanup, \ 1531 pixman_composite_src_n_8_8_process_pixblock_head, \ 1532 pixman_composite_src_n_8_8_process_pixblock_tail, \ 1533 pixman_composite_src_n_8_8_process_pixblock_tail_head 1534 1535 /******************************************************************************/ 1536 1537 .macro pixman_composite_over_n_8_8888_process_pixblock_head 1538 /* expecting deinterleaved source data in {v8, v9, v10, v11} */ 1539 /* v8 - blue, v9 - green, v10 - red, v11 - alpha */ 1540 /* and destination data in {v4, v5, v6, v7} */ 1541 /* mask is in v24 (v25, v26, v27 are unused) */ 1542 1543 /* in */ 1544 umull v12.8h, v24.8b, v8.8b 1545 umull v13.8h, v24.8b, v9.8b 1546 umull v14.8h, v24.8b, v10.8b 1547 umull v15.8h, v24.8b, v11.8b 1548 urshr v16.8h, v12.8h, #8 1549 urshr v17.8h, v13.8h, #8 1550 urshr v18.8h, v14.8h, #8 1551 urshr v19.8h, v15.8h, #8 1552 raddhn v0.8b, v12.8h, v16.8h 1553 raddhn v1.8b, v13.8h, v17.8h 1554 raddhn v2.8b, v14.8h, v18.8h 1555 raddhn v3.8b, v15.8h, v19.8h 1556 mvn v25.8b, v3.8b /* get inverted alpha */ 1557 /* source: v0 - blue, v1 - green, v2 - red, v3 - alpha */ 1558 /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */ 1559 /* now do alpha blending */ 1560 umull v12.8h, v25.8b, v4.8b 1561 umull v13.8h, v25.8b, v5.8b 1562 umull v14.8h, v25.8b, v6.8b 1563 umull v15.8h, v25.8b, v7.8b 1564 .endm 1565 1566 .macro pixman_composite_over_n_8_8888_process_pixblock_tail 1567 urshr v16.8h, v12.8h, #8 1568 urshr v17.8h, v13.8h, #8 1569 urshr v18.8h, v14.8h, #8 1570 urshr v19.8h, v15.8h, #8 1571 raddhn v28.8b, v16.8h, v12.8h 1572 raddhn v29.8b, v17.8h, v13.8h 1573 raddhn v30.8b, v18.8h, v14.8h 1574 raddhn v31.8b, v19.8h, v15.8h 1575 uqadd v28.8b, v0.8b, v28.8b 1576 uqadd v29.8b, v1.8b, v29.8b 1577 uqadd v30.8b, v2.8b, v30.8b 1578 uqadd v31.8b, v3.8b, v31.8b 1579 .endm 1580 1581 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head 1582 urshr v16.8h, v12.8h, #8 1583 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 1584 urshr v17.8h, v13.8h, #8 1585 fetch_mask_pixblock 1586 urshr v18.8h, v14.8h, #8 1587 PF add, PF_X, PF_X, #8 1588 urshr v19.8h, v15.8h, #8 1589 PF tst, PF_CTL, #0x0F 1590 raddhn v28.8b, v16.8h, v12.8h 1591 PF beq, 10f 1592 PF add, PF_X, PF_X, #8 1593 10: 1594 raddhn v29.8b, v17.8h, v13.8h 1595 PF beq, 10f 1596 PF sub, PF_CTL, PF_CTL, #1 1597 10: 1598 raddhn v30.8b, v18.8h, v14.8h 1599 PF cmp, PF_X, ORIG_W 1600 raddhn v31.8b, v19.8h, v15.8h 1601 PF lsl, DUMMY, PF_X, #dst_bpp_shift 1602 PF prfm, PREFETCH_MODE, [PF_DST, DUMMY] 1603 umull v16.8h, v24.8b, v8.8b 1604 PF lsl, DUMMY, PF_X, #mask_bpp_shift 1605 PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY] 1606 umull v17.8h, v24.8b, v9.8b 1607 PF ble, 10f 1608 PF sub, PF_X, PF_X, ORIG_W 1609 10: 1610 umull v18.8h, v24.8b, v10.8b 1611 PF ble, 10f 1612 PF subs, PF_CTL, PF_CTL, #0x10 1613 10: 1614 umull v19.8h, v24.8b, v11.8b 1615 PF ble, 10f 1616 PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift 1617 PF ldrsb, DUMMY, [PF_DST] 1618 10: 1619 uqadd v28.8b, v0.8b, v28.8b 1620 PF ble, 10f 1621 PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift 1622 PF ldrsb, DUMMY, [PF_MASK] 1623 10: 1624 uqadd v29.8b, v1.8b, v29.8b 1625 uqadd v30.8b, v2.8b, v30.8b 1626 uqadd v31.8b, v3.8b, v31.8b 1627 urshr v12.8h, v16.8h, #8 1628 urshr v13.8h, v17.8h, #8 1629 urshr v14.8h, v18.8h, #8 1630 urshr v15.8h, v19.8h, #8 1631 raddhn v0.8b, v16.8h, v12.8h 1632 raddhn v1.8b, v17.8h, v13.8h 1633 raddhn v2.8b, v18.8h, v14.8h 1634 raddhn v3.8b, v19.8h, v15.8h 1635 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 1636 mvn v25.8b, v3.8b 1637 umull v12.8h, v25.8b, v4.8b 1638 umull v13.8h, v25.8b, v5.8b 1639 umull v14.8h, v25.8b, v6.8b 1640 umull v15.8h, v25.8b, v7.8b 1641 .endm 1642 1643 .macro pixman_composite_over_n_8_8888_init 1644 mov v11.s[0], w4 1645 dup v8.8b, v11.b[0] 1646 dup v9.8b, v11.b[1] 1647 dup v10.8b, v11.b[2] 1648 dup v11.8b, v11.b[3] 1649 .endm 1650 1651 .macro pixman_composite_over_n_8_8888_cleanup 1652 .endm 1653 1654 generate_composite_function \ 1655 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ 1656 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1657 8, /* number of pixels, processed in a single block */ \ 1658 5, /* prefetch distance */ \ 1659 pixman_composite_over_n_8_8888_init, \ 1660 pixman_composite_over_n_8_8888_cleanup, \ 1661 pixman_composite_over_n_8_8888_process_pixblock_head, \ 1662 pixman_composite_over_n_8_8888_process_pixblock_tail, \ 1663 pixman_composite_over_n_8_8888_process_pixblock_tail_head 1664 1665 /******************************************************************************/ 1666 1667 .macro pixman_composite_over_n_8_8_process_pixblock_head 1668 umull v0.8h, v24.8b, v8.8b 1669 umull v1.8h, v25.8b, v8.8b 1670 umull v2.8h, v26.8b, v8.8b 1671 umull v3.8h, v27.8b, v8.8b 1672 urshr v10.8h, v0.8h, #8 1673 urshr v11.8h, v1.8h, #8 1674 urshr v12.8h, v2.8h, #8 1675 urshr v13.8h, v3.8h, #8 1676 raddhn v0.8b, v0.8h, v10.8h 1677 raddhn v1.8b, v1.8h, v11.8h 1678 raddhn v2.8b, v2.8h, v12.8h 1679 raddhn v3.8b, v3.8h, v13.8h 1680 mvn v24.8b, v0.8b 1681 mvn v25.8b, v1.8b 1682 mvn v26.8b, v2.8b 1683 mvn v27.8b, v3.8b 1684 umull v10.8h, v24.8b, v4.8b 1685 umull v11.8h, v25.8b, v5.8b 1686 umull v12.8h, v26.8b, v6.8b 1687 umull v13.8h, v27.8b, v7.8b 1688 .endm 1689 1690 .macro pixman_composite_over_n_8_8_process_pixblock_tail 1691 urshr v14.8h, v10.8h, #8 1692 urshr v15.8h, v11.8h, #8 1693 urshr v16.8h, v12.8h, #8 1694 urshr v17.8h, v13.8h, #8 1695 raddhn v28.8b, v14.8h, v10.8h 1696 raddhn v29.8b, v15.8h, v11.8h 1697 raddhn v30.8b, v16.8h, v12.8h 1698 raddhn v31.8b, v17.8h, v13.8h 1699 uqadd v28.8b, v0.8b, v28.8b 1700 uqadd v29.8b, v1.8b, v29.8b 1701 uqadd v30.8b, v2.8b, v30.8b 1702 uqadd v31.8b, v3.8b, v31.8b 1703 .endm 1704 1705 /* TODO: expand macros and do better instructions scheduling */ 1706 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head 1707 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 1708 pixman_composite_over_n_8_8_process_pixblock_tail 1709 fetch_mask_pixblock 1710 cache_preload 32, 32 1711 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 1712 pixman_composite_over_n_8_8_process_pixblock_head 1713 .endm 1714 1715 .macro pixman_composite_over_n_8_8_init 1716 mov v8.s[0], w4 1717 dup v8.8b, v8.b[3] 1718 .endm 1719 1720 .macro pixman_composite_over_n_8_8_cleanup 1721 .endm 1722 1723 generate_composite_function \ 1724 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ 1725 FLAG_DST_READWRITE, \ 1726 32, /* number of pixels, processed in a single block */ \ 1727 5, /* prefetch distance */ \ 1728 pixman_composite_over_n_8_8_init, \ 1729 pixman_composite_over_n_8_8_cleanup, \ 1730 pixman_composite_over_n_8_8_process_pixblock_head, \ 1731 pixman_composite_over_n_8_8_process_pixblock_tail, \ 1732 pixman_composite_over_n_8_8_process_pixblock_tail_head 1733 1734 /******************************************************************************/ 1735 1736 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1737 /* 1738 * 'combine_mask_ca' replacement 1739 * 1740 * input: solid src (n) in {v8, v9, v10, v11} 1741 * dest in {v4, v5, v6, v7 } 1742 * mask in {v24, v25, v26, v27} 1743 * output: updated src in {v0, v1, v2, v3 } 1744 * updated mask in {v24, v25, v26, v3 } 1745 */ 1746 umull v0.8h, v24.8b, v8.8b 1747 umull v1.8h, v25.8b, v9.8b 1748 umull v2.8h, v26.8b, v10.8b 1749 umull v3.8h, v27.8b, v11.8b 1750 umull v12.8h, v11.8b, v25.8b 1751 umull v13.8h, v11.8b, v24.8b 1752 umull v14.8h, v11.8b, v26.8b 1753 urshr v15.8h, v0.8h, #8 1754 urshr v16.8h, v1.8h, #8 1755 urshr v17.8h, v2.8h, #8 1756 raddhn v0.8b, v0.8h, v15.8h 1757 raddhn v1.8b, v1.8h, v16.8h 1758 raddhn v2.8b, v2.8h, v17.8h 1759 urshr v15.8h, v13.8h, #8 1760 urshr v16.8h, v12.8h, #8 1761 urshr v17.8h, v14.8h, #8 1762 urshr v18.8h, v3.8h, #8 1763 raddhn v24.8b, v13.8h, v15.8h 1764 raddhn v25.8b, v12.8h, v16.8h 1765 raddhn v26.8b, v14.8h, v17.8h 1766 raddhn v3.8b, v3.8h, v18.8h 1767 /* 1768 * 'combine_over_ca' replacement 1769 * 1770 * output: updated dest in {v28, v29, v30, v31} 1771 */ 1772 mvn v24.8b, v24.8b 1773 mvn v25.8b, v25.8b 1774 mvn v26.8b, v26.8b 1775 mvn v27.8b, v3.8b 1776 umull v12.8h, v24.8b, v4.8b 1777 umull v13.8h, v25.8b, v5.8b 1778 umull v14.8h, v26.8b, v6.8b 1779 umull v15.8h, v27.8b, v7.8b 1780 .endm 1781 1782 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail 1783 /* ... continue 'combine_over_ca' replacement */ 1784 urshr v16.8h, v12.8h, #8 1785 urshr v17.8h, v13.8h, #8 1786 urshr v18.8h, v14.8h, #8 1787 urshr v19.8h, v15.8h, #8 1788 raddhn v28.8b, v16.8h, v12.8h 1789 raddhn v29.8b, v17.8h, v13.8h 1790 raddhn v30.8b, v18.8h, v14.8h 1791 raddhn v31.8b, v19.8h, v15.8h 1792 uqadd v28.8b, v0.8b, v28.8b 1793 uqadd v29.8b, v1.8b, v29.8b 1794 uqadd v30.8b, v2.8b, v30.8b 1795 uqadd v31.8b, v3.8b, v31.8b 1796 .endm 1797 1798 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1799 urshr v16.8h, v12.8h, #8 1800 urshr v17.8h, v13.8h, #8 1801 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 1802 urshr v18.8h, v14.8h, #8 1803 urshr v19.8h, v15.8h, #8 1804 raddhn v28.8b, v16.8h, v12.8h 1805 raddhn v29.8b, v17.8h, v13.8h 1806 raddhn v30.8b, v18.8h, v14.8h 1807 raddhn v31.8b, v19.8h, v15.8h 1808 fetch_mask_pixblock 1809 uqadd v28.8b, v0.8b, v28.8b 1810 uqadd v29.8b, v1.8b, v29.8b 1811 uqadd v30.8b, v2.8b, v30.8b 1812 uqadd v31.8b, v3.8b, v31.8b 1813 cache_preload 8, 8 1814 pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1815 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 1816 .endm 1817 1818 .macro pixman_composite_over_n_8888_8888_ca_init 1819 mov v13.s[0], w4 1820 dup v8.8b, v13.b[0] 1821 dup v9.8b, v13.b[1] 1822 dup v10.8b, v13.b[2] 1823 dup v11.8b, v13.b[3] 1824 .endm 1825 1826 .macro pixman_composite_over_n_8888_8888_ca_cleanup 1827 .endm 1828 1829 generate_composite_function \ 1830 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ 1831 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1832 8, /* number of pixels, processed in a single block */ \ 1833 5, /* prefetch distance */ \ 1834 pixman_composite_over_n_8888_8888_ca_init, \ 1835 pixman_composite_over_n_8888_8888_ca_cleanup, \ 1836 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ 1837 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ 1838 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1839 1840 /******************************************************************************/ 1841 1842 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head 1843 /* 1844 * 'combine_mask_ca' replacement 1845 * 1846 * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A] 1847 * mask in {v24, v25, v26} [B, G, R] 1848 * output: updated src in {v0, v1, v2 } [B, G, R] 1849 * updated mask in {v24, v25, v26} [B, G, R] 1850 */ 1851 umull v0.8h, v24.8b, v8.8b 1852 umull v1.8h, v25.8b, v9.8b 1853 umull v2.8h, v26.8b, v10.8b 1854 umull v12.8h, v11.8b, v24.8b 1855 umull v13.8h, v11.8b, v25.8b 1856 umull v14.8h, v11.8b, v26.8b 1857 urshr v15.8h, v0.8h, #8 1858 urshr v16.8h, v1.8h, #8 1859 urshr v17.8h, v2.8h, #8 1860 raddhn v0.8b, v0.8h, v15.8h 1861 raddhn v1.8b, v1.8h, v16.8h 1862 raddhn v2.8b, v2.8h, v17.8h 1863 urshr v19.8h, v12.8h, #8 1864 urshr v20.8h, v13.8h, #8 1865 urshr v21.8h, v14.8h, #8 1866 raddhn v24.8b, v12.8h, v19.8h 1867 raddhn v25.8b, v13.8h, v20.8h 1868 /* 1869 * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format 1870 * and put data into v16 - blue, v17 - green, v18 - red 1871 */ 1872 mov v4.d[1], v5.d[0] 1873 shrn v17.8b, v4.8h, #3 1874 shrn v18.8b, v4.8h, #8 1875 raddhn v26.8b, v14.8h, v21.8h 1876 sli v4.8h, v4.8h, #5 1877 sri v18.8b, v18.8b, #5 1878 sri v17.8b, v17.8b, #6 1879 /* 1880 * 'combine_over_ca' replacement 1881 * 1882 * output: updated dest in v16 - blue, v17 - green, v18 - red 1883 */ 1884 mvn v24.8b, v24.8b 1885 mvn v25.8b, v25.8b 1886 shrn v16.8b, v4.8h, #2 1887 mvn v26.8b, v26.8b 1888 umull v5.8h, v16.8b, v24.8b 1889 umull v6.8h, v17.8b, v25.8b 1890 umull v7.8h, v18.8b, v26.8b 1891 .endm 1892 1893 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail 1894 /* ... continue 'combine_over_ca' replacement */ 1895 urshr v13.8h, v5.8h, #8 1896 urshr v14.8h, v6.8h, #8 1897 urshr v15.8h, v7.8h, #8 1898 raddhn v16.8b, v13.8h, v5.8h 1899 raddhn v17.8b, v14.8h, v6.8h 1900 raddhn v18.8b, v15.8h, v7.8h 1901 uqadd v16.8b, v0.8b, v16.8b 1902 uqadd v17.8b, v1.8b, v17.8b 1903 uqadd v18.8b, v2.8b, v18.8b 1904 /* 1905 * convert the results in v16, v17, v18 to r5g6b5 and store 1906 * them into {v14} 1907 */ 1908 ushll v14.8h, v18.8b, #7 1909 sli v14.8h, v14.8h, #1 1910 ushll v12.8h, v17.8b, #7 1911 sli v12.8h, v12.8h, #1 1912 ushll v13.8h, v16.8b, #7 1913 sli v13.8h, v13.8h, #1 1914 sri v14.8h, v12.8h, #5 1915 sri v14.8h, v13.8h, #11 1916 mov v28.d[0], v14.d[0] 1917 mov v29.d[0], v14.d[1] 1918 .endm 1919 1920 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 1921 fetch_mask_pixblock 1922 urshr v13.8h, v5.8h, #8 1923 urshr v14.8h, v6.8h, #8 1924 ld1 {v4.8h}, [DST_R], #16 1925 urshr v15.8h, v7.8h, #8 1926 raddhn v16.8b, v13.8h, v5.8h 1927 raddhn v17.8b, v14.8h, v6.8h 1928 raddhn v18.8b, v15.8h, v7.8h 1929 mov v5.d[0], v4.d[1] 1930 /* process_pixblock_head */ 1931 /* 1932 * 'combine_mask_ca' replacement 1933 * 1934 * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A] 1935 * mask in {v24, v25, v26} [B, G, R] 1936 * output: updated src in {v0, v1, v2 } [B, G, R] 1937 * updated mask in {v24, v25, v26} [B, G, R] 1938 */ 1939 uqadd v16.8b, v0.8b, v16.8b 1940 uqadd v17.8b, v1.8b, v17.8b 1941 uqadd v18.8b, v2.8b, v18.8b 1942 umull v0.8h, v24.8b, v8.8b 1943 umull v1.8h, v25.8b, v9.8b 1944 umull v2.8h, v26.8b, v10.8b 1945 /* 1946 * convert the result in v16, v17, v18 to r5g6b5 and store 1947 * it into {v14} 1948 */ 1949 ushll v14.8h, v18.8b, #7 1950 sli v14.8h, v14.8h, #1 1951 ushll v18.8h, v16.8b, #7 1952 sli v18.8h, v18.8h, #1 1953 ushll v19.8h, v17.8b, #7 1954 sli v19.8h, v19.8h, #1 1955 umull v12.8h, v11.8b, v24.8b 1956 sri v14.8h, v19.8h, #5 1957 umull v13.8h, v11.8b, v25.8b 1958 umull v15.8h, v11.8b, v26.8b 1959 sri v14.8h, v18.8h, #11 1960 mov v28.d[0], v14.d[0] 1961 mov v29.d[0], v14.d[1] 1962 cache_preload 8, 8 1963 urshr v16.8h, v0.8h, #8 1964 urshr v17.8h, v1.8h, #8 1965 urshr v18.8h, v2.8h, #8 1966 raddhn v0.8b, v0.8h, v16.8h 1967 raddhn v1.8b, v1.8h, v17.8h 1968 raddhn v2.8b, v2.8h, v18.8h 1969 urshr v19.8h, v12.8h, #8 1970 urshr v20.8h, v13.8h, #8 1971 urshr v21.8h, v15.8h, #8 1972 raddhn v24.8b, v12.8h, v19.8h 1973 raddhn v25.8b, v13.8h, v20.8h 1974 /* 1975 * convert 8 r5g6b5 pixel data from {v4, v5} to planar 1976 * 8-bit format and put data into v16 - blue, v17 - green, 1977 * v18 - red 1978 */ 1979 mov v4.d[1], v5.d[0] 1980 shrn v17.8b, v4.8h, #3 1981 shrn v18.8b, v4.8h, #8 1982 raddhn v26.8b, v15.8h, v21.8h 1983 sli v4.8h, v4.8h, #5 1984 sri v17.8b, v17.8b, #6 1985 sri v18.8b, v18.8b, #5 1986 /* 1987 * 'combine_over_ca' replacement 1988 * 1989 * output: updated dest in v16 - blue, v17 - green, v18 - red 1990 */ 1991 mvn v24.8b, v24.8b 1992 mvn v25.8b, v25.8b 1993 shrn v16.8b, v4.8h, #2 1994 mvn v26.8b, v26.8b 1995 umull v5.8h, v16.8b, v24.8b 1996 umull v6.8h, v17.8b, v25.8b 1997 umull v7.8h, v18.8b, v26.8b 1998 st1 {v14.8h}, [DST_W], #16 1999 .endm 2000 2001 .macro pixman_composite_over_n_8888_0565_ca_init 2002 mov v13.s[0], w4 2003 dup v8.8b, v13.b[0] 2004 dup v9.8b, v13.b[1] 2005 dup v10.8b, v13.b[2] 2006 dup v11.8b, v13.b[3] 2007 .endm 2008 2009 .macro pixman_composite_over_n_8888_0565_ca_cleanup 2010 .endm 2011 2012 generate_composite_function \ 2013 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ 2014 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2015 8, /* number of pixels, processed in a single block */ \ 2016 5, /* prefetch distance */ \ 2017 pixman_composite_over_n_8888_0565_ca_init, \ 2018 pixman_composite_over_n_8888_0565_ca_cleanup, \ 2019 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ 2020 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ 2021 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 2022 2023 /******************************************************************************/ 2024 2025 .macro pixman_composite_in_n_8_process_pixblock_head 2026 /* expecting source data in {v0, v1, v2, v3} */ 2027 /* and destination data in {v4, v5, v6, v7} */ 2028 umull v8.8h, v4.8b, v3.8b 2029 umull v9.8h, v5.8b, v3.8b 2030 umull v10.8h, v6.8b, v3.8b 2031 umull v11.8h, v7.8b, v3.8b 2032 .endm 2033 2034 .macro pixman_composite_in_n_8_process_pixblock_tail 2035 urshr v14.8h, v8.8h, #8 2036 urshr v15.8h, v9.8h, #8 2037 urshr v12.8h, v10.8h, #8 2038 urshr v13.8h, v11.8h, #8 2039 raddhn v28.8b, v8.8h, v14.8h 2040 raddhn v29.8b, v9.8h, v15.8h 2041 raddhn v30.8b, v10.8h, v12.8h 2042 raddhn v31.8b, v11.8h, v13.8h 2043 .endm 2044 2045 .macro pixman_composite_in_n_8_process_pixblock_tail_head 2046 pixman_composite_in_n_8_process_pixblock_tail 2047 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 2048 cache_preload 32, 32 2049 pixman_composite_in_n_8_process_pixblock_head 2050 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2051 .endm 2052 2053 .macro pixman_composite_in_n_8_init 2054 mov v3.s[0], w4 2055 dup v3.8b, v3.b[3] 2056 .endm 2057 2058 .macro pixman_composite_in_n_8_cleanup 2059 .endm 2060 2061 generate_composite_function \ 2062 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ 2063 FLAG_DST_READWRITE, \ 2064 32, /* number of pixels, processed in a single block */ \ 2065 5, /* prefetch distance */ \ 2066 pixman_composite_in_n_8_init, \ 2067 pixman_composite_in_n_8_cleanup, \ 2068 pixman_composite_in_n_8_process_pixblock_head, \ 2069 pixman_composite_in_n_8_process_pixblock_tail, \ 2070 pixman_composite_in_n_8_process_pixblock_tail_head, \ 2071 28, /* dst_w_basereg */ \ 2072 4, /* dst_r_basereg */ \ 2073 0, /* src_basereg */ \ 2074 24 /* mask_basereg */ 2075 2076 .macro pixman_composite_add_n_8_8_process_pixblock_head 2077 /* expecting source data in {v8, v9, v10, v11} */ 2078 /* v8 - blue, v9 - green, v10 - red, v11 - alpha */ 2079 /* and destination data in {v4, v5, v6, v7} */ 2080 /* mask is in v24, v25, v26, v27 */ 2081 umull v0.8h, v24.8b, v11.8b 2082 umull v1.8h, v25.8b, v11.8b 2083 umull v2.8h, v26.8b, v11.8b 2084 umull v3.8h, v27.8b, v11.8b 2085 urshr v12.8h, v0.8h, #8 2086 urshr v13.8h, v1.8h, #8 2087 urshr v14.8h, v2.8h, #8 2088 urshr v15.8h, v3.8h, #8 2089 raddhn v0.8b, v0.8h, v12.8h 2090 raddhn v1.8b, v1.8h, v13.8h 2091 raddhn v2.8b, v2.8h, v14.8h 2092 raddhn v3.8b, v3.8h, v15.8h 2093 uqadd v28.8b, v0.8b, v4.8b 2094 uqadd v29.8b, v1.8b, v5.8b 2095 uqadd v30.8b, v2.8b, v6.8b 2096 uqadd v31.8b, v3.8b, v7.8b 2097 .endm 2098 2099 .macro pixman_composite_add_n_8_8_process_pixblock_tail 2100 .endm 2101 2102 /* TODO: expand macros and do better instructions scheduling */ 2103 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head 2104 pixman_composite_add_n_8_8_process_pixblock_tail 2105 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2106 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 2107 fetch_mask_pixblock 2108 cache_preload 32, 32 2109 pixman_composite_add_n_8_8_process_pixblock_head 2110 .endm 2111 2112 .macro pixman_composite_add_n_8_8_init 2113 mov v11.s[0], w4 2114 dup v11.8b, v11.b[3] 2115 .endm 2116 2117 .macro pixman_composite_add_n_8_8_cleanup 2118 .endm 2119 2120 generate_composite_function \ 2121 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ 2122 FLAG_DST_READWRITE, \ 2123 32, /* number of pixels, processed in a single block */ \ 2124 5, /* prefetch distance */ \ 2125 pixman_composite_add_n_8_8_init, \ 2126 pixman_composite_add_n_8_8_cleanup, \ 2127 pixman_composite_add_n_8_8_process_pixblock_head, \ 2128 pixman_composite_add_n_8_8_process_pixblock_tail, \ 2129 pixman_composite_add_n_8_8_process_pixblock_tail_head 2130 2131 /******************************************************************************/ 2132 2133 .macro pixman_composite_add_8_8_8_process_pixblock_head 2134 /* expecting source data in {v0, v1, v2, v3} */ 2135 /* destination data in {v4, v5, v6, v7} */ 2136 /* mask in {v24, v25, v26, v27} */ 2137 umull v8.8h, v24.8b, v0.8b 2138 umull v9.8h, v25.8b, v1.8b 2139 umull v10.8h, v26.8b, v2.8b 2140 umull v11.8h, v27.8b, v3.8b 2141 urshr v0.8h, v8.8h, #8 2142 urshr v1.8h, v9.8h, #8 2143 urshr v12.8h, v10.8h, #8 2144 urshr v13.8h, v11.8h, #8 2145 raddhn v0.8b, v0.8h, v8.8h 2146 raddhn v1.8b, v1.8h, v9.8h 2147 raddhn v2.8b, v12.8h, v10.8h 2148 raddhn v3.8b, v13.8h, v11.8h 2149 uqadd v28.8b, v0.8b, v4.8b 2150 uqadd v29.8b, v1.8b, v5.8b 2151 uqadd v30.8b, v2.8b, v6.8b 2152 uqadd v31.8b, v3.8b, v7.8b 2153 .endm 2154 2155 .macro pixman_composite_add_8_8_8_process_pixblock_tail 2156 .endm 2157 2158 /* TODO: expand macros and do better instructions scheduling */ 2159 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head 2160 pixman_composite_add_8_8_8_process_pixblock_tail 2161 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2162 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 2163 fetch_mask_pixblock 2164 fetch_src_pixblock 2165 cache_preload 32, 32 2166 pixman_composite_add_8_8_8_process_pixblock_head 2167 .endm 2168 2169 .macro pixman_composite_add_8_8_8_init 2170 .endm 2171 2172 .macro pixman_composite_add_8_8_8_cleanup 2173 .endm 2174 2175 generate_composite_function \ 2176 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ 2177 FLAG_DST_READWRITE, \ 2178 32, /* number of pixels, processed in a single block */ \ 2179 5, /* prefetch distance */ \ 2180 pixman_composite_add_8_8_8_init, \ 2181 pixman_composite_add_8_8_8_cleanup, \ 2182 pixman_composite_add_8_8_8_process_pixblock_head, \ 2183 pixman_composite_add_8_8_8_process_pixblock_tail, \ 2184 pixman_composite_add_8_8_8_process_pixblock_tail_head 2185 2186 /******************************************************************************/ 2187 2188 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head 2189 /* expecting source data in {v0, v1, v2, v3} */ 2190 /* destination data in {v4, v5, v6, v7} */ 2191 /* mask in {v24, v25, v26, v27} */ 2192 umull v8.8h, v27.8b, v0.8b 2193 umull v9.8h, v27.8b, v1.8b 2194 umull v10.8h, v27.8b, v2.8b 2195 umull v11.8h, v27.8b, v3.8b 2196 /* 1 cycle bubble */ 2197 ursra v8.8h, v8.8h, #8 2198 ursra v9.8h, v9.8h, #8 2199 ursra v10.8h, v10.8h, #8 2200 ursra v11.8h, v11.8h, #8 2201 .endm 2202 2203 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail 2204 /* 2 cycle bubble */ 2205 rshrn v28.8b, v8.8h, #8 2206 rshrn v29.8b, v9.8h, #8 2207 rshrn v30.8b, v10.8h, #8 2208 rshrn v31.8b, v11.8h, #8 2209 uqadd v28.8b, v4.8b, v28.8b 2210 uqadd v29.8b, v5.8b, v29.8b 2211 uqadd v30.8b, v6.8b, v30.8b 2212 uqadd v31.8b, v7.8b, v31.8b 2213 .endm 2214 2215 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2216 fetch_src_pixblock 2217 rshrn v28.8b, v8.8h, #8 2218 fetch_mask_pixblock 2219 rshrn v29.8b, v9.8h, #8 2220 umull v8.8h, v27.8b, v0.8b 2221 rshrn v30.8b, v10.8h, #8 2222 umull v9.8h, v27.8b, v1.8b 2223 rshrn v31.8b, v11.8h, #8 2224 umull v10.8h, v27.8b, v2.8b 2225 umull v11.8h, v27.8b, v3.8b 2226 uqadd v28.8b, v4.8b, v28.8b 2227 uqadd v29.8b, v5.8b, v29.8b 2228 uqadd v30.8b, v6.8b, v30.8b 2229 uqadd v31.8b, v7.8b, v31.8b 2230 ursra v8.8h, v8.8h, #8 2231 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 2232 ursra v9.8h, v9.8h, #8 2233 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2234 ursra v10.8h, v10.8h, #8 2235 2236 cache_preload 8, 8 2237 2238 ursra v11.8h, v11.8h, #8 2239 .endm 2240 2241 generate_composite_function \ 2242 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ 2243 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2244 8, /* number of pixels, processed in a single block */ \ 2245 10, /* prefetch distance */ \ 2246 default_init, \ 2247 default_cleanup, \ 2248 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2249 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2250 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2251 28, /* dst_w_basereg */ \ 2252 4, /* dst_r_basereg */ \ 2253 0, /* src_basereg */ \ 2254 24 /* mask_basereg */ 2255 2256 generate_composite_function_single_scanline \ 2257 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ 2258 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2259 8, /* number of pixels, processed in a single block */ \ 2260 default_init, \ 2261 default_cleanup, \ 2262 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2263 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2264 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2265 28, /* dst_w_basereg */ \ 2266 4, /* dst_r_basereg */ \ 2267 0, /* src_basereg */ \ 2268 24 /* mask_basereg */ 2269 2270 /******************************************************************************/ 2271 2272 generate_composite_function \ 2273 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ 2274 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2275 8, /* number of pixels, processed in a single block */ \ 2276 5, /* prefetch distance */ \ 2277 default_init, \ 2278 default_cleanup, \ 2279 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2280 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2281 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2282 28, /* dst_w_basereg */ \ 2283 4, /* dst_r_basereg */ \ 2284 0, /* src_basereg */ \ 2285 27 /* mask_basereg */ 2286 2287 /******************************************************************************/ 2288 2289 .macro pixman_composite_add_n_8_8888_init 2290 mov v3.s[0], w4 2291 dup v0.8b, v3.b[0] 2292 dup v1.8b, v3.b[1] 2293 dup v2.8b, v3.b[2] 2294 dup v3.8b, v3.b[3] 2295 .endm 2296 2297 .macro pixman_composite_add_n_8_8888_cleanup 2298 .endm 2299 2300 generate_composite_function \ 2301 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ 2302 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2303 8, /* number of pixels, processed in a single block */ \ 2304 5, /* prefetch distance */ \ 2305 pixman_composite_add_n_8_8888_init, \ 2306 pixman_composite_add_n_8_8888_cleanup, \ 2307 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2308 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2309 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2310 28, /* dst_w_basereg */ \ 2311 4, /* dst_r_basereg */ \ 2312 0, /* src_basereg */ \ 2313 27 /* mask_basereg */ 2314 2315 /******************************************************************************/ 2316 2317 .macro pixman_composite_add_8888_n_8888_init 2318 mov v27.s[0], w6 2319 dup v27.8b, v27.b[3] 2320 .endm 2321 2322 .macro pixman_composite_add_8888_n_8888_cleanup 2323 .endm 2324 2325 generate_composite_function \ 2326 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ 2327 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2328 8, /* number of pixels, processed in a single block */ \ 2329 5, /* prefetch distance */ \ 2330 pixman_composite_add_8888_n_8888_init, \ 2331 pixman_composite_add_8888_n_8888_cleanup, \ 2332 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2333 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2334 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2335 28, /* dst_w_basereg */ \ 2336 4, /* dst_r_basereg */ \ 2337 0, /* src_basereg */ \ 2338 27 /* mask_basereg */ 2339 2340 /******************************************************************************/ 2341 2342 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2343 /* expecting source data in {v0, v1, v2, v3} */ 2344 /* destination data in {v4, v5, v6, v7} */ 2345 /* solid mask is in v15 */ 2346 2347 /* 'in' */ 2348 umull v11.8h, v15.8b, v3.8b 2349 umull v10.8h, v15.8b, v2.8b 2350 umull v9.8h, v15.8b, v1.8b 2351 umull v8.8h, v15.8b, v0.8b 2352 urshr v16.8h, v11.8h, #8 2353 urshr v14.8h, v10.8h, #8 2354 urshr v13.8h, v9.8h, #8 2355 urshr v12.8h, v8.8h, #8 2356 raddhn v3.8b, v11.8h, v16.8h 2357 raddhn v2.8b, v10.8h, v14.8h 2358 raddhn v1.8b, v9.8h, v13.8h 2359 raddhn v0.8b, v8.8h, v12.8h 2360 mvn v24.8b, v3.8b /* get inverted alpha */ 2361 /* now do alpha blending */ 2362 umull v8.8h, v24.8b, v4.8b 2363 umull v9.8h, v24.8b, v5.8b 2364 umull v10.8h, v24.8b, v6.8b 2365 umull v11.8h, v24.8b, v7.8b 2366 .endm 2367 2368 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2369 urshr v16.8h, v8.8h, #8 2370 urshr v17.8h, v9.8h, #8 2371 urshr v18.8h, v10.8h, #8 2372 urshr v19.8h, v11.8h, #8 2373 raddhn v28.8b, v16.8h, v8.8h 2374 raddhn v29.8b, v17.8h, v9.8h 2375 raddhn v30.8b, v18.8h, v10.8h 2376 raddhn v31.8b, v19.8h, v11.8h 2377 .endm 2378 2379 /* TODO: expand macros and do better instructions scheduling */ 2380 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head 2381 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 2382 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2383 fetch_src_pixblock 2384 cache_preload 8, 8 2385 fetch_mask_pixblock 2386 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2387 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2388 .endm 2389 2390 generate_composite_function_single_scanline \ 2391 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ 2392 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2393 8, /* number of pixels, processed in a single block */ \ 2394 default_init_need_all_regs, \ 2395 default_cleanup_need_all_regs, \ 2396 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ 2397 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ 2398 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \ 2399 28, /* dst_w_basereg */ \ 2400 4, /* dst_r_basereg */ \ 2401 0, /* src_basereg */ \ 2402 12 /* mask_basereg */ 2403 2404 /******************************************************************************/ 2405 2406 .macro pixman_composite_over_8888_n_8888_process_pixblock_head 2407 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2408 .endm 2409 2410 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail 2411 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2412 uqadd v28.8b, v0.8b, v28.8b 2413 uqadd v29.8b, v1.8b, v29.8b 2414 uqadd v30.8b, v2.8b, v30.8b 2415 uqadd v31.8b, v3.8b, v31.8b 2416 .endm 2417 2418 /* TODO: expand macros and do better instructions scheduling */ 2419 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2420 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 2421 pixman_composite_over_8888_n_8888_process_pixblock_tail 2422 fetch_src_pixblock 2423 cache_preload 8, 8 2424 pixman_composite_over_8888_n_8888_process_pixblock_head 2425 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2426 .endm 2427 2428 .macro pixman_composite_over_8888_n_8888_init 2429 mov v15.s[0], w6 2430 dup v15.8b, v15.b[3] 2431 .endm 2432 2433 .macro pixman_composite_over_8888_n_8888_cleanup 2434 .endm 2435 2436 generate_composite_function \ 2437 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ 2438 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2439 8, /* number of pixels, processed in a single block */ \ 2440 5, /* prefetch distance */ \ 2441 pixman_composite_over_8888_n_8888_init, \ 2442 pixman_composite_over_8888_n_8888_cleanup, \ 2443 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2444 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2445 pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \ 2446 28, /* dst_w_basereg */ \ 2447 4, /* dst_r_basereg */ \ 2448 0, /* src_basereg */ \ 2449 12 /* mask_basereg */ 2450 2451 /******************************************************************************/ 2452 2453 /* TODO: expand macros and do better instructions scheduling */ 2454 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head 2455 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 2456 pixman_composite_over_8888_n_8888_process_pixblock_tail 2457 fetch_src_pixblock 2458 cache_preload 8, 8 2459 fetch_mask_pixblock 2460 pixman_composite_over_8888_n_8888_process_pixblock_head 2461 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2462 .endm 2463 2464 generate_composite_function \ 2465 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ 2466 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2467 8, /* number of pixels, processed in a single block */ \ 2468 5, /* prefetch distance */ \ 2469 default_init_need_all_regs, \ 2470 default_cleanup_need_all_regs, \ 2471 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2472 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2473 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ 2474 28, /* dst_w_basereg */ \ 2475 4, /* dst_r_basereg */ \ 2476 0, /* src_basereg */ \ 2477 12 /* mask_basereg */ 2478 2479 generate_composite_function_single_scanline \ 2480 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ 2481 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2482 8, /* number of pixels, processed in a single block */ \ 2483 default_init_need_all_regs, \ 2484 default_cleanup_need_all_regs, \ 2485 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2486 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2487 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \ 2488 28, /* dst_w_basereg */ \ 2489 4, /* dst_r_basereg */ \ 2490 0, /* src_basereg */ \ 2491 12 /* mask_basereg */ 2492 2493 /******************************************************************************/ 2494 2495 /* TODO: expand macros and do better instructions scheduling */ 2496 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head 2497 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 2498 pixman_composite_over_8888_n_8888_process_pixblock_tail 2499 fetch_src_pixblock 2500 cache_preload 8, 8 2501 fetch_mask_pixblock 2502 pixman_composite_over_8888_n_8888_process_pixblock_head 2503 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2504 .endm 2505 2506 generate_composite_function \ 2507 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ 2508 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2509 8, /* number of pixels, processed in a single block */ \ 2510 5, /* prefetch distance */ \ 2511 default_init_need_all_regs, \ 2512 default_cleanup_need_all_regs, \ 2513 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2514 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2515 pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \ 2516 28, /* dst_w_basereg */ \ 2517 4, /* dst_r_basereg */ \ 2518 0, /* src_basereg */ \ 2519 15 /* mask_basereg */ 2520 2521 /******************************************************************************/ 2522 2523 .macro pixman_composite_src_0888_0888_process_pixblock_head 2524 .endm 2525 2526 .macro pixman_composite_src_0888_0888_process_pixblock_tail 2527 .endm 2528 2529 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head 2530 st3 {v0.8b, v1.8b, v2.8b}, [DST_W], #24 2531 fetch_src_pixblock 2532 cache_preload 8, 8 2533 .endm 2534 2535 generate_composite_function \ 2536 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ 2537 FLAG_DST_WRITEONLY, \ 2538 8, /* number of pixels, processed in a single block */ \ 2539 10, /* prefetch distance */ \ 2540 default_init, \ 2541 default_cleanup, \ 2542 pixman_composite_src_0888_0888_process_pixblock_head, \ 2543 pixman_composite_src_0888_0888_process_pixblock_tail, \ 2544 pixman_composite_src_0888_0888_process_pixblock_tail_head, \ 2545 0, /* dst_w_basereg */ \ 2546 0, /* dst_r_basereg */ \ 2547 0, /* src_basereg */ \ 2548 0 /* mask_basereg */ 2549 2550 /******************************************************************************/ 2551 2552 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head 2553 mov v31.8b, v2.8b 2554 mov v2.8b, v0.8b 2555 mov v0.8b, v31.8b 2556 .endm 2557 2558 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail 2559 .endm 2560 2561 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head 2562 st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32 2563 fetch_src_pixblock 2564 mov v31.8b, v2.8b 2565 mov v2.8b, v0.8b 2566 mov v0.8b, v31.8b 2567 cache_preload 8, 8 2568 .endm 2569 2570 .macro pixman_composite_src_0888_8888_rev_init 2571 eor v3.8b, v3.8b, v3.8b 2572 .endm 2573 2574 generate_composite_function \ 2575 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ 2576 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2577 8, /* number of pixels, processed in a single block */ \ 2578 10, /* prefetch distance */ \ 2579 pixman_composite_src_0888_8888_rev_init, \ 2580 default_cleanup, \ 2581 pixman_composite_src_0888_8888_rev_process_pixblock_head, \ 2582 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ 2583 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ 2584 0, /* dst_w_basereg */ \ 2585 0, /* dst_r_basereg */ \ 2586 0, /* src_basereg */ \ 2587 0 /* mask_basereg */ 2588 2589 /******************************************************************************/ 2590 2591 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head 2592 ushll v8.8h, v1.8b, #7 2593 sli v8.8h, v8.8h, #1 2594 ushll v9.8h, v2.8b, #7 2595 sli v9.8h, v9.8h, #1 2596 .endm 2597 2598 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail 2599 ushll v14.8h, v0.8b, #7 2600 sli v14.8h, v14.8h, #1 2601 sri v14.8h, v8.8h, #5 2602 sri v14.8h, v9.8h, #11 2603 mov v28.d[0], v14.d[0] 2604 mov v29.d[0], v14.d[1] 2605 .endm 2606 2607 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head 2608 ushll v14.8h, v0.8b, #7 2609 sli v14.8h, v14.8h, #1 2610 fetch_src_pixblock 2611 sri v14.8h, v8.8h, #5 2612 sri v14.8h, v9.8h, #11 2613 mov v28.d[0], v14.d[0] 2614 mov v29.d[0], v14.d[1] 2615 ushll v8.8h, v1.8b, #7 2616 sli v8.8h, v8.8h, #1 2617 st1 {v14.8h}, [DST_W], #16 2618 ushll v9.8h, v2.8b, #7 2619 sli v9.8h, v9.8h, #1 2620 .endm 2621 2622 generate_composite_function \ 2623 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ 2624 FLAG_DST_WRITEONLY, \ 2625 8, /* number of pixels, processed in a single block */ \ 2626 10, /* prefetch distance */ \ 2627 default_init, \ 2628 default_cleanup, \ 2629 pixman_composite_src_0888_0565_rev_process_pixblock_head, \ 2630 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ 2631 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ 2632 28, /* dst_w_basereg */ \ 2633 0, /* dst_r_basereg */ \ 2634 0, /* src_basereg */ \ 2635 0 /* mask_basereg */ 2636 2637 /******************************************************************************/ 2638 2639 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head 2640 umull v8.8h, v3.8b, v0.8b 2641 umull v9.8h, v3.8b, v1.8b 2642 umull v10.8h, v3.8b, v2.8b 2643 .endm 2644 2645 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail 2646 urshr v11.8h, v8.8h, #8 2647 mov v30.8b, v31.8b 2648 mov v31.8b, v3.8b 2649 mov v3.8b, v30.8b 2650 urshr v12.8h, v9.8h, #8 2651 urshr v13.8h, v10.8h, #8 2652 raddhn v30.8b, v11.8h, v8.8h 2653 raddhn v29.8b, v12.8h, v9.8h 2654 raddhn v28.8b, v13.8h, v10.8h 2655 .endm 2656 2657 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head 2658 urshr v11.8h, v8.8h, #8 2659 mov v30.8b, v31.8b 2660 mov v31.8b, v3.8b 2661 mov v3.8b, v31.8b 2662 urshr v12.8h, v9.8h, #8 2663 urshr v13.8h, v10.8h, #8 2664 fetch_src_pixblock 2665 raddhn v30.8b, v11.8h, v8.8h 2666 PF add, PF_X, PF_X, #8 2667 PF tst, PF_CTL, #0xF 2668 PF beq, 10f 2669 PF add, PF_X, PF_X, #8 2670 PF sub, PF_CTL, PF_CTL, #1 2671 10: 2672 raddhn v29.8b, v12.8h, v9.8h 2673 raddhn v28.8b, v13.8h, v10.8h 2674 umull v8.8h, v3.8b, v0.8b 2675 umull v9.8h, v3.8b, v1.8b 2676 umull v10.8h, v3.8b, v2.8b 2677 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2678 PF cmp, PF_X, ORIG_W 2679 PF lsl, DUMMY, PF_X, src_bpp_shift 2680 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 2681 PF ble, 10f 2682 PF sub, PF_X, PF_X, ORIG_W 2683 PF subs, PF_CTL, PF_CTL, #0x10 2684 PF ble, 10f 2685 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 2686 PF ldrsb, DUMMY, [PF_SRC] 2687 10: 2688 .endm 2689 2690 generate_composite_function \ 2691 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ 2692 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2693 8, /* number of pixels, processed in a single block */ \ 2694 10, /* prefetch distance */ \ 2695 default_init, \ 2696 default_cleanup, \ 2697 pixman_composite_src_pixbuf_8888_process_pixblock_head, \ 2698 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ 2699 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ 2700 28, /* dst_w_basereg */ \ 2701 0, /* dst_r_basereg */ \ 2702 0, /* src_basereg */ \ 2703 0 /* mask_basereg */ 2704 2705 /******************************************************************************/ 2706 2707 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head 2708 umull v8.8h, v3.8b, v0.8b 2709 umull v9.8h, v3.8b, v1.8b 2710 umull v10.8h, v3.8b, v2.8b 2711 .endm 2712 2713 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail 2714 urshr v11.8h, v8.8h, #8 2715 mov v30.8b, v31.8b 2716 mov v31.8b, v3.8b 2717 mov v3.8b, v30.8b 2718 urshr v12.8h, v9.8h, #8 2719 urshr v13.8h, v10.8h, #8 2720 raddhn v28.8b, v11.8h, v8.8h 2721 raddhn v29.8b, v12.8h, v9.8h 2722 raddhn v30.8b, v13.8h, v10.8h 2723 .endm 2724 2725 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head 2726 urshr v11.8h, v8.8h, #8 2727 mov v30.8b, v31.8b 2728 mov v31.8b, v3.8b 2729 mov v3.8b, v30.8b 2730 urshr v12.8h, v9.8h, #8 2731 urshr v13.8h, v10.8h, #8 2732 fetch_src_pixblock 2733 raddhn v28.8b, v11.8h, v8.8h 2734 PF add, PF_X, PF_X, #8 2735 PF tst, PF_CTL, #0xF 2736 PF beq, 10f 2737 PF add, PF_X, PF_X, #8 2738 PF sub, PF_CTL, PF_CTL, #1 2739 10: 2740 raddhn v29.8b, v12.8h, v9.8h 2741 raddhn v30.8b, v13.8h, v10.8h 2742 umull v8.8h, v3.8b, v0.8b 2743 umull v9.8h, v3.8b, v1.8b 2744 umull v10.8h, v3.8b, v2.8b 2745 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 2746 PF cmp, PF_X, ORIG_W 2747 PF lsl, DUMMY, PF_X, src_bpp_shift 2748 PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY] 2749 PF ble, 10f 2750 PF sub, PF_X, PF_X, ORIG_W 2751 PF subs, PF_CTL, PF_CTL, #0x10 2752 PF ble, 10f 2753 PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift 2754 PF ldrsb, DUMMY, [PF_SRC] 2755 10: 2756 .endm 2757 2758 generate_composite_function \ 2759 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ 2760 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2761 8, /* number of pixels, processed in a single block */ \ 2762 10, /* prefetch distance */ \ 2763 default_init, \ 2764 default_cleanup, \ 2765 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ 2766 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ 2767 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ 2768 28, /* dst_w_basereg */ \ 2769 0, /* dst_r_basereg */ \ 2770 0, /* src_basereg */ \ 2771 0 /* mask_basereg */ 2772 2773 /******************************************************************************/ 2774 2775 .macro pixman_composite_over_0565_8_0565_process_pixblock_head 2776 /* mask is in v15 */ 2777 mov v4.d[0], v8.d[0] 2778 mov v4.d[1], v9.d[0] 2779 mov v13.d[0], v10.d[0] 2780 mov v13.d[1], v11.d[0] 2781 convert_0565_to_x888 v4, v2, v1, v0 2782 convert_0565_to_x888 v13, v6, v5, v4 2783 /* source pixel data is in {v0, v1, v2, XX} */ 2784 /* destination pixel data is in {v4, v5, v6, XX} */ 2785 mvn v7.8b, v15.8b 2786 umull v10.8h, v15.8b, v2.8b 2787 umull v9.8h, v15.8b, v1.8b 2788 umull v8.8h, v15.8b, v0.8b 2789 umull v11.8h, v7.8b, v4.8b 2790 umull v12.8h, v7.8b, v5.8b 2791 umull v13.8h, v7.8b, v6.8b 2792 urshr v19.8h, v10.8h, #8 2793 urshr v18.8h, v9.8h, #8 2794 urshr v17.8h, v8.8h, #8 2795 raddhn v2.8b, v10.8h, v19.8h 2796 raddhn v1.8b, v9.8h, v18.8h 2797 raddhn v0.8b, v8.8h, v17.8h 2798 .endm 2799 2800 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail 2801 urshr v17.8h, v11.8h, #8 2802 urshr v18.8h, v12.8h, #8 2803 urshr v19.8h, v13.8h, #8 2804 raddhn v28.8b, v17.8h, v11.8h 2805 raddhn v29.8b, v18.8h, v12.8h 2806 raddhn v30.8b, v19.8h, v13.8h 2807 uqadd v0.8b, v0.8b, v28.8b 2808 uqadd v1.8b, v1.8b, v29.8b 2809 uqadd v2.8b, v2.8b, v30.8b 2810 /* 32bpp result is in {v0, v1, v2, XX} */ 2811 convert_8888_to_0565 v2, v1, v0, v14, v30, v13 2812 mov v28.d[0], v14.d[0] 2813 mov v29.d[0], v14.d[1] 2814 .endm 2815 2816 /* TODO: expand macros and do better instructions scheduling */ 2817 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head 2818 fetch_mask_pixblock 2819 pixman_composite_over_0565_8_0565_process_pixblock_tail 2820 fetch_src_pixblock 2821 ld1 {v10.4h, v11.4h}, [DST_R], #16 2822 cache_preload 8, 8 2823 pixman_composite_over_0565_8_0565_process_pixblock_head 2824 st1 {v14.8h}, [DST_W], #16 2825 .endm 2826 2827 generate_composite_function \ 2828 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ 2829 FLAG_DST_READWRITE, \ 2830 8, /* number of pixels, processed in a single block */ \ 2831 5, /* prefetch distance */ \ 2832 default_init_need_all_regs, \ 2833 default_cleanup_need_all_regs, \ 2834 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2835 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2836 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2837 28, /* dst_w_basereg */ \ 2838 10, /* dst_r_basereg */ \ 2839 8, /* src_basereg */ \ 2840 15 /* mask_basereg */ 2841 2842 /******************************************************************************/ 2843 2844 .macro pixman_composite_over_0565_n_0565_init 2845 mov v15.s[0], w6 2846 dup v15.8b, v15.b[3] 2847 .endm 2848 2849 .macro pixman_composite_over_0565_n_0565_cleanup 2850 .endm 2851 2852 generate_composite_function \ 2853 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ 2854 FLAG_DST_READWRITE, \ 2855 8, /* number of pixels, processed in a single block */ \ 2856 5, /* prefetch distance */ \ 2857 pixman_composite_over_0565_n_0565_init, \ 2858 pixman_composite_over_0565_n_0565_cleanup, \ 2859 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2860 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2861 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2862 28, /* dst_w_basereg */ \ 2863 10, /* dst_r_basereg */ \ 2864 8, /* src_basereg */ \ 2865 15 /* mask_basereg */ 2866 2867 /******************************************************************************/ 2868 2869 .macro pixman_composite_add_0565_8_0565_process_pixblock_head 2870 /* mask is in v15 */ 2871 mov v4.d[0], v8.d[0] 2872 mov v4.d[1], v9.d[0] 2873 mov v13.d[0], v10.d[0] 2874 mov v13.d[1], v11.d[0] 2875 convert_0565_to_x888 v4, v2, v1, v0 2876 convert_0565_to_x888 v13, v6, v5, v4 2877 /* source pixel data is in {v0, v1, v2, XX} */ 2878 /* destination pixel data is in {v4, v5, v6, XX} */ 2879 umull v9.8h, v15.8b, v2.8b 2880 umull v8.8h, v15.8b, v1.8b 2881 umull v7.8h, v15.8b, v0.8b 2882 urshr v12.8h, v9.8h, #8 2883 urshr v11.8h, v8.8h, #8 2884 urshr v10.8h, v7.8h, #8 2885 raddhn v2.8b, v9.8h, v12.8h 2886 raddhn v1.8b, v8.8h, v11.8h 2887 raddhn v0.8b, v7.8h, v10.8h 2888 .endm 2889 2890 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail 2891 uqadd v0.8b, v0.8b, v4.8b 2892 uqadd v1.8b, v1.8b, v5.8b 2893 uqadd v2.8b, v2.8b, v6.8b 2894 /* 32bpp result is in {v0, v1, v2, XX} */ 2895 convert_8888_to_0565 v2, v1, v0, v14, v30, v13 2896 mov v28.d[0], v14.d[0] 2897 mov v29.d[0], v14.d[1] 2898 .endm 2899 2900 /* TODO: expand macros and do better instructions scheduling */ 2901 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head 2902 fetch_mask_pixblock 2903 pixman_composite_add_0565_8_0565_process_pixblock_tail 2904 fetch_src_pixblock 2905 ld1 {v10.4h, v11.4h}, [DST_R], #16 2906 cache_preload 8, 8 2907 pixman_composite_add_0565_8_0565_process_pixblock_head 2908 st1 {v14.8h}, [DST_W], #16 2909 .endm 2910 2911 generate_composite_function \ 2912 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ 2913 FLAG_DST_READWRITE, \ 2914 8, /* number of pixels, processed in a single block */ \ 2915 5, /* prefetch distance */ \ 2916 default_init_need_all_regs, \ 2917 default_cleanup_need_all_regs, \ 2918 pixman_composite_add_0565_8_0565_process_pixblock_head, \ 2919 pixman_composite_add_0565_8_0565_process_pixblock_tail, \ 2920 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ 2921 28, /* dst_w_basereg */ \ 2922 10, /* dst_r_basereg */ \ 2923 8, /* src_basereg */ \ 2924 15 /* mask_basereg */ 2925 2926 /******************************************************************************/ 2927 2928 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head 2929 /* mask is in v15 */ 2930 mov v12.d[0], v10.d[0] 2931 mov v12.d[1], v11.d[0] 2932 convert_0565_to_x888 v12, v6, v5, v4 2933 /* destination pixel data is in {v4, v5, v6, xx} */ 2934 mvn v24.8b, v15.8b /* get inverted alpha */ 2935 /* now do alpha blending */ 2936 umull v8.8h, v24.8b, v4.8b 2937 umull v9.8h, v24.8b, v5.8b 2938 umull v10.8h, v24.8b, v6.8b 2939 .endm 2940 2941 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail 2942 urshr v11.8h, v8.8h, #8 2943 urshr v12.8h, v9.8h, #8 2944 urshr v13.8h, v10.8h, #8 2945 raddhn v0.8b, v11.8h, v8.8h 2946 raddhn v1.8b, v12.8h, v9.8h 2947 raddhn v2.8b, v13.8h, v10.8h 2948 /* 32bpp result is in {v0, v1, v2, XX} */ 2949 convert_8888_to_0565 v2, v1, v0, v14, v12, v3 2950 mov v28.d[0], v14.d[0] 2951 mov v29.d[0], v14.d[1] 2952 .endm 2953 2954 /* TODO: expand macros and do better instructions scheduling */ 2955 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head 2956 fetch_src_pixblock 2957 pixman_composite_out_reverse_8_0565_process_pixblock_tail 2958 ld1 {v10.4h, v11.4h}, [DST_R], #16 2959 cache_preload 8, 8 2960 pixman_composite_out_reverse_8_0565_process_pixblock_head 2961 st1 {v14.8h}, [DST_W], #16 2962 .endm 2963 2964 generate_composite_function \ 2965 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ 2966 FLAG_DST_READWRITE, \ 2967 8, /* number of pixels, processed in a single block */ \ 2968 5, /* prefetch distance */ \ 2969 default_init_need_all_regs, \ 2970 default_cleanup_need_all_regs, \ 2971 pixman_composite_out_reverse_8_0565_process_pixblock_head, \ 2972 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ 2973 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ 2974 28, /* dst_w_basereg */ \ 2975 10, /* dst_r_basereg */ \ 2976 15, /* src_basereg */ \ 2977 0 /* mask_basereg */ 2978 2979 /******************************************************************************/ 2980 2981 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head 2982 /* src is in v0 */ 2983 /* destination pixel data is in {v4, v5, v6, v7} */ 2984 mvn v1.8b, v0.8b /* get inverted alpha */ 2985 /* now do alpha blending */ 2986 umull v8.8h, v1.8b, v4.8b 2987 umull v9.8h, v1.8b, v5.8b 2988 umull v10.8h, v1.8b, v6.8b 2989 umull v11.8h, v1.8b, v7.8b 2990 .endm 2991 2992 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail 2993 urshr v14.8h, v8.8h, #8 2994 urshr v15.8h, v9.8h, #8 2995 urshr v12.8h, v10.8h, #8 2996 urshr v13.8h, v11.8h, #8 2997 raddhn v28.8b, v14.8h, v8.8h 2998 raddhn v29.8b, v15.8h, v9.8h 2999 raddhn v30.8b, v12.8h, v10.8h 3000 raddhn v31.8b, v13.8h, v11.8h 3001 /* 32bpp result is in {v28, v29, v30, v31} */ 3002 .endm 3003 3004 /* TODO: expand macros and do better instructions scheduling */ 3005 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head 3006 fetch_src_pixblock 3007 pixman_composite_out_reverse_8_8888_process_pixblock_tail 3008 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 3009 cache_preload 8, 8 3010 pixman_composite_out_reverse_8_8888_process_pixblock_head 3011 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 3012 .endm 3013 3014 generate_composite_function \ 3015 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ 3016 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3017 8, /* number of pixels, processed in a single block */ \ 3018 5, /* prefetch distance */ \ 3019 default_init, \ 3020 default_cleanup, \ 3021 pixman_composite_out_reverse_8_8888_process_pixblock_head, \ 3022 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ 3023 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ 3024 28, /* dst_w_basereg */ \ 3025 4, /* dst_r_basereg */ \ 3026 0, /* src_basereg */ \ 3027 0 /* mask_basereg */ 3028 3029 /******************************************************************************/ 3030 3031 generate_composite_function_nearest_scanline \ 3032 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ 3033 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3034 8, /* number of pixels, processed in a single block */ \ 3035 default_init, \ 3036 default_cleanup, \ 3037 pixman_composite_over_8888_8888_process_pixblock_head, \ 3038 pixman_composite_over_8888_8888_process_pixblock_tail, \ 3039 pixman_composite_over_8888_8888_process_pixblock_tail_head 3040 3041 generate_composite_function_nearest_scanline \ 3042 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ 3043 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3044 8, /* number of pixels, processed in a single block */ \ 3045 default_init, \ 3046 default_cleanup, \ 3047 pixman_composite_over_8888_0565_process_pixblock_head, \ 3048 pixman_composite_over_8888_0565_process_pixblock_tail, \ 3049 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 3050 28, /* dst_w_basereg */ \ 3051 4, /* dst_r_basereg */ \ 3052 0, /* src_basereg */ \ 3053 24 /* mask_basereg */ 3054 3055 generate_composite_function_nearest_scanline \ 3056 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ 3057 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 3058 8, /* number of pixels, processed in a single block */ \ 3059 default_init, \ 3060 default_cleanup, \ 3061 pixman_composite_src_8888_0565_process_pixblock_head, \ 3062 pixman_composite_src_8888_0565_process_pixblock_tail, \ 3063 pixman_composite_src_8888_0565_process_pixblock_tail_head 3064 3065 generate_composite_function_nearest_scanline \ 3066 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ 3067 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 3068 8, /* number of pixels, processed in a single block */ \ 3069 default_init, \ 3070 default_cleanup, \ 3071 pixman_composite_src_0565_8888_process_pixblock_head, \ 3072 pixman_composite_src_0565_8888_process_pixblock_tail, \ 3073 pixman_composite_src_0565_8888_process_pixblock_tail_head 3074 3075 generate_composite_function_nearest_scanline \ 3076 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ 3077 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3078 8, /* number of pixels, processed in a single block */ \ 3079 default_init_need_all_regs, \ 3080 default_cleanup_need_all_regs, \ 3081 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 3082 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 3083 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 3084 28, /* dst_w_basereg */ \ 3085 4, /* dst_r_basereg */ \ 3086 8, /* src_basereg */ \ 3087 24 /* mask_basereg */ 3088 3089 generate_composite_function_nearest_scanline \ 3090 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ 3091 FLAG_DST_READWRITE, \ 3092 8, /* number of pixels, processed in a single block */ \ 3093 default_init_need_all_regs, \ 3094 default_cleanup_need_all_regs, \ 3095 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 3096 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 3097 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 3098 28, /* dst_w_basereg */ \ 3099 10, /* dst_r_basereg */ \ 3100 8, /* src_basereg */ \ 3101 15 /* mask_basereg */ 3102 3103 /******************************************************************************/ 3104 3105 /* 3106 * Bilinear scaling support code which tries to provide pixel fetching, color 3107 * format conversion, and interpolation as separate macros which can be used 3108 * as the basic building blocks for constructing bilinear scanline functions. 3109 */ 3110 3111 .macro bilinear_load_8888 reg1, reg2, tmp 3112 asr TMP1, X, #16 3113 add X, X, UX 3114 add TMP1, TOP, TMP1, lsl #2 3115 ld1 {\()\reg1\().2s}, [TMP1], STRIDE 3116 ld1 {\()\reg2\().2s}, [TMP1] 3117 .endm 3118 3119 .macro bilinear_load_0565 reg1, reg2, tmp 3120 asr TMP1, X, #16 3121 add X, X, UX 3122 add TMP1, TOP, TMP1, lsl #1 3123 ld1 {\()\reg2\().s}[0], [TMP1], STRIDE 3124 ld1 {\()\reg2\().s}[1], [TMP1] 3125 convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp 3126 .endm 3127 3128 .macro bilinear_load_and_vertical_interpolate_two_8888 \ 3129 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 3130 3131 bilinear_load_8888 \reg1, \reg2, \tmp1 3132 umull \()\acc1\().8h, \()\reg1\().8b, v28.8b 3133 umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b 3134 bilinear_load_8888 \reg3, \reg4, \tmp2 3135 umull \()\acc2\().8h, \()\reg3\().8b, v28.8b 3136 umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b 3137 .endm 3138 3139 .macro bilinear_load_and_vertical_interpolate_four_8888 \ 3140 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ 3141 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 3142 3143 bilinear_load_and_vertical_interpolate_two_8888 \ 3144 \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi 3145 bilinear_load_and_vertical_interpolate_two_8888 \ 3146 \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi 3147 .endm 3148 3149 .macro vzip reg1, reg2 3150 umov TMP4, v31.d[0] 3151 zip1 v31.8b, \reg1, \reg2 3152 zip2 \reg2, \reg1, \reg2 3153 mov \reg1, v31.8b 3154 mov v31.d[0], TMP4 3155 .endm 3156 3157 .macro vuzp reg1, reg2 3158 umov TMP4, v31.d[0] 3159 uzp1 v31.8b, \reg1, \reg2 3160 uzp2 \reg2, \reg1, \reg2 3161 mov \reg1, v31.8b 3162 mov v31.d[0], TMP4 3163 .endm 3164 3165 .macro bilinear_load_and_vertical_interpolate_two_0565 \ 3166 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 3167 asr TMP1, X, #16 3168 add X, X, UX 3169 add TMP1, TOP, TMP1, lsl #1 3170 asr TMP2, X, #16 3171 add X, X, UX 3172 add TMP2, TOP, TMP2, lsl #1 3173 ld1 {\()\acc2\().s}[0], [TMP1], STRIDE 3174 ld1 {\()\acc2\().s}[2], [TMP2], STRIDE 3175 ld1 {\()\acc2\().s}[1], [TMP1] 3176 ld1 {\()\acc2\().s}[3], [TMP2] 3177 convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 3178 vzip \()\reg1\().8b, \()\reg3\().8b 3179 vzip \()\reg2\().8b, \()\reg4\().8b 3180 vzip \()\reg3\().8b, \()\reg4\().8b 3181 vzip \()\reg1\().8b, \()\reg2\().8b 3182 umull \()\acc1\().8h, \()\reg1\().8b, v28.8b 3183 umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b 3184 umull \()\acc2\().8h, \()\reg3\().8b, v28.8b 3185 umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b 3186 .endm 3187 3188 .macro bilinear_load_and_vertical_interpolate_four_0565 \ 3189 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ 3190 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 3191 asr TMP1, X, #16 3192 add X, X, UX 3193 add TMP1, TOP, TMP1, lsl #1 3194 asr TMP2, X, #16 3195 add X, X, UX 3196 add TMP2, TOP, TMP2, lsl #1 3197 ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE 3198 ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE 3199 ld1 {\()\xacc2\().s}[1], [TMP1] 3200 ld1 {\()\xacc2\().s}[3], [TMP2] 3201 convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 3202 asr TMP1, X, #16 3203 add X, X, UX 3204 add TMP1, TOP, TMP1, lsl #1 3205 asr TMP2, X, #16 3206 add X, X, UX 3207 add TMP2, TOP, TMP2, lsl #1 3208 ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE 3209 vzip \()\xreg1\().8b, \()\xreg3\().8b 3210 ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE 3211 vzip \()\xreg2\().8b, \()\xreg4\().8b 3212 ld1 {\()\yacc2\().s}[1], [TMP1] 3213 vzip \()\xreg3\().8b, \()\xreg4\().8b 3214 ld1 {\()\yacc2\().s}[3], [TMP2] 3215 vzip \()\xreg1\().8b, \()\xreg2\().8b 3216 convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 3217 umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b 3218 vzip \()\yreg1\().8b, \()\yreg3\().8b 3219 umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b 3220 vzip \()\yreg2\().8b, \()\yreg4\().8b 3221 umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b 3222 vzip \()\yreg3\().8b, \()\yreg4\().8b 3223 umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b 3224 vzip \()\yreg1\().8b, \()\yreg2\().8b 3225 umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b 3226 umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b 3227 umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b 3228 umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b 3229 .endm 3230 3231 .macro bilinear_store_8888 numpix, tmp1, tmp2 3232 .if \numpix == 4 3233 st1 {v0.2s, v1.2s}, [OUT], #16 3234 .elseif \numpix == 2 3235 st1 {v0.2s}, [OUT], #8 3236 .elseif \numpix == 1 3237 st1 {v0.s}[0], [OUT], #4 3238 .else 3239 .error bilinear_store_8888 \numpix is unsupported 3240 .endif 3241 .endm 3242 3243 .macro bilinear_store_0565 numpix, tmp1, tmp2 3244 vuzp v0.8b, v1.8b 3245 vuzp v2.8b, v3.8b 3246 vuzp v1.8b, v3.8b 3247 vuzp v0.8b, v2.8b 3248 convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 3249 .if \numpix == 4 3250 st1 {v1.4h}, [OUT], #8 3251 .elseif \numpix == 2 3252 st1 {v1.s}[0], [OUT], #4 3253 .elseif \numpix == 1 3254 st1 {v1.h}[0], [OUT], #2 3255 .else 3256 .error bilinear_store_0565 \numpix is unsupported 3257 .endif 3258 .endm 3259 3260 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt 3261 bilinear_load_\()\src_fmt v0, v1, v2 3262 umull v2.8h, v0.8b, v28.8b 3263 umlal v2.8h, v1.8b, v29.8b 3264 /* 5 cycles bubble */ 3265 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS 3266 umlsl v0.4s, v2.4h, v15.h[0] 3267 umlal2 v0.4s, v2.8h, v15.h[0] 3268 /* 5 cycles bubble */ 3269 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3270 /* 3 cycles bubble */ 3271 xtn v0.8b, v0.8h 3272 /* 1 cycle bubble */ 3273 bilinear_store_\()\dst_fmt 1, v3, v4 3274 .endm 3275 3276 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt 3277 bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ 3278 v1, v11, v2, v3, v20, v21, v22, v23 3279 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS 3280 umlsl v0.4s, v1.4h, v15.h[0] 3281 umlal2 v0.4s, v1.8h, v15.h[0] 3282 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS 3283 umlsl v10.4s, v11.4h, v15.h[4] 3284 umlal2 v10.4s, v11.8h, v15.h[4] 3285 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3286 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3287 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3288 add v12.8h, v12.8h, v13.8h 3289 xtn v0.8b, v0.8h 3290 bilinear_store_\()\dst_fmt 2, v3, v4 3291 .endm 3292 3293 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt 3294 bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ 3295 v1, v11, v14, v20, v16, v17, v22, v23, \ 3296 v3, v9, v24, v25, v26, v27, v18, v19 3297 prfm PREFETCH_MODE, [TMP1, PF_OFFS] 3298 sub TMP1, TMP1, STRIDE 3299 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS 3300 umlsl v0.4s, v1.4h, v15.h[0] 3301 umlal2 v0.4s, v1.8h, v15.h[0] 3302 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS 3303 umlsl v10.4s, v11.4h, v15.h[4] 3304 umlal2 v10.4s, v11.8h, v15.h[4] 3305 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3306 ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS 3307 umlsl v2.4s, v3.4h, v15.h[0] 3308 umlal2 v2.4s, v3.8h, v15.h[0] 3309 ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS 3310 prfm PREFETCH_MODE, [TMP2, PF_OFFS] 3311 umlsl v8.4s, v9.4h, v15.h[4] 3312 umlal2 v8.4s, v9.8h, v15.h[4] 3313 add v12.8h, v12.8h, v13.8h 3314 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3315 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3316 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3317 shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3318 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3319 xtn v0.8b, v0.8h 3320 xtn v1.8b, v2.8h 3321 add v12.8h, v12.8h, v13.8h 3322 bilinear_store_\()\dst_fmt 4, v3, v4 3323 .endm 3324 3325 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3326 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt 3327 bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head 3328 .else 3329 bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 3330 .endif 3331 .endm 3332 3333 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3334 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt 3335 bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail 3336 .endif 3337 .endm 3338 3339 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3340 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt 3341 bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head 3342 .else 3343 bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 3344 .endif 3345 .endm 3346 3347 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 3348 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt 3349 bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head 3350 .else 3351 bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt 3352 bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt 3353 .endif 3354 .endm 3355 3356 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 3357 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt 3358 bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail 3359 .else 3360 bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt 3361 .endif 3362 .endm 3363 3364 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 3365 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt 3366 bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head 3367 .else 3368 bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt 3369 bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt 3370 .endif 3371 .endm 3372 3373 .set BILINEAR_FLAG_UNROLL_4, 0 3374 .set BILINEAR_FLAG_UNROLL_8, 1 3375 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 3376 3377 /* 3378 * Main template macro for generating NEON optimized bilinear scanline 3379 * functions. 3380 * 3381 * Bilinear scanline scaler macro template uses the following arguments: 3382 * fname - name of the function to generate 3383 * src_fmt - source color format (8888 or 0565) 3384 * dst_fmt - destination color format (8888 or 0565) 3385 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes 3386 * prefetch_distance - prefetch in the source image by that many 3387 * pixels ahead 3388 */ 3389 3390 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ 3391 src_bpp_shift, dst_bpp_shift, \ 3392 prefetch_distance, flags 3393 3394 pixman_asm_function \fname 3395 OUT .req x0 3396 TOP .req x1 3397 BOTTOM .req x2 3398 WT .req x3 3399 WB .req x4 3400 X .req x5 3401 UX .req x6 3402 WIDTH .req x7 3403 TMP1 .req x8 3404 TMP2 .req x9 3405 PF_OFFS .req x10 3406 TMP3 .req x11 3407 TMP4 .req x12 3408 STRIDE .req x13 3409 3410 sxtw x3, w3 3411 sxtw x4, w4 3412 sxtw x5, w5 3413 sxtw x6, w6 3414 sxtw x7, w7 3415 3416 stp x29, x30, [sp, -16]! 3417 mov x29, sp 3418 sub sp, sp, 112 /* push all registers */ 3419 sub x29, x29, 64 3420 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 3421 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 3422 stp x8, x9, [x29, -80] 3423 stp x10, x11, [x29, -96] 3424 stp x12, x13, [x29, -112] 3425 3426 mov PF_OFFS, #\prefetch_distance 3427 mul PF_OFFS, PF_OFFS, UX 3428 3429 subs STRIDE, BOTTOM, TOP 3430 .unreq BOTTOM 3431 3432 cmp WIDTH, #0 3433 ble 300f 3434 3435 dup v12.8h, w5 3436 dup v13.8h, w6 3437 dup v28.8b, w3 3438 dup v29.8b, w4 3439 mov v25.d[0], v12.d[1] 3440 mov v26.d[0], v13.d[0] 3441 add v25.4h, v25.4h, v26.4h 3442 mov v12.d[1], v25.d[0] 3443 3444 /* ensure good destination alignment */ 3445 cmp WIDTH, #1 3446 blt 100f 3447 tst OUT, #(1 << \dst_bpp_shift) 3448 beq 100f 3449 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3450 add v12.8h, v12.8h, v13.8h 3451 bilinear_interpolate_last_pixel \src_fmt, \dst_fmt 3452 sub WIDTH, WIDTH, #1 3453 100: 3454 add v13.8h, v13.8h, v13.8h 3455 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3456 add v12.8h, v12.8h, v13.8h 3457 3458 cmp WIDTH, #2 3459 blt 100f 3460 tst OUT, #(1 << (\dst_bpp_shift + 1)) 3461 beq 100f 3462 bilinear_interpolate_two_pixels \src_fmt, \dst_fmt 3463 sub WIDTH, WIDTH, #2 3464 100: 3465 .if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 3466 /*********** 8 pixels per iteration *****************/ 3467 cmp WIDTH, #4 3468 blt 100f 3469 tst OUT, #(1 << (\dst_bpp_shift + 2)) 3470 beq 100f 3471 bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 3472 sub WIDTH, WIDTH, #4 3473 100: 3474 subs WIDTH, WIDTH, #8 3475 blt 100f 3476 asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) 3477 bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt 3478 subs WIDTH, WIDTH, #8 3479 blt 500f 3480 1000: 3481 bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt 3482 subs WIDTH, WIDTH, #8 3483 bge 1000b 3484 500: 3485 bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt 3486 100: 3487 tst WIDTH, #4 3488 beq 200f 3489 bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 3490 200: 3491 .else 3492 /*********** 4 pixels per iteration *****************/ 3493 subs WIDTH, WIDTH, #4 3494 blt 100f 3495 asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) 3496 bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt 3497 subs WIDTH, WIDTH, #4 3498 blt 500f 3499 1000: 3500 bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt 3501 subs WIDTH, WIDTH, #4 3502 bge 1000b 3503 500: 3504 bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt 3505 100: 3506 /****************************************************/ 3507 .endif 3508 /* handle the remaining trailing pixels */ 3509 tst WIDTH, #2 3510 beq 200f 3511 bilinear_interpolate_two_pixels \src_fmt, \dst_fmt 3512 200: 3513 tst WIDTH, #1 3514 beq 300f 3515 bilinear_interpolate_last_pixel \src_fmt, \dst_fmt 3516 300: 3517 sub x29, x29, 64 3518 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 3519 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 3520 ldp x8, x9, [x29, -80] 3521 ldp x10, x11, [x29, -96] 3522 ldp x12, x13, [x29, -104] 3523 mov sp, x29 3524 ldp x29, x30, [sp], 16 3525 VERIFY_LR 3526 ret 3527 3528 .unreq OUT 3529 .unreq TOP 3530 .unreq WT 3531 .unreq WB 3532 .unreq X 3533 .unreq UX 3534 .unreq WIDTH 3535 .unreq TMP1 3536 .unreq TMP2 3537 .unreq PF_OFFS 3538 .unreq TMP3 3539 .unreq TMP4 3540 .unreq STRIDE 3541 pixman_end_asm_function 3542 3543 .endm 3544 3545 /*****************************************************************************/ 3546 3547 .set have_bilinear_interpolate_four_pixels_8888_8888, 1 3548 3549 .macro bilinear_interpolate_four_pixels_8888_8888_head 3550 asr TMP1, X, #16 3551 add X, X, UX 3552 add TMP1, TOP, TMP1, lsl #2 3553 asr TMP2, X, #16 3554 add X, X, UX 3555 add TMP2, TOP, TMP2, lsl #2 3556 3557 ld1 {v22.2s}, [TMP1], STRIDE 3558 ld1 {v23.2s}, [TMP1] 3559 asr TMP3, X, #16 3560 add X, X, UX 3561 add TMP3, TOP, TMP3, lsl #2 3562 umull v8.8h, v22.8b, v28.8b 3563 umlal v8.8h, v23.8b, v29.8b 3564 3565 ld1 {v22.2s}, [TMP2], STRIDE 3566 ld1 {v23.2s}, [TMP2] 3567 asr TMP4, X, #16 3568 add X, X, UX 3569 add TMP4, TOP, TMP4, lsl #2 3570 umull v9.8h, v22.8b, v28.8b 3571 umlal v9.8h, v23.8b, v29.8b 3572 3573 ld1 {v22.2s}, [TMP3], STRIDE 3574 ld1 {v23.2s}, [TMP3] 3575 umull v10.8h, v22.8b, v28.8b 3576 umlal v10.8h, v23.8b, v29.8b 3577 3578 ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS 3579 umlsl v0.4s, v8.4h, v15.h[0] 3580 umlal2 v0.4s, v8.8h, v15.h[0] 3581 3582 prfm PREFETCH_MODE, [TMP4, PF_OFFS] 3583 ld1 {v16.2s}, [TMP4], STRIDE 3584 ld1 {v17.2s}, [TMP4] 3585 prfm PREFETCH_MODE, [TMP4, PF_OFFS] 3586 umull v11.8h, v16.8b, v28.8b 3587 umlal v11.8h, v17.8b, v29.8b 3588 3589 ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS 3590 umlsl v1.4s, v9.4h, v15.h[4] 3591 .endm 3592 3593 .macro bilinear_interpolate_four_pixels_8888_8888_tail 3594 umlal2 v1.4s, v9.8h, v15.h[4] 3595 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3596 ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS 3597 umlsl v2.4s, v10.4h, v15.h[0] 3598 umlal2 v2.4s, v10.8h, v15.h[0] 3599 ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS 3600 umlsl v3.4s, v11.4h, v15.h[4] 3601 umlal2 v3.4s, v11.8h, v15.h[4] 3602 add v12.8h, v12.8h, v13.8h 3603 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3604 shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3605 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3606 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3607 shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3608 xtn v6.8b, v0.8h 3609 xtn v7.8b, v2.8h 3610 add v12.8h, v12.8h, v13.8h 3611 st1 {v6.2s, v7.2s}, [OUT], #16 3612 .endm 3613 3614 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head 3615 asr TMP1, X, #16 3616 add X, X, UX 3617 add TMP1, TOP, TMP1, lsl #2 3618 asr TMP2, X, #16 3619 add X, X, UX 3620 add TMP2, TOP, TMP2, lsl #2 3621 umlal2 v1.4s, v9.8h, v15.h[4] 3622 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3623 ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS 3624 umlsl v2.4s, v10.4h, v15.h[0] 3625 umlal2 v2.4s, v10.8h, v15.h[0] 3626 ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS 3627 ld1 {v20.2s}, [TMP1], STRIDE 3628 umlsl v3.4s, v11.4h, v15.h[4] 3629 umlal2 v3.4s, v11.8h, v15.h[4] 3630 ld1 {v21.2s}, [TMP1] 3631 umull v8.8h, v20.8b, v28.8b 3632 umlal v8.8h, v21.8b, v29.8b 3633 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3634 shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3635 shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3636 ld1 {v22.2s}, [TMP2], STRIDE 3637 shrn2 v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 3638 add v12.8h, v12.8h, v13.8h 3639 ld1 {v23.2s}, [TMP2] 3640 umull v9.8h, v22.8b, v28.8b 3641 asr TMP3, X, #16 3642 add X, X, UX 3643 add TMP3, TOP, TMP3, lsl #2 3644 asr TMP4, X, #16 3645 add X, X, UX 3646 add TMP4, TOP, TMP4, lsl #2 3647 umlal v9.8h, v23.8b, v29.8b 3648 ld1 {v22.2s}, [TMP3], STRIDE 3649 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 3650 ld1 {v23.2s}, [TMP3] 3651 umull v10.8h, v22.8b, v28.8b 3652 umlal v10.8h, v23.8b, v29.8b 3653 xtn v6.8b, v0.8h 3654 ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS 3655 xtn v7.8b, v4.8h 3656 umlsl v0.4s, v8.4h, v15.h[0] 3657 umlal2 v0.4s, v8.8h, v15.h[0] 3658 prfm PREFETCH_MODE, [TMP4, PF_OFFS] 3659 ld1 {v16.2s}, [TMP4], STRIDE 3660 add v12.8h, v12.8h, v13.8h 3661 ld1 {v17.2s}, [TMP4] 3662 prfm PREFETCH_MODE, [TMP4, PF_OFFS] 3663 umull v11.8h, v16.8b, v28.8b 3664 umlal v11.8h, v17.8b, v29.8b 3665 st1 {v6.2s, v7.2s}, [OUT], #16 3666 ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS 3667 umlsl v1.4s, v9.4h, v15.h[4] 3668 .endm 3669 3670 /*****************************************************************************/ 3671 3672 generate_bilinear_scanline_func \ 3673 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ 3674 2, 2, 28, BILINEAR_FLAG_UNROLL_4 3675 3676 generate_bilinear_scanline_func \ 3677 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ 3678 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS 3679 3680 generate_bilinear_scanline_func \ 3681 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ 3682 1, 2, 28, BILINEAR_FLAG_UNROLL_4 3683 3684 generate_bilinear_scanline_func \ 3685 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ 3686 1, 1, 28, BILINEAR_FLAG_UNROLL_4