pixman-arm-neon-asm.S (129297B)
1 /* 2 * Copyright © 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 */ 25 26 /* 27 * This file contains implementations of NEON optimized pixel processing 28 * functions. There is no full and detailed tutorial, but some functions 29 * (those which are exposing some new or interesting features) are 30 * extensively commented and can be used as examples. 31 * 32 * You may want to have a look at the comments for following functions: 33 * - pixman_composite_over_8888_0565_asm_neon 34 * - pixman_composite_over_n_8_0565_asm_neon 35 */ 36 37 /* Prevent the stack from becoming executable for no reason... */ 38 #if defined(__linux__) && defined(__ELF__) 39 .section .note.GNU-stack,"",%progbits 40 #endif 41 42 .text 43 .arch armv7a 44 .object_arch armv4 45 .fpu neon 46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ 47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ 48 .arm 49 .altmacro 50 .p2align 2 51 52 #include "pixman-private.h" 53 #include "pixman-arm-asm.h" 54 #include "pixman-arm-neon-asm.h" 55 56 pixman_syntax_unified 57 58 /* Global configuration options and preferences */ 59 60 /* 61 * The code can optionally make use of unaligned memory accesses to improve 62 * performance of handling leading/trailing pixels for each scanline. 63 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 64 * example in linux if unaligned memory accesses are not configured to 65 * generate.exceptions. 66 */ 67 .set RESPECT_STRICT_ALIGNMENT, 1 68 69 /* 70 * Set default prefetch type. There is a choice between the following options: 71 * 72 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 73 * as NOP to workaround some HW bugs or for whatever other reason) 74 * 75 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 76 * advanced prefetch intruduces heavy overhead) 77 * 78 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 79 * which can run ARM and NEON instructions simultaneously so that extra ARM 80 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 81 * 82 * Note: some types of function can't support advanced prefetch and fallback 83 * to simple one (those which handle 24bpp pixels) 84 */ 85 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 86 87 /* Prefetch distance in pixels for simple prefetch */ 88 .set PREFETCH_DISTANCE_SIMPLE, 64 89 90 /* 91 * Implementation of pixman_composite_over_8888_0565_asm_neon 92 * 93 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and 94 * performs OVER compositing operation. Function fast_composite_over_8888_0565 95 * from pixman-fast-path.c does the same in C and can be used as a reference. 96 * 97 * First we need to have some NEON assembly code which can do the actual 98 * operation on the pixels and provide it to the template macro. 99 * 100 * Template macro quite conveniently takes care of emitting all the necessary 101 * code for memory reading and writing (including quite tricky cases of 102 * handling unaligned leading/trailing pixels), so we only need to deal with 103 * the data in NEON registers. 104 * 105 * NEON registers allocation in general is recommented to be the following: 106 * d0, d1, d2, d3 - contain loaded source pixel data 107 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) 108 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) 109 * d28, d29, d30, d31 - place for storing the result (destination pixels) 110 * 111 * As can be seen above, four 64-bit NEON registers are used for keeping 112 * intermediate pixel data and up to 8 pixels can be processed in one step 113 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). 114 * 115 * This particular function uses the following registers allocation: 116 * d0, d1, d2, d3 - contain loaded source pixel data 117 * d4, d5 - contain loaded destination pixels (they are needed) 118 * d28, d29 - place for storing the result (destination pixels) 119 */ 120 121 /* 122 * Step one. We need to have some code to do some arithmetics on pixel data. 123 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used 124 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, 125 * perform all the needed calculations and write the result to {d28, d29}. 126 * The rationale for having two macros and not just one will be explained 127 * later. In practice, any single monolitic function which does the work can 128 * be split into two parts in any arbitrary way without affecting correctness. 129 * 130 * There is one special trick here too. Common template macro can optionally 131 * make our life a bit easier by doing R, G, B, A color components 132 * deinterleaving for 32bpp pixel formats (and this feature is used in 133 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that 134 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we 135 * actually use d0 register for blue channel (a vector of eight 8-bit 136 * values), d1 register for green, d2 for red and d3 for alpha. This 137 * simple conversion can be also done with a few NEON instructions: 138 * 139 * Packed to planar conversion: 140 * vuzp.8 d0, d1 141 * vuzp.8 d2, d3 142 * vuzp.8 d1, d3 143 * vuzp.8 d0, d2 144 * 145 * Planar to packed conversion: 146 * vzip.8 d0, d2 147 * vzip.8 d1, d3 148 * vzip.8 d2, d3 149 * vzip.8 d0, d1 150 * 151 * But pixel can be loaded directly in planar format using VLD4.8 NEON 152 * instruction. It is 1 cycle slower than VLD1.32, so this is not always 153 * desirable, that's why deinterleaving is optional. 154 * 155 * But anyway, here is the code: 156 */ 157 .macro pixman_composite_over_8888_0565_process_pixblock_head 158 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 159 and put data into d6 - red, d7 - green, d30 - blue */ 160 vshrn.u16 d6, q2, #8 161 vshrn.u16 d7, q2, #3 162 vsli.u16 q2, q2, #5 163 vsri.u8 d6, d6, #5 164 vmvn.8 d3, d3 /* invert source alpha */ 165 vsri.u8 d7, d7, #6 166 vshrn.u16 d30, q2, #2 167 /* now do alpha blending, storing results in 8-bit planar format 168 into d16 - red, d19 - green, d18 - blue */ 169 vmull.u8 q10, d3, d6 170 vmull.u8 q11, d3, d7 171 vmull.u8 q12, d3, d30 172 vrshr.u16 q13, q10, #8 173 vrshr.u16 q3, q11, #8 174 vrshr.u16 q15, q12, #8 175 vraddhn.u16 d20, q10, q13 176 vraddhn.u16 d23, q11, q3 177 vraddhn.u16 d22, q12, q15 178 .endm 179 180 .macro pixman_composite_over_8888_0565_process_pixblock_tail 181 /* ... continue alpha blending */ 182 vqadd.u8 d16, d2, d20 183 vqadd.u8 q9, q0, q11 184 /* convert the result to r5g6b5 and store it into {d28, d29} */ 185 vshll.u8 q14, d16, #8 186 vshll.u8 q8, d19, #8 187 vshll.u8 q9, d18, #8 188 vsri.u16 q14, q8, #5 189 vsri.u16 q14, q9, #11 190 .endm 191 192 /* 193 * OK, now we got almost everything that we need. Using the above two 194 * macros, the work can be done right. But now we want to optimize 195 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really 196 * a lot from good code scheduling and software pipelining. 197 * 198 * Let's construct some code, which will run in the core main loop. 199 * Some pseudo-code of the main loop will look like this: 200 * head 201 * while (...) { 202 * tail 203 * head 204 * } 205 * tail 206 * 207 * It may look a bit weird, but this setup allows to hide instruction 208 * latencies better and also utilize dual-issue capability more 209 * efficiently (make pairs of load-store and ALU instructions). 210 * 211 * So what we need now is a '*_tail_head' macro, which will be used 212 * in the core main loop. A trivial straightforward implementation 213 * of this macro would look like this: 214 * 215 * pixman_composite_over_8888_0565_process_pixblock_tail 216 * vst1.16 {d28, d29}, [DST_W, :128]! 217 * vld1.16 {d4, d5}, [DST_R, :128]! 218 * vld4.32 {d0, d1, d2, d3}, [SRC]! 219 * pixman_composite_over_8888_0565_process_pixblock_head 220 * cache_preload 8, 8 221 * 222 * Now it also got some VLD/VST instructions. We simply can't move from 223 * processing one block of pixels to the other one with just arithmetics. 224 * The previously processed data needs to be written to memory and new 225 * data needs to be fetched. Fortunately, this main loop does not deal 226 * with partial leading/trailing pixels and can load/store a full block 227 * of pixels in a bulk. Additionally, destination buffer is already 228 * 16 bytes aligned here (which is good for performance). 229 * 230 * New things here are DST_R, DST_W, SRC and MASK identifiers. These 231 * are the aliases for ARM registers which are used as pointers for 232 * accessing data. We maintain separate pointers for reading and writing 233 * destination buffer (DST_R and DST_W). 234 * 235 * Another new thing is 'cache_preload' macro. It is used for prefetching 236 * data into CPU L2 cache and improve performance when dealing with large 237 * images which are far larger than cache size. It uses one argument 238 * (actually two, but they need to be the same here) - number of pixels 239 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some 240 * details about this macro. Moreover, if good performance is needed 241 * the code from this macro needs to be copied into '*_tail_head' macro 242 * and mixed with the rest of code for optimal instructions scheduling. 243 * We are actually doing it below. 244 * 245 * Now after all the explanations, here is the optimized code. 246 * Different instruction streams (originaling from '*_head', '*_tail' 247 * and 'cache_preload' macro) use different indentation levels for 248 * better readability. Actually taking the code from one of these 249 * indentation levels and ignoring a few VLD/VST instructions would 250 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' 251 * macro! 252 */ 253 254 #if 1 255 256 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head 257 vqadd.u8 d16, d2, d20 258 vld1.16 {d4, d5}, [DST_R, :128]! 259 vqadd.u8 q9, q0, q11 260 vshrn.u16 d6, q2, #8 261 fetch_src_pixblock 262 vshrn.u16 d7, q2, #3 263 vsli.u16 q2, q2, #5 264 vshll.u8 q14, d16, #8 265 PF add, PF_X, PF_X, #8 266 vshll.u8 q8, d19, #8 267 PF tst, PF_CTL, #0xF 268 vsri.u8 d6, d6, #5 269 PF addne, PF_X, PF_X, #8 270 vmvn.8 d3, d3 271 PF subne, PF_CTL, PF_CTL, #1 272 vsri.u8 d7, d7, #6 273 vshrn.u16 d30, q2, #2 274 vmull.u8 q10, d3, d6 275 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 276 vmull.u8 q11, d3, d7 277 vmull.u8 q12, d3, d30 278 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 279 vsri.u16 q14, q8, #5 280 PF cmp, PF_X, ORIG_W 281 vshll.u8 q9, d18, #8 282 vrshr.u16 q13, q10, #8 283 PF subge, PF_X, PF_X, ORIG_W 284 vrshr.u16 q3, q11, #8 285 vrshr.u16 q15, q12, #8 286 PF subsge, PF_CTL, PF_CTL, #0x10 287 vsri.u16 q14, q9, #11 288 PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 289 vraddhn.u16 d20, q10, q13 290 vraddhn.u16 d23, q11, q3 291 PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 292 vraddhn.u16 d22, q12, q15 293 vst1.16 {d28, d29}, [DST_W, :128]! 294 .endm 295 296 #else 297 298 /* If we did not care much about the performance, we would just use this... */ 299 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head 300 pixman_composite_over_8888_0565_process_pixblock_tail 301 vst1.16 {d28, d29}, [DST_W, :128]! 302 vld1.16 {d4, d5}, [DST_R, :128]! 303 fetch_src_pixblock 304 pixman_composite_over_8888_0565_process_pixblock_head 305 cache_preload 8, 8 306 .endm 307 308 #endif 309 310 /* 311 * And now the final part. We are using 'generate_composite_function' macro 312 * to put all the stuff together. We are specifying the name of the function 313 * which we want to get, number of bits per pixel for the source, mask and 314 * destination (0 if unused, like mask in this case). Next come some bit 315 * flags: 316 * FLAG_DST_READWRITE - tells that the destination buffer is both read 317 * and written, for write-only buffer we would use 318 * FLAG_DST_WRITEONLY flag instead 319 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data 320 * and separate color channels for 32bpp format. 321 * The next things are: 322 * - the number of pixels processed per iteration (8 in this case, because 323 * that's the maximum what can fit into four 64-bit NEON registers). 324 * - prefetch distance, measured in pixel blocks. In this case it is 5 times 325 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal 326 * prefetch distance can be selected by running some benchmarks. 327 * 328 * After that we specify some macros, these are 'default_init', 329 * 'default_cleanup' here which are empty (but it is possible to have custom 330 * init/cleanup macros to be able to save/restore some extra NEON registers 331 * like d8-d15 or do anything else) followed by 332 * 'pixman_composite_over_8888_0565_process_pixblock_head', 333 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and 334 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' 335 * which we got implemented above. 336 * 337 * The last part is the NEON registers allocation scheme. 338 */ 339 generate_composite_function \ 340 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ 341 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 342 8, /* number of pixels, processed in a single block */ \ 343 5, /* prefetch distance */ \ 344 default_init, \ 345 default_cleanup, \ 346 pixman_composite_over_8888_0565_process_pixblock_head, \ 347 pixman_composite_over_8888_0565_process_pixblock_tail, \ 348 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 349 28, /* dst_w_basereg */ \ 350 4, /* dst_r_basereg */ \ 351 0, /* src_basereg */ \ 352 24 /* mask_basereg */ 353 354 /******************************************************************************/ 355 356 .macro pixman_composite_over_n_0565_process_pixblock_head 357 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 358 and put data into d6 - red, d7 - green, d30 - blue */ 359 vshrn.u16 d6, q2, #8 360 vshrn.u16 d7, q2, #3 361 vsli.u16 q2, q2, #5 362 vsri.u8 d6, d6, #5 363 vsri.u8 d7, d7, #6 364 vshrn.u16 d30, q2, #2 365 /* now do alpha blending, storing results in 8-bit planar format 366 into d16 - red, d19 - green, d18 - blue */ 367 vmull.u8 q10, d3, d6 368 vmull.u8 q11, d3, d7 369 vmull.u8 q12, d3, d30 370 vrshr.u16 q13, q10, #8 371 vrshr.u16 q3, q11, #8 372 vrshr.u16 q15, q12, #8 373 vraddhn.u16 d20, q10, q13 374 vraddhn.u16 d23, q11, q3 375 vraddhn.u16 d22, q12, q15 376 .endm 377 378 .macro pixman_composite_over_n_0565_process_pixblock_tail 379 /* ... continue alpha blending */ 380 vqadd.u8 d16, d2, d20 381 vqadd.u8 q9, q0, q11 382 /* convert the result to r5g6b5 and store it into {d28, d29} */ 383 vshll.u8 q14, d16, #8 384 vshll.u8 q8, d19, #8 385 vshll.u8 q9, d18, #8 386 vsri.u16 q14, q8, #5 387 vsri.u16 q14, q9, #11 388 .endm 389 390 /* TODO: expand macros and do better instructions scheduling */ 391 .macro pixman_composite_over_n_0565_process_pixblock_tail_head 392 pixman_composite_over_n_0565_process_pixblock_tail 393 vld1.16 {d4, d5}, [DST_R, :128]! 394 vst1.16 {d28, d29}, [DST_W, :128]! 395 pixman_composite_over_n_0565_process_pixblock_head 396 cache_preload 8, 8 397 .endm 398 399 .macro pixman_composite_over_n_0565_init 400 add DUMMY, sp, #ARGS_STACK_OFFSET 401 vld1.32 {d3[0]}, [DUMMY] 402 vdup.8 d0, d3[0] 403 vdup.8 d1, d3[1] 404 vdup.8 d2, d3[2] 405 vdup.8 d3, d3[3] 406 vmvn.8 d3, d3 /* invert source alpha */ 407 .endm 408 409 generate_composite_function \ 410 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ 411 FLAG_DST_READWRITE, \ 412 8, /* number of pixels, processed in a single block */ \ 413 5, /* prefetch distance */ \ 414 pixman_composite_over_n_0565_init, \ 415 default_cleanup, \ 416 pixman_composite_over_n_0565_process_pixblock_head, \ 417 pixman_composite_over_n_0565_process_pixblock_tail, \ 418 pixman_composite_over_n_0565_process_pixblock_tail_head, \ 419 28, /* dst_w_basereg */ \ 420 4, /* dst_r_basereg */ \ 421 0, /* src_basereg */ \ 422 24 /* mask_basereg */ 423 424 /******************************************************************************/ 425 426 .macro pixman_composite_src_8888_0565_process_pixblock_head 427 vshll.u8 q8, d1, #8 428 vshll.u8 q14, d2, #8 429 vshll.u8 q9, d0, #8 430 .endm 431 432 .macro pixman_composite_src_8888_0565_process_pixblock_tail 433 vsri.u16 q14, q8, #5 434 vsri.u16 q14, q9, #11 435 .endm 436 437 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head 438 vsri.u16 q14, q8, #5 439 PF add, PF_X, PF_X, #8 440 PF tst, PF_CTL, #0xF 441 fetch_src_pixblock 442 PF addne, PF_X, PF_X, #8 443 PF subne, PF_CTL, PF_CTL, #1 444 vsri.u16 q14, q9, #11 445 PF cmp, PF_X, ORIG_W 446 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 447 vshll.u8 q8, d1, #8 448 vst1.16 {d28, d29}, [DST_W, :128]! 449 PF subge, PF_X, PF_X, ORIG_W 450 PF subsge, PF_CTL, PF_CTL, #0x10 451 vshll.u8 q14, d2, #8 452 PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 453 vshll.u8 q9, d0, #8 454 .endm 455 456 generate_composite_function \ 457 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ 458 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 459 8, /* number of pixels, processed in a single block */ \ 460 10, /* prefetch distance */ \ 461 default_init, \ 462 default_cleanup, \ 463 pixman_composite_src_8888_0565_process_pixblock_head, \ 464 pixman_composite_src_8888_0565_process_pixblock_tail, \ 465 pixman_composite_src_8888_0565_process_pixblock_tail_head 466 467 /******************************************************************************/ 468 469 .macro pixman_composite_src_0565_8888_process_pixblock_head 470 vshrn.u16 d30, q0, #8 471 vshrn.u16 d29, q0, #3 472 vsli.u16 q0, q0, #5 473 vmov.u8 d31, #255 474 vsri.u8 d30, d30, #5 475 vsri.u8 d29, d29, #6 476 vshrn.u16 d28, q0, #2 477 .endm 478 479 .macro pixman_composite_src_0565_8888_process_pixblock_tail 480 .endm 481 482 /* TODO: expand macros and do better instructions scheduling */ 483 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head 484 pixman_composite_src_0565_8888_process_pixblock_tail 485 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 486 fetch_src_pixblock 487 pixman_composite_src_0565_8888_process_pixblock_head 488 cache_preload 8, 8 489 .endm 490 491 generate_composite_function \ 492 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ 493 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 494 8, /* number of pixels, processed in a single block */ \ 495 10, /* prefetch distance */ \ 496 default_init, \ 497 default_cleanup, \ 498 pixman_composite_src_0565_8888_process_pixblock_head, \ 499 pixman_composite_src_0565_8888_process_pixblock_tail, \ 500 pixman_composite_src_0565_8888_process_pixblock_tail_head 501 502 /******************************************************************************/ 503 504 .macro pixman_composite_add_8_8_process_pixblock_head 505 vqadd.u8 q14, q0, q2 506 vqadd.u8 q15, q1, q3 507 .endm 508 509 .macro pixman_composite_add_8_8_process_pixblock_tail 510 .endm 511 512 .macro pixman_composite_add_8_8_process_pixblock_tail_head 513 fetch_src_pixblock 514 PF add, PF_X, PF_X, #32 515 PF tst, PF_CTL, #0xF 516 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 517 PF addne, PF_X, PF_X, #32 518 PF subne, PF_CTL, PF_CTL, #1 519 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 520 PF cmp, PF_X, ORIG_W 521 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 522 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 523 PF subge, PF_X, PF_X, ORIG_W 524 PF subsge, PF_CTL, PF_CTL, #0x10 525 vqadd.u8 q14, q0, q2 526 PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 527 PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 528 vqadd.u8 q15, q1, q3 529 .endm 530 531 generate_composite_function \ 532 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ 533 FLAG_DST_READWRITE, \ 534 32, /* number of pixels, processed in a single block */ \ 535 10, /* prefetch distance */ \ 536 default_init, \ 537 default_cleanup, \ 538 pixman_composite_add_8_8_process_pixblock_head, \ 539 pixman_composite_add_8_8_process_pixblock_tail, \ 540 pixman_composite_add_8_8_process_pixblock_tail_head 541 542 /******************************************************************************/ 543 544 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head 545 fetch_src_pixblock 546 PF add, PF_X, PF_X, #8 547 PF tst, PF_CTL, #0xF 548 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! 549 PF addne, PF_X, PF_X, #8 550 PF subne, PF_CTL, PF_CTL, #1 551 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! 552 PF cmp, PF_X, ORIG_W 553 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 554 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 555 PF subge, PF_X, PF_X, ORIG_W 556 PF subsge, PF_CTL, PF_CTL, #0x10 557 vqadd.u8 q14, q0, q2 558 PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 559 PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 560 vqadd.u8 q15, q1, q3 561 .endm 562 563 generate_composite_function \ 564 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ 565 FLAG_DST_READWRITE, \ 566 8, /* number of pixels, processed in a single block */ \ 567 10, /* prefetch distance */ \ 568 default_init, \ 569 default_cleanup, \ 570 pixman_composite_add_8_8_process_pixblock_head, \ 571 pixman_composite_add_8_8_process_pixblock_tail, \ 572 pixman_composite_add_8888_8888_process_pixblock_tail_head 573 574 generate_composite_function_single_scanline \ 575 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ 576 FLAG_DST_READWRITE, \ 577 8, /* number of pixels, processed in a single block */ \ 578 default_init, \ 579 default_cleanup, \ 580 pixman_composite_add_8_8_process_pixblock_head, \ 581 pixman_composite_add_8_8_process_pixblock_tail, \ 582 pixman_composite_add_8888_8888_process_pixblock_tail_head 583 584 /******************************************************************************/ 585 586 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head 587 vmvn.8 d24, d3 /* get inverted alpha */ 588 /* do alpha blending */ 589 vmull.u8 q8, d24, d4 590 vmull.u8 q9, d24, d5 591 vmull.u8 q10, d24, d6 592 vmull.u8 q11, d24, d7 593 .endm 594 595 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail 596 vrshr.u16 q14, q8, #8 597 vrshr.u16 q15, q9, #8 598 vrshr.u16 q12, q10, #8 599 vrshr.u16 q13, q11, #8 600 vraddhn.u16 d28, q14, q8 601 vraddhn.u16 d29, q15, q9 602 vraddhn.u16 d30, q12, q10 603 vraddhn.u16 d31, q13, q11 604 .endm 605 606 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 607 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 608 vrshr.u16 q14, q8, #8 609 PF add, PF_X, PF_X, #8 610 PF tst, PF_CTL, #0xF 611 vrshr.u16 q15, q9, #8 612 vrshr.u16 q12, q10, #8 613 vrshr.u16 q13, q11, #8 614 PF addne, PF_X, PF_X, #8 615 PF subne, PF_CTL, PF_CTL, #1 616 vraddhn.u16 d28, q14, q8 617 vraddhn.u16 d29, q15, q9 618 PF cmp, PF_X, ORIG_W 619 vraddhn.u16 d30, q12, q10 620 vraddhn.u16 d31, q13, q11 621 fetch_src_pixblock 622 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 623 vmvn.8 d22, d3 624 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 625 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 626 PF subge, PF_X, PF_X, ORIG_W 627 vmull.u8 q8, d22, d4 628 PF subsge, PF_CTL, PF_CTL, #0x10 629 vmull.u8 q9, d22, d5 630 PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 631 vmull.u8 q10, d22, d6 632 PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 633 vmull.u8 q11, d22, d7 634 .endm 635 636 generate_composite_function_single_scanline \ 637 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ 638 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 639 8, /* number of pixels, processed in a single block */ \ 640 default_init, \ 641 default_cleanup, \ 642 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ 643 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ 644 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 645 646 /******************************************************************************/ 647 648 .macro pixman_composite_over_8888_8888_process_pixblock_head 649 pixman_composite_out_reverse_8888_8888_process_pixblock_head 650 .endm 651 652 .macro pixman_composite_over_8888_8888_process_pixblock_tail 653 pixman_composite_out_reverse_8888_8888_process_pixblock_tail 654 vqadd.u8 q14, q0, q14 655 vqadd.u8 q15, q1, q15 656 .endm 657 658 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head 659 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 660 vrshr.u16 q14, q8, #8 661 PF add, PF_X, PF_X, #8 662 PF tst, PF_CTL, #0xF 663 vrshr.u16 q15, q9, #8 664 vrshr.u16 q12, q10, #8 665 vrshr.u16 q13, q11, #8 666 PF addne, PF_X, PF_X, #8 667 PF subne, PF_CTL, PF_CTL, #1 668 vraddhn.u16 d28, q14, q8 669 vraddhn.u16 d29, q15, q9 670 PF cmp, PF_X, ORIG_W 671 vraddhn.u16 d30, q12, q10 672 vraddhn.u16 d31, q13, q11 673 vqadd.u8 q14, q0, q14 674 vqadd.u8 q15, q1, q15 675 fetch_src_pixblock 676 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 677 vmvn.8 d22, d3 678 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 679 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 680 PF subge, PF_X, PF_X, ORIG_W 681 vmull.u8 q8, d22, d4 682 PF subsge, PF_CTL, PF_CTL, #0x10 683 vmull.u8 q9, d22, d5 684 PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 685 vmull.u8 q10, d22, d6 686 PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 687 vmull.u8 q11, d22, d7 688 .endm 689 690 generate_composite_function \ 691 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ 692 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 693 8, /* number of pixels, processed in a single block */ \ 694 5, /* prefetch distance */ \ 695 default_init, \ 696 default_cleanup, \ 697 pixman_composite_over_8888_8888_process_pixblock_head, \ 698 pixman_composite_over_8888_8888_process_pixblock_tail, \ 699 pixman_composite_over_8888_8888_process_pixblock_tail_head 700 701 generate_composite_function_single_scanline \ 702 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ 703 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 704 8, /* number of pixels, processed in a single block */ \ 705 default_init, \ 706 default_cleanup, \ 707 pixman_composite_over_8888_8888_process_pixblock_head, \ 708 pixman_composite_over_8888_8888_process_pixblock_tail, \ 709 pixman_composite_over_8888_8888_process_pixblock_tail_head 710 711 /******************************************************************************/ 712 713 .macro pixman_composite_over_n_8888_process_pixblock_head 714 /* deinterleaved source pixels in {d0, d1, d2, d3} */ 715 /* inverted alpha in {d24} */ 716 /* destination pixels in {d4, d5, d6, d7} */ 717 vmull.u8 q8, d24, d4 718 vmull.u8 q9, d24, d5 719 vmull.u8 q10, d24, d6 720 vmull.u8 q11, d24, d7 721 .endm 722 723 .macro pixman_composite_over_n_8888_process_pixblock_tail 724 vrshr.u16 q14, q8, #8 725 vrshr.u16 q15, q9, #8 726 vrshr.u16 q2, q10, #8 727 vrshr.u16 q3, q11, #8 728 vraddhn.u16 d28, q14, q8 729 vraddhn.u16 d29, q15, q9 730 vraddhn.u16 d30, q2, q10 731 vraddhn.u16 d31, q3, q11 732 vqadd.u8 q14, q0, q14 733 vqadd.u8 q15, q1, q15 734 .endm 735 736 .macro pixman_composite_over_n_8888_process_pixblock_tail_head 737 vrshr.u16 q14, q8, #8 738 vrshr.u16 q15, q9, #8 739 vrshr.u16 q2, q10, #8 740 vrshr.u16 q3, q11, #8 741 vraddhn.u16 d28, q14, q8 742 vraddhn.u16 d29, q15, q9 743 vraddhn.u16 d30, q2, q10 744 vraddhn.u16 d31, q3, q11 745 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 746 vqadd.u8 q14, q0, q14 747 PF add, PF_X, PF_X, #8 748 PF tst, PF_CTL, #0x0F 749 PF addne, PF_X, PF_X, #8 750 PF subne, PF_CTL, PF_CTL, #1 751 vqadd.u8 q15, q1, q15 752 PF cmp, PF_X, ORIG_W 753 vmull.u8 q8, d24, d4 754 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 755 vmull.u8 q9, d24, d5 756 PF subge, PF_X, PF_X, ORIG_W 757 vmull.u8 q10, d24, d6 758 PF subsge, PF_CTL, PF_CTL, #0x10 759 vmull.u8 q11, d24, d7 760 PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 761 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 762 .endm 763 764 .macro pixman_composite_over_n_8888_init 765 add DUMMY, sp, #ARGS_STACK_OFFSET 766 vld1.32 {d3[0]}, [DUMMY] 767 vdup.8 d0, d3[0] 768 vdup.8 d1, d3[1] 769 vdup.8 d2, d3[2] 770 vdup.8 d3, d3[3] 771 vmvn.8 d24, d3 /* get inverted alpha */ 772 .endm 773 774 generate_composite_function \ 775 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ 776 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 777 8, /* number of pixels, processed in a single block */ \ 778 5, /* prefetch distance */ \ 779 pixman_composite_over_n_8888_init, \ 780 default_cleanup, \ 781 pixman_composite_over_8888_8888_process_pixblock_head, \ 782 pixman_composite_over_8888_8888_process_pixblock_tail, \ 783 pixman_composite_over_n_8888_process_pixblock_tail_head 784 785 /******************************************************************************/ 786 787 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head 788 vrshr.u16 q14, q8, #8 789 PF add, PF_X, PF_X, #8 790 PF tst, PF_CTL, #0xF 791 vrshr.u16 q15, q9, #8 792 vrshr.u16 q12, q10, #8 793 vrshr.u16 q13, q11, #8 794 PF addne, PF_X, PF_X, #8 795 PF subne, PF_CTL, PF_CTL, #1 796 vraddhn.u16 d28, q14, q8 797 vraddhn.u16 d29, q15, q9 798 PF cmp, PF_X, ORIG_W 799 vraddhn.u16 d30, q12, q10 800 vraddhn.u16 d31, q13, q11 801 vqadd.u8 q14, q0, q14 802 vqadd.u8 q15, q1, q15 803 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! 804 vmvn.8 d22, d3 805 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 806 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 807 PF subge, PF_X, PF_X, ORIG_W 808 vmull.u8 q8, d22, d4 809 PF subsge, PF_CTL, PF_CTL, #0x10 810 vmull.u8 q9, d22, d5 811 vmull.u8 q10, d22, d6 812 PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 813 vmull.u8 q11, d22, d7 814 .endm 815 816 .macro pixman_composite_over_reverse_n_8888_init 817 add DUMMY, sp, #ARGS_STACK_OFFSET 818 vld1.32 {d7[0]}, [DUMMY] 819 vdup.8 d4, d7[0] 820 vdup.8 d5, d7[1] 821 vdup.8 d6, d7[2] 822 vdup.8 d7, d7[3] 823 .endm 824 825 generate_composite_function \ 826 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ 827 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 828 8, /* number of pixels, processed in a single block */ \ 829 5, /* prefetch distance */ \ 830 pixman_composite_over_reverse_n_8888_init, \ 831 default_cleanup, \ 832 pixman_composite_over_8888_8888_process_pixblock_head, \ 833 pixman_composite_over_8888_8888_process_pixblock_tail, \ 834 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ 835 28, /* dst_w_basereg */ \ 836 0, /* dst_r_basereg */ \ 837 4, /* src_basereg */ \ 838 24 /* mask_basereg */ 839 840 /******************************************************************************/ 841 842 .macro pixman_composite_over_8888_8_0565_process_pixblock_head 843 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ 844 vmull.u8 q1, d24, d9 845 vmull.u8 q6, d24, d10 846 vmull.u8 q7, d24, d11 847 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ 848 vshrn.u16 d7, q2, #3 849 vsli.u16 q2, q2, #5 850 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ 851 vrshr.u16 q9, q1, #8 852 vrshr.u16 q10, q6, #8 853 vrshr.u16 q11, q7, #8 854 vraddhn.u16 d0, q0, q8 855 vraddhn.u16 d1, q1, q9 856 vraddhn.u16 d2, q6, q10 857 vraddhn.u16 d3, q7, q11 858 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ 859 vsri.u8 d7, d7, #6 860 vmvn.8 d3, d3 861 vshrn.u16 d30, q2, #2 862 vmull.u8 q8, d3, d6 /* now do alpha blending */ 863 vmull.u8 q9, d3, d7 864 vmull.u8 q10, d3, d30 865 .endm 866 867 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail 868 /* 3 cycle bubble (after vmull.u8) */ 869 vrshr.u16 q13, q8, #8 870 vrshr.u16 q11, q9, #8 871 vrshr.u16 q15, q10, #8 872 vraddhn.u16 d16, q8, q13 873 vraddhn.u16 d27, q9, q11 874 vraddhn.u16 d26, q10, q15 875 vqadd.u8 d16, d2, d16 876 /* 1 cycle bubble */ 877 vqadd.u8 q9, q0, q13 878 vshll.u8 q14, d16, #8 /* convert to 16bpp */ 879 vshll.u8 q8, d19, #8 880 vshll.u8 q9, d18, #8 881 vsri.u16 q14, q8, #5 882 /* 1 cycle bubble */ 883 vsri.u16 q14, q9, #11 884 .endm 885 886 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head 887 vld1.16 {d4, d5}, [DST_R, :128]! 888 vshrn.u16 d6, q2, #8 889 fetch_mask_pixblock 890 vshrn.u16 d7, q2, #3 891 fetch_src_pixblock 892 vmull.u8 q6, d24, d10 893 vrshr.u16 q13, q8, #8 894 vrshr.u16 q11, q9, #8 895 vrshr.u16 q15, q10, #8 896 vraddhn.u16 d16, q8, q13 897 vraddhn.u16 d27, q9, q11 898 vraddhn.u16 d26, q10, q15 899 vqadd.u8 d16, d2, d16 900 vmull.u8 q1, d24, d9 901 vqadd.u8 q9, q0, q13 902 vshll.u8 q14, d16, #8 903 vmull.u8 q0, d24, d8 904 vshll.u8 q8, d19, #8 905 vshll.u8 q9, d18, #8 906 vsri.u16 q14, q8, #5 907 vmull.u8 q7, d24, d11 908 vsri.u16 q14, q9, #11 909 910 cache_preload 8, 8 911 912 vsli.u16 q2, q2, #5 913 vrshr.u16 q8, q0, #8 914 vrshr.u16 q9, q1, #8 915 vrshr.u16 q10, q6, #8 916 vrshr.u16 q11, q7, #8 917 vraddhn.u16 d0, q0, q8 918 vraddhn.u16 d1, q1, q9 919 vraddhn.u16 d2, q6, q10 920 vraddhn.u16 d3, q7, q11 921 vsri.u8 d6, d6, #5 922 vsri.u8 d7, d7, #6 923 vmvn.8 d3, d3 924 vshrn.u16 d30, q2, #2 925 vst1.16 {d28, d29}, [DST_W, :128]! 926 vmull.u8 q8, d3, d6 927 vmull.u8 q9, d3, d7 928 vmull.u8 q10, d3, d30 929 .endm 930 931 generate_composite_function \ 932 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ 933 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 934 8, /* number of pixels, processed in a single block */ \ 935 5, /* prefetch distance */ \ 936 default_init_need_all_regs, \ 937 default_cleanup_need_all_regs, \ 938 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 939 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 940 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 941 28, /* dst_w_basereg */ \ 942 4, /* dst_r_basereg */ \ 943 8, /* src_basereg */ \ 944 24 /* mask_basereg */ 945 946 /******************************************************************************/ 947 948 /* 949 * This function needs a special initialization of solid mask. 950 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET 951 * offset, split into color components and replicated in d8-d11 952 * registers. Additionally, this function needs all the NEON registers, 953 * so it has to save d8-d15 registers which are callee saved according 954 * to ABI. These registers are restored from 'cleanup' macro. All the 955 * other NEON registers are caller saved, so can be clobbered freely 956 * without introducing any problems. 957 */ 958 .macro pixman_composite_over_n_8_0565_init 959 add DUMMY, sp, #ARGS_STACK_OFFSET 960 vpush {d8-d15} 961 vld1.32 {d11[0]}, [DUMMY] 962 vdup.8 d8, d11[0] 963 vdup.8 d9, d11[1] 964 vdup.8 d10, d11[2] 965 vdup.8 d11, d11[3] 966 .endm 967 968 .macro pixman_composite_over_n_8_0565_cleanup 969 vpop {d8-d15} 970 .endm 971 972 generate_composite_function \ 973 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ 974 FLAG_DST_READWRITE, \ 975 8, /* number of pixels, processed in a single block */ \ 976 5, /* prefetch distance */ \ 977 pixman_composite_over_n_8_0565_init, \ 978 pixman_composite_over_n_8_0565_cleanup, \ 979 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 980 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 981 pixman_composite_over_8888_8_0565_process_pixblock_tail_head 982 983 /******************************************************************************/ 984 985 .macro pixman_composite_over_8888_n_0565_init 986 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 987 vpush {d8-d15} 988 vld1.32 {d24[0]}, [DUMMY] 989 vdup.8 d24, d24[3] 990 .endm 991 992 .macro pixman_composite_over_8888_n_0565_cleanup 993 vpop {d8-d15} 994 .endm 995 996 generate_composite_function \ 997 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ 998 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 999 8, /* number of pixels, processed in a single block */ \ 1000 5, /* prefetch distance */ \ 1001 pixman_composite_over_8888_n_0565_init, \ 1002 pixman_composite_over_8888_n_0565_cleanup, \ 1003 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 1004 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 1005 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 1006 28, /* dst_w_basereg */ \ 1007 4, /* dst_r_basereg */ \ 1008 8, /* src_basereg */ \ 1009 24 /* mask_basereg */ 1010 1011 /******************************************************************************/ 1012 1013 .macro pixman_composite_src_0565_0565_process_pixblock_head 1014 .endm 1015 1016 .macro pixman_composite_src_0565_0565_process_pixblock_tail 1017 .endm 1018 1019 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head 1020 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 1021 fetch_src_pixblock 1022 cache_preload 16, 16 1023 .endm 1024 1025 generate_composite_function \ 1026 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ 1027 FLAG_DST_WRITEONLY, \ 1028 16, /* number of pixels, processed in a single block */ \ 1029 10, /* prefetch distance */ \ 1030 default_init, \ 1031 default_cleanup, \ 1032 pixman_composite_src_0565_0565_process_pixblock_head, \ 1033 pixman_composite_src_0565_0565_process_pixblock_tail, \ 1034 pixman_composite_src_0565_0565_process_pixblock_tail_head, \ 1035 0, /* dst_w_basereg */ \ 1036 0, /* dst_r_basereg */ \ 1037 0, /* src_basereg */ \ 1038 0 /* mask_basereg */ 1039 1040 /******************************************************************************/ 1041 1042 .macro pixman_composite_src_n_8_process_pixblock_head 1043 .endm 1044 1045 .macro pixman_composite_src_n_8_process_pixblock_tail 1046 .endm 1047 1048 .macro pixman_composite_src_n_8_process_pixblock_tail_head 1049 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! 1050 .endm 1051 1052 .macro pixman_composite_src_n_8_init 1053 add DUMMY, sp, #ARGS_STACK_OFFSET 1054 vld1.32 {d0[0]}, [DUMMY] 1055 vsli.u64 d0, d0, #8 1056 vsli.u64 d0, d0, #16 1057 vsli.u64 d0, d0, #32 1058 vorr d1, d0, d0 1059 vorr q1, q0, q0 1060 .endm 1061 1062 .macro pixman_composite_src_n_8_cleanup 1063 .endm 1064 1065 generate_composite_function \ 1066 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ 1067 FLAG_DST_WRITEONLY, \ 1068 32, /* number of pixels, processed in a single block */ \ 1069 0, /* prefetch distance */ \ 1070 pixman_composite_src_n_8_init, \ 1071 pixman_composite_src_n_8_cleanup, \ 1072 pixman_composite_src_n_8_process_pixblock_head, \ 1073 pixman_composite_src_n_8_process_pixblock_tail, \ 1074 pixman_composite_src_n_8_process_pixblock_tail_head, \ 1075 0, /* dst_w_basereg */ \ 1076 0, /* dst_r_basereg */ \ 1077 0, /* src_basereg */ \ 1078 0 /* mask_basereg */ 1079 1080 /******************************************************************************/ 1081 1082 .macro pixman_composite_src_n_0565_process_pixblock_head 1083 .endm 1084 1085 .macro pixman_composite_src_n_0565_process_pixblock_tail 1086 .endm 1087 1088 .macro pixman_composite_src_n_0565_process_pixblock_tail_head 1089 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 1090 .endm 1091 1092 .macro pixman_composite_src_n_0565_init 1093 add DUMMY, sp, #ARGS_STACK_OFFSET 1094 vld1.32 {d0[0]}, [DUMMY] 1095 vsli.u64 d0, d0, #16 1096 vsli.u64 d0, d0, #32 1097 vorr d1, d0, d0 1098 vorr q1, q0, q0 1099 .endm 1100 1101 .macro pixman_composite_src_n_0565_cleanup 1102 .endm 1103 1104 generate_composite_function \ 1105 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ 1106 FLAG_DST_WRITEONLY, \ 1107 16, /* number of pixels, processed in a single block */ \ 1108 0, /* prefetch distance */ \ 1109 pixman_composite_src_n_0565_init, \ 1110 pixman_composite_src_n_0565_cleanup, \ 1111 pixman_composite_src_n_0565_process_pixblock_head, \ 1112 pixman_composite_src_n_0565_process_pixblock_tail, \ 1113 pixman_composite_src_n_0565_process_pixblock_tail_head, \ 1114 0, /* dst_w_basereg */ \ 1115 0, /* dst_r_basereg */ \ 1116 0, /* src_basereg */ \ 1117 0 /* mask_basereg */ 1118 1119 /******************************************************************************/ 1120 1121 .macro pixman_composite_src_n_8888_process_pixblock_head 1122 .endm 1123 1124 .macro pixman_composite_src_n_8888_process_pixblock_tail 1125 .endm 1126 1127 .macro pixman_composite_src_n_8888_process_pixblock_tail_head 1128 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1129 .endm 1130 1131 .macro pixman_composite_src_n_8888_init 1132 add DUMMY, sp, #ARGS_STACK_OFFSET 1133 vld1.32 {d0[0]}, [DUMMY] 1134 vsli.u64 d0, d0, #32 1135 vorr d1, d0, d0 1136 vorr q1, q0, q0 1137 .endm 1138 1139 .macro pixman_composite_src_n_8888_cleanup 1140 .endm 1141 1142 generate_composite_function \ 1143 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ 1144 FLAG_DST_WRITEONLY, \ 1145 8, /* number of pixels, processed in a single block */ \ 1146 0, /* prefetch distance */ \ 1147 pixman_composite_src_n_8888_init, \ 1148 pixman_composite_src_n_8888_cleanup, \ 1149 pixman_composite_src_n_8888_process_pixblock_head, \ 1150 pixman_composite_src_n_8888_process_pixblock_tail, \ 1151 pixman_composite_src_n_8888_process_pixblock_tail_head, \ 1152 0, /* dst_w_basereg */ \ 1153 0, /* dst_r_basereg */ \ 1154 0, /* src_basereg */ \ 1155 0 /* mask_basereg */ 1156 1157 /******************************************************************************/ 1158 1159 .macro pixman_composite_src_8888_8888_process_pixblock_head 1160 .endm 1161 1162 .macro pixman_composite_src_8888_8888_process_pixblock_tail 1163 .endm 1164 1165 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head 1166 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1167 fetch_src_pixblock 1168 cache_preload 8, 8 1169 .endm 1170 1171 generate_composite_function \ 1172 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ 1173 FLAG_DST_WRITEONLY, \ 1174 8, /* number of pixels, processed in a single block */ \ 1175 10, /* prefetch distance */ \ 1176 default_init, \ 1177 default_cleanup, \ 1178 pixman_composite_src_8888_8888_process_pixblock_head, \ 1179 pixman_composite_src_8888_8888_process_pixblock_tail, \ 1180 pixman_composite_src_8888_8888_process_pixblock_tail_head, \ 1181 0, /* dst_w_basereg */ \ 1182 0, /* dst_r_basereg */ \ 1183 0, /* src_basereg */ \ 1184 0 /* mask_basereg */ 1185 1186 /******************************************************************************/ 1187 1188 .macro pixman_composite_src_x888_8888_process_pixblock_head 1189 vorr q0, q0, q2 1190 vorr q1, q1, q2 1191 .endm 1192 1193 .macro pixman_composite_src_x888_8888_process_pixblock_tail 1194 .endm 1195 1196 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head 1197 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1198 fetch_src_pixblock 1199 vorr q0, q0, q2 1200 vorr q1, q1, q2 1201 cache_preload 8, 8 1202 .endm 1203 1204 .macro pixman_composite_src_x888_8888_init 1205 vmov.u8 q2, #0xFF 1206 vshl.u32 q2, q2, #24 1207 .endm 1208 1209 generate_composite_function \ 1210 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ 1211 FLAG_DST_WRITEONLY, \ 1212 8, /* number of pixels, processed in a single block */ \ 1213 10, /* prefetch distance */ \ 1214 pixman_composite_src_x888_8888_init, \ 1215 default_cleanup, \ 1216 pixman_composite_src_x888_8888_process_pixblock_head, \ 1217 pixman_composite_src_x888_8888_process_pixblock_tail, \ 1218 pixman_composite_src_x888_8888_process_pixblock_tail_head, \ 1219 0, /* dst_w_basereg */ \ 1220 0, /* dst_r_basereg */ \ 1221 0, /* src_basereg */ \ 1222 0 /* mask_basereg */ 1223 1224 /******************************************************************************/ 1225 1226 .macro pixman_composite_src_n_8_8888_process_pixblock_head 1227 /* expecting solid source in {d0, d1, d2, d3} */ 1228 /* mask is in d24 (d25, d26, d27 are unused) */ 1229 1230 /* in */ 1231 vmull.u8 q8, d24, d0 1232 vmull.u8 q9, d24, d1 1233 vmull.u8 q10, d24, d2 1234 vmull.u8 q11, d24, d3 1235 vrsra.u16 q8, q8, #8 1236 vrsra.u16 q9, q9, #8 1237 vrsra.u16 q10, q10, #8 1238 vrsra.u16 q11, q11, #8 1239 .endm 1240 1241 .macro pixman_composite_src_n_8_8888_process_pixblock_tail 1242 vrshrn.u16 d28, q8, #8 1243 vrshrn.u16 d29, q9, #8 1244 vrshrn.u16 d30, q10, #8 1245 vrshrn.u16 d31, q11, #8 1246 .endm 1247 1248 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head 1249 fetch_mask_pixblock 1250 PF add, PF_X, PF_X, #8 1251 vrshrn.u16 d28, q8, #8 1252 PF tst, PF_CTL, #0x0F 1253 vrshrn.u16 d29, q9, #8 1254 PF addne, PF_X, PF_X, #8 1255 vrshrn.u16 d30, q10, #8 1256 PF subne, PF_CTL, PF_CTL, #1 1257 vrshrn.u16 d31, q11, #8 1258 PF cmp, PF_X, ORIG_W 1259 vmull.u8 q8, d24, d0 1260 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1261 vmull.u8 q9, d24, d1 1262 PF subge, PF_X, PF_X, ORIG_W 1263 vmull.u8 q10, d24, d2 1264 PF subsge, PF_CTL, PF_CTL, #0x10 1265 vmull.u8 q11, d24, d3 1266 PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1267 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1268 vrsra.u16 q8, q8, #8 1269 vrsra.u16 q9, q9, #8 1270 vrsra.u16 q10, q10, #8 1271 vrsra.u16 q11, q11, #8 1272 .endm 1273 1274 .macro pixman_composite_src_n_8_8888_init 1275 add DUMMY, sp, #ARGS_STACK_OFFSET 1276 vld1.32 {d3[0]}, [DUMMY] 1277 vdup.8 d0, d3[0] 1278 vdup.8 d1, d3[1] 1279 vdup.8 d2, d3[2] 1280 vdup.8 d3, d3[3] 1281 .endm 1282 1283 .macro pixman_composite_src_n_8_8888_cleanup 1284 .endm 1285 1286 generate_composite_function \ 1287 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ 1288 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1289 8, /* number of pixels, processed in a single block */ \ 1290 5, /* prefetch distance */ \ 1291 pixman_composite_src_n_8_8888_init, \ 1292 pixman_composite_src_n_8_8888_cleanup, \ 1293 pixman_composite_src_n_8_8888_process_pixblock_head, \ 1294 pixman_composite_src_n_8_8888_process_pixblock_tail, \ 1295 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ 1296 1297 /******************************************************************************/ 1298 1299 .macro pixman_composite_src_n_8_8_process_pixblock_head 1300 vmull.u8 q0, d24, d16 1301 vmull.u8 q1, d25, d16 1302 vmull.u8 q2, d26, d16 1303 vmull.u8 q3, d27, d16 1304 vrsra.u16 q0, q0, #8 1305 vrsra.u16 q1, q1, #8 1306 vrsra.u16 q2, q2, #8 1307 vrsra.u16 q3, q3, #8 1308 .endm 1309 1310 .macro pixman_composite_src_n_8_8_process_pixblock_tail 1311 vrshrn.u16 d28, q0, #8 1312 vrshrn.u16 d29, q1, #8 1313 vrshrn.u16 d30, q2, #8 1314 vrshrn.u16 d31, q3, #8 1315 .endm 1316 1317 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head 1318 fetch_mask_pixblock 1319 PF add, PF_X, PF_X, #8 1320 vrshrn.u16 d28, q0, #8 1321 PF tst, PF_CTL, #0x0F 1322 vrshrn.u16 d29, q1, #8 1323 PF addne, PF_X, PF_X, #8 1324 vrshrn.u16 d30, q2, #8 1325 PF subne, PF_CTL, PF_CTL, #1 1326 vrshrn.u16 d31, q3, #8 1327 PF cmp, PF_X, ORIG_W 1328 vmull.u8 q0, d24, d16 1329 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1330 vmull.u8 q1, d25, d16 1331 PF subge, PF_X, PF_X, ORIG_W 1332 vmull.u8 q2, d26, d16 1333 PF subsge, PF_CTL, PF_CTL, #0x10 1334 vmull.u8 q3, d27, d16 1335 PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1336 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1337 vrsra.u16 q0, q0, #8 1338 vrsra.u16 q1, q1, #8 1339 vrsra.u16 q2, q2, #8 1340 vrsra.u16 q3, q3, #8 1341 .endm 1342 1343 .macro pixman_composite_src_n_8_8_init 1344 add DUMMY, sp, #ARGS_STACK_OFFSET 1345 vld1.32 {d16[0]}, [DUMMY] 1346 vdup.8 d16, d16[3] 1347 .endm 1348 1349 .macro pixman_composite_src_n_8_8_cleanup 1350 .endm 1351 1352 generate_composite_function \ 1353 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ 1354 FLAG_DST_WRITEONLY, \ 1355 32, /* number of pixels, processed in a single block */ \ 1356 5, /* prefetch distance */ \ 1357 pixman_composite_src_n_8_8_init, \ 1358 pixman_composite_src_n_8_8_cleanup, \ 1359 pixman_composite_src_n_8_8_process_pixblock_head, \ 1360 pixman_composite_src_n_8_8_process_pixblock_tail, \ 1361 pixman_composite_src_n_8_8_process_pixblock_tail_head 1362 1363 /******************************************************************************/ 1364 1365 .macro pixman_composite_over_n_8_8888_process_pixblock_head 1366 /* expecting deinterleaved source data in {d8, d9, d10, d11} */ 1367 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1368 /* and destination data in {d4, d5, d6, d7} */ 1369 /* mask is in d24 (d25, d26, d27 are unused) */ 1370 1371 /* in */ 1372 vmull.u8 q6, d24, d8 1373 vmull.u8 q7, d24, d9 1374 vmull.u8 q8, d24, d10 1375 vmull.u8 q9, d24, d11 1376 vrshr.u16 q10, q6, #8 1377 vrshr.u16 q11, q7, #8 1378 vrshr.u16 q12, q8, #8 1379 vrshr.u16 q13, q9, #8 1380 vraddhn.u16 d0, q6, q10 1381 vraddhn.u16 d1, q7, q11 1382 vraddhn.u16 d2, q8, q12 1383 vraddhn.u16 d3, q9, q13 1384 vmvn.8 d25, d3 /* get inverted alpha */ 1385 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ 1386 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ 1387 /* now do alpha blending */ 1388 vmull.u8 q8, d25, d4 1389 vmull.u8 q9, d25, d5 1390 vmull.u8 q10, d25, d6 1391 vmull.u8 q11, d25, d7 1392 .endm 1393 1394 .macro pixman_composite_over_n_8_8888_process_pixblock_tail 1395 vrshr.u16 q14, q8, #8 1396 vrshr.u16 q15, q9, #8 1397 vrshr.u16 q6, q10, #8 1398 vrshr.u16 q7, q11, #8 1399 vraddhn.u16 d28, q14, q8 1400 vraddhn.u16 d29, q15, q9 1401 vraddhn.u16 d30, q6, q10 1402 vraddhn.u16 d31, q7, q11 1403 vqadd.u8 q14, q0, q14 1404 vqadd.u8 q15, q1, q15 1405 .endm 1406 1407 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head 1408 vrshr.u16 q14, q8, #8 1409 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1410 vrshr.u16 q15, q9, #8 1411 fetch_mask_pixblock 1412 vrshr.u16 q6, q10, #8 1413 PF add, PF_X, PF_X, #8 1414 vrshr.u16 q7, q11, #8 1415 PF tst, PF_CTL, #0x0F 1416 vraddhn.u16 d28, q14, q8 1417 PF addne, PF_X, PF_X, #8 1418 vraddhn.u16 d29, q15, q9 1419 PF subne, PF_CTL, PF_CTL, #1 1420 vraddhn.u16 d30, q6, q10 1421 PF cmp, PF_X, ORIG_W 1422 vraddhn.u16 d31, q7, q11 1423 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1424 vmull.u8 q6, d24, d8 1425 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1426 vmull.u8 q7, d24, d9 1427 PF subge, PF_X, PF_X, ORIG_W 1428 vmull.u8 q8, d24, d10 1429 PF subsge, PF_CTL, PF_CTL, #0x10 1430 vmull.u8 q9, d24, d11 1431 PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1432 vqadd.u8 q14, q0, q14 1433 PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1434 vqadd.u8 q15, q1, q15 1435 vrshr.u16 q10, q6, #8 1436 vrshr.u16 q11, q7, #8 1437 vrshr.u16 q12, q8, #8 1438 vrshr.u16 q13, q9, #8 1439 vraddhn.u16 d0, q6, q10 1440 vraddhn.u16 d1, q7, q11 1441 vraddhn.u16 d2, q8, q12 1442 vraddhn.u16 d3, q9, q13 1443 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1444 vmvn.8 d25, d3 1445 vmull.u8 q8, d25, d4 1446 vmull.u8 q9, d25, d5 1447 vmull.u8 q10, d25, d6 1448 vmull.u8 q11, d25, d7 1449 .endm 1450 1451 .macro pixman_composite_over_n_8_8888_init 1452 add DUMMY, sp, #ARGS_STACK_OFFSET 1453 vpush {d8-d15} 1454 vld1.32 {d11[0]}, [DUMMY] 1455 vdup.8 d8, d11[0] 1456 vdup.8 d9, d11[1] 1457 vdup.8 d10, d11[2] 1458 vdup.8 d11, d11[3] 1459 .endm 1460 1461 .macro pixman_composite_over_n_8_8888_cleanup 1462 vpop {d8-d15} 1463 .endm 1464 1465 generate_composite_function \ 1466 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ 1467 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1468 8, /* number of pixels, processed in a single block */ \ 1469 5, /* prefetch distance */ \ 1470 pixman_composite_over_n_8_8888_init, \ 1471 pixman_composite_over_n_8_8888_cleanup, \ 1472 pixman_composite_over_n_8_8888_process_pixblock_head, \ 1473 pixman_composite_over_n_8_8888_process_pixblock_tail, \ 1474 pixman_composite_over_n_8_8888_process_pixblock_tail_head 1475 1476 /******************************************************************************/ 1477 1478 .macro pixman_composite_over_n_8_8_process_pixblock_head 1479 vmull.u8 q0, d24, d8 1480 vmull.u8 q1, d25, d8 1481 vmull.u8 q6, d26, d8 1482 vmull.u8 q7, d27, d8 1483 vrshr.u16 q10, q0, #8 1484 vrshr.u16 q11, q1, #8 1485 vrshr.u16 q12, q6, #8 1486 vrshr.u16 q13, q7, #8 1487 vraddhn.u16 d0, q0, q10 1488 vraddhn.u16 d1, q1, q11 1489 vraddhn.u16 d2, q6, q12 1490 vraddhn.u16 d3, q7, q13 1491 vmvn.8 q12, q0 1492 vmvn.8 q13, q1 1493 vmull.u8 q8, d24, d4 1494 vmull.u8 q9, d25, d5 1495 vmull.u8 q10, d26, d6 1496 vmull.u8 q11, d27, d7 1497 .endm 1498 1499 .macro pixman_composite_over_n_8_8_process_pixblock_tail 1500 vrshr.u16 q14, q8, #8 1501 vrshr.u16 q15, q9, #8 1502 vrshr.u16 q12, q10, #8 1503 vrshr.u16 q13, q11, #8 1504 vraddhn.u16 d28, q14, q8 1505 vraddhn.u16 d29, q15, q9 1506 vraddhn.u16 d30, q12, q10 1507 vraddhn.u16 d31, q13, q11 1508 vqadd.u8 q14, q0, q14 1509 vqadd.u8 q15, q1, q15 1510 .endm 1511 1512 /* TODO: expand macros and do better instructions scheduling */ 1513 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head 1514 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1515 pixman_composite_over_n_8_8_process_pixblock_tail 1516 fetch_mask_pixblock 1517 cache_preload 32, 32 1518 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1519 pixman_composite_over_n_8_8_process_pixblock_head 1520 .endm 1521 1522 .macro pixman_composite_over_n_8_8_init 1523 add DUMMY, sp, #ARGS_STACK_OFFSET 1524 vpush {d8-d15} 1525 vld1.32 {d8[0]}, [DUMMY] 1526 vdup.8 d8, d8[3] 1527 .endm 1528 1529 .macro pixman_composite_over_n_8_8_cleanup 1530 vpop {d8-d15} 1531 .endm 1532 1533 generate_composite_function \ 1534 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ 1535 FLAG_DST_READWRITE, \ 1536 32, /* number of pixels, processed in a single block */ \ 1537 5, /* prefetch distance */ \ 1538 pixman_composite_over_n_8_8_init, \ 1539 pixman_composite_over_n_8_8_cleanup, \ 1540 pixman_composite_over_n_8_8_process_pixblock_head, \ 1541 pixman_composite_over_n_8_8_process_pixblock_tail, \ 1542 pixman_composite_over_n_8_8_process_pixblock_tail_head 1543 1544 /******************************************************************************/ 1545 1546 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1547 /* 1548 * 'combine_mask_ca' replacement 1549 * 1550 * input: solid src (n) in {d8, d9, d10, d11} 1551 * dest in {d4, d5, d6, d7 } 1552 * mask in {d24, d25, d26, d27} 1553 * output: updated src in {d0, d1, d2, d3 } 1554 * updated mask in {d24, d25, d26, d3 } 1555 */ 1556 vmull.u8 q0, d24, d8 1557 vmull.u8 q1, d25, d9 1558 vmull.u8 q6, d26, d10 1559 vmull.u8 q7, d27, d11 1560 vmull.u8 q9, d11, d25 1561 vmull.u8 q12, d11, d24 1562 vmull.u8 q13, d11, d26 1563 vrshr.u16 q8, q0, #8 1564 vrshr.u16 q10, q1, #8 1565 vrshr.u16 q11, q6, #8 1566 vraddhn.u16 d0, q0, q8 1567 vraddhn.u16 d1, q1, q10 1568 vraddhn.u16 d2, q6, q11 1569 vrshr.u16 q11, q12, #8 1570 vrshr.u16 q8, q9, #8 1571 vrshr.u16 q6, q13, #8 1572 vrshr.u16 q10, q7, #8 1573 vraddhn.u16 d24, q12, q11 1574 vraddhn.u16 d25, q9, q8 1575 vraddhn.u16 d26, q13, q6 1576 vraddhn.u16 d3, q7, q10 1577 /* 1578 * 'combine_over_ca' replacement 1579 * 1580 * output: updated dest in {d28, d29, d30, d31} 1581 */ 1582 vmvn.8 q12, q12 1583 vmvn.8 d26, d26 1584 vmull.u8 q8, d24, d4 1585 vmull.u8 q9, d25, d5 1586 vmvn.8 d27, d3 1587 vmull.u8 q10, d26, d6 1588 vmull.u8 q11, d27, d7 1589 .endm 1590 1591 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail 1592 /* ... continue 'combine_over_ca' replacement */ 1593 vrshr.u16 q14, q8, #8 1594 vrshr.u16 q15, q9, #8 1595 vrshr.u16 q6, q10, #8 1596 vrshr.u16 q7, q11, #8 1597 vraddhn.u16 d28, q14, q8 1598 vraddhn.u16 d29, q15, q9 1599 vraddhn.u16 d30, q6, q10 1600 vraddhn.u16 d31, q7, q11 1601 vqadd.u8 q14, q0, q14 1602 vqadd.u8 q15, q1, q15 1603 .endm 1604 1605 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1606 vrshr.u16 q14, q8, #8 1607 vrshr.u16 q15, q9, #8 1608 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1609 vrshr.u16 q6, q10, #8 1610 vrshr.u16 q7, q11, #8 1611 vraddhn.u16 d28, q14, q8 1612 vraddhn.u16 d29, q15, q9 1613 vraddhn.u16 d30, q6, q10 1614 vraddhn.u16 d31, q7, q11 1615 fetch_mask_pixblock 1616 vqadd.u8 q14, q0, q14 1617 vqadd.u8 q15, q1, q15 1618 cache_preload 8, 8 1619 pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1620 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1621 .endm 1622 1623 .macro pixman_composite_over_n_8888_8888_ca_init 1624 add DUMMY, sp, #ARGS_STACK_OFFSET 1625 vpush {d8-d15} 1626 vld1.32 {d11[0]}, [DUMMY] 1627 vdup.8 d8, d11[0] 1628 vdup.8 d9, d11[1] 1629 vdup.8 d10, d11[2] 1630 vdup.8 d11, d11[3] 1631 .endm 1632 1633 .macro pixman_composite_over_n_8888_8888_ca_cleanup 1634 vpop {d8-d15} 1635 .endm 1636 1637 generate_composite_function \ 1638 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ 1639 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1640 8, /* number of pixels, processed in a single block */ \ 1641 5, /* prefetch distance */ \ 1642 pixman_composite_over_n_8888_8888_ca_init, \ 1643 pixman_composite_over_n_8888_8888_ca_cleanup, \ 1644 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ 1645 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ 1646 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1647 1648 /******************************************************************************/ 1649 1650 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head 1651 /* 1652 * 'combine_mask_ca' replacement 1653 * 1654 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 1655 * mask in {d24, d25, d26} [B, G, R] 1656 * output: updated src in {d0, d1, d2 } [B, G, R] 1657 * updated mask in {d24, d25, d26} [B, G, R] 1658 */ 1659 vmull.u8 q0, d24, d8 1660 vmull.u8 q1, d25, d9 1661 vmull.u8 q6, d26, d10 1662 vmull.u8 q9, d11, d25 1663 vmull.u8 q12, d11, d24 1664 vmull.u8 q13, d11, d26 1665 vrshr.u16 q8, q0, #8 1666 vrshr.u16 q10, q1, #8 1667 vrshr.u16 q11, q6, #8 1668 vraddhn.u16 d0, q0, q8 1669 vraddhn.u16 d1, q1, q10 1670 vraddhn.u16 d2, q6, q11 1671 vrshr.u16 q11, q12, #8 1672 vrshr.u16 q8, q9, #8 1673 vrshr.u16 q6, q13, #8 1674 vraddhn.u16 d24, q12, q11 1675 vraddhn.u16 d25, q9, q8 1676 /* 1677 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 1678 * and put data into d16 - blue, d17 - green, d18 - red 1679 */ 1680 vshrn.u16 d17, q2, #3 1681 vshrn.u16 d18, q2, #8 1682 vraddhn.u16 d26, q13, q6 1683 vsli.u16 q2, q2, #5 1684 vsri.u8 d18, d18, #5 1685 vsri.u8 d17, d17, #6 1686 /* 1687 * 'combine_over_ca' replacement 1688 * 1689 * output: updated dest in d16 - blue, d17 - green, d18 - red 1690 */ 1691 vmvn.8 q12, q12 1692 vshrn.u16 d16, q2, #2 1693 vmvn.8 d26, d26 1694 vmull.u8 q6, d16, d24 1695 vmull.u8 q7, d17, d25 1696 vmull.u8 q11, d18, d26 1697 .endm 1698 1699 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail 1700 /* ... continue 'combine_over_ca' replacement */ 1701 vrshr.u16 q10, q6, #8 1702 vrshr.u16 q14, q7, #8 1703 vrshr.u16 q15, q11, #8 1704 vraddhn.u16 d16, q10, q6 1705 vraddhn.u16 d17, q14, q7 1706 vraddhn.u16 d18, q15, q11 1707 vqadd.u8 q8, q0, q8 1708 vqadd.u8 d18, d2, d18 1709 /* 1710 * convert the results in d16, d17, d18 to r5g6b5 and store 1711 * them into {d28, d29} 1712 */ 1713 vshll.u8 q14, d18, #8 1714 vshll.u8 q10, d17, #8 1715 vshll.u8 q15, d16, #8 1716 vsri.u16 q14, q10, #5 1717 vsri.u16 q14, q15, #11 1718 .endm 1719 1720 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 1721 fetch_mask_pixblock 1722 vrshr.u16 q10, q6, #8 1723 vrshr.u16 q14, q7, #8 1724 vld1.16 {d4, d5}, [DST_R, :128]! 1725 vrshr.u16 q15, q11, #8 1726 vraddhn.u16 d16, q10, q6 1727 vraddhn.u16 d17, q14, q7 1728 vraddhn.u16 d22, q15, q11 1729 /* process_pixblock_head */ 1730 /* 1731 * 'combine_mask_ca' replacement 1732 * 1733 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 1734 * mask in {d24, d25, d26} [B, G, R] 1735 * output: updated src in {d0, d1, d2 } [B, G, R] 1736 * updated mask in {d24, d25, d26} [B, G, R] 1737 */ 1738 vmull.u8 q6, d26, d10 1739 vqadd.u8 q8, q0, q8 1740 vmull.u8 q0, d24, d8 1741 vqadd.u8 d22, d2, d22 1742 vmull.u8 q1, d25, d9 1743 /* 1744 * convert the result in d16, d17, d22 to r5g6b5 and store 1745 * it into {d28, d29} 1746 */ 1747 vshll.u8 q14, d22, #8 1748 vshll.u8 q10, d17, #8 1749 vshll.u8 q15, d16, #8 1750 vmull.u8 q9, d11, d25 1751 vsri.u16 q14, q10, #5 1752 vmull.u8 q12, d11, d24 1753 vmull.u8 q13, d11, d26 1754 vsri.u16 q14, q15, #11 1755 cache_preload 8, 8 1756 vrshr.u16 q8, q0, #8 1757 vrshr.u16 q10, q1, #8 1758 vrshr.u16 q11, q6, #8 1759 vraddhn.u16 d0, q0, q8 1760 vraddhn.u16 d1, q1, q10 1761 vraddhn.u16 d2, q6, q11 1762 vrshr.u16 q11, q12, #8 1763 vrshr.u16 q8, q9, #8 1764 vrshr.u16 q6, q13, #8 1765 vraddhn.u16 d24, q12, q11 1766 vraddhn.u16 d25, q9, q8 1767 /* 1768 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 1769 * 8-bit format and put data into d16 - blue, d17 - green, 1770 * d18 - red 1771 */ 1772 vshrn.u16 d17, q2, #3 1773 vshrn.u16 d18, q2, #8 1774 vraddhn.u16 d26, q13, q6 1775 vsli.u16 q2, q2, #5 1776 vsri.u8 d17, d17, #6 1777 vsri.u8 d18, d18, #5 1778 /* 1779 * 'combine_over_ca' replacement 1780 * 1781 * output: updated dest in d16 - blue, d17 - green, d18 - red 1782 */ 1783 vmvn.8 q12, q12 1784 vshrn.u16 d16, q2, #2 1785 vmvn.8 d26, d26 1786 vmull.u8 q7, d17, d25 1787 vmull.u8 q6, d16, d24 1788 vmull.u8 q11, d18, d26 1789 vst1.16 {d28, d29}, [DST_W, :128]! 1790 .endm 1791 1792 .macro pixman_composite_over_n_8888_0565_ca_init 1793 add DUMMY, sp, #ARGS_STACK_OFFSET 1794 vpush {d8-d15} 1795 vld1.32 {d11[0]}, [DUMMY] 1796 vdup.8 d8, d11[0] 1797 vdup.8 d9, d11[1] 1798 vdup.8 d10, d11[2] 1799 vdup.8 d11, d11[3] 1800 .endm 1801 1802 .macro pixman_composite_over_n_8888_0565_ca_cleanup 1803 vpop {d8-d15} 1804 .endm 1805 1806 generate_composite_function \ 1807 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ 1808 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1809 8, /* number of pixels, processed in a single block */ \ 1810 5, /* prefetch distance */ \ 1811 pixman_composite_over_n_8888_0565_ca_init, \ 1812 pixman_composite_over_n_8888_0565_ca_cleanup, \ 1813 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ 1814 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ 1815 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 1816 1817 /******************************************************************************/ 1818 1819 .macro pixman_composite_in_n_8_process_pixblock_head 1820 /* expecting source data in {d0, d1, d2, d3} */ 1821 /* and destination data in {d4, d5, d6, d7} */ 1822 vmull.u8 q8, d4, d3 1823 vmull.u8 q9, d5, d3 1824 vmull.u8 q10, d6, d3 1825 vmull.u8 q11, d7, d3 1826 .endm 1827 1828 .macro pixman_composite_in_n_8_process_pixblock_tail 1829 vrshr.u16 q14, q8, #8 1830 vrshr.u16 q15, q9, #8 1831 vrshr.u16 q12, q10, #8 1832 vrshr.u16 q13, q11, #8 1833 vraddhn.u16 d28, q8, q14 1834 vraddhn.u16 d29, q9, q15 1835 vraddhn.u16 d30, q10, q12 1836 vraddhn.u16 d31, q11, q13 1837 .endm 1838 1839 .macro pixman_composite_in_n_8_process_pixblock_tail_head 1840 pixman_composite_in_n_8_process_pixblock_tail 1841 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1842 cache_preload 32, 32 1843 pixman_composite_in_n_8_process_pixblock_head 1844 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1845 .endm 1846 1847 .macro pixman_composite_in_n_8_init 1848 add DUMMY, sp, #ARGS_STACK_OFFSET 1849 vld1.32 {d3[0]}, [DUMMY] 1850 vdup.8 d3, d3[3] 1851 .endm 1852 1853 .macro pixman_composite_in_n_8_cleanup 1854 .endm 1855 1856 generate_composite_function \ 1857 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ 1858 FLAG_DST_READWRITE, \ 1859 32, /* number of pixels, processed in a single block */ \ 1860 5, /* prefetch distance */ \ 1861 pixman_composite_in_n_8_init, \ 1862 pixman_composite_in_n_8_cleanup, \ 1863 pixman_composite_in_n_8_process_pixblock_head, \ 1864 pixman_composite_in_n_8_process_pixblock_tail, \ 1865 pixman_composite_in_n_8_process_pixblock_tail_head, \ 1866 28, /* dst_w_basereg */ \ 1867 4, /* dst_r_basereg */ \ 1868 0, /* src_basereg */ \ 1869 24 /* mask_basereg */ 1870 1871 .macro pixman_composite_add_n_8_8_process_pixblock_head 1872 /* expecting source data in {d8, d9, d10, d11} */ 1873 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1874 /* and destination data in {d4, d5, d6, d7} */ 1875 /* mask is in d24, d25, d26, d27 */ 1876 vmull.u8 q0, d24, d11 1877 vmull.u8 q1, d25, d11 1878 vmull.u8 q6, d26, d11 1879 vmull.u8 q7, d27, d11 1880 vrshr.u16 q10, q0, #8 1881 vrshr.u16 q11, q1, #8 1882 vrshr.u16 q12, q6, #8 1883 vrshr.u16 q13, q7, #8 1884 vraddhn.u16 d0, q0, q10 1885 vraddhn.u16 d1, q1, q11 1886 vraddhn.u16 d2, q6, q12 1887 vraddhn.u16 d3, q7, q13 1888 vqadd.u8 q14, q0, q2 1889 vqadd.u8 q15, q1, q3 1890 .endm 1891 1892 .macro pixman_composite_add_n_8_8_process_pixblock_tail 1893 .endm 1894 1895 /* TODO: expand macros and do better instructions scheduling */ 1896 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head 1897 pixman_composite_add_n_8_8_process_pixblock_tail 1898 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1899 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1900 fetch_mask_pixblock 1901 cache_preload 32, 32 1902 pixman_composite_add_n_8_8_process_pixblock_head 1903 .endm 1904 1905 .macro pixman_composite_add_n_8_8_init 1906 add DUMMY, sp, #ARGS_STACK_OFFSET 1907 vpush {d8-d15} 1908 vld1.32 {d11[0]}, [DUMMY] 1909 vdup.8 d11, d11[3] 1910 .endm 1911 1912 .macro pixman_composite_add_n_8_8_cleanup 1913 vpop {d8-d15} 1914 .endm 1915 1916 generate_composite_function \ 1917 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ 1918 FLAG_DST_READWRITE, \ 1919 32, /* number of pixels, processed in a single block */ \ 1920 5, /* prefetch distance */ \ 1921 pixman_composite_add_n_8_8_init, \ 1922 pixman_composite_add_n_8_8_cleanup, \ 1923 pixman_composite_add_n_8_8_process_pixblock_head, \ 1924 pixman_composite_add_n_8_8_process_pixblock_tail, \ 1925 pixman_composite_add_n_8_8_process_pixblock_tail_head 1926 1927 /******************************************************************************/ 1928 1929 .macro pixman_composite_add_8_8_8_process_pixblock_head 1930 /* expecting source data in {d0, d1, d2, d3} */ 1931 /* destination data in {d4, d5, d6, d7} */ 1932 /* mask in {d24, d25, d26, d27} */ 1933 vmull.u8 q8, d24, d0 1934 vmull.u8 q9, d25, d1 1935 vmull.u8 q10, d26, d2 1936 vmull.u8 q11, d27, d3 1937 vrshr.u16 q0, q8, #8 1938 vrshr.u16 q1, q9, #8 1939 vrshr.u16 q12, q10, #8 1940 vrshr.u16 q13, q11, #8 1941 vraddhn.u16 d0, q0, q8 1942 vraddhn.u16 d1, q1, q9 1943 vraddhn.u16 d2, q12, q10 1944 vraddhn.u16 d3, q13, q11 1945 vqadd.u8 q14, q0, q2 1946 vqadd.u8 q15, q1, q3 1947 .endm 1948 1949 .macro pixman_composite_add_8_8_8_process_pixblock_tail 1950 .endm 1951 1952 /* TODO: expand macros and do better instructions scheduling */ 1953 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head 1954 pixman_composite_add_8_8_8_process_pixblock_tail 1955 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1956 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1957 fetch_mask_pixblock 1958 fetch_src_pixblock 1959 cache_preload 32, 32 1960 pixman_composite_add_8_8_8_process_pixblock_head 1961 .endm 1962 1963 .macro pixman_composite_add_8_8_8_init 1964 .endm 1965 1966 .macro pixman_composite_add_8_8_8_cleanup 1967 .endm 1968 1969 generate_composite_function \ 1970 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ 1971 FLAG_DST_READWRITE, \ 1972 32, /* number of pixels, processed in a single block */ \ 1973 5, /* prefetch distance */ \ 1974 pixman_composite_add_8_8_8_init, \ 1975 pixman_composite_add_8_8_8_cleanup, \ 1976 pixman_composite_add_8_8_8_process_pixblock_head, \ 1977 pixman_composite_add_8_8_8_process_pixblock_tail, \ 1978 pixman_composite_add_8_8_8_process_pixblock_tail_head 1979 1980 /******************************************************************************/ 1981 1982 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head 1983 /* expecting source data in {d0, d1, d2, d3} */ 1984 /* destination data in {d4, d5, d6, d7} */ 1985 /* mask in {d24, d25, d26, d27} */ 1986 vmull.u8 q8, d27, d0 1987 vmull.u8 q9, d27, d1 1988 vmull.u8 q10, d27, d2 1989 vmull.u8 q11, d27, d3 1990 /* 1 cycle bubble */ 1991 vrsra.u16 q8, q8, #8 1992 vrsra.u16 q9, q9, #8 1993 vrsra.u16 q10, q10, #8 1994 vrsra.u16 q11, q11, #8 1995 .endm 1996 1997 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail 1998 /* 2 cycle bubble */ 1999 vrshrn.u16 d28, q8, #8 2000 vrshrn.u16 d29, q9, #8 2001 vrshrn.u16 d30, q10, #8 2002 vrshrn.u16 d31, q11, #8 2003 vqadd.u8 q14, q2, q14 2004 /* 1 cycle bubble */ 2005 vqadd.u8 q15, q3, q15 2006 .endm 2007 2008 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2009 fetch_src_pixblock 2010 vrshrn.u16 d28, q8, #8 2011 fetch_mask_pixblock 2012 vrshrn.u16 d29, q9, #8 2013 vmull.u8 q8, d27, d0 2014 vrshrn.u16 d30, q10, #8 2015 vmull.u8 q9, d27, d1 2016 vrshrn.u16 d31, q11, #8 2017 vmull.u8 q10, d27, d2 2018 vqadd.u8 q14, q2, q14 2019 vmull.u8 q11, d27, d3 2020 vqadd.u8 q15, q3, q15 2021 vrsra.u16 q8, q8, #8 2022 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2023 vrsra.u16 q9, q9, #8 2024 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2025 vrsra.u16 q10, q10, #8 2026 2027 cache_preload 8, 8 2028 2029 vrsra.u16 q11, q11, #8 2030 .endm 2031 2032 generate_composite_function \ 2033 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ 2034 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2035 8, /* number of pixels, processed in a single block */ \ 2036 10, /* prefetch distance */ \ 2037 default_init, \ 2038 default_cleanup, \ 2039 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2040 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2041 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2042 2043 generate_composite_function_single_scanline \ 2044 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ 2045 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2046 8, /* number of pixels, processed in a single block */ \ 2047 default_init, \ 2048 default_cleanup, \ 2049 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2050 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2051 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2052 2053 /******************************************************************************/ 2054 2055 generate_composite_function \ 2056 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ 2057 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2058 8, /* number of pixels, processed in a single block */ \ 2059 5, /* prefetch distance */ \ 2060 default_init, \ 2061 default_cleanup, \ 2062 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2063 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2064 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2065 28, /* dst_w_basereg */ \ 2066 4, /* dst_r_basereg */ \ 2067 0, /* src_basereg */ \ 2068 27 /* mask_basereg */ 2069 2070 /******************************************************************************/ 2071 2072 .macro pixman_composite_add_n_8_8888_init 2073 add DUMMY, sp, #ARGS_STACK_OFFSET 2074 vld1.32 {d3[0]}, [DUMMY] 2075 vdup.8 d0, d3[0] 2076 vdup.8 d1, d3[1] 2077 vdup.8 d2, d3[2] 2078 vdup.8 d3, d3[3] 2079 .endm 2080 2081 .macro pixman_composite_add_n_8_8888_cleanup 2082 .endm 2083 2084 generate_composite_function \ 2085 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ 2086 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2087 8, /* number of pixels, processed in a single block */ \ 2088 5, /* prefetch distance */ \ 2089 pixman_composite_add_n_8_8888_init, \ 2090 pixman_composite_add_n_8_8888_cleanup, \ 2091 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2092 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2093 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2094 28, /* dst_w_basereg */ \ 2095 4, /* dst_r_basereg */ \ 2096 0, /* src_basereg */ \ 2097 27 /* mask_basereg */ 2098 2099 /******************************************************************************/ 2100 2101 .macro pixman_composite_add_8888_n_8888_init 2102 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2103 vld1.32 {d27[0]}, [DUMMY] 2104 vdup.8 d27, d27[3] 2105 .endm 2106 2107 .macro pixman_composite_add_8888_n_8888_cleanup 2108 .endm 2109 2110 generate_composite_function \ 2111 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ 2112 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2113 8, /* number of pixels, processed in a single block */ \ 2114 5, /* prefetch distance */ \ 2115 pixman_composite_add_8888_n_8888_init, \ 2116 pixman_composite_add_8888_n_8888_cleanup, \ 2117 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2118 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2119 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2120 28, /* dst_w_basereg */ \ 2121 4, /* dst_r_basereg */ \ 2122 0, /* src_basereg */ \ 2123 27 /* mask_basereg */ 2124 2125 /******************************************************************************/ 2126 2127 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2128 /* expecting source data in {d0, d1, d2, d3} */ 2129 /* destination data in {d4, d5, d6, d7} */ 2130 /* solid mask is in d15 */ 2131 2132 /* 'in' */ 2133 vmull.u8 q8, d15, d3 2134 vmull.u8 q6, d15, d2 2135 vmull.u8 q5, d15, d1 2136 vmull.u8 q4, d15, d0 2137 vrshr.u16 q13, q8, #8 2138 vrshr.u16 q12, q6, #8 2139 vrshr.u16 q11, q5, #8 2140 vrshr.u16 q10, q4, #8 2141 vraddhn.u16 d3, q8, q13 2142 vraddhn.u16 d2, q6, q12 2143 vraddhn.u16 d1, q5, q11 2144 vraddhn.u16 d0, q4, q10 2145 vmvn.8 d24, d3 /* get inverted alpha */ 2146 /* now do alpha blending */ 2147 vmull.u8 q8, d24, d4 2148 vmull.u8 q9, d24, d5 2149 vmull.u8 q10, d24, d6 2150 vmull.u8 q11, d24, d7 2151 .endm 2152 2153 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2154 vrshr.u16 q14, q8, #8 2155 vrshr.u16 q15, q9, #8 2156 vrshr.u16 q12, q10, #8 2157 vrshr.u16 q13, q11, #8 2158 vraddhn.u16 d28, q14, q8 2159 vraddhn.u16 d29, q15, q9 2160 vraddhn.u16 d30, q12, q10 2161 vraddhn.u16 d31, q13, q11 2162 .endm 2163 2164 /* TODO: expand macros and do better instructions scheduling */ 2165 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head 2166 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2167 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2168 fetch_src_pixblock 2169 cache_preload 8, 8 2170 fetch_mask_pixblock 2171 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2172 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2173 .endm 2174 2175 generate_composite_function_single_scanline \ 2176 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ 2177 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2178 8, /* number of pixels, processed in a single block */ \ 2179 default_init_need_all_regs, \ 2180 default_cleanup_need_all_regs, \ 2181 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ 2182 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ 2183 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ 2184 28, /* dst_w_basereg */ \ 2185 4, /* dst_r_basereg */ \ 2186 0, /* src_basereg */ \ 2187 12 /* mask_basereg */ 2188 2189 /******************************************************************************/ 2190 2191 .macro pixman_composite_over_8888_n_8888_process_pixblock_head 2192 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2193 .endm 2194 2195 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail 2196 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2197 vqadd.u8 q14, q0, q14 2198 vqadd.u8 q15, q1, q15 2199 .endm 2200 2201 /* TODO: expand macros and do better instructions scheduling */ 2202 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2203 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2204 pixman_composite_over_8888_n_8888_process_pixblock_tail 2205 fetch_src_pixblock 2206 cache_preload 8, 8 2207 pixman_composite_over_8888_n_8888_process_pixblock_head 2208 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2209 .endm 2210 2211 .macro pixman_composite_over_8888_n_8888_init 2212 add DUMMY, sp, #48 2213 vpush {d8-d15} 2214 vld1.32 {d15[0]}, [DUMMY] 2215 vdup.8 d15, d15[3] 2216 .endm 2217 2218 .macro pixman_composite_over_8888_n_8888_cleanup 2219 vpop {d8-d15} 2220 .endm 2221 2222 generate_composite_function \ 2223 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ 2224 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2225 8, /* number of pixels, processed in a single block */ \ 2226 5, /* prefetch distance */ \ 2227 pixman_composite_over_8888_n_8888_init, \ 2228 pixman_composite_over_8888_n_8888_cleanup, \ 2229 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2230 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2231 pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2232 2233 /******************************************************************************/ 2234 2235 /* TODO: expand macros and do better instructions scheduling */ 2236 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head 2237 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2238 pixman_composite_over_8888_n_8888_process_pixblock_tail 2239 fetch_src_pixblock 2240 cache_preload 8, 8 2241 fetch_mask_pixblock 2242 pixman_composite_over_8888_n_8888_process_pixblock_head 2243 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2244 .endm 2245 2246 generate_composite_function \ 2247 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ 2248 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2249 8, /* number of pixels, processed in a single block */ \ 2250 5, /* prefetch distance */ \ 2251 default_init_need_all_regs, \ 2252 default_cleanup_need_all_regs, \ 2253 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2254 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2255 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 2256 28, /* dst_w_basereg */ \ 2257 4, /* dst_r_basereg */ \ 2258 0, /* src_basereg */ \ 2259 12 /* mask_basereg */ 2260 2261 generate_composite_function_single_scanline \ 2262 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ 2263 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2264 8, /* number of pixels, processed in a single block */ \ 2265 default_init_need_all_regs, \ 2266 default_cleanup_need_all_regs, \ 2267 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2268 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2269 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 2270 28, /* dst_w_basereg */ \ 2271 4, /* dst_r_basereg */ \ 2272 0, /* src_basereg */ \ 2273 12 /* mask_basereg */ 2274 2275 /******************************************************************************/ 2276 2277 /* TODO: expand macros and do better instructions scheduling */ 2278 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head 2279 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2280 pixman_composite_over_8888_n_8888_process_pixblock_tail 2281 fetch_src_pixblock 2282 cache_preload 8, 8 2283 fetch_mask_pixblock 2284 pixman_composite_over_8888_n_8888_process_pixblock_head 2285 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2286 .endm 2287 2288 generate_composite_function \ 2289 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ 2290 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2291 8, /* number of pixels, processed in a single block */ \ 2292 5, /* prefetch distance */ \ 2293 default_init_need_all_regs, \ 2294 default_cleanup_need_all_regs, \ 2295 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2296 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2297 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ 2298 28, /* dst_w_basereg */ \ 2299 4, /* dst_r_basereg */ \ 2300 0, /* src_basereg */ \ 2301 15 /* mask_basereg */ 2302 2303 /******************************************************************************/ 2304 2305 .macro pixman_composite_src_0888_0888_process_pixblock_head 2306 .endm 2307 2308 .macro pixman_composite_src_0888_0888_process_pixblock_tail 2309 .endm 2310 2311 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head 2312 vst3.8 {d0, d1, d2}, [DST_W]! 2313 fetch_src_pixblock 2314 cache_preload 8, 8 2315 .endm 2316 2317 generate_composite_function \ 2318 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ 2319 FLAG_DST_WRITEONLY, \ 2320 8, /* number of pixels, processed in a single block */ \ 2321 10, /* prefetch distance */ \ 2322 default_init, \ 2323 default_cleanup, \ 2324 pixman_composite_src_0888_0888_process_pixblock_head, \ 2325 pixman_composite_src_0888_0888_process_pixblock_tail, \ 2326 pixman_composite_src_0888_0888_process_pixblock_tail_head, \ 2327 0, /* dst_w_basereg */ \ 2328 0, /* dst_r_basereg */ \ 2329 0, /* src_basereg */ \ 2330 0 /* mask_basereg */ 2331 2332 /******************************************************************************/ 2333 2334 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head 2335 vswp d0, d2 2336 .endm 2337 2338 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail 2339 .endm 2340 2341 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head 2342 vst4.8 {d0, d1, d2, d3}, [DST_W]! 2343 fetch_src_pixblock 2344 vswp d0, d2 2345 cache_preload 8, 8 2346 .endm 2347 2348 .macro pixman_composite_src_0888_8888_rev_init 2349 veor d3, d3, d3 2350 .endm 2351 2352 generate_composite_function \ 2353 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ 2354 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2355 8, /* number of pixels, processed in a single block */ \ 2356 10, /* prefetch distance */ \ 2357 pixman_composite_src_0888_8888_rev_init, \ 2358 default_cleanup, \ 2359 pixman_composite_src_0888_8888_rev_process_pixblock_head, \ 2360 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ 2361 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ 2362 0, /* dst_w_basereg */ \ 2363 0, /* dst_r_basereg */ \ 2364 0, /* src_basereg */ \ 2365 0 /* mask_basereg */ 2366 2367 /******************************************************************************/ 2368 2369 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head 2370 vshll.u8 q8, d1, #8 2371 vshll.u8 q9, d2, #8 2372 .endm 2373 2374 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail 2375 vshll.u8 q14, d0, #8 2376 vsri.u16 q14, q8, #5 2377 vsri.u16 q14, q9, #11 2378 .endm 2379 2380 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head 2381 vshll.u8 q14, d0, #8 2382 fetch_src_pixblock 2383 vsri.u16 q14, q8, #5 2384 vsri.u16 q14, q9, #11 2385 vshll.u8 q8, d1, #8 2386 vst1.16 {d28, d29}, [DST_W, :128]! 2387 vshll.u8 q9, d2, #8 2388 .endm 2389 2390 generate_composite_function \ 2391 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ 2392 FLAG_DST_WRITEONLY, \ 2393 8, /* number of pixels, processed in a single block */ \ 2394 10, /* prefetch distance */ \ 2395 default_init, \ 2396 default_cleanup, \ 2397 pixman_composite_src_0888_0565_rev_process_pixblock_head, \ 2398 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ 2399 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ 2400 28, /* dst_w_basereg */ \ 2401 0, /* dst_r_basereg */ \ 2402 0, /* src_basereg */ \ 2403 0 /* mask_basereg */ 2404 2405 /******************************************************************************/ 2406 2407 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head 2408 vmull.u8 q8, d3, d0 2409 vmull.u8 q9, d3, d1 2410 vmull.u8 q10, d3, d2 2411 .endm 2412 2413 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail 2414 vrshr.u16 q11, q8, #8 2415 vswp d3, d31 2416 vrshr.u16 q12, q9, #8 2417 vrshr.u16 q13, q10, #8 2418 vraddhn.u16 d30, q11, q8 2419 vraddhn.u16 d29, q12, q9 2420 vraddhn.u16 d28, q13, q10 2421 .endm 2422 2423 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head 2424 vrshr.u16 q11, q8, #8 2425 vswp d3, d31 2426 vrshr.u16 q12, q9, #8 2427 vrshr.u16 q13, q10, #8 2428 fetch_src_pixblock 2429 vraddhn.u16 d30, q11, q8 2430 PF add, PF_X, PF_X, #8 2431 PF tst, PF_CTL, #0xF 2432 PF addne, PF_X, PF_X, #8 2433 PF subne, PF_CTL, PF_CTL, #1 2434 vraddhn.u16 d29, q12, q9 2435 vraddhn.u16 d28, q13, q10 2436 vmull.u8 q8, d3, d0 2437 vmull.u8 q9, d3, d1 2438 vmull.u8 q10, d3, d2 2439 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2440 PF cmp, PF_X, ORIG_W 2441 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 2442 PF subge, PF_X, PF_X, ORIG_W 2443 PF subsge, PF_CTL, PF_CTL, #0x10 2444 PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 2445 .endm 2446 2447 generate_composite_function \ 2448 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ 2449 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2450 8, /* number of pixels, processed in a single block */ \ 2451 10, /* prefetch distance */ \ 2452 default_init, \ 2453 default_cleanup, \ 2454 pixman_composite_src_pixbuf_8888_process_pixblock_head, \ 2455 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ 2456 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ 2457 28, /* dst_w_basereg */ \ 2458 0, /* dst_r_basereg */ \ 2459 0, /* src_basereg */ \ 2460 0 /* mask_basereg */ 2461 2462 /******************************************************************************/ 2463 2464 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head 2465 vmull.u8 q8, d3, d0 2466 vmull.u8 q9, d3, d1 2467 vmull.u8 q10, d3, d2 2468 .endm 2469 2470 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail 2471 vrshr.u16 q11, q8, #8 2472 vswp d3, d31 2473 vrshr.u16 q12, q9, #8 2474 vrshr.u16 q13, q10, #8 2475 vraddhn.u16 d28, q11, q8 2476 vraddhn.u16 d29, q12, q9 2477 vraddhn.u16 d30, q13, q10 2478 .endm 2479 2480 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head 2481 vrshr.u16 q11, q8, #8 2482 vswp d3, d31 2483 vrshr.u16 q12, q9, #8 2484 vrshr.u16 q13, q10, #8 2485 fetch_src_pixblock 2486 vraddhn.u16 d28, q11, q8 2487 PF add, PF_X, PF_X, #8 2488 PF tst, PF_CTL, #0xF 2489 PF addne, PF_X, PF_X, #8 2490 PF subne, PF_CTL, PF_CTL, #1 2491 vraddhn.u16 d29, q12, q9 2492 vraddhn.u16 d30, q13, q10 2493 vmull.u8 q8, d3, d0 2494 vmull.u8 q9, d3, d1 2495 vmull.u8 q10, d3, d2 2496 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2497 PF cmp, PF_X, ORIG_W 2498 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 2499 PF subge, PF_X, PF_X, ORIG_W 2500 PF subsge, PF_CTL, PF_CTL, #0x10 2501 PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 2502 .endm 2503 2504 generate_composite_function \ 2505 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ 2506 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2507 8, /* number of pixels, processed in a single block */ \ 2508 10, /* prefetch distance */ \ 2509 default_init, \ 2510 default_cleanup, \ 2511 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ 2512 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ 2513 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ 2514 28, /* dst_w_basereg */ \ 2515 0, /* dst_r_basereg */ \ 2516 0, /* src_basereg */ \ 2517 0 /* mask_basereg */ 2518 2519 /******************************************************************************/ 2520 2521 .macro pixman_composite_over_0565_8_0565_process_pixblock_head 2522 /* mask is in d15 */ 2523 convert_0565_to_x888 q4, d2, d1, d0 2524 convert_0565_to_x888 q5, d6, d5, d4 2525 /* source pixel data is in {d0, d1, d2, XX} */ 2526 /* destination pixel data is in {d4, d5, d6, XX} */ 2527 vmvn.8 d7, d15 2528 vmull.u8 q6, d15, d2 2529 vmull.u8 q5, d15, d1 2530 vmull.u8 q4, d15, d0 2531 vmull.u8 q8, d7, d4 2532 vmull.u8 q9, d7, d5 2533 vmull.u8 q13, d7, d6 2534 vrshr.u16 q12, q6, #8 2535 vrshr.u16 q11, q5, #8 2536 vrshr.u16 q10, q4, #8 2537 vraddhn.u16 d2, q6, q12 2538 vraddhn.u16 d1, q5, q11 2539 vraddhn.u16 d0, q4, q10 2540 .endm 2541 2542 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail 2543 vrshr.u16 q14, q8, #8 2544 vrshr.u16 q15, q9, #8 2545 vrshr.u16 q12, q13, #8 2546 vraddhn.u16 d28, q14, q8 2547 vraddhn.u16 d29, q15, q9 2548 vraddhn.u16 d30, q12, q13 2549 vqadd.u8 q0, q0, q14 2550 vqadd.u8 q1, q1, q15 2551 /* 32bpp result is in {d0, d1, d2, XX} */ 2552 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2553 .endm 2554 2555 /* TODO: expand macros and do better instructions scheduling */ 2556 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head 2557 fetch_mask_pixblock 2558 pixman_composite_over_0565_8_0565_process_pixblock_tail 2559 fetch_src_pixblock 2560 vld1.16 {d10, d11}, [DST_R, :128]! 2561 cache_preload 8, 8 2562 pixman_composite_over_0565_8_0565_process_pixblock_head 2563 vst1.16 {d28, d29}, [DST_W, :128]! 2564 .endm 2565 2566 generate_composite_function \ 2567 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ 2568 FLAG_DST_READWRITE, \ 2569 8, /* number of pixels, processed in a single block */ \ 2570 5, /* prefetch distance */ \ 2571 default_init_need_all_regs, \ 2572 default_cleanup_need_all_regs, \ 2573 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2574 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2575 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2576 28, /* dst_w_basereg */ \ 2577 10, /* dst_r_basereg */ \ 2578 8, /* src_basereg */ \ 2579 15 /* mask_basereg */ 2580 2581 /******************************************************************************/ 2582 2583 .macro pixman_composite_over_0565_n_0565_init 2584 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2585 vpush {d8-d15} 2586 vld1.32 {d15[0]}, [DUMMY] 2587 vdup.8 d15, d15[3] 2588 .endm 2589 2590 .macro pixman_composite_over_0565_n_0565_cleanup 2591 vpop {d8-d15} 2592 .endm 2593 2594 generate_composite_function \ 2595 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ 2596 FLAG_DST_READWRITE, \ 2597 8, /* number of pixels, processed in a single block */ \ 2598 5, /* prefetch distance */ \ 2599 pixman_composite_over_0565_n_0565_init, \ 2600 pixman_composite_over_0565_n_0565_cleanup, \ 2601 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2602 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2603 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2604 28, /* dst_w_basereg */ \ 2605 10, /* dst_r_basereg */ \ 2606 8, /* src_basereg */ \ 2607 15 /* mask_basereg */ 2608 2609 /******************************************************************************/ 2610 2611 .macro pixman_composite_add_0565_8_0565_process_pixblock_head 2612 /* mask is in d15 */ 2613 convert_0565_to_x888 q4, d2, d1, d0 2614 convert_0565_to_x888 q5, d6, d5, d4 2615 /* source pixel data is in {d0, d1, d2, XX} */ 2616 /* destination pixel data is in {d4, d5, d6, XX} */ 2617 vmull.u8 q6, d15, d2 2618 vmull.u8 q5, d15, d1 2619 vmull.u8 q4, d15, d0 2620 vrshr.u16 q12, q6, #8 2621 vrshr.u16 q11, q5, #8 2622 vrshr.u16 q10, q4, #8 2623 vraddhn.u16 d2, q6, q12 2624 vraddhn.u16 d1, q5, q11 2625 vraddhn.u16 d0, q4, q10 2626 .endm 2627 2628 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail 2629 vqadd.u8 q0, q0, q2 2630 vqadd.u8 q1, q1, q3 2631 /* 32bpp result is in {d0, d1, d2, XX} */ 2632 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2633 .endm 2634 2635 /* TODO: expand macros and do better instructions scheduling */ 2636 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head 2637 fetch_mask_pixblock 2638 pixman_composite_add_0565_8_0565_process_pixblock_tail 2639 fetch_src_pixblock 2640 vld1.16 {d10, d11}, [DST_R, :128]! 2641 cache_preload 8, 8 2642 pixman_composite_add_0565_8_0565_process_pixblock_head 2643 vst1.16 {d28, d29}, [DST_W, :128]! 2644 .endm 2645 2646 generate_composite_function \ 2647 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ 2648 FLAG_DST_READWRITE, \ 2649 8, /* number of pixels, processed in a single block */ \ 2650 5, /* prefetch distance */ \ 2651 default_init_need_all_regs, \ 2652 default_cleanup_need_all_regs, \ 2653 pixman_composite_add_0565_8_0565_process_pixblock_head, \ 2654 pixman_composite_add_0565_8_0565_process_pixblock_tail, \ 2655 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ 2656 28, /* dst_w_basereg */ \ 2657 10, /* dst_r_basereg */ \ 2658 8, /* src_basereg */ \ 2659 15 /* mask_basereg */ 2660 2661 /******************************************************************************/ 2662 2663 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head 2664 /* mask is in d15 */ 2665 convert_0565_to_x888 q5, d6, d5, d4 2666 /* destination pixel data is in {d4, d5, d6, xx} */ 2667 vmvn.8 d24, d15 /* get inverted alpha */ 2668 /* now do alpha blending */ 2669 vmull.u8 q8, d24, d4 2670 vmull.u8 q9, d24, d5 2671 vmull.u8 q10, d24, d6 2672 .endm 2673 2674 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail 2675 vrshr.u16 q14, q8, #8 2676 vrshr.u16 q15, q9, #8 2677 vrshr.u16 q12, q10, #8 2678 vraddhn.u16 d0, q14, q8 2679 vraddhn.u16 d1, q15, q9 2680 vraddhn.u16 d2, q12, q10 2681 /* 32bpp result is in {d0, d1, d2, XX} */ 2682 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2683 .endm 2684 2685 /* TODO: expand macros and do better instructions scheduling */ 2686 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head 2687 fetch_src_pixblock 2688 pixman_composite_out_reverse_8_0565_process_pixblock_tail 2689 vld1.16 {d10, d11}, [DST_R, :128]! 2690 cache_preload 8, 8 2691 pixman_composite_out_reverse_8_0565_process_pixblock_head 2692 vst1.16 {d28, d29}, [DST_W, :128]! 2693 .endm 2694 2695 generate_composite_function \ 2696 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ 2697 FLAG_DST_READWRITE, \ 2698 8, /* number of pixels, processed in a single block */ \ 2699 5, /* prefetch distance */ \ 2700 default_init_need_all_regs, \ 2701 default_cleanup_need_all_regs, \ 2702 pixman_composite_out_reverse_8_0565_process_pixblock_head, \ 2703 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ 2704 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ 2705 28, /* dst_w_basereg */ \ 2706 10, /* dst_r_basereg */ \ 2707 15, /* src_basereg */ \ 2708 0 /* mask_basereg */ 2709 2710 /******************************************************************************/ 2711 2712 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head 2713 /* src is in d0 */ 2714 /* destination pixel data is in {d4, d5, d6, d7} */ 2715 vmvn.8 d1, d0 /* get inverted alpha */ 2716 /* now do alpha blending */ 2717 vmull.u8 q8, d1, d4 2718 vmull.u8 q9, d1, d5 2719 vmull.u8 q10, d1, d6 2720 vmull.u8 q11, d1, d7 2721 .endm 2722 2723 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail 2724 vrshr.u16 q14, q8, #8 2725 vrshr.u16 q15, q9, #8 2726 vrshr.u16 q12, q10, #8 2727 vrshr.u16 q13, q11, #8 2728 vraddhn.u16 d28, q14, q8 2729 vraddhn.u16 d29, q15, q9 2730 vraddhn.u16 d30, q12, q10 2731 vraddhn.u16 d31, q13, q11 2732 /* 32bpp result is in {d28, d29, d30, d31} */ 2733 .endm 2734 2735 /* TODO: expand macros and do better instructions scheduling */ 2736 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head 2737 fetch_src_pixblock 2738 pixman_composite_out_reverse_8_8888_process_pixblock_tail 2739 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2740 cache_preload 8, 8 2741 pixman_composite_out_reverse_8_8888_process_pixblock_head 2742 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2743 .endm 2744 2745 generate_composite_function \ 2746 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ 2747 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2748 8, /* number of pixels, processed in a single block */ \ 2749 5, /* prefetch distance */ \ 2750 default_init, \ 2751 default_cleanup, \ 2752 pixman_composite_out_reverse_8_8888_process_pixblock_head, \ 2753 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ 2754 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ 2755 28, /* dst_w_basereg */ \ 2756 4, /* dst_r_basereg */ \ 2757 0, /* src_basereg */ \ 2758 0 /* mask_basereg */ 2759 2760 /******************************************************************************/ 2761 2762 generate_composite_function_nearest_scanline \ 2763 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ 2764 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2765 8, /* number of pixels, processed in a single block */ \ 2766 default_init, \ 2767 default_cleanup, \ 2768 pixman_composite_over_8888_8888_process_pixblock_head, \ 2769 pixman_composite_over_8888_8888_process_pixblock_tail, \ 2770 pixman_composite_over_8888_8888_process_pixblock_tail_head 2771 2772 generate_composite_function_nearest_scanline \ 2773 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ 2774 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2775 8, /* number of pixels, processed in a single block */ \ 2776 default_init, \ 2777 default_cleanup, \ 2778 pixman_composite_over_8888_0565_process_pixblock_head, \ 2779 pixman_composite_over_8888_0565_process_pixblock_tail, \ 2780 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 2781 28, /* dst_w_basereg */ \ 2782 4, /* dst_r_basereg */ \ 2783 0, /* src_basereg */ \ 2784 24 /* mask_basereg */ 2785 2786 generate_composite_function_nearest_scanline \ 2787 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ 2788 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2789 8, /* number of pixels, processed in a single block */ \ 2790 default_init, \ 2791 default_cleanup, \ 2792 pixman_composite_src_8888_0565_process_pixblock_head, \ 2793 pixman_composite_src_8888_0565_process_pixblock_tail, \ 2794 pixman_composite_src_8888_0565_process_pixblock_tail_head 2795 2796 generate_composite_function_nearest_scanline \ 2797 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ 2798 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2799 8, /* number of pixels, processed in a single block */ \ 2800 default_init, \ 2801 default_cleanup, \ 2802 pixman_composite_src_0565_8888_process_pixblock_head, \ 2803 pixman_composite_src_0565_8888_process_pixblock_tail, \ 2804 pixman_composite_src_0565_8888_process_pixblock_tail_head 2805 2806 generate_composite_function_nearest_scanline \ 2807 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ 2808 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2809 8, /* number of pixels, processed in a single block */ \ 2810 default_init_need_all_regs, \ 2811 default_cleanup_need_all_regs, \ 2812 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 2813 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 2814 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 2815 28, /* dst_w_basereg */ \ 2816 4, /* dst_r_basereg */ \ 2817 8, /* src_basereg */ \ 2818 24 /* mask_basereg */ 2819 2820 generate_composite_function_nearest_scanline \ 2821 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ 2822 FLAG_DST_READWRITE, \ 2823 8, /* number of pixels, processed in a single block */ \ 2824 default_init_need_all_regs, \ 2825 default_cleanup_need_all_regs, \ 2826 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2827 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2828 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2829 28, /* dst_w_basereg */ \ 2830 10, /* dst_r_basereg */ \ 2831 8, /* src_basereg */ \ 2832 15 /* mask_basereg */ 2833 2834 /******************************************************************************/ 2835 2836 /* 2837 * Bilinear scaling support code which tries to provide pixel fetching, color 2838 * format conversion, and interpolation as separate macros which can be used 2839 * as the basic building blocks for constructing bilinear scanline functions. 2840 */ 2841 2842 .macro bilinear_load_8888 reg1, reg2, tmp 2843 mov TMP1, X, asr #16 2844 add X, X, UX 2845 add TMP1, TOP, TMP1, asl #2 2846 vld1.32 {\reg1}, [TMP1], STRIDE 2847 vld1.32 {\reg2}, [TMP1] 2848 .endm 2849 2850 .macro bilinear_load_0565 reg1, reg2, tmp 2851 mov TMP1, X, asr #16 2852 add X, X, UX 2853 add TMP1, TOP, TMP1, asl #1 2854 vld1.32 {\reg2[0]}, [TMP1], STRIDE 2855 vld1.32 {\reg2[1]}, [TMP1] 2856 convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp 2857 .endm 2858 2859 .macro bilinear_load_and_vertical_interpolate_two_8888 \ 2860 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 2861 2862 bilinear_load_8888 \reg1, \reg2, \tmp1 2863 vmull.u8 \acc1, \reg1, d28 2864 vmlal.u8 \acc1, \reg2, d29 2865 bilinear_load_8888 \reg3, \reg4, \tmp2 2866 vmull.u8 \acc2, \reg3, d28 2867 vmlal.u8 \acc2, \reg4, d29 2868 .endm 2869 2870 .macro bilinear_load_and_vertical_interpolate_four_8888 \ 2871 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 2872 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 2873 2874 bilinear_load_and_vertical_interpolate_two_8888 \ 2875 \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi 2876 bilinear_load_and_vertical_interpolate_two_8888 \ 2877 \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi 2878 .endm 2879 2880 .macro bilinear_load_and_vertical_interpolate_two_0565 \ 2881 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 2882 2883 mov TMP1, X, asr #16 2884 add X, X, UX 2885 add TMP1, TOP, TMP1, asl #1 2886 mov TMP2, X, asr #16 2887 add X, X, UX 2888 add TMP2, TOP, TMP2, asl #1 2889 vld1.32 {\acc2lo[0]}, [TMP1], STRIDE 2890 vld1.32 {\acc2hi[0]}, [TMP2], STRIDE 2891 vld1.32 {\acc2lo[1]}, [TMP1] 2892 vld1.32 {\acc2hi[1]}, [TMP2] 2893 convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 2894 vzip.u8 \reg1, \reg3 2895 vzip.u8 \reg2, \reg4 2896 vzip.u8 \reg3, \reg4 2897 vzip.u8 \reg1, \reg2 2898 vmull.u8 \acc1, \reg1, d28 2899 vmlal.u8 \acc1, \reg2, d29 2900 vmull.u8 \acc2, \reg3, d28 2901 vmlal.u8 \acc2, \reg4, d29 2902 .endm 2903 2904 .macro bilinear_load_and_vertical_interpolate_four_0565 \ 2905 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 2906 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 2907 2908 mov TMP1, X, asr #16 2909 add X, X, UX 2910 add TMP1, TOP, TMP1, asl #1 2911 mov TMP2, X, asr #16 2912 add X, X, UX 2913 add TMP2, TOP, TMP2, asl #1 2914 vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE 2915 vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE 2916 vld1.32 {\xacc2lo[1]}, [TMP1] 2917 vld1.32 {\xacc2hi[1]}, [TMP2] 2918 convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 2919 mov TMP1, X, asr #16 2920 add X, X, UX 2921 add TMP1, TOP, TMP1, asl #1 2922 mov TMP2, X, asr #16 2923 add X, X, UX 2924 add TMP2, TOP, TMP2, asl #1 2925 vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE 2926 vzip.u8 \xreg1, \xreg3 2927 vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE 2928 vzip.u8 \xreg2, \xreg4 2929 vld1.32 {\yacc2lo[1]}, [TMP1] 2930 vzip.u8 \xreg3, \xreg4 2931 vld1.32 {\yacc2hi[1]}, [TMP2] 2932 vzip.u8 \xreg1, \xreg2 2933 convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 2934 vmull.u8 \xacc1, \xreg1, d28 2935 vzip.u8 \yreg1, \yreg3 2936 vmlal.u8 \xacc1, \xreg2, d29 2937 vzip.u8 \yreg2, \yreg4 2938 vmull.u8 \xacc2, \xreg3, d28 2939 vzip.u8 \yreg3, \yreg4 2940 vmlal.u8 \xacc2, \xreg4, d29 2941 vzip.u8 \yreg1, \yreg2 2942 vmull.u8 \yacc1, \yreg1, d28 2943 vmlal.u8 \yacc1, \yreg2, d29 2944 vmull.u8 \yacc2, \yreg3, d28 2945 vmlal.u8 \yacc2, \yreg4, d29 2946 .endm 2947 2948 .macro bilinear_store_8888 numpix, tmp1, tmp2 2949 .if \numpix == 4 2950 vst1.32 {d0, d1}, [OUT, :128]! 2951 .elseif \numpix == 2 2952 vst1.32 {d0}, [OUT, :64]! 2953 .elseif \numpix == 1 2954 vst1.32 {d0[0]}, [OUT, :32]! 2955 .else 2956 .error bilinear_store_8888 \numpix is unsupported 2957 .endif 2958 .endm 2959 2960 .macro bilinear_store_0565 numpix, tmp1, tmp2 2961 vuzp.u8 d0, d1 2962 vuzp.u8 d2, d3 2963 vuzp.u8 d1, d3 2964 vuzp.u8 d0, d2 2965 convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2 2966 .if \numpix == 4 2967 vst1.16 {d2}, [OUT, :64]! 2968 .elseif \numpix == 2 2969 vst1.32 {d2[0]}, [OUT, :32]! 2970 .elseif \numpix == 1 2971 vst1.16 {d2[0]}, [OUT, :16]! 2972 .else 2973 .error bilinear_store_0565 \numpix is unsupported 2974 .endif 2975 .endm 2976 2977 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt 2978 bilinear_load_\()\src_fmt d0, d1, d2 2979 vmull.u8 q1, d0, d28 2980 vmlal.u8 q1, d1, d29 2981 /* 5 cycles bubble */ 2982 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 2983 vmlsl.u16 q0, d2, d30 2984 vmlal.u16 q0, d3, d30 2985 /* 5 cycles bubble */ 2986 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 2987 /* 3 cycles bubble */ 2988 vmovn.u16 d0, q0 2989 /* 1 cycle bubble */ 2990 bilinear_store_\()\dst_fmt 1, q2, q3 2991 .endm 2992 2993 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt 2994 bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ 2995 q1, q11, d0, d1, d20, d21, d22, d23 2996 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 2997 vmlsl.u16 q0, d2, d30 2998 vmlal.u16 q0, d3, d30 2999 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 3000 vmlsl.u16 q10, d22, d31 3001 vmlal.u16 q10, d23, d31 3002 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3003 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 3004 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3005 vadd.u16 q12, q12, q13 3006 vmovn.u16 d0, q0 3007 bilinear_store_\()\dst_fmt 2, q2, q3 3008 .endm 3009 3010 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt 3011 bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ 3012 q1, q11, d0, d1, d20, d21, d22, d23 \ 3013 q3, q9, d4, d5, d16, d17, d18, d19 3014 pld [TMP1, PF_OFFS] 3015 sub TMP1, TMP1, STRIDE 3016 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 3017 vmlsl.u16 q0, d2, d30 3018 vmlal.u16 q0, d3, d30 3019 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 3020 vmlsl.u16 q10, d22, d31 3021 vmlal.u16 q10, d23, d31 3022 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3023 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 3024 vmlsl.u16 q2, d6, d30 3025 vmlal.u16 q2, d7, d30 3026 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 3027 pld [TMP2, PF_OFFS] 3028 vmlsl.u16 q8, d18, d31 3029 vmlal.u16 q8, d19, d31 3030 vadd.u16 q12, q12, q13 3031 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3032 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 3033 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3034 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 3035 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3036 vmovn.u16 d0, q0 3037 vmovn.u16 d1, q2 3038 vadd.u16 q12, q12, q13 3039 bilinear_store_\()\dst_fmt 4, q2, q3 3040 .endm 3041 3042 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3043 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt 3044 bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head 3045 .else 3046 bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 3047 .endif 3048 .endm 3049 3050 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3051 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt 3052 bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail 3053 .endif 3054 .endm 3055 3056 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3057 .ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt 3058 bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head 3059 .else 3060 bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 3061 .endif 3062 .endm 3063 3064 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 3065 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt 3066 bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head 3067 .else 3068 bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt 3069 bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt 3070 .endif 3071 .endm 3072 3073 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 3074 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt 3075 bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail 3076 .else 3077 bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt 3078 .endif 3079 .endm 3080 3081 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 3082 .ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt 3083 bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head 3084 .else 3085 bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt 3086 bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt 3087 .endif 3088 .endm 3089 3090 .set BILINEAR_FLAG_UNROLL_4, 0 3091 .set BILINEAR_FLAG_UNROLL_8, 1 3092 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 3093 3094 /* 3095 * Main template macro for generating NEON optimized bilinear scanline 3096 * functions. 3097 * 3098 * Bilinear scanline scaler macro template uses the following arguments: 3099 * fname - name of the function to generate 3100 * src_fmt - source color format (8888 or 0565) 3101 * dst_fmt - destination color format (8888 or 0565) 3102 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes 3103 * prefetch_distance - prefetch in the source image by that many 3104 * pixels ahead 3105 */ 3106 3107 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ 3108 src_bpp_shift, dst_bpp_shift, \ 3109 prefetch_distance, flags 3110 3111 pixman_asm_function \fname 3112 OUT .req r0 3113 TOP .req r1 3114 BOTTOM .req r2 3115 WT .req r3 3116 WB .req r4 3117 X .req r5 3118 UX .req r6 3119 WIDTH .req ip 3120 TMP1 .req r3 3121 TMP2 .req r4 3122 PF_OFFS .req r7 3123 TMP3 .req r8 3124 TMP4 .req r9 3125 STRIDE .req r2 3126 3127 mov ip, sp 3128 push {r4, r5, r6, r7, r8, r9} 3129 mov PF_OFFS, #\prefetch_distance 3130 ldmia ip, {WB, X, UX, WIDTH} 3131 mul PF_OFFS, PF_OFFS, UX 3132 3133 .if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 3134 vpush {d8-d15} 3135 .endif 3136 3137 sub STRIDE, BOTTOM, TOP 3138 .unreq BOTTOM 3139 3140 cmp WIDTH, #0 3141 ble 3f 3142 3143 vdup.u16 q12, X 3144 vdup.u16 q13, UX 3145 vdup.u8 d28, WT 3146 vdup.u8 d29, WB 3147 vadd.u16 d25, d25, d26 3148 3149 /* ensure good destination alignment */ 3150 cmp WIDTH, #1 3151 blt 0f 3152 tst OUT, #(1 << \dst_bpp_shift) 3153 beq 0f 3154 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3155 vadd.u16 q12, q12, q13 3156 bilinear_interpolate_last_pixel \src_fmt, \dst_fmt 3157 sub WIDTH, WIDTH, #1 3158 0: 3159 vadd.u16 q13, q13, q13 3160 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3161 vadd.u16 q12, q12, q13 3162 3163 cmp WIDTH, #2 3164 blt 0f 3165 tst OUT, #(1 << (\dst_bpp_shift + 1)) 3166 beq 0f 3167 bilinear_interpolate_two_pixels \src_fmt, \dst_fmt 3168 sub WIDTH, WIDTH, #2 3169 0: 3170 .if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0 3171 /*********** 8 pixels per iteration *****************/ 3172 cmp WIDTH, #4 3173 blt 0f 3174 tst OUT, #(1 << (\dst_bpp_shift + 2)) 3175 beq 0f 3176 bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 3177 sub WIDTH, WIDTH, #4 3178 0: 3179 subs WIDTH, WIDTH, #8 3180 blt 1f 3181 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 3182 bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt 3183 subs WIDTH, WIDTH, #8 3184 blt 5f 3185 0: 3186 bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt 3187 subs WIDTH, WIDTH, #8 3188 bge 0b 3189 5: 3190 bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt 3191 1: 3192 tst WIDTH, #4 3193 beq 2f 3194 bilinear_interpolate_four_pixels \src_fmt, \dst_fmt 3195 2: 3196 .else 3197 /*********** 4 pixels per iteration *****************/ 3198 subs WIDTH, WIDTH, #4 3199 blt 1f 3200 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 3201 bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt 3202 subs WIDTH, WIDTH, #4 3203 blt 5f 3204 0: 3205 bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt 3206 subs WIDTH, WIDTH, #4 3207 bge 0b 3208 5: 3209 bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt 3210 1: 3211 /****************************************************/ 3212 .endif 3213 /* handle the remaining trailing pixels */ 3214 tst WIDTH, #2 3215 beq 2f 3216 bilinear_interpolate_two_pixels \src_fmt, \dst_fmt 3217 2: 3218 tst WIDTH, #1 3219 beq 3f 3220 bilinear_interpolate_last_pixel \src_fmt, \dst_fmt 3221 3: 3222 .if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 3223 vpop {d8-d15} 3224 .endif 3225 pop {r4, r5, r6, r7, r8, r9} 3226 bx lr 3227 3228 .unreq OUT 3229 .unreq TOP 3230 .unreq WT 3231 .unreq WB 3232 .unreq X 3233 .unreq UX 3234 .unreq WIDTH 3235 .unreq TMP1 3236 .unreq TMP2 3237 .unreq PF_OFFS 3238 .unreq TMP3 3239 .unreq TMP4 3240 .unreq STRIDE 3241 pixman_end_asm_function 3242 3243 .endm 3244 3245 /*****************************************************************************/ 3246 3247 .set have_bilinear_interpolate_four_pixels_8888_8888, 1 3248 3249 .macro bilinear_interpolate_four_pixels_8888_8888_head 3250 mov TMP1, X, asr #16 3251 add X, X, UX 3252 add TMP1, TOP, TMP1, asl #2 3253 mov TMP2, X, asr #16 3254 add X, X, UX 3255 add TMP2, TOP, TMP2, asl #2 3256 3257 vld1.32 {d22}, [TMP1], STRIDE 3258 vld1.32 {d23}, [TMP1] 3259 mov TMP3, X, asr #16 3260 add X, X, UX 3261 add TMP3, TOP, TMP3, asl #2 3262 vmull.u8 q8, d22, d28 3263 vmlal.u8 q8, d23, d29 3264 3265 vld1.32 {d22}, [TMP2], STRIDE 3266 vld1.32 {d23}, [TMP2] 3267 mov TMP4, X, asr #16 3268 add X, X, UX 3269 add TMP4, TOP, TMP4, asl #2 3270 vmull.u8 q9, d22, d28 3271 vmlal.u8 q9, d23, d29 3272 3273 vld1.32 {d22}, [TMP3], STRIDE 3274 vld1.32 {d23}, [TMP3] 3275 vmull.u8 q10, d22, d28 3276 vmlal.u8 q10, d23, d29 3277 3278 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3279 vmlsl.u16 q0, d16, d30 3280 vmlal.u16 q0, d17, d30 3281 3282 pld [TMP4, PF_OFFS] 3283 vld1.32 {d16}, [TMP4], STRIDE 3284 vld1.32 {d17}, [TMP4] 3285 pld [TMP4, PF_OFFS] 3286 vmull.u8 q11, d16, d28 3287 vmlal.u8 q11, d17, d29 3288 3289 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3290 vmlsl.u16 q1, d18, d31 3291 .endm 3292 3293 .macro bilinear_interpolate_four_pixels_8888_8888_tail 3294 vmlal.u16 q1, d19, d31 3295 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3296 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3297 vmlsl.u16 q2, d20, d30 3298 vmlal.u16 q2, d21, d30 3299 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3300 vmlsl.u16 q3, d22, d31 3301 vmlal.u16 q3, d23, d31 3302 vadd.u16 q12, q12, q13 3303 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3304 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3305 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3306 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3307 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3308 vmovn.u16 d6, q0 3309 vmovn.u16 d7, q2 3310 vadd.u16 q12, q12, q13 3311 vst1.32 {d6, d7}, [OUT, :128]! 3312 .endm 3313 3314 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head 3315 mov TMP1, X, asr #16 3316 add X, X, UX 3317 add TMP1, TOP, TMP1, asl #2 3318 mov TMP2, X, asr #16 3319 add X, X, UX 3320 add TMP2, TOP, TMP2, asl #2 3321 vmlal.u16 q1, d19, d31 3322 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3323 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3324 vmlsl.u16 q2, d20, d30 3325 vmlal.u16 q2, d21, d30 3326 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3327 vld1.32 {d20}, [TMP1], STRIDE 3328 vmlsl.u16 q3, d22, d31 3329 vmlal.u16 q3, d23, d31 3330 vld1.32 {d21}, [TMP1] 3331 vmull.u8 q8, d20, d28 3332 vmlal.u8 q8, d21, d29 3333 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3334 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3335 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3336 vld1.32 {d22}, [TMP2], STRIDE 3337 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3338 vadd.u16 q12, q12, q13 3339 vld1.32 {d23}, [TMP2] 3340 vmull.u8 q9, d22, d28 3341 mov TMP3, X, asr #16 3342 add X, X, UX 3343 add TMP3, TOP, TMP3, asl #2 3344 mov TMP4, X, asr #16 3345 add X, X, UX 3346 add TMP4, TOP, TMP4, asl #2 3347 vmlal.u8 q9, d23, d29 3348 vld1.32 {d22}, [TMP3], STRIDE 3349 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3350 vld1.32 {d23}, [TMP3] 3351 vmull.u8 q10, d22, d28 3352 vmlal.u8 q10, d23, d29 3353 vmovn.u16 d6, q0 3354 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3355 vmovn.u16 d7, q2 3356 vmlsl.u16 q0, d16, d30 3357 vmlal.u16 q0, d17, d30 3358 pld [TMP4, PF_OFFS] 3359 vld1.32 {d16}, [TMP4], STRIDE 3360 vadd.u16 q12, q12, q13 3361 vld1.32 {d17}, [TMP4] 3362 pld [TMP4, PF_OFFS] 3363 vmull.u8 q11, d16, d28 3364 vmlal.u8 q11, d17, d29 3365 vst1.32 {d6, d7}, [OUT, :128]! 3366 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3367 vmlsl.u16 q1, d18, d31 3368 .endm 3369 3370 /*****************************************************************************/ 3371 3372 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1 3373 3374 .macro bilinear_interpolate_eight_pixels_8888_0565_head 3375 mov TMP1, X, asr #16 3376 add X, X, UX 3377 add TMP1, TOP, TMP1, asl #2 3378 mov TMP2, X, asr #16 3379 add X, X, UX 3380 add TMP2, TOP, TMP2, asl #2 3381 vld1.32 {d20}, [TMP1], STRIDE 3382 vld1.32 {d21}, [TMP1] 3383 vmull.u8 q8, d20, d28 3384 vmlal.u8 q8, d21, d29 3385 vld1.32 {d22}, [TMP2], STRIDE 3386 vld1.32 {d23}, [TMP2] 3387 vmull.u8 q9, d22, d28 3388 mov TMP3, X, asr #16 3389 add X, X, UX 3390 add TMP3, TOP, TMP3, asl #2 3391 mov TMP4, X, asr #16 3392 add X, X, UX 3393 add TMP4, TOP, TMP4, asl #2 3394 vmlal.u8 q9, d23, d29 3395 vld1.32 {d22}, [TMP3], STRIDE 3396 vld1.32 {d23}, [TMP3] 3397 vmull.u8 q10, d22, d28 3398 vmlal.u8 q10, d23, d29 3399 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3400 vmlsl.u16 q0, d16, d30 3401 vmlal.u16 q0, d17, d30 3402 pld [TMP4, PF_OFFS] 3403 vld1.32 {d16}, [TMP4], STRIDE 3404 vld1.32 {d17}, [TMP4] 3405 pld [TMP4, PF_OFFS] 3406 vmull.u8 q11, d16, d28 3407 vmlal.u8 q11, d17, d29 3408 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3409 vmlsl.u16 q1, d18, d31 3410 3411 mov TMP1, X, asr #16 3412 add X, X, UX 3413 add TMP1, TOP, TMP1, asl #2 3414 mov TMP2, X, asr #16 3415 add X, X, UX 3416 add TMP2, TOP, TMP2, asl #2 3417 vmlal.u16 q1, d19, d31 3418 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3419 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3420 vmlsl.u16 q2, d20, d30 3421 vmlal.u16 q2, d21, d30 3422 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3423 vld1.32 {d20}, [TMP1], STRIDE 3424 vmlsl.u16 q3, d22, d31 3425 vmlal.u16 q3, d23, d31 3426 vld1.32 {d21}, [TMP1] 3427 vmull.u8 q8, d20, d28 3428 vmlal.u8 q8, d21, d29 3429 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3430 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3431 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3432 vld1.32 {d22}, [TMP2], STRIDE 3433 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3434 vadd.u16 q12, q12, q13 3435 vld1.32 {d23}, [TMP2] 3436 vmull.u8 q9, d22, d28 3437 mov TMP3, X, asr #16 3438 add X, X, UX 3439 add TMP3, TOP, TMP3, asl #2 3440 mov TMP4, X, asr #16 3441 add X, X, UX 3442 add TMP4, TOP, TMP4, asl #2 3443 vmlal.u8 q9, d23, d29 3444 vld1.32 {d22}, [TMP3], STRIDE 3445 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3446 vld1.32 {d23}, [TMP3] 3447 vmull.u8 q10, d22, d28 3448 vmlal.u8 q10, d23, d29 3449 vmovn.u16 d8, q0 3450 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3451 vmovn.u16 d9, q2 3452 vmlsl.u16 q0, d16, d30 3453 vmlal.u16 q0, d17, d30 3454 pld [TMP4, PF_OFFS] 3455 vld1.32 {d16}, [TMP4], STRIDE 3456 vadd.u16 q12, q12, q13 3457 vld1.32 {d17}, [TMP4] 3458 pld [TMP4, PF_OFFS] 3459 vmull.u8 q11, d16, d28 3460 vmlal.u8 q11, d17, d29 3461 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3462 vmlsl.u16 q1, d18, d31 3463 .endm 3464 3465 .macro bilinear_interpolate_eight_pixels_8888_0565_tail 3466 vmlal.u16 q1, d19, d31 3467 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3468 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3469 vmlsl.u16 q2, d20, d30 3470 vmlal.u16 q2, d21, d30 3471 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3472 vmlsl.u16 q3, d22, d31 3473 vmlal.u16 q3, d23, d31 3474 vadd.u16 q12, q12, q13 3475 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3476 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3477 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3478 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3479 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3480 vmovn.u16 d10, q0 3481 vmovn.u16 d11, q2 3482 vadd.u16 q12, q12, q13 3483 3484 vuzp.u8 d8, d9 3485 vuzp.u8 d10, d11 3486 vuzp.u8 d9, d11 3487 vuzp.u8 d8, d10 3488 vshll.u8 q6, d9, #8 3489 vshll.u8 q5, d10, #8 3490 vshll.u8 q7, d8, #8 3491 vsri.u16 q5, q6, #5 3492 vsri.u16 q5, q7, #11 3493 vst1.32 {d10, d11}, [OUT, :128]! 3494 .endm 3495 3496 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head 3497 mov TMP1, X, asr #16 3498 add X, X, UX 3499 add TMP1, TOP, TMP1, asl #2 3500 mov TMP2, X, asr #16 3501 add X, X, UX 3502 add TMP2, TOP, TMP2, asl #2 3503 vmlal.u16 q1, d19, d31 3504 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3505 vuzp.u8 d8, d9 3506 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3507 vmlsl.u16 q2, d20, d30 3508 vmlal.u16 q2, d21, d30 3509 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3510 vld1.32 {d20}, [TMP1], STRIDE 3511 vmlsl.u16 q3, d22, d31 3512 vmlal.u16 q3, d23, d31 3513 vld1.32 {d21}, [TMP1] 3514 vmull.u8 q8, d20, d28 3515 vmlal.u8 q8, d21, d29 3516 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3517 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3518 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3519 vld1.32 {d22}, [TMP2], STRIDE 3520 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3521 vadd.u16 q12, q12, q13 3522 vld1.32 {d23}, [TMP2] 3523 vmull.u8 q9, d22, d28 3524 mov TMP3, X, asr #16 3525 add X, X, UX 3526 add TMP3, TOP, TMP3, asl #2 3527 mov TMP4, X, asr #16 3528 add X, X, UX 3529 add TMP4, TOP, TMP4, asl #2 3530 vmlal.u8 q9, d23, d29 3531 vld1.32 {d22}, [TMP3], STRIDE 3532 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3533 vld1.32 {d23}, [TMP3] 3534 vmull.u8 q10, d22, d28 3535 vmlal.u8 q10, d23, d29 3536 vmovn.u16 d10, q0 3537 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3538 vmovn.u16 d11, q2 3539 vmlsl.u16 q0, d16, d30 3540 vmlal.u16 q0, d17, d30 3541 pld [TMP4, PF_OFFS] 3542 vld1.32 {d16}, [TMP4], STRIDE 3543 vadd.u16 q12, q12, q13 3544 vld1.32 {d17}, [TMP4] 3545 pld [TMP4, PF_OFFS] 3546 vmull.u8 q11, d16, d28 3547 vmlal.u8 q11, d17, d29 3548 vuzp.u8 d10, d11 3549 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3550 vmlsl.u16 q1, d18, d31 3551 3552 mov TMP1, X, asr #16 3553 add X, X, UX 3554 add TMP1, TOP, TMP1, asl #2 3555 mov TMP2, X, asr #16 3556 add X, X, UX 3557 add TMP2, TOP, TMP2, asl #2 3558 vmlal.u16 q1, d19, d31 3559 vuzp.u8 d9, d11 3560 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3561 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3562 vuzp.u8 d8, d10 3563 vmlsl.u16 q2, d20, d30 3564 vmlal.u16 q2, d21, d30 3565 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3566 vld1.32 {d20}, [TMP1], STRIDE 3567 vmlsl.u16 q3, d22, d31 3568 vmlal.u16 q3, d23, d31 3569 vld1.32 {d21}, [TMP1] 3570 vmull.u8 q8, d20, d28 3571 vmlal.u8 q8, d21, d29 3572 vshll.u8 q6, d9, #8 3573 vshll.u8 q5, d10, #8 3574 vshll.u8 q7, d8, #8 3575 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3576 vsri.u16 q5, q6, #5 3577 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3578 vsri.u16 q5, q7, #11 3579 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3580 vld1.32 {d22}, [TMP2], STRIDE 3581 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3582 vadd.u16 q12, q12, q13 3583 vld1.32 {d23}, [TMP2] 3584 vmull.u8 q9, d22, d28 3585 mov TMP3, X, asr #16 3586 add X, X, UX 3587 add TMP3, TOP, TMP3, asl #2 3588 mov TMP4, X, asr #16 3589 add X, X, UX 3590 add TMP4, TOP, TMP4, asl #2 3591 vmlal.u8 q9, d23, d29 3592 vld1.32 {d22}, [TMP3], STRIDE 3593 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3594 vld1.32 {d23}, [TMP3] 3595 vmull.u8 q10, d22, d28 3596 vmlal.u8 q10, d23, d29 3597 vmovn.u16 d8, q0 3598 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3599 vmovn.u16 d9, q2 3600 vmlsl.u16 q0, d16, d30 3601 vmlal.u16 q0, d17, d30 3602 pld [TMP4, PF_OFFS] 3603 vld1.32 {d16}, [TMP4], STRIDE 3604 vadd.u16 q12, q12, q13 3605 vld1.32 {d17}, [TMP4] 3606 pld [TMP4, PF_OFFS] 3607 vmull.u8 q11, d16, d28 3608 vmlal.u8 q11, d17, d29 3609 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3610 vst1.32 {d10, d11}, [OUT, :128]! 3611 vmlsl.u16 q1, d18, d31 3612 .endm 3613 /*****************************************************************************/ 3614 3615 generate_bilinear_scanline_func \ 3616 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ 3617 2, 2, 28, BILINEAR_FLAG_UNROLL_4 3618 3619 generate_bilinear_scanline_func \ 3620 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ 3621 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS 3622 3623 generate_bilinear_scanline_func \ 3624 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ 3625 1, 2, 28, BILINEAR_FLAG_UNROLL_4 3626 3627 generate_bilinear_scanline_func \ 3628 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ 3629 1, 1, 28, BILINEAR_FLAG_UNROLL_4