pixman-arm-neon-asm-bilinear.S (45613B)
1 /* 2 * Copyright © 2011 SCore Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 * Author: Taekyun Kim (tkq.kim@samsung.com) 25 */ 26 27 /* 28 * This file contains scaled bilinear scanline functions implemented 29 * using older siarhei's bilinear macro template. 30 * 31 * << General scanline function procedures >> 32 * 1. bilinear interpolate source pixels 33 * 2. load mask pixels 34 * 3. load destination pixels 35 * 4. duplicate mask to fill whole register 36 * 5. interleave source & destination pixels 37 * 6. apply mask to source pixels 38 * 7. combine source & destination pixels 39 * 8, Deinterleave final result 40 * 9. store destination pixels 41 * 42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers. 43 * Registers with double numbers(src01, dst01) are 128-bits registers. 44 * All temp registers can be used freely outside the code block. 45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks. 46 * 47 * Remarks 48 * There can be lots of pipeline stalls inside code block and between code blocks. 49 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme. 50 */ 51 52 /* Prevent the stack from becoming executable for no reason... */ 53 #if defined(__linux__) && defined (__ELF__) 54 .section .note.GNU-stack,"",%progbits 55 #endif 56 57 .text 58 .arch armv7a 59 .object_arch armv4 60 .fpu neon 61 .eabi_attribute 10, 0 62 .eabi_attribute 12, 0 63 .arm 64 .altmacro 65 .p2align 2 66 67 #include "pixman-private.h" 68 #include "pixman-arm-asm.h" 69 #include "pixman-arm-neon-asm.h" 70 71 pixman_syntax_unified 72 73 /* 74 * Bilinear macros from pixman-arm-neon-asm.S 75 */ 76 77 /* 78 * Bilinear scaling support code which tries to provide pixel fetching, color 79 * format conversion, and interpolation as separate macros which can be used 80 * as the basic building blocks for constructing bilinear scanline functions. 81 */ 82 83 .macro bilinear_load_8888 reg1, reg2, tmp 84 mov TMP1, X, asr #16 85 add X, X, UX 86 add TMP1, TOP, TMP1, asl #2 87 vld1.32 {\reg1}, [TMP1], STRIDE 88 vld1.32 {\reg2}, [TMP1] 89 .endm 90 91 .macro bilinear_load_0565 reg1, reg2, tmp 92 mov TMP1, X, asr #16 93 add X, X, UX 94 add TMP1, TOP, TMP1, asl #1 95 vld1.32 {\reg2[0]}, [TMP1], STRIDE 96 vld1.32 {\reg2[1]}, [TMP1] 97 convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp 98 .endm 99 100 .macro bilinear_load_and_vertical_interpolate_two_8888 \ 101 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 102 103 bilinear_load_8888 \reg1, \reg2, \tmp1 104 vmull.u8 \acc1, \reg1, d28 105 vmlal.u8 \acc1, \reg2, d29 106 bilinear_load_8888 \reg3, \reg4, \tmp2 107 vmull.u8 \acc2, \reg3, d28 108 vmlal.u8 \acc2, \reg4, d29 109 .endm 110 111 .macro bilinear_load_and_vertical_interpolate_four_8888 \ 112 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 113 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 114 115 bilinear_load_and_vertical_interpolate_two_8888 \ 116 \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi 117 bilinear_load_and_vertical_interpolate_two_8888 \ 118 \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi 119 .endm 120 121 .macro bilinear_load_and_vertical_interpolate_two_0565 \ 122 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 123 124 mov TMP1, X, asr #16 125 add X, X, UX 126 add TMP1, TOP, TMP1, asl #1 127 mov TMP2, X, asr #16 128 add X, X, UX 129 add TMP2, TOP, TMP2, asl #1 130 vld1.32 {\acc2lo[0]}, [TMP1], STRIDE 131 vld1.32 {\acc2hi[0]}, [TMP2], STRIDE 132 vld1.32 {\acc2lo[1]}, [TMP1] 133 vld1.32 {\acc2hi[1]}, [TMP2] 134 convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 135 vzip.u8 \reg1, \reg3 136 vzip.u8 \reg2, \reg4 137 vzip.u8 \reg3, \reg4 138 vzip.u8 \reg1, \reg2 139 vmull.u8 \acc1, \reg1, d28 140 vmlal.u8 \acc1, \reg2, d29 141 vmull.u8 \acc2, \reg3, d28 142 vmlal.u8 \acc2, \reg4, d29 143 .endm 144 145 .macro bilinear_load_and_vertical_interpolate_four_0565 \ 146 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 147 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 148 149 mov TMP1, X, asr #16 150 add X, X, UX 151 add TMP1, TOP, TMP1, asl #1 152 mov TMP2, X, asr #16 153 add X, X, UX 154 add TMP2, TOP, TMP2, asl #1 155 vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE 156 vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE 157 vld1.32 {\xacc2lo[1]}, [TMP1] 158 vld1.32 {\xacc2hi[1]}, [TMP2] 159 convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 160 mov TMP1, X, asr #16 161 add X, X, UX 162 add TMP1, TOP, TMP1, asl #1 163 mov TMP2, X, asr #16 164 add X, X, UX 165 add TMP2, TOP, TMP2, asl #1 166 vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE 167 vzip.u8 \xreg1, \xreg3 168 vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE 169 vzip.u8 \xreg2, \xreg4 170 vld1.32 {\yacc2lo[1]}, [TMP1] 171 vzip.u8 \xreg3, \xreg4 172 vld1.32 {\yacc2hi[1]}, [TMP2] 173 vzip.u8 \xreg1, \xreg2 174 convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 175 vmull.u8 \xacc1, \xreg1, d28 176 vzip.u8 \yreg1, \yreg3 177 vmlal.u8 \xacc1, \xreg2, d29 178 vzip.u8 \yreg2, \yreg4 179 vmull.u8 \xacc2, \xreg3, d28 180 vzip.u8 \yreg3, \yreg4 181 vmlal.u8 \xacc2, \xreg4, d29 182 vzip.u8 \yreg1, \yreg2 183 vmull.u8 \yacc1, \yreg1, d28 184 vmlal.u8 \yacc1, \yreg2, d29 185 vmull.u8 \yacc2, \yreg3, d28 186 vmlal.u8 \yacc2, \yreg4, d29 187 .endm 188 189 .macro bilinear_store_8888 numpix, tmp1, tmp2 190 .if \numpix == 4 191 vst1.32 {d0, d1}, [OUT]! 192 .elseif \numpix == 2 193 vst1.32 {d0}, [OUT]! 194 .elseif \numpix == 1 195 vst1.32 {d0[0]}, [OUT, :32]! 196 .else 197 .error bilinear_store_8888 numpix is unsupported 198 .endif 199 .endm 200 201 .macro bilinear_store_0565 numpix, tmp1, tmp2 202 vuzp.u8 d0, d1 203 vuzp.u8 d2, d3 204 vuzp.u8 d1, d3 205 vuzp.u8 d0, d2 206 convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2 207 .if \numpix == 4 208 vst1.16 {d2}, [OUT]! 209 .elseif \numpix == 2 210 vst1.32 {d2[0]}, [OUT]! 211 .elseif \numpix == 1 212 vst1.16 {d2[0]}, [OUT]! 213 .else 214 .error bilinear_store_0565 numpix is unsupported 215 .endif 216 .endm 217 218 219 /* 220 * Macros for loading mask pixels into register 'mask'. 221 * vdup must be done in somewhere else. 222 */ 223 .macro bilinear_load_mask_x numpix, mask 224 .endm 225 226 .macro bilinear_load_mask_8 numpix, mask 227 .if \numpix == 4 228 vld1.32 {\mask[0]}, [MASK]! 229 .elseif \numpix == 2 230 vld1.16 {\mask[0]}, [MASK]! 231 .elseif \numpix == 1 232 vld1.8 {\mask[0]}, [MASK]! 233 .else 234 .error bilinear_load_mask_8 \numpix is unsupported 235 .endif 236 pld [MASK, #prefetch_offset] 237 .endm 238 239 .macro bilinear_load_mask mask_fmt, numpix, mask 240 bilinear_load_mask_\()\mask_fmt \numpix, \mask 241 .endm 242 243 244 /* 245 * Macros for loading destination pixels into register 'dst0' and 'dst1'. 246 * Interleave should be done somewhere else. 247 */ 248 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 249 .endm 250 251 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 252 .endm 253 254 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 255 .if \numpix == 4 256 vld1.32 {\dst0, \dst1}, [OUT] 257 .elseif \numpix == 2 258 vld1.32 {\dst0}, [OUT] 259 .elseif \numpix == 1 260 vld1.32 {\dst0[0]}, [OUT] 261 .else 262 .error bilinear_load_dst_8888 \numpix is unsupported 263 .endif 264 pld [OUT, #(prefetch_offset * 4)] 265 .endm 266 267 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 268 bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 269 .endm 270 271 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 272 bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 273 .endm 274 275 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 276 bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 277 .endm 278 279 /* 280 * Macros for duplicating partially loaded mask to fill entire register. 281 * We will apply mask to interleaved source pixels, that is 282 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) 283 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) 284 * So, we need to duplicate loaded mask into whole register. 285 * 286 * For two pixel case 287 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) 288 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) 289 * We can do some optimizations for this including last pixel cases. 290 */ 291 .macro bilinear_duplicate_mask_x numpix, mask 292 .endm 293 294 .macro bilinear_duplicate_mask_8 numpix, mask 295 .if \numpix == 4 296 vdup.32 \mask, \mask[0] 297 .elseif \numpix == 2 298 vdup.16 \mask, \mask[0] 299 .elseif \numpix == 1 300 vdup.8 \mask, \mask[0] 301 .else 302 .error bilinear_duplicate_mask_8 is unsupported 303 .endif 304 .endm 305 306 .macro bilinear_duplicate_mask mask_fmt, numpix, mask 307 bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask 308 .endm 309 310 /* 311 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. 312 * Interleave should be done when maks is enabled or operator is 'over'. 313 */ 314 .macro bilinear_interleave src0, src1, dst0, dst1 315 vuzp.8 \src0, \src1 316 vuzp.8 \dst0, \dst1 317 vuzp.8 \src0, \src1 318 vuzp.8 \dst0, \dst1 319 .endm 320 321 .macro bilinear_interleave_src_dst_x_src \ 322 numpix, src0, src1, src01, dst0, dst1, dst01 323 .endm 324 325 .macro bilinear_interleave_src_dst_x_over \ 326 numpix, src0, src1, src01, dst0, dst1, dst01 327 328 bilinear_interleave \src0, \src1, \dst0, \dst1 329 .endm 330 331 .macro bilinear_interleave_src_dst_x_add \ 332 numpix, src0, src1, src01, dst0, dst1, dst01 333 .endm 334 335 .macro bilinear_interleave_src_dst_8_src \ 336 numpix, src0, src1, src01, dst0, dst1, dst01 337 338 bilinear_interleave \src0, \src1, \dst0, \dst1 339 .endm 340 341 .macro bilinear_interleave_src_dst_8_over \ 342 numpix, src0, src1, src01, dst0, dst1, dst01 343 344 bilinear_interleave \src0, \src1, \dst0, \dst1 345 .endm 346 347 .macro bilinear_interleave_src_dst_8_add \ 348 numpix, src0, src1, src01, dst0, dst1, dst01 349 350 bilinear_interleave \src0, \src1, \dst0, \dst1 351 .endm 352 353 .macro bilinear_interleave_src_dst \ 354 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 355 356 bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ 357 \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 358 .endm 359 360 361 /* 362 * Macros for applying masks to src pixels. (see combine_mask_u() function) 363 * src, dst should be in interleaved form. 364 * mask register should be in form (m0, m1, m2, m3). 365 */ 366 .macro bilinear_apply_mask_to_src_x \ 367 numpix, src0, src1, src01, mask, \ 368 tmp01, tmp23, tmp45, tmp67 369 .endm 370 371 .macro bilinear_apply_mask_to_src_8 \ 372 numpix, src0, src1, src01, mask, \ 373 tmp01, tmp23, tmp45, tmp67 374 375 vmull.u8 \tmp01, \src0, \mask 376 vmull.u8 \tmp23, \src1, \mask 377 /* bubbles */ 378 vrshr.u16 \tmp45, \tmp01, #8 379 vrshr.u16 \tmp67, \tmp23, #8 380 /* bubbles */ 381 vraddhn.u16 \src0, \tmp45, \tmp01 382 vraddhn.u16 \src1, \tmp67, \tmp23 383 .endm 384 385 .macro bilinear_apply_mask_to_src \ 386 mask_fmt, numpix, src0, src1, src01, mask, \ 387 tmp01, tmp23, tmp45, tmp67 388 389 bilinear_apply_mask_to_src_\()\mask_fmt \ 390 \numpix, \src0, \src1, \src01, \mask, \ 391 \tmp01, \tmp23, \tmp45, \tmp67 392 .endm 393 394 395 /* 396 * Macros for combining src and destination pixels. 397 * Interleave or not is depending on operator 'op'. 398 */ 399 .macro bilinear_combine_src \ 400 numpix, src0, src1, src01, dst0, dst1, dst01, \ 401 tmp01, tmp23, tmp45, tmp67, tmp8 402 .endm 403 404 .macro bilinear_combine_over \ 405 numpix, src0, src1, src01, dst0, dst1, dst01, \ 406 tmp01, tmp23, tmp45, tmp67, tmp8 407 408 vdup.32 \tmp8, \src1[1] 409 /* bubbles */ 410 vmvn.8 \tmp8, \tmp8 411 /* bubbles */ 412 vmull.u8 \tmp01, \dst0, \tmp8 413 /* bubbles */ 414 vmull.u8 \tmp23, \dst1, \tmp8 415 /* bubbles */ 416 vrshr.u16 \tmp45, \tmp01, #8 417 vrshr.u16 \tmp67, \tmp23, #8 418 /* bubbles */ 419 vraddhn.u16 \dst0, \tmp45, \tmp01 420 vraddhn.u16 \dst1, \tmp67, \tmp23 421 /* bubbles */ 422 vqadd.u8 \src01, \dst01, \src01 423 .endm 424 425 .macro bilinear_combine_add \ 426 numpix, src0, src1, src01, dst0, dst1, dst01, \ 427 tmp01, tmp23, tmp45, tmp67, tmp8 428 429 vqadd.u8 \src01, \dst01, \src01 430 .endm 431 432 .macro bilinear_combine \ 433 op, numpix, src0, src1, src01, dst0, dst1, dst01, \ 434 tmp01, tmp23, tmp45, tmp67, tmp8 435 436 bilinear_combine_\()\op \ 437 \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ 438 \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 439 .endm 440 441 /* 442 * Macros for final deinterleaving of destination pixels if needed. 443 */ 444 .macro bilinear_deinterleave numpix, dst0, dst1, dst01 445 vuzp.8 \dst0, \dst1 446 /* bubbles */ 447 vuzp.8 \dst0, \dst1 448 .endm 449 450 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 451 .endm 452 453 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 454 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 455 .endm 456 457 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 458 .endm 459 460 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 461 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 462 .endm 463 464 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 465 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 466 .endm 467 468 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 469 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 470 .endm 471 472 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 473 bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 474 .endm 475 476 477 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op 478 bilinear_load_\()\src_fmt d0, d1, d2 479 bilinear_load_mask \mask_fmt, 1, d4 480 bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9 481 vmull.u8 q1, d0, d28 482 vmlal.u8 q1, d1, d29 483 /* 5 cycles bubble */ 484 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 485 vmlsl.u16 q0, d2, d30 486 vmlal.u16 q0, d3, d30 487 /* 5 cycles bubble */ 488 bilinear_duplicate_mask \mask_fmt, 1, d4 489 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 490 /* 3 cycles bubble */ 491 vmovn.u16 d0, q0 492 /* 1 cycle bubble */ 493 bilinear_interleave_src_dst \ 494 \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9 495 bilinear_apply_mask_to_src \ 496 \mask_fmt, 1, d0, d1, q0, d4, \ 497 q3, q8, q10, q11 498 bilinear_combine \ 499 \op, 1, d0, d1, q0, d18, d19, q9, \ 500 q3, q8, q10, q11, d5 501 bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0 502 bilinear_store_\()\dst_fmt 1, q2, q3 503 .endm 504 505 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op 506 bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ 507 q1, q11, d0, d1, d20, d21, d22, d23 508 bilinear_load_mask \mask_fmt, 2, d4 509 bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9 510 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 511 vmlsl.u16 q0, d2, d30 512 vmlal.u16 q0, d3, d30 513 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 514 vmlsl.u16 q10, d22, d31 515 vmlal.u16 q10, d23, d31 516 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 517 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 518 bilinear_duplicate_mask \mask_fmt, 2, d4 519 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 520 vadd.u16 q12, q12, q13 521 vmovn.u16 d0, q0 522 bilinear_interleave_src_dst \ 523 \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9 524 bilinear_apply_mask_to_src \ 525 \mask_fmt, 2, d0, d1, q0, d4, \ 526 q3, q8, q10, q11 527 bilinear_combine \ 528 \op, 2, d0, d1, q0, d18, d19, q9, \ 529 q3, q8, q10, q11, d5 530 bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0 531 bilinear_store_\()\dst_fmt 2, q2, q3 532 .endm 533 534 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op 535 bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ 536 q1, q11, d0, d1, d20, d21, d22, d23 \ 537 q3, q9, d4, d5, d16, d17, d18, d19 538 pld [TMP1, PF_OFFS] 539 sub TMP1, TMP1, STRIDE 540 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 541 vmlsl.u16 q0, d2, d30 542 vmlal.u16 q0, d3, d30 543 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 544 vmlsl.u16 q10, d22, d31 545 vmlal.u16 q10, d23, d31 546 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 547 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 548 vmlsl.u16 q2, d6, d30 549 vmlal.u16 q2, d7, d30 550 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 551 bilinear_load_mask \mask_fmt, 4, d22 552 bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1 553 pld [TMP1, PF_OFFS] 554 vmlsl.u16 q8, d18, d31 555 vmlal.u16 q8, d19, d31 556 vadd.u16 q12, q12, q13 557 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 558 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 559 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 560 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 561 bilinear_duplicate_mask \mask_fmt, 4, d22 562 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 563 vmovn.u16 d0, q0 564 vmovn.u16 d1, q2 565 vadd.u16 q12, q12, q13 566 bilinear_interleave_src_dst \ 567 \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1 568 bilinear_apply_mask_to_src \ 569 \mask_fmt, 4, d0, d1, q0, d22, \ 570 q3, q8, q9, q10 571 bilinear_combine \ 572 \op, 4, d0, d1, q0, d2, d3, q1, \ 573 q3, q8, q9, q10, d23 574 bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0 575 bilinear_store_\()\dst_fmt 4, q2, q3 576 .endm 577 578 .set BILINEAR_FLAG_USE_MASK, 1 579 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 580 581 /* 582 * Main template macro for generating NEON optimized bilinear scanline functions. 583 * 584 * Bilinear scanline generator macro take folling arguments: 585 * fname - name of the function to generate 586 * src_fmt - source color format (8888 or 0565) 587 * dst_fmt - destination color format (8888 or 0565) 588 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes 589 * process_last_pixel - code block that interpolate one pixel and does not 590 * update horizontal weight 591 * process_two_pixels - code block that interpolate two pixels and update 592 * horizontal weight 593 * process_four_pixels - code block that interpolate four pixels and update 594 * horizontal weight 595 * process_pixblock_head - head part of middle loop 596 * process_pixblock_tail - tail part of middle loop 597 * process_pixblock_tail_head - tail_head of middle loop 598 * pixblock_size - number of pixels processed in a single middle loop 599 * prefetch_distance - prefetch in the source image by that many pixels ahead 600 */ 601 602 .macro generate_bilinear_scanline_func \ 603 fname, \ 604 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ 605 bilinear_process_last_pixel, \ 606 bilinear_process_two_pixels, \ 607 bilinear_process_four_pixels, \ 608 bilinear_process_pixblock_head, \ 609 bilinear_process_pixblock_tail, \ 610 bilinear_process_pixblock_tail_head, \ 611 pixblock_size, \ 612 prefetch_distance, \ 613 flags 614 615 pixman_asm_function \fname 616 .if \pixblock_size == 8 617 .elseif \pixblock_size == 4 618 .else 619 .error unsupported pixblock size 620 .endif 621 622 .if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 623 OUT .req r0 624 TOP .req r1 625 BOTTOM .req r2 626 WT .req r3 627 WB .req r4 628 X .req r5 629 UX .req r6 630 WIDTH .req ip 631 TMP1 .req r3 632 TMP2 .req r4 633 PF_OFFS .req r7 634 TMP3 .req r8 635 TMP4 .req r9 636 STRIDE .req r2 637 638 mov ip, sp 639 push {r4, r5, r6, r7, r8, r9} 640 mov PF_OFFS, #\prefetch_distance 641 ldmia ip, {WB, X, UX, WIDTH} 642 .else 643 OUT .req r0 644 MASK .req r1 645 TOP .req r2 646 BOTTOM .req r3 647 WT .req r4 648 WB .req r5 649 X .req r6 650 UX .req r7 651 WIDTH .req ip 652 TMP1 .req r4 653 TMP2 .req r5 654 PF_OFFS .req r8 655 TMP3 .req r9 656 TMP4 .req r10 657 STRIDE .req r3 658 659 .set prefetch_offset, \prefetch_distance 660 661 mov ip, sp 662 push {r4, r5, r6, r7, r8, r9, r10, ip} 663 mov PF_OFFS, #\prefetch_distance 664 ldmia ip, {WT, WB, X, UX, WIDTH} 665 .endif 666 667 mul PF_OFFS, PF_OFFS, UX 668 669 .if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 670 vpush {d8-d15} 671 .endif 672 673 sub STRIDE, BOTTOM, TOP 674 .unreq BOTTOM 675 676 cmp WIDTH, #0 677 ble 3f 678 679 vdup.u16 q12, X 680 vdup.u16 q13, UX 681 vdup.u8 d28, WT 682 vdup.u8 d29, WB 683 vadd.u16 d25, d25, d26 684 685 /* ensure good destination alignment */ 686 cmp WIDTH, #1 687 blt 0f 688 tst OUT, #(1 << \dst_bpp_shift) 689 beq 0f 690 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 691 vadd.u16 q12, q12, q13 692 \bilinear_process_last_pixel 693 sub WIDTH, WIDTH, #1 694 0: 695 vadd.u16 q13, q13, q13 696 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 697 vadd.u16 q12, q12, q13 698 699 cmp WIDTH, #2 700 blt 0f 701 tst OUT, #(1 << (\dst_bpp_shift + 1)) 702 beq 0f 703 \bilinear_process_two_pixels 704 sub WIDTH, WIDTH, #2 705 0: 706 .if \pixblock_size == 8 707 cmp WIDTH, #4 708 blt 0f 709 tst OUT, #(1 << (\dst_bpp_shift + 2)) 710 beq 0f 711 \bilinear_process_four_pixels 712 sub WIDTH, WIDTH, #4 713 0: 714 .endif 715 subs WIDTH, WIDTH, #\pixblock_size 716 blt 1f 717 mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift) 718 \bilinear_process_pixblock_head 719 subs WIDTH, WIDTH, #\pixblock_size 720 blt 5f 721 0: 722 \bilinear_process_pixblock_tail_head 723 subs WIDTH, WIDTH, #\pixblock_size 724 bge 0b 725 5: 726 \bilinear_process_pixblock_tail 727 1: 728 .if \pixblock_size == 8 729 tst WIDTH, #4 730 beq 2f 731 \bilinear_process_four_pixels 732 2: 733 .endif 734 /* handle the remaining trailing pixels */ 735 tst WIDTH, #2 736 beq 2f 737 \bilinear_process_two_pixels 738 2: 739 tst WIDTH, #1 740 beq 3f 741 \bilinear_process_last_pixel 742 3: 743 .if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 744 vpop {d8-d15} 745 .endif 746 747 .if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 748 pop {r4, r5, r6, r7, r8, r9} 749 .else 750 pop {r4, r5, r6, r7, r8, r9, r10, ip} 751 .endif 752 bx lr 753 754 .unreq OUT 755 .unreq TOP 756 .unreq WT 757 .unreq WB 758 .unreq X 759 .unreq UX 760 .unreq WIDTH 761 .unreq TMP1 762 .unreq TMP2 763 .unreq PF_OFFS 764 .unreq TMP3 765 .unreq TMP4 766 .unreq STRIDE 767 .if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 768 .unreq MASK 769 .endif 770 771 pixman_end_asm_function 772 773 .endm 774 775 /* src_8888_8_8888 */ 776 .macro bilinear_src_8888_8_8888_process_last_pixel 777 bilinear_interpolate_last_pixel 8888, 8, 8888, src 778 .endm 779 780 .macro bilinear_src_8888_8_8888_process_two_pixels 781 bilinear_interpolate_two_pixels 8888, 8, 8888, src 782 .endm 783 784 .macro bilinear_src_8888_8_8888_process_four_pixels 785 bilinear_interpolate_four_pixels 8888, 8, 8888, src 786 .endm 787 788 .macro bilinear_src_8888_8_8888_process_pixblock_head 789 bilinear_src_8888_8_8888_process_four_pixels 790 .endm 791 792 .macro bilinear_src_8888_8_8888_process_pixblock_tail 793 .endm 794 795 .macro bilinear_src_8888_8_8888_process_pixblock_tail_head 796 bilinear_src_8888_8_8888_process_pixblock_tail 797 bilinear_src_8888_8_8888_process_pixblock_head 798 .endm 799 800 /* src_8888_8_0565 */ 801 .macro bilinear_src_8888_8_0565_process_last_pixel 802 bilinear_interpolate_last_pixel 8888, 8, 0565, src 803 .endm 804 805 .macro bilinear_src_8888_8_0565_process_two_pixels 806 bilinear_interpolate_two_pixels 8888, 8, 0565, src 807 .endm 808 809 .macro bilinear_src_8888_8_0565_process_four_pixels 810 bilinear_interpolate_four_pixels 8888, 8, 0565, src 811 .endm 812 813 .macro bilinear_src_8888_8_0565_process_pixblock_head 814 bilinear_src_8888_8_0565_process_four_pixels 815 .endm 816 817 .macro bilinear_src_8888_8_0565_process_pixblock_tail 818 .endm 819 820 .macro bilinear_src_8888_8_0565_process_pixblock_tail_head 821 bilinear_src_8888_8_0565_process_pixblock_tail 822 bilinear_src_8888_8_0565_process_pixblock_head 823 .endm 824 825 /* src_0565_8_x888 */ 826 .macro bilinear_src_0565_8_x888_process_last_pixel 827 bilinear_interpolate_last_pixel 0565, 8, 8888, src 828 .endm 829 830 .macro bilinear_src_0565_8_x888_process_two_pixels 831 bilinear_interpolate_two_pixels 0565, 8, 8888, src 832 .endm 833 834 .macro bilinear_src_0565_8_x888_process_four_pixels 835 bilinear_interpolate_four_pixels 0565, 8, 8888, src 836 .endm 837 838 .macro bilinear_src_0565_8_x888_process_pixblock_head 839 bilinear_src_0565_8_x888_process_four_pixels 840 .endm 841 842 .macro bilinear_src_0565_8_x888_process_pixblock_tail 843 .endm 844 845 .macro bilinear_src_0565_8_x888_process_pixblock_tail_head 846 bilinear_src_0565_8_x888_process_pixblock_tail 847 bilinear_src_0565_8_x888_process_pixblock_head 848 .endm 849 850 /* src_0565_8_0565 */ 851 .macro bilinear_src_0565_8_0565_process_last_pixel 852 bilinear_interpolate_last_pixel 0565, 8, 0565, src 853 .endm 854 855 .macro bilinear_src_0565_8_0565_process_two_pixels 856 bilinear_interpolate_two_pixels 0565, 8, 0565, src 857 .endm 858 859 .macro bilinear_src_0565_8_0565_process_four_pixels 860 bilinear_interpolate_four_pixels 0565, 8, 0565, src 861 .endm 862 863 .macro bilinear_src_0565_8_0565_process_pixblock_head 864 bilinear_src_0565_8_0565_process_four_pixels 865 .endm 866 867 .macro bilinear_src_0565_8_0565_process_pixblock_tail 868 .endm 869 870 .macro bilinear_src_0565_8_0565_process_pixblock_tail_head 871 bilinear_src_0565_8_0565_process_pixblock_tail 872 bilinear_src_0565_8_0565_process_pixblock_head 873 .endm 874 875 /* over_8888_8888 */ 876 .macro bilinear_over_8888_8888_process_last_pixel 877 bilinear_interpolate_last_pixel 8888, x, 8888, over 878 .endm 879 880 .macro bilinear_over_8888_8888_process_two_pixels 881 bilinear_interpolate_two_pixels 8888, x, 8888, over 882 .endm 883 884 .macro bilinear_over_8888_8888_process_four_pixels 885 bilinear_interpolate_four_pixels 8888, x, 8888, over 886 .endm 887 888 .macro bilinear_over_8888_8888_process_pixblock_head 889 mov TMP1, X, asr #16 890 add X, X, UX 891 add TMP1, TOP, TMP1, asl #2 892 mov TMP2, X, asr #16 893 add X, X, UX 894 add TMP2, TOP, TMP2, asl #2 895 896 vld1.32 {d22}, [TMP1], STRIDE 897 vld1.32 {d23}, [TMP1] 898 mov TMP3, X, asr #16 899 add X, X, UX 900 add TMP3, TOP, TMP3, asl #2 901 vmull.u8 q8, d22, d28 902 vmlal.u8 q8, d23, d29 903 904 vld1.32 {d22}, [TMP2], STRIDE 905 vld1.32 {d23}, [TMP2] 906 mov TMP4, X, asr #16 907 add X, X, UX 908 add TMP4, TOP, TMP4, asl #2 909 vmull.u8 q9, d22, d28 910 vmlal.u8 q9, d23, d29 911 912 vld1.32 {d22}, [TMP3], STRIDE 913 vld1.32 {d23}, [TMP3] 914 vmull.u8 q10, d22, d28 915 vmlal.u8 q10, d23, d29 916 917 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 918 vmlsl.u16 q0, d16, d30 919 vmlal.u16 q0, d17, d30 920 921 pld [TMP4, PF_OFFS] 922 vld1.32 {d16}, [TMP4], STRIDE 923 vld1.32 {d17}, [TMP4] 924 pld [TMP4, PF_OFFS] 925 vmull.u8 q11, d16, d28 926 vmlal.u8 q11, d17, d29 927 928 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 929 vmlsl.u16 q1, d18, d31 930 vmlal.u16 q1, d19, d31 931 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 932 vadd.u16 q12, q12, q13 933 .endm 934 935 .macro bilinear_over_8888_8888_process_pixblock_tail 936 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 937 vmlsl.u16 q2, d20, d30 938 vmlal.u16 q2, d21, d30 939 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 940 vmlsl.u16 q3, d22, d31 941 vmlal.u16 q3, d23, d31 942 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 943 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 944 vld1.32 {d2, d3}, [OUT, :128] 945 pld [OUT, #(prefetch_offset * 4)] 946 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 947 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 948 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 949 vmovn.u16 d6, q0 950 vmovn.u16 d7, q2 951 vuzp.8 d6, d7 952 vuzp.8 d2, d3 953 vuzp.8 d6, d7 954 vuzp.8 d2, d3 955 vdup.32 d4, d7[1] 956 vmvn.8 d4, d4 957 vmull.u8 q11, d2, d4 958 vmull.u8 q2, d3, d4 959 vrshr.u16 q1, q11, #8 960 vrshr.u16 q10, q2, #8 961 vraddhn.u16 d2, q1, q11 962 vraddhn.u16 d3, q10, q2 963 vqadd.u8 q3, q1, q3 964 vuzp.8 d6, d7 965 vuzp.8 d6, d7 966 vadd.u16 q12, q12, q13 967 vst1.32 {d6, d7}, [OUT, :128]! 968 .endm 969 970 .macro bilinear_over_8888_8888_process_pixblock_tail_head 971 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 972 mov TMP1, X, asr #16 973 add X, X, UX 974 add TMP1, TOP, TMP1, asl #2 975 vmlsl.u16 q2, d20, d30 976 mov TMP2, X, asr #16 977 add X, X, UX 978 add TMP2, TOP, TMP2, asl #2 979 vmlal.u16 q2, d21, d30 980 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 981 vld1.32 {d20}, [TMP1], STRIDE 982 vmlsl.u16 q3, d22, d31 983 vmlal.u16 q3, d23, d31 984 vld1.32 {d21}, [TMP1] 985 vmull.u8 q8, d20, d28 986 vmlal.u8 q8, d21, d29 987 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 988 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 989 vld1.32 {d2, d3}, [OUT, :128] 990 pld [OUT, PF_OFFS] 991 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 992 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 993 vld1.32 {d22}, [TMP2], STRIDE 994 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 995 vmovn.u16 d6, q0 996 vld1.32 {d23}, [TMP2] 997 vmull.u8 q9, d22, d28 998 mov TMP3, X, asr #16 999 add X, X, UX 1000 add TMP3, TOP, TMP3, asl #2 1001 mov TMP4, X, asr #16 1002 add X, X, UX 1003 add TMP4, TOP, TMP4, asl #2 1004 vmlal.u8 q9, d23, d29 1005 vmovn.u16 d7, q2 1006 vld1.32 {d22}, [TMP3], STRIDE 1007 vuzp.8 d6, d7 1008 vuzp.8 d2, d3 1009 vuzp.8 d6, d7 1010 vuzp.8 d2, d3 1011 vdup.32 d4, d7[1] 1012 vld1.32 {d23}, [TMP3] 1013 vmvn.8 d4, d4 1014 vmull.u8 q10, d22, d28 1015 vmlal.u8 q10, d23, d29 1016 vmull.u8 q11, d2, d4 1017 vmull.u8 q2, d3, d4 1018 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 1019 vmlsl.u16 q0, d16, d30 1020 vrshr.u16 q1, q11, #8 1021 vmlal.u16 q0, d17, d30 1022 vrshr.u16 q8, q2, #8 1023 vraddhn.u16 d2, q1, q11 1024 vraddhn.u16 d3, q8, q2 1025 pld [TMP4, PF_OFFS] 1026 vld1.32 {d16}, [TMP4], STRIDE 1027 vqadd.u8 q3, q1, q3 1028 vld1.32 {d17}, [TMP4] 1029 pld [TMP4, PF_OFFS] 1030 vmull.u8 q11, d16, d28 1031 vmlal.u8 q11, d17, d29 1032 vuzp.8 d6, d7 1033 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 1034 vuzp.8 d6, d7 1035 vmlsl.u16 q1, d18, d31 1036 vadd.u16 q12, q12, q13 1037 vmlal.u16 q1, d19, d31 1038 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1039 vadd.u16 q12, q12, q13 1040 vst1.32 {d6, d7}, [OUT, :128]! 1041 .endm 1042 1043 /* over_8888_8_8888 */ 1044 .macro bilinear_over_8888_8_8888_process_last_pixel 1045 bilinear_interpolate_last_pixel 8888, 8, 8888, over 1046 .endm 1047 1048 .macro bilinear_over_8888_8_8888_process_two_pixels 1049 bilinear_interpolate_two_pixels 8888, 8, 8888, over 1050 .endm 1051 1052 .macro bilinear_over_8888_8_8888_process_four_pixels 1053 bilinear_interpolate_four_pixels 8888, 8, 8888, over 1054 .endm 1055 1056 .macro bilinear_over_8888_8_8888_process_pixblock_head 1057 mov TMP1, X, asr #16 1058 add X, X, UX 1059 add TMP1, TOP, TMP1, asl #2 1060 vld1.32 {d0}, [TMP1], STRIDE 1061 mov TMP2, X, asr #16 1062 add X, X, UX 1063 add TMP2, TOP, TMP2, asl #2 1064 vld1.32 {d1}, [TMP1] 1065 mov TMP3, X, asr #16 1066 add X, X, UX 1067 add TMP3, TOP, TMP3, asl #2 1068 vld1.32 {d2}, [TMP2], STRIDE 1069 mov TMP4, X, asr #16 1070 add X, X, UX 1071 add TMP4, TOP, TMP4, asl #2 1072 vld1.32 {d3}, [TMP2] 1073 vmull.u8 q2, d0, d28 1074 vmull.u8 q3, d2, d28 1075 vmlal.u8 q2, d1, d29 1076 vmlal.u8 q3, d3, d29 1077 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS 1078 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS 1079 vmlsl.u16 q0, d4, d30 1080 vmlsl.u16 q1, d6, d31 1081 vmlal.u16 q0, d5, d30 1082 vmlal.u16 q1, d7, d31 1083 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 1084 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 1085 vld1.32 {d2}, [TMP3], STRIDE 1086 vld1.32 {d3}, [TMP3] 1087 pld [TMP4, PF_OFFS] 1088 vld1.32 {d4}, [TMP4], STRIDE 1089 vld1.32 {d5}, [TMP4] 1090 pld [TMP4, PF_OFFS] 1091 vmull.u8 q3, d2, d28 1092 vmlal.u8 q3, d3, d29 1093 vmull.u8 q1, d4, d28 1094 vmlal.u8 q1, d5, d29 1095 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1096 vld1.32 {d22[0]}, [MASK]! 1097 pld [MASK, #prefetch_offset] 1098 vadd.u16 q12, q12, q13 1099 vmovn.u16 d16, q0 1100 .endm 1101 1102 .macro bilinear_over_8888_8_8888_process_pixblock_tail 1103 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS 1104 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS 1105 vmlsl.u16 q9, d6, d30 1106 vmlsl.u16 q10, d2, d31 1107 vmlal.u16 q9, d7, d30 1108 vmlal.u16 q10, d3, d31 1109 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1110 vadd.u16 q12, q12, q13 1111 vdup.32 d22, d22[0] 1112 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) 1113 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 1114 vmovn.u16 d17, q9 1115 vld1.32 {d18, d19}, [OUT, :128] 1116 pld [OUT, PF_OFFS] 1117 vuzp.8 d16, d17 1118 vuzp.8 d18, d19 1119 vuzp.8 d16, d17 1120 vuzp.8 d18, d19 1121 vmull.u8 q10, d16, d22 1122 vmull.u8 q11, d17, d22 1123 vrsra.u16 q10, q10, #8 1124 vrsra.u16 q11, q11, #8 1125 vrshrn.u16 d16, q10, #8 1126 vrshrn.u16 d17, q11, #8 1127 vdup.32 d22, d17[1] 1128 vmvn.8 d22, d22 1129 vmull.u8 q10, d18, d22 1130 vmull.u8 q11, d19, d22 1131 vrshr.u16 q9, q10, #8 1132 vrshr.u16 q0, q11, #8 1133 vraddhn.u16 d18, q9, q10 1134 vraddhn.u16 d19, q0, q11 1135 vqadd.u8 q9, q8, q9 1136 vuzp.8 d18, d19 1137 vuzp.8 d18, d19 1138 vst1.32 {d18, d19}, [OUT, :128]! 1139 .endm 1140 1141 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head 1142 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS 1143 mov TMP1, X, asr #16 1144 add X, X, UX 1145 add TMP1, TOP, TMP1, asl #2 1146 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS 1147 vld1.32 {d0}, [TMP1], STRIDE 1148 mov TMP2, X, asr #16 1149 add X, X, UX 1150 add TMP2, TOP, TMP2, asl #2 1151 vmlsl.u16 q9, d6, d30 1152 vmlsl.u16 q10, d2, d31 1153 vld1.32 {d1}, [TMP1] 1154 mov TMP3, X, asr #16 1155 add X, X, UX 1156 add TMP3, TOP, TMP3, asl #2 1157 vmlal.u16 q9, d7, d30 1158 vmlal.u16 q10, d3, d31 1159 vld1.32 {d2}, [TMP2], STRIDE 1160 mov TMP4, X, asr #16 1161 add X, X, UX 1162 add TMP4, TOP, TMP4, asl #2 1163 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1164 vadd.u16 q12, q12, q13 1165 vld1.32 {d3}, [TMP2] 1166 vdup.32 d22, d22[0] 1167 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) 1168 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 1169 vmull.u8 q2, d0, d28 1170 vmull.u8 q3, d2, d28 1171 vmovn.u16 d17, q9 1172 vld1.32 {d18, d19}, [OUT, :128] 1173 pld [OUT, #(prefetch_offset * 4)] 1174 vmlal.u8 q2, d1, d29 1175 vmlal.u8 q3, d3, d29 1176 vuzp.8 d16, d17 1177 vuzp.8 d18, d19 1178 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS 1179 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS 1180 vuzp.8 d16, d17 1181 vuzp.8 d18, d19 1182 vmlsl.u16 q0, d4, d30 1183 vmlsl.u16 q1, d6, d31 1184 vmull.u8 q10, d16, d22 1185 vmull.u8 q11, d17, d22 1186 vmlal.u16 q0, d5, d30 1187 vmlal.u16 q1, d7, d31 1188 vrsra.u16 q10, q10, #8 1189 vrsra.u16 q11, q11, #8 1190 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 1191 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 1192 vrshrn.u16 d16, q10, #8 1193 vrshrn.u16 d17, q11, #8 1194 vld1.32 {d2}, [TMP3], STRIDE 1195 vdup.32 d22, d17[1] 1196 vld1.32 {d3}, [TMP3] 1197 vmvn.8 d22, d22 1198 pld [TMP4, PF_OFFS] 1199 vld1.32 {d4}, [TMP4], STRIDE 1200 vmull.u8 q10, d18, d22 1201 vmull.u8 q11, d19, d22 1202 vld1.32 {d5}, [TMP4] 1203 pld [TMP4, PF_OFFS] 1204 vmull.u8 q3, d2, d28 1205 vrshr.u16 q9, q10, #8 1206 vrshr.u16 q15, q11, #8 1207 vmlal.u8 q3, d3, d29 1208 vmull.u8 q1, d4, d28 1209 vraddhn.u16 d18, q9, q10 1210 vraddhn.u16 d19, q15, q11 1211 vmlal.u8 q1, d5, d29 1212 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1213 vqadd.u8 q9, q8, q9 1214 vld1.32 {d22[0]}, [MASK]! 1215 vuzp.8 d18, d19 1216 vadd.u16 q12, q12, q13 1217 vuzp.8 d18, d19 1218 vmovn.u16 d16, q0 1219 vst1.32 {d18, d19}, [OUT, :128]! 1220 .endm 1221 1222 /* add_8888_8888 */ 1223 .macro bilinear_add_8888_8888_process_last_pixel 1224 bilinear_interpolate_last_pixel 8888, x, 8888, add 1225 .endm 1226 1227 .macro bilinear_add_8888_8888_process_two_pixels 1228 bilinear_interpolate_two_pixels 8888, x, 8888, add 1229 .endm 1230 1231 .macro bilinear_add_8888_8888_process_four_pixels 1232 bilinear_interpolate_four_pixels 8888, x, 8888, add 1233 .endm 1234 1235 .macro bilinear_add_8888_8888_process_pixblock_head 1236 bilinear_add_8888_8888_process_four_pixels 1237 .endm 1238 1239 .macro bilinear_add_8888_8888_process_pixblock_tail 1240 .endm 1241 1242 .macro bilinear_add_8888_8888_process_pixblock_tail_head 1243 bilinear_add_8888_8888_process_pixblock_tail 1244 bilinear_add_8888_8888_process_pixblock_head 1245 .endm 1246 1247 /* add_8888_8_8888 */ 1248 .macro bilinear_add_8888_8_8888_process_last_pixel 1249 bilinear_interpolate_last_pixel 8888, 8, 8888, add 1250 .endm 1251 1252 .macro bilinear_add_8888_8_8888_process_two_pixels 1253 bilinear_interpolate_two_pixels 8888, 8, 8888, add 1254 .endm 1255 1256 .macro bilinear_add_8888_8_8888_process_four_pixels 1257 bilinear_interpolate_four_pixels 8888, 8, 8888, add 1258 .endm 1259 1260 .macro bilinear_add_8888_8_8888_process_pixblock_head 1261 bilinear_add_8888_8_8888_process_four_pixels 1262 .endm 1263 1264 .macro bilinear_add_8888_8_8888_process_pixblock_tail 1265 .endm 1266 1267 .macro bilinear_add_8888_8_8888_process_pixblock_tail_head 1268 bilinear_add_8888_8_8888_process_pixblock_tail 1269 bilinear_add_8888_8_8888_process_pixblock_head 1270 .endm 1271 1272 1273 /* Bilinear scanline functions */ 1274 generate_bilinear_scanline_func \ 1275 pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \ 1276 8888, 8888, 2, 2, \ 1277 bilinear_src_8888_8_8888_process_last_pixel, \ 1278 bilinear_src_8888_8_8888_process_two_pixels, \ 1279 bilinear_src_8888_8_8888_process_four_pixels, \ 1280 bilinear_src_8888_8_8888_process_pixblock_head, \ 1281 bilinear_src_8888_8_8888_process_pixblock_tail, \ 1282 bilinear_src_8888_8_8888_process_pixblock_tail_head, \ 1283 4, 28, BILINEAR_FLAG_USE_MASK 1284 1285 generate_bilinear_scanline_func \ 1286 pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \ 1287 8888, 0565, 2, 1, \ 1288 bilinear_src_8888_8_0565_process_last_pixel, \ 1289 bilinear_src_8888_8_0565_process_two_pixels, \ 1290 bilinear_src_8888_8_0565_process_four_pixels, \ 1291 bilinear_src_8888_8_0565_process_pixblock_head, \ 1292 bilinear_src_8888_8_0565_process_pixblock_tail, \ 1293 bilinear_src_8888_8_0565_process_pixblock_tail_head, \ 1294 4, 28, BILINEAR_FLAG_USE_MASK 1295 1296 generate_bilinear_scanline_func \ 1297 pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \ 1298 0565, 8888, 1, 2, \ 1299 bilinear_src_0565_8_x888_process_last_pixel, \ 1300 bilinear_src_0565_8_x888_process_two_pixels, \ 1301 bilinear_src_0565_8_x888_process_four_pixels, \ 1302 bilinear_src_0565_8_x888_process_pixblock_head, \ 1303 bilinear_src_0565_8_x888_process_pixblock_tail, \ 1304 bilinear_src_0565_8_x888_process_pixblock_tail_head, \ 1305 4, 28, BILINEAR_FLAG_USE_MASK 1306 1307 generate_bilinear_scanline_func \ 1308 pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \ 1309 0565, 0565, 1, 1, \ 1310 bilinear_src_0565_8_0565_process_last_pixel, \ 1311 bilinear_src_0565_8_0565_process_two_pixels, \ 1312 bilinear_src_0565_8_0565_process_four_pixels, \ 1313 bilinear_src_0565_8_0565_process_pixblock_head, \ 1314 bilinear_src_0565_8_0565_process_pixblock_tail, \ 1315 bilinear_src_0565_8_0565_process_pixblock_tail_head, \ 1316 4, 28, BILINEAR_FLAG_USE_MASK 1317 1318 generate_bilinear_scanline_func \ 1319 pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \ 1320 8888, 8888, 2, 2, \ 1321 bilinear_over_8888_8888_process_last_pixel, \ 1322 bilinear_over_8888_8888_process_two_pixels, \ 1323 bilinear_over_8888_8888_process_four_pixels, \ 1324 bilinear_over_8888_8888_process_pixblock_head, \ 1325 bilinear_over_8888_8888_process_pixblock_tail, \ 1326 bilinear_over_8888_8888_process_pixblock_tail_head, \ 1327 4, 28, 0 1328 1329 generate_bilinear_scanline_func \ 1330 pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \ 1331 8888, 8888, 2, 2, \ 1332 bilinear_over_8888_8_8888_process_last_pixel, \ 1333 bilinear_over_8888_8_8888_process_two_pixels, \ 1334 bilinear_over_8888_8_8888_process_four_pixels, \ 1335 bilinear_over_8888_8_8888_process_pixblock_head, \ 1336 bilinear_over_8888_8_8888_process_pixblock_tail, \ 1337 bilinear_over_8888_8_8888_process_pixblock_tail_head, \ 1338 4, 28, BILINEAR_FLAG_USE_MASK 1339 1340 generate_bilinear_scanline_func \ 1341 pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \ 1342 8888, 8888, 2, 2, \ 1343 bilinear_add_8888_8888_process_last_pixel, \ 1344 bilinear_add_8888_8888_process_two_pixels, \ 1345 bilinear_add_8888_8888_process_four_pixels, \ 1346 bilinear_add_8888_8888_process_pixblock_head, \ 1347 bilinear_add_8888_8888_process_pixblock_tail, \ 1348 bilinear_add_8888_8888_process_pixblock_tail_head, \ 1349 4, 28, 0 1350 1351 generate_bilinear_scanline_func \ 1352 pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \ 1353 8888, 8888, 2, 2, \ 1354 bilinear_add_8888_8_8888_process_last_pixel, \ 1355 bilinear_add_8888_8_8888_process_two_pixels, \ 1356 bilinear_add_8888_8_8888_process_four_pixels, \ 1357 bilinear_add_8888_8_8888_process_pixblock_head, \ 1358 bilinear_add_8888_8_8888_process_pixblock_tail, \ 1359 bilinear_add_8888_8_8888_process_pixblock_tail_head, \ 1360 4, 28, BILINEAR_FLAG_USE_MASK