pixman-arma64-neon-asm-bilinear.S (44741B)
1 /* 2 * Copyright © 2011 SCore Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 * Author: Taekyun Kim (tkq.kim@samsung.com) 25 */ 26 27 /* 28 * This file contains scaled bilinear scanline functions implemented 29 * using older siarhei's bilinear macro template. 30 * 31 * << General scanline function procedures >> 32 * 1. bilinear interpolate source pixels 33 * 2. load mask pixels 34 * 3. load destination pixels 35 * 4. duplicate mask to fill whole register 36 * 5. interleave source & destination pixels 37 * 6. apply mask to source pixels 38 * 7. combine source & destination pixels 39 * 8, Deinterleave final result 40 * 9. store destination pixels 41 * 42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers. 43 * Registers with double numbers(src01, dst01) are 128-bits registers. 44 * All temp registers can be used freely outside the code block. 45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks. 46 * 47 * Remarks 48 * There can be lots of pipeline stalls inside code block and between code blocks. 49 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme. 50 */ 51 52 /* Prevent the stack from becoming executable for no reason... */ 53 #if defined(__linux__) && defined (__ELF__) 54 .section .note.GNU-stack,"",%progbits 55 #endif 56 57 .text 58 .arch armv8-a 59 .altmacro 60 .p2align 2 61 62 #include "pixman-private.h" 63 #include "pixman-arm-asm.h" 64 #include "pixman-arma64-neon-asm.h" 65 66 /* 67 * Bilinear macros from pixman-arm-neon-asm.S 68 */ 69 70 /* 71 * Bilinear scaling support code which tries to provide pixel fetching, color 72 * format conversion, and interpolation as separate macros which can be used 73 * as the basic building blocks for constructing bilinear scanline functions. 74 */ 75 76 .macro bilinear_load_8888 reg1, reg2, tmp 77 asr WTMP1, X, #16 78 add X, X, UX 79 add TMP1, TOP, TMP1, lsl #2 80 ld1 {\()\reg1\().2s}, [TMP1], STRIDE 81 ld1 {\()\reg2\().2s}, [TMP1] 82 .endm 83 84 .macro bilinear_load_0565 reg1, reg2, tmp 85 asr WTMP1, X, #16 86 add X, X, UX 87 add TMP1, TOP, TMP1, lsl #1 88 ld1 {\()\reg2\().s}[0], [TMP1], STRIDE 89 ld1 {\()\reg2\().s}[1], [TMP1] 90 convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp 91 .endm 92 93 .macro bilinear_load_and_vertical_interpolate_two_8888 \ 94 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 95 96 bilinear_load_8888 \reg1, \reg2, \tmp1 97 umull \()\acc1\().8h, \()\reg1\().8b, v28.8b 98 umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b 99 bilinear_load_8888 \reg3, \reg4, \tmp2 100 umull \()\acc2\().8h, \()\reg3\().8b, v28.8b 101 umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b 102 .endm 103 104 .macro bilinear_load_and_vertical_interpolate_four_8888 \ 105 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ 106 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 107 108 bilinear_load_and_vertical_interpolate_two_8888 \ 109 \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi 110 bilinear_load_and_vertical_interpolate_two_8888 \ 111 \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi 112 .endm 113 114 .macro vzip reg1, reg2 115 zip1 v24.8b, \reg1, \reg2 116 zip2 \reg2, \reg1, \reg2 117 mov \reg1, v24.8b 118 .endm 119 120 .macro vuzp reg1, reg2 121 uzp1 v24.8b, \reg1, \reg2 122 uzp2 \reg2, \reg1, \reg2 123 mov \reg1, v24.8b 124 .endm 125 126 .macro bilinear_load_and_vertical_interpolate_two_0565 \ 127 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 128 asr WTMP1, X, #16 129 add X, X, UX 130 add TMP1, TOP, TMP1, lsl #1 131 asr WTMP2, X, #16 132 add X, X, UX 133 add TMP2, TOP, TMP2, lsl #1 134 ld1 {\()\acc2\().s}[0], [TMP1], STRIDE 135 ld1 {\()\acc2\().s}[2], [TMP2], STRIDE 136 ld1 {\()\acc2\().s}[1], [TMP1] 137 ld1 {\()\acc2\().s}[3], [TMP2] 138 convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1 139 vzip \()\reg1\().8b, \()\reg3\().8b 140 vzip \()\reg2\().8b, \()\reg4\().8b 141 vzip \()\reg3\().8b, \()\reg4\().8b 142 vzip \()\reg1\().8b, \()\reg2\().8b 143 umull \()\acc1\().8h, \()\reg1\().8b, v28.8b 144 umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b 145 umull \()\acc2\().8h, \()\reg3\().8b, v28.8b 146 umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b 147 .endm 148 149 .macro bilinear_load_and_vertical_interpolate_four_0565 \ 150 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \ 151 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 152 153 asr WTMP1, X, #16 154 add X, X, UX 155 add TMP1, TOP, TMP1, lsl #1 156 asr WTMP2, X, #16 157 add X, X, UX 158 add TMP2, TOP, TMP2, lsl #1 159 ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE 160 ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE 161 ld1 {\()\xacc2\().s}[1], [TMP1] 162 ld1 {\()\xacc2\().s}[3], [TMP2] 163 convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1 164 asr WTMP1, X, #16 165 add X, X, UX 166 add TMP1, TOP, TMP1, lsl #1 167 asr WTMP2, X, #16 168 add X, X, UX 169 add TMP2, TOP, TMP2, lsl #1 170 ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE 171 vzip \()\xreg1\().8b, \()\xreg3\().8b 172 ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE 173 vzip \()\xreg2\().8b, \()\xreg4\().8b 174 ld1 {\()\yacc2\().s}[1], [TMP1] 175 vzip \()\xreg3\().8b, \()\xreg4\().8b 176 ld1 {\()\yacc2\().s}[3], [TMP2] 177 vzip \()\xreg1\().8b, \()\xreg2\().8b 178 convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1 179 umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b 180 vzip \()\yreg1\().8b, \()\yreg3\().8b 181 umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b 182 vzip \()\yreg2\().8b, \()\yreg4\().8b 183 umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b 184 vzip \()\yreg3\().8b, \()\yreg4\().8b 185 umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b 186 vzip \()\yreg1\().8b, \()\yreg2\().8b 187 umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b 188 umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b 189 umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b 190 umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b 191 .endm 192 193 .macro bilinear_store_8888 numpix, tmp1, tmp2 194 .if \numpix == 4 195 st1 {v0.2s, v1.2s}, [OUT], #16 196 .elseif \numpix == 2 197 st1 {v0.2s}, [OUT], #8 198 .elseif \numpix == 1 199 st1 {v0.s}[0], [OUT], #4 200 .else 201 .error bilinear_store_8888 \numpix is unsupported 202 .endif 203 .endm 204 205 .macro bilinear_store_0565 numpix, tmp1, tmp2 206 vuzp v0.8b, v1.8b 207 vuzp v2.8b, v3.8b 208 vuzp v1.8b, v3.8b 209 vuzp v0.8b, v2.8b 210 convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2 211 .if \numpix == 4 212 st1 {v1.4h}, [OUT], #8 213 .elseif \numpix == 2 214 st1 {v1.s}[0], [OUT], #4 215 .elseif \numpix == 1 216 st1 {v1.h}[0], [OUT], #2 217 .else 218 .error bilinear_store_0565 \numpix is unsupported 219 .endif 220 .endm 221 222 223 /* 224 * Macros for loading mask pixels into register 'mask'. 225 * dup must be done in somewhere else. 226 */ 227 .macro bilinear_load_mask_x numpix, mask 228 .endm 229 230 .macro bilinear_load_mask_8 numpix, mask 231 .if \numpix == 4 232 ld1 {\()\mask\().s}[0], [MASK], #4 233 .elseif \numpix == 2 234 ld1 {\()\mask\().h}[0], [MASK], #2 235 .elseif \numpix == 1 236 ld1 {\()\mask\().b}[0], [MASK], #1 237 .else 238 .error bilinear_load_mask_8 \numpix is unsupported 239 .endif 240 prfum PREFETCH_MODE, [MASK, #(prefetch_offset)] 241 .endm 242 243 .macro bilinear_load_mask mask_fmt, numpix, mask 244 bilinear_load_mask_\mask_fmt \numpix, \mask 245 .endm 246 247 248 /* 249 * Macros for loading destination pixels into register 'dst0' and 'dst1'. 250 * Interleave should be done somewhere else. 251 */ 252 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 253 .endm 254 255 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 256 .endm 257 258 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 259 .if \numpix == 4 260 ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT] 261 .elseif \numpix == 2 262 ld1 {\()\dst0\().2s}, [OUT] 263 .elseif \numpix == 1 264 ld1 {\()\dst0\().s}[0], [OUT] 265 .else 266 .error bilinear_load_dst_8888 \numpix is unsupported 267 .endif 268 mov \()\dst01\().d[0], \()\dst0\().d[0] 269 mov \()\dst01\().d[1], \()\dst1\().d[0] 270 prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] 271 .endm 272 273 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 274 bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 275 .endm 276 277 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 278 bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01 279 .endm 280 281 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 282 bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 283 .endm 284 285 /* 286 * Macros for duplicating partially loaded mask to fill entire register. 287 * We will apply mask to interleaved source pixels, that is 288 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) 289 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) 290 * So, we need to duplicate loaded mask into whole register. 291 * 292 * For two pixel case 293 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) 294 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) 295 * We can do some optimizations for this including last pixel cases. 296 */ 297 .macro bilinear_duplicate_mask_x numpix, mask 298 .endm 299 300 .macro bilinear_duplicate_mask_8 numpix, mask 301 .if \numpix == 4 302 dup \()\mask\().2s, \()\mask\().s[0] 303 .elseif \numpix == 2 304 dup \()\mask\().4h, \()\mask\().h[0] 305 .elseif \numpix == 1 306 dup \()\mask\().8b, \()\mask\().b[0] 307 .else 308 .error bilinear_duplicate_\mask_8 is unsupported 309 .endif 310 .endm 311 312 .macro bilinear_duplicate_mask mask_fmt, numpix, mask 313 bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask 314 .endm 315 316 /* 317 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. 318 * Interleave should be done when maks is enabled or operator is 'over'. 319 */ 320 .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01 321 vuzp \()\src0\().8b, \()\src1\().8b 322 vuzp \()\dst0\().8b, \()\dst1\().8b 323 vuzp \()\src0\().8b, \()\src1\().8b 324 vuzp \()\dst0\().8b, \()\dst1\().8b 325 mov \()\src01\().d[1], \()\src1\().d[0] 326 mov \()\src01\().d[0], \()\src0\().d[0] 327 mov \()\dst01\().d[1], \()\dst1\().d[0] 328 mov \()\dst01\().d[0], \()\dst0\().d[0] 329 .endm 330 331 .macro bilinear_interleave_src_dst_x_src \ 332 numpix, src0, src1, src01, dst0, dst1, dst01 333 .endm 334 335 .macro bilinear_interleave_src_dst_x_over \ 336 numpix, src0, src1, src01, dst0, dst1, dst01 337 338 bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 339 .endm 340 341 .macro bilinear_interleave_src_dst_x_add \ 342 numpix, src0, src1, src01, dst0, dst1, dst01 343 344 bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 345 .endm 346 347 .macro bilinear_interleave_src_dst_8_src \ 348 numpix, src0, src1, src01, dst0, dst1, dst01 349 350 bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 351 .endm 352 353 .macro bilinear_interleave_src_dst_8_over \ 354 numpix, src0, src1, src01, dst0, dst1, dst01 355 356 bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 357 .endm 358 359 .macro bilinear_interleave_src_dst_8_add \ 360 numpix, src0, src1, src01, dst0, dst1, dst01 361 362 bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01 363 .endm 364 365 .macro bilinear_interleave_src_dst \ 366 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 367 368 bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \ 369 \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01 370 .endm 371 372 373 /* 374 * Macros for applying masks to src pixels. (see combine_mask_u() function) 375 * src, dst should be in interleaved form. 376 * mask register should be in form (m0, m1, m2, m3). 377 */ 378 .macro bilinear_apply_mask_to_src_x \ 379 numpix, src0, src1, src01, mask, \ 380 tmp01, tmp23, tmp45, tmp67 381 .endm 382 383 .macro bilinear_apply_mask_to_src_8 \ 384 numpix, src0, src1, src01, mask, \ 385 tmp01, tmp23, tmp45, tmp67 386 387 umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b 388 umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b 389 /* bubbles */ 390 urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 391 urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 392 /* bubbles */ 393 raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h 394 raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h 395 mov \()\src01\().d[0], \()\src0\().d[0] 396 mov \()\src01\().d[1], \()\src1\().d[0] 397 .endm 398 399 .macro bilinear_apply_mask_to_src \ 400 mask_fmt, numpix, src0, src1, src01, mask, \ 401 tmp01, tmp23, tmp45, tmp67 402 403 bilinear_apply_mask_to_src_\()\mask_fmt \ 404 \numpix, \src0, \src1, \src01, \mask, \ 405 \tmp01, \tmp23, \tmp45, \tmp67 406 .endm 407 408 409 /* 410 * Macros for combining src and destination pixels. 411 * Interleave or not is depending on operator 'op'. 412 */ 413 .macro bilinear_combine_src \ 414 numpix, src0, src1, src01, dst0, dst1, dst01, \ 415 tmp01, tmp23, tmp45, tmp67, tmp8 416 .endm 417 418 .macro bilinear_combine_over \ 419 numpix, src0, src1, src01, dst0, dst1, dst01, \ 420 tmp01, tmp23, tmp45, tmp67, tmp8 421 422 dup \()\tmp8\().2s, \()\src1\().s[1] 423 /* bubbles */ 424 mvn \()\tmp8\().8b, \()\tmp8\().8b 425 /* bubbles */ 426 umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b 427 /* bubbles */ 428 umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b 429 /* bubbles */ 430 urshr \()\tmp45\().8h, \()\tmp01\().8h, #8 431 urshr \()\tmp67\().8h, \()\tmp23\().8h, #8 432 /* bubbles */ 433 raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h 434 raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h 435 mov \()\dst01\().d[0], \()\dst0\().d[0] 436 mov \()\dst01\().d[1], \()\dst1\().d[0] 437 /* bubbles */ 438 uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b 439 uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b 440 mov \()\src01\().d[0], \()\src0\().d[0] 441 mov \()\src01\().d[1], \()\src1\().d[0] 442 .endm 443 444 .macro bilinear_combine_add \ 445 numpix, src0, src1, src01, dst0, dst1, dst01, \ 446 tmp01, tmp23, tmp45, tmp67, tmp8 447 448 uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b 449 uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b 450 mov \()\src01\().d[0], \()\src0\().d[0] 451 mov \()\src01\().d[1], \()\src1\().d[0] 452 .endm 453 454 .macro bilinear_combine \ 455 op, numpix, src0, src1, src01, dst0, dst1, dst01, \ 456 tmp01, tmp23, tmp45, tmp67, tmp8 457 458 bilinear_combine_\()\op \ 459 \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \ 460 \tmp01, \tmp23, \tmp45, \tmp67, \tmp8 461 .endm 462 463 /* 464 * Macros for final deinterleaving of destination pixels if needed. 465 */ 466 .macro bilinear_deinterleave numpix, dst0, dst1, dst01 467 vuzp \()\dst0\().8b, \()\dst1\().8b 468 /* bubbles */ 469 vuzp \()\dst0\().8b, \()\dst1\().8b 470 mov \()\dst01\().d[0], \()\dst0\().d[0] 471 mov \()\dst01\().d[1], \()\dst1\().d[0] 472 .endm 473 474 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 475 .endm 476 477 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 478 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 479 .endm 480 481 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 482 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 483 .endm 484 485 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 486 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 487 .endm 488 489 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 490 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 491 .endm 492 493 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 494 bilinear_deinterleave \numpix, \dst0, \dst1, \dst01 495 .endm 496 497 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 498 bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01 499 .endm 500 501 502 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op 503 bilinear_load_\()\src_fmt v0, v1, v2 504 bilinear_load_mask \mask_fmt, 1, v4 505 bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9 506 umull v2.8h, v0.8b, v28.8b 507 umlal v2.8h, v1.8b, v29.8b 508 /* 5 cycles bubble */ 509 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS 510 umlsl v0.4s, v2.4h, v15.h[0] 511 umlal2 v0.4s, v2.8h, v15.h[0] 512 /* 5 cycles bubble */ 513 bilinear_duplicate_mask \mask_fmt, 1, v4 514 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 515 /* 3 cycles bubble */ 516 xtn v0.8b, v0.8h 517 /* 1 cycle bubble */ 518 bilinear_interleave_src_dst \ 519 \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9 520 bilinear_apply_mask_to_src \ 521 \mask_fmt, 1, v0, v1, v0, v4, \ 522 v3, v8, v10, v11 523 bilinear_combine \ 524 \op, 1, v0, v1, v0, v18, v19, v9, \ 525 v3, v8, v10, v11, v5 526 bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0 527 bilinear_store_\()\dst_fmt 1, v17, v18 528 .endm 529 530 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op 531 bilinear_load_and_vertical_interpolate_two_\()\src_fmt \ 532 v1, v11, v18, v19, v20, v21, v22, v23 533 bilinear_load_mask \mask_fmt, 2, v4 534 bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9 535 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS 536 umlsl v0.4s, v1.4h, v15.h[0] 537 umlal2 v0.4s, v1.8h, v15.h[0] 538 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS 539 umlsl v10.4s, v11.4h, v15.h[4] 540 umlal2 v10.4s, v11.8h, v15.h[4] 541 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 542 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 543 bilinear_duplicate_mask \mask_fmt, 2, v4 544 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 545 add v12.8h, v12.8h, v13.8h 546 xtn v0.8b, v0.8h 547 bilinear_interleave_src_dst \ 548 \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9 549 bilinear_apply_mask_to_src \ 550 \mask_fmt, 2, v0, v1, v0, v4, \ 551 v3, v8, v10, v11 552 bilinear_combine \ 553 \op, 2, v0, v1, v0, v18, v19, v9, \ 554 v3, v8, v10, v11, v5 555 bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0 556 bilinear_store_\()\dst_fmt 2, v16, v17 557 .endm 558 559 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op 560 bilinear_load_and_vertical_interpolate_four_\()\src_fmt \ 561 v1, v11, v4, v5, v6, v7, v22, v23, \ 562 v3, v9, v16, v17, v20, v21, v18, v19 563 prfm PREFETCH_MODE, [TMP1, PF_OFFS] 564 sub TMP1, TMP1, STRIDE 565 prfm PREFETCH_MODE, [TMP1, PF_OFFS] 566 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS 567 umlsl v0.4s, v1.4h, v15.h[0] 568 umlal2 v0.4s, v1.8h, v15.h[0] 569 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS 570 umlsl v10.4s, v11.4h, v15.h[4] 571 umlal2 v10.4s, v11.8h, v15.h[4] 572 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 573 ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS 574 umlsl v2.4s, v3.4h, v15.h[0] 575 umlal2 v2.4s, v3.8h, v15.h[0] 576 ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS 577 umlsl v8.4s, v9.4h, v15.h[4] 578 umlal2 v8.4s, v9.8h, v15.h[4] 579 add v12.8h, v12.8h, v13.8h 580 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 581 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 582 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 583 shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 584 bilinear_load_mask \mask_fmt, 4, v4 585 bilinear_duplicate_mask \mask_fmt, 4, v4 586 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 587 xtn v0.8b, v0.8h 588 xtn v1.8b, v2.8h 589 add v12.8h, v12.8h, v13.8h 590 bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21 591 bilinear_interleave_src_dst \ 592 \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11 593 bilinear_apply_mask_to_src \ 594 \mask_fmt, 4, v0, v1, v0, v4, \ 595 v6, v8, v9, v10 596 bilinear_combine \ 597 \op, 4, v0, v1, v0, v2, v3, v1, \ 598 v6, v8, v9, v10, v23 599 bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0 600 bilinear_store_\()\dst_fmt 4, v6, v7 601 .endm 602 603 .set BILINEAR_FLAG_USE_MASK, 1 604 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 605 606 /* 607 * Main template macro for generating NEON optimized bilinear scanline functions. 608 * 609 * Bilinear scanline generator macro take folling arguments: 610 * fname - name of the function to generate 611 * src_fmt - source color format (8888 or 0565) 612 * dst_fmt - destination color format (8888 or 0565) 613 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes 614 * process_last_pixel - code block that interpolate one pixel and does not 615 * update horizontal weight 616 * process_two_pixels - code block that interpolate two pixels and update 617 * horizontal weight 618 * process_four_pixels - code block that interpolate four pixels and update 619 * horizontal weight 620 * process_pixblock_head - head part of middle loop 621 * process_pixblock_tail - tail part of middle loop 622 * process_pixblock_tail_head - tail_head of middle loop 623 * pixblock_size - number of pixels processed in a single middle loop 624 * prefetch_distance - prefetch in the source image by that many pixels ahead 625 */ 626 627 .macro generate_bilinear_scanline_func \ 628 fname, \ 629 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ 630 bilinear_process_last_pixel, \ 631 bilinear_process_two_pixels, \ 632 bilinear_process_four_pixels, \ 633 bilinear_process_pixblock_head, \ 634 bilinear_process_pixblock_tail, \ 635 bilinear_process_pixblock_tail_head, \ 636 pixblock_size, \ 637 prefetch_distance, \ 638 flags 639 640 pixman_asm_function \fname 641 .if \pixblock_size == 8 642 .elseif \pixblock_size == 4 643 .else 644 .error unsupported pixblock size 645 .endif 646 647 .if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 648 OUT .req x0 649 TOP .req x1 650 BOTTOM .req x2 651 WT .req x3 652 WWT .req w3 653 WB .req x4 654 WWB .req w4 655 X .req w5 656 UX .req w6 657 WIDTH .req x7 658 TMP1 .req x10 659 WTMP1 .req w10 660 TMP2 .req x11 661 WTMP2 .req w11 662 PF_OFFS .req x12 663 TMP3 .req x13 664 WTMP3 .req w13 665 TMP4 .req x14 666 WTMP4 .req w14 667 STRIDE .req x15 668 DUMMY .req x30 669 670 stp x29, x30, [sp, -16]! 671 mov x29, sp 672 sub sp, sp, 112 673 sub x29, x29, 64 674 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 675 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 676 stp x10, x11, [x29, -80] 677 stp x12, x13, [x29, -96] 678 stp x14, x15, [x29, -112] 679 .else 680 OUT .req x0 681 MASK .req x1 682 TOP .req x2 683 BOTTOM .req x3 684 WT .req x4 685 WWT .req w4 686 WB .req x5 687 WWB .req w5 688 X .req w6 689 UX .req w7 690 WIDTH .req x8 691 TMP1 .req x10 692 WTMP1 .req w10 693 TMP2 .req x11 694 WTMP2 .req w11 695 PF_OFFS .req x12 696 TMP3 .req x13 697 WTMP3 .req w13 698 TMP4 .req x14 699 WTMP4 .req w14 700 STRIDE .req x15 701 DUMMY .req x30 702 703 .set prefetch_offset, \prefetch_distance 704 705 stp x29, x30, [sp, -16]! 706 mov x29, sp 707 sub x29, x29, 64 708 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 709 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 710 stp x10, x11, [x29, -80] 711 stp x12, x13, [x29, -96] 712 stp x14, x15, [x29, -112] 713 str x8, [x29, -120] 714 ldr w8, [x29, 16] 715 sub sp, sp, 120 716 .endif 717 718 mov WTMP1, #\prefetch_distance 719 umull PF_OFFS, WTMP1, UX 720 721 sub STRIDE, BOTTOM, TOP 722 .unreq BOTTOM 723 724 cmp WIDTH, #0 725 ble 300f 726 727 dup v12.8h, X 728 dup v13.8h, UX 729 dup v28.8b, WWT 730 dup v29.8b, WWB 731 mov v25.d[0], v12.d[1] 732 mov v26.d[0], v13.d[0] 733 add v25.4h, v25.4h, v26.4h 734 mov v12.d[1], v25.d[0] 735 736 /* ensure good destination alignment */ 737 cmp WIDTH, #1 738 blt 100f 739 tst OUT, #(1 << \dst_bpp_shift) 740 beq 100f 741 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 742 add v12.8h, v12.8h, v13.8h 743 \bilinear_process_last_pixel 744 sub WIDTH, WIDTH, #1 745 100: 746 add v13.8h, v13.8h, v13.8h 747 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 748 add v12.8h, v12.8h, v13.8h 749 750 cmp WIDTH, #2 751 blt 100f 752 tst OUT, #(1 << (\dst_bpp_shift + 1)) 753 beq 100f 754 \bilinear_process_two_pixels 755 sub WIDTH, WIDTH, #2 756 100: 757 .if \pixblock_size == 8 758 cmp WIDTH, #4 759 blt 100f 760 tst OUT, #(1 << (\dst_bpp_shift + 2)) 761 beq 100f 762 \bilinear_process_four_pixels 763 sub WIDTH, WIDTH, #4 764 100: 765 .endif 766 subs WIDTH, WIDTH, #\pixblock_size 767 blt 100f 768 asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift) 769 \bilinear_process_pixblock_head 770 subs WIDTH, WIDTH, #\pixblock_size 771 blt 500f 772 0: 773 \bilinear_process_pixblock_tail_head 774 subs WIDTH, WIDTH, #\pixblock_size 775 bge 0b 776 500: 777 \bilinear_process_pixblock_tail 778 100: 779 .if \pixblock_size == 8 780 tst WIDTH, #4 781 beq 200f 782 \bilinear_process_four_pixels 783 200: 784 .endif 785 /* handle the remaining trailing pixels */ 786 tst WIDTH, #2 787 beq 200f 788 \bilinear_process_two_pixels 789 200: 790 tst WIDTH, #1 791 beq 300f 792 \bilinear_process_last_pixel 793 300: 794 795 .if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0 796 sub x29, x29, 64 797 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 798 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 799 ldp x10, x11, [x29, -80] 800 ldp x12, x13, [x29, -96] 801 ldp x14, x15, [x29, -112] 802 mov sp, x29 803 ldp x29, x30, [sp], 16 804 .else 805 sub x29, x29, 64 806 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 807 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 808 ldp x10, x11, [x29, -80] 809 ldp x12, x13, [x29, -96] 810 ldp x14, x15, [x29, -112] 811 ldr x8, [x29, -120] 812 mov sp, x29 813 ldp x29, x30, [sp], 16 814 .endif 815 VERIFY_LR 816 ret 817 818 .unreq OUT 819 .unreq TOP 820 .unreq WT 821 .unreq WWT 822 .unreq WB 823 .unreq WWB 824 .unreq X 825 .unreq UX 826 .unreq WIDTH 827 .unreq TMP1 828 .unreq WTMP1 829 .unreq TMP2 830 .unreq PF_OFFS 831 .unreq TMP3 832 .unreq TMP4 833 .unreq STRIDE 834 .if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0 835 .unreq MASK 836 .endif 837 838 pixman_end_asm_function 839 840 .endm 841 842 /* src_8888_8_8888 */ 843 .macro bilinear_src_8888_8_8888_process_last_pixel 844 bilinear_interpolate_last_pixel 8888, 8, 8888, src 845 .endm 846 847 .macro bilinear_src_8888_8_8888_process_two_pixels 848 bilinear_interpolate_two_pixels 8888, 8, 8888, src 849 .endm 850 851 .macro bilinear_src_8888_8_8888_process_four_pixels 852 bilinear_interpolate_four_pixels 8888, 8, 8888, src 853 .endm 854 855 .macro bilinear_src_8888_8_8888_process_pixblock_head 856 bilinear_src_8888_8_8888_process_four_pixels 857 .endm 858 859 .macro bilinear_src_8888_8_8888_process_pixblock_tail 860 .endm 861 862 .macro bilinear_src_8888_8_8888_process_pixblock_tail_head 863 bilinear_src_8888_8_8888_process_pixblock_tail 864 bilinear_src_8888_8_8888_process_pixblock_head 865 .endm 866 867 /* src_8888_8_0565 */ 868 .macro bilinear_src_8888_8_0565_process_last_pixel 869 bilinear_interpolate_last_pixel 8888, 8, 0565, src 870 .endm 871 872 .macro bilinear_src_8888_8_0565_process_two_pixels 873 bilinear_interpolate_two_pixels 8888, 8, 0565, src 874 .endm 875 876 .macro bilinear_src_8888_8_0565_process_four_pixels 877 bilinear_interpolate_four_pixels 8888, 8, 0565, src 878 .endm 879 880 .macro bilinear_src_8888_8_0565_process_pixblock_head 881 bilinear_src_8888_8_0565_process_four_pixels 882 .endm 883 884 .macro bilinear_src_8888_8_0565_process_pixblock_tail 885 .endm 886 887 .macro bilinear_src_8888_8_0565_process_pixblock_tail_head 888 bilinear_src_8888_8_0565_process_pixblock_tail 889 bilinear_src_8888_8_0565_process_pixblock_head 890 .endm 891 892 /* src_0565_8_x888 */ 893 .macro bilinear_src_0565_8_x888_process_last_pixel 894 bilinear_interpolate_last_pixel 0565, 8, 8888, src 895 .endm 896 897 .macro bilinear_src_0565_8_x888_process_two_pixels 898 bilinear_interpolate_two_pixels 0565, 8, 8888, src 899 .endm 900 901 .macro bilinear_src_0565_8_x888_process_four_pixels 902 bilinear_interpolate_four_pixels 0565, 8, 8888, src 903 .endm 904 905 .macro bilinear_src_0565_8_x888_process_pixblock_head 906 bilinear_src_0565_8_x888_process_four_pixels 907 .endm 908 909 .macro bilinear_src_0565_8_x888_process_pixblock_tail 910 .endm 911 912 .macro bilinear_src_0565_8_x888_process_pixblock_tail_head 913 bilinear_src_0565_8_x888_process_pixblock_tail 914 bilinear_src_0565_8_x888_process_pixblock_head 915 .endm 916 917 /* src_0565_8_0565 */ 918 .macro bilinear_src_0565_8_0565_process_last_pixel 919 bilinear_interpolate_last_pixel 0565, 8, 0565, src 920 .endm 921 922 .macro bilinear_src_0565_8_0565_process_two_pixels 923 bilinear_interpolate_two_pixels 0565, 8, 0565, src 924 .endm 925 926 .macro bilinear_src_0565_8_0565_process_four_pixels 927 bilinear_interpolate_four_pixels 0565, 8, 0565, src 928 .endm 929 930 .macro bilinear_src_0565_8_0565_process_pixblock_head 931 bilinear_src_0565_8_0565_process_four_pixels 932 .endm 933 934 .macro bilinear_src_0565_8_0565_process_pixblock_tail 935 .endm 936 937 .macro bilinear_src_0565_8_0565_process_pixblock_tail_head 938 bilinear_src_0565_8_0565_process_pixblock_tail 939 bilinear_src_0565_8_0565_process_pixblock_head 940 .endm 941 942 /* over_8888_8888 */ 943 .macro bilinear_over_8888_8888_process_last_pixel 944 bilinear_interpolate_last_pixel 8888, x, 8888, over 945 .endm 946 947 .macro bilinear_over_8888_8888_process_two_pixels 948 bilinear_interpolate_two_pixels 8888, x, 8888, over 949 .endm 950 951 .macro bilinear_over_8888_8888_process_four_pixels 952 bilinear_interpolate_four_pixels 8888, x, 8888, over 953 .endm 954 955 .macro bilinear_over_8888_8888_process_pixblock_head 956 asr WTMP1, X, #16 957 add X, X, UX 958 add TMP1, TOP, TMP1, lsl #2 959 asr WTMP2, X, #16 960 add X, X, UX 961 add TMP2, TOP, TMP2, lsl #2 962 963 ld1 {v22.2s}, [TMP1], STRIDE 964 ld1 {v23.2s}, [TMP1] 965 asr WTMP3, X, #16 966 add X, X, UX 967 add TMP3, TOP, TMP3, lsl #2 968 umull v8.8h, v22.8b, v28.8b 969 umlal v8.8h, v23.8b, v29.8b 970 971 ld1 {v22.2s}, [TMP2], STRIDE 972 ld1 {v23.2s}, [TMP2] 973 asr WTMP4, X, #16 974 add X, X, UX 975 add TMP4, TOP, TMP4, lsl #2 976 umull v9.8h, v22.8b, v28.8b 977 umlal v9.8h, v23.8b, v29.8b 978 979 ld1 {v22.2s}, [TMP3], STRIDE 980 ld1 {v23.2s}, [TMP3] 981 umull v10.8h, v22.8b, v28.8b 982 umlal v10.8h, v23.8b, v29.8b 983 984 ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS 985 umlsl v0.4s, v8.4h, v15.h[0] 986 umlal2 v0.4s, v8.8h, v15.h[0] 987 988 prfm PREFETCH_MODE, [TMP4, PF_OFFS] 989 ld1 {v16.2s}, [TMP4], STRIDE 990 ld1 {v17.2s}, [TMP4] 991 prfm PREFETCH_MODE, [TMP4, PF_OFFS] 992 umull v11.8h, v16.8b, v28.8b 993 umlal v11.8h, v17.8b, v29.8b 994 995 ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS 996 umlsl v1.4s, v9.4h, v15.h[4] 997 umlal2 v1.4s, v9.8h, v15.h[4] 998 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 999 add v12.8h, v12.8h, v13.8h 1000 .endm 1001 1002 .macro bilinear_over_8888_8888_process_pixblock_tail 1003 ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS 1004 umlsl v2.4s, v10.4h, v15.h[0] 1005 umlal2 v2.4s, v10.8h, v15.h[0] 1006 ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS 1007 umlsl v3.4s, v11.4h, v15.h[4] 1008 umlal2 v3.4s, v11.8h, v15.h[4] 1009 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 1010 shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 1011 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 1012 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 1013 shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 1014 xtn v6.8b, v0.8h 1015 xtn v7.8b, v2.8h 1016 ld1 {v2.2s, v3.2s}, [OUT] 1017 prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] 1018 vuzp v6.8b, v7.8b 1019 vuzp v2.8b, v3.8b 1020 vuzp v6.8b, v7.8b 1021 vuzp v2.8b, v3.8b 1022 dup v4.2s, v7.s[1] 1023 mvn v4.8b, v4.8b 1024 umull v11.8h, v2.8b, v4.8b 1025 umull v2.8h, v3.8b, v4.8b 1026 urshr v1.8h, v11.8h, #8 1027 urshr v10.8h, v2.8h, #8 1028 raddhn v3.8b, v10.8h, v2.8h 1029 raddhn v2.8b, v1.8h, v11.8h 1030 uqadd v6.8b, v2.8b, v6.8b 1031 uqadd v7.8b, v3.8b, v7.8b 1032 vuzp v6.8b, v7.8b 1033 vuzp v6.8b, v7.8b 1034 add v12.8h, v12.8h, v13.8h 1035 st1 {v6.2s, v7.2s}, [OUT], #16 1036 .endm 1037 1038 .macro bilinear_over_8888_8888_process_pixblock_tail_head 1039 ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS 1040 asr WTMP1, X, #16 1041 add X, X, UX 1042 add TMP1, TOP, TMP1, lsl #2 1043 umlsl v2.4s, v10.4h, v15.h[0] 1044 asr WTMP2, X, #16 1045 add X, X, UX 1046 add TMP2, TOP, TMP2, lsl #2 1047 umlal2 v2.4s, v10.8h, v15.h[0] 1048 ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS 1049 ld1 {v20.2s}, [TMP1], STRIDE 1050 umlsl v3.4s, v11.4h, v15.h[4] 1051 umlal2 v3.4s, v11.8h, v15.h[4] 1052 ld1 {v21.2s}, [TMP1] 1053 umull v8.8h, v20.8b, v28.8b 1054 umlal v8.8h, v21.8b, v29.8b 1055 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 1056 shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 1057 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 1058 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 1059 ld1 {v22.2s}, [TMP2], STRIDE 1060 shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) 1061 xtn v6.8b, v0.8h 1062 ld1 {v23.2s}, [TMP2] 1063 umull v9.8h, v22.8b, v28.8b 1064 asr WTMP3, X, #16 1065 add X, X, UX 1066 add TMP3, TOP, TMP3, lsl #2 1067 asr WTMP4, X, #16 1068 add X, X, UX 1069 add TMP4, TOP, TMP4, lsl #2 1070 umlal v9.8h, v23.8b, v29.8b 1071 xtn v7.8b, v2.8h 1072 ld1 {v2.2s, v3.2s}, [OUT] 1073 prfm PREFETCH_MODE, [OUT, PF_OFFS] 1074 ld1 {v22.2s}, [TMP3], STRIDE 1075 vuzp v6.8b, v7.8b 1076 vuzp v2.8b, v3.8b 1077 vuzp v6.8b, v7.8b 1078 vuzp v2.8b, v3.8b 1079 dup v4.2s, v7.s[1] 1080 ld1 {v23.2s}, [TMP3] 1081 mvn v4.8b, v4.8b 1082 umull v10.8h, v22.8b, v28.8b 1083 umlal v10.8h, v23.8b, v29.8b 1084 umull v11.8h, v2.8b, v4.8b 1085 umull v2.8h, v3.8b, v4.8b 1086 ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS 1087 umlsl v0.4s, v8.4h, v15.h[0] 1088 urshr v1.8h, v11.8h, #8 1089 umlal2 v0.4s, v8.8h, v15.h[0] 1090 urshr v8.8h, v2.8h, #8 1091 raddhn v3.8b, v8.8h, v2.8h 1092 raddhn v2.8b, v1.8h, v11.8h 1093 prfm PREFETCH_MODE, [TMP4, PF_OFFS] 1094 ld1 {v16.2s}, [TMP4], STRIDE 1095 uqadd v6.8b, v2.8b, v6.8b 1096 uqadd v7.8b, v3.8b, v7.8b 1097 ld1 {v17.2s}, [TMP4] 1098 prfm PREFETCH_MODE, [TMP4, PF_OFFS] 1099 umull v11.8h, v16.8b, v28.8b 1100 umlal v11.8h, v17.8b, v29.8b 1101 vuzp v6.8b, v7.8b 1102 ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS 1103 vuzp v6.8b, v7.8b 1104 umlsl v1.4s, v9.4h, v15.h[4] 1105 add v12.8h, v12.8h, v13.8h 1106 umlal2 v1.4s, v9.8h, v15.h[4] 1107 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) 1108 add v12.8h, v12.8h, v13.8h 1109 st1 {v6.2s, v7.2s}, [OUT], #16 1110 .endm 1111 1112 /* over_8888_8_8888 */ 1113 .macro bilinear_over_8888_8_8888_process_last_pixel 1114 bilinear_interpolate_last_pixel 8888, 8, 8888, over 1115 .endm 1116 1117 .macro bilinear_over_8888_8_8888_process_two_pixels 1118 bilinear_interpolate_two_pixels 8888, 8, 8888, over 1119 .endm 1120 1121 .macro bilinear_over_8888_8_8888_process_four_pixels 1122 bilinear_interpolate_two_pixels 8888, 8, 8888, over 1123 bilinear_interpolate_two_pixels 8888, 8, 8888, over 1124 .endm 1125 1126 .macro bilinear_over_8888_8_8888_process_pixblock_head 1127 bilinear_over_8888_8_8888_process_four_pixels 1128 .endm 1129 1130 .macro bilinear_over_8888_8_8888_process_pixblock_tail 1131 .endm 1132 1133 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head 1134 bilinear_over_8888_8_8888_process_pixblock_tail 1135 bilinear_over_8888_8_8888_process_pixblock_head 1136 .endm 1137 1138 /* add_8888_8888 */ 1139 .macro bilinear_add_8888_8888_process_last_pixel 1140 bilinear_interpolate_last_pixel 8888, x, 8888, add 1141 .endm 1142 1143 .macro bilinear_add_8888_8888_process_two_pixels 1144 bilinear_interpolate_two_pixels 8888, x, 8888, add 1145 .endm 1146 1147 .macro bilinear_add_8888_8888_process_four_pixels 1148 bilinear_interpolate_two_pixels 8888, x, 8888, add 1149 bilinear_interpolate_two_pixels 8888, x, 8888, add 1150 .endm 1151 1152 .macro bilinear_add_8888_8888_process_pixblock_head 1153 bilinear_add_8888_8888_process_four_pixels 1154 .endm 1155 1156 .macro bilinear_add_8888_8888_process_pixblock_tail 1157 .endm 1158 1159 .macro bilinear_add_8888_8888_process_pixblock_tail_head 1160 bilinear_add_8888_8888_process_pixblock_tail 1161 bilinear_add_8888_8888_process_pixblock_head 1162 .endm 1163 1164 /* add_8888_8_8888 */ 1165 .macro bilinear_add_8888_8_8888_process_last_pixel 1166 bilinear_interpolate_last_pixel 8888, 8, 8888, add 1167 .endm 1168 1169 .macro bilinear_add_8888_8_8888_process_two_pixels 1170 bilinear_interpolate_two_pixels 8888, 8, 8888, add 1171 .endm 1172 1173 .macro bilinear_add_8888_8_8888_process_four_pixels 1174 bilinear_interpolate_four_pixels 8888, 8, 8888, add 1175 .endm 1176 1177 .macro bilinear_add_8888_8_8888_process_pixblock_head 1178 bilinear_add_8888_8_8888_process_four_pixels 1179 .endm 1180 1181 .macro bilinear_add_8888_8_8888_process_pixblock_tail 1182 .endm 1183 1184 .macro bilinear_add_8888_8_8888_process_pixblock_tail_head 1185 bilinear_add_8888_8_8888_process_pixblock_tail 1186 bilinear_add_8888_8_8888_process_pixblock_head 1187 .endm 1188 1189 1190 /* Bilinear scanline functions */ 1191 generate_bilinear_scanline_func \ 1192 pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \ 1193 8888, 8888, 2, 2, \ 1194 bilinear_src_8888_8_8888_process_last_pixel, \ 1195 bilinear_src_8888_8_8888_process_two_pixels, \ 1196 bilinear_src_8888_8_8888_process_four_pixels, \ 1197 bilinear_src_8888_8_8888_process_pixblock_head, \ 1198 bilinear_src_8888_8_8888_process_pixblock_tail, \ 1199 bilinear_src_8888_8_8888_process_pixblock_tail_head, \ 1200 4, 28, BILINEAR_FLAG_USE_MASK 1201 1202 generate_bilinear_scanline_func \ 1203 pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \ 1204 8888, 0565, 2, 1, \ 1205 bilinear_src_8888_8_0565_process_last_pixel, \ 1206 bilinear_src_8888_8_0565_process_two_pixels, \ 1207 bilinear_src_8888_8_0565_process_four_pixels, \ 1208 bilinear_src_8888_8_0565_process_pixblock_head, \ 1209 bilinear_src_8888_8_0565_process_pixblock_tail, \ 1210 bilinear_src_8888_8_0565_process_pixblock_tail_head, \ 1211 4, 28, BILINEAR_FLAG_USE_MASK 1212 1213 generate_bilinear_scanline_func \ 1214 pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \ 1215 0565, 8888, 1, 2, \ 1216 bilinear_src_0565_8_x888_process_last_pixel, \ 1217 bilinear_src_0565_8_x888_process_two_pixels, \ 1218 bilinear_src_0565_8_x888_process_four_pixels, \ 1219 bilinear_src_0565_8_x888_process_pixblock_head, \ 1220 bilinear_src_0565_8_x888_process_pixblock_tail, \ 1221 bilinear_src_0565_8_x888_process_pixblock_tail_head, \ 1222 4, 28, BILINEAR_FLAG_USE_MASK 1223 1224 generate_bilinear_scanline_func \ 1225 pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \ 1226 0565, 0565, 1, 1, \ 1227 bilinear_src_0565_8_0565_process_last_pixel, \ 1228 bilinear_src_0565_8_0565_process_two_pixels, \ 1229 bilinear_src_0565_8_0565_process_four_pixels, \ 1230 bilinear_src_0565_8_0565_process_pixblock_head, \ 1231 bilinear_src_0565_8_0565_process_pixblock_tail, \ 1232 bilinear_src_0565_8_0565_process_pixblock_tail_head, \ 1233 4, 28, BILINEAR_FLAG_USE_MASK 1234 1235 generate_bilinear_scanline_func \ 1236 pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \ 1237 8888, 8888, 2, 2, \ 1238 bilinear_over_8888_8888_process_last_pixel, \ 1239 bilinear_over_8888_8888_process_two_pixels, \ 1240 bilinear_over_8888_8888_process_four_pixels, \ 1241 bilinear_over_8888_8888_process_pixblock_head, \ 1242 bilinear_over_8888_8888_process_pixblock_tail, \ 1243 bilinear_over_8888_8888_process_pixblock_tail_head, \ 1244 4, 28, 0 1245 1246 generate_bilinear_scanline_func \ 1247 pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \ 1248 8888, 8888, 2, 2, \ 1249 bilinear_over_8888_8_8888_process_last_pixel, \ 1250 bilinear_over_8888_8_8888_process_two_pixels, \ 1251 bilinear_over_8888_8_8888_process_four_pixels, \ 1252 bilinear_over_8888_8_8888_process_pixblock_head, \ 1253 bilinear_over_8888_8_8888_process_pixblock_tail, \ 1254 bilinear_over_8888_8_8888_process_pixblock_tail_head, \ 1255 4, 28, BILINEAR_FLAG_USE_MASK 1256 1257 generate_bilinear_scanline_func \ 1258 pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \ 1259 8888, 8888, 2, 2, \ 1260 bilinear_add_8888_8888_process_last_pixel, \ 1261 bilinear_add_8888_8888_process_two_pixels, \ 1262 bilinear_add_8888_8888_process_four_pixels, \ 1263 bilinear_add_8888_8888_process_pixblock_head, \ 1264 bilinear_add_8888_8888_process_pixblock_tail, \ 1265 bilinear_add_8888_8888_process_pixblock_tail_head, \ 1266 4, 28, 0 1267 1268 generate_bilinear_scanline_func \ 1269 pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \ 1270 8888, 8888, 2, 2, \ 1271 bilinear_add_8888_8_8888_process_last_pixel, \ 1272 bilinear_add_8888_8_8888_process_two_pixels, \ 1273 bilinear_add_8888_8_8888_process_four_pixels, \ 1274 bilinear_add_8888_8_8888_process_pixblock_head, \ 1275 bilinear_add_8888_8_8888_process_pixblock_tail, \ 1276 bilinear_add_8888_8_8888_process_pixblock_tail_head, \ 1277 4, 28, BILINEAR_FLAG_USE_MASK