pixman-vmx.c (64570B)
1 /* 2 * Copyright © 2007 Luca Barbato 3 * 4 * Permission to use, copy, modify, distribute, and sell this software and its 5 * documentation for any purpose is hereby granted without fee, provided that 6 * the above copyright notice appear in all copies and that both that 7 * copyright notice and this permission notice appear in supporting 8 * documentation, and that the name of Luca Barbato not be used in advertising or 9 * publicity pertaining to distribution of the software without specific, 10 * written prior permission. Luca Barbato makes no representations about the 11 * suitability of this software for any purpose. It is provided "as is" 12 * without express or implied warranty. 13 * 14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 21 * SOFTWARE. 22 * 23 * Author: Luca Barbato (lu_zero@gentoo.org) 24 * 25 * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell 26 */ 27 28 #ifdef HAVE_CONFIG_H 29 #include <pixman-config.h> 30 #endif 31 #include "pixman-combine32.h" 32 #include "pixman-inlines.h" 33 #include "pixman-private.h" 34 #include <altivec.h> 35 36 static const vector unsigned char vzero = (const vector unsigned char){0}; 37 static vector unsigned char mask_ff000000; 38 39 static force_inline vector unsigned char 40 splat_alpha (vector unsigned char pix) 41 { 42 const vector unsigned char sel = (vector unsigned char){ 43 #ifdef WORDS_BIGENDIAN 44 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 45 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C, 46 #else 47 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07, 48 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F, 49 #endif 50 }; 51 52 return vec_perm (pix, pix, sel); 53 } 54 55 static force_inline vector unsigned char 56 splat_pixel (vector unsigned char pix) 57 { 58 const vector unsigned char sel = (vector unsigned char){ 59 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 60 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 61 }; 62 63 return vec_perm (pix, pix, sel); 64 } 65 66 static force_inline vector unsigned short 67 create_mask_16_128 (uint32_t mask) 68 { 69 return (vector unsigned short){mask, mask, mask, mask, 70 mask, mask, mask, mask}; 71 } 72 73 static force_inline vector unsigned int 74 create_mask_32_128 (uint32_t mask) 75 { 76 return (vector unsigned int){mask, mask, mask, mask}; 77 } 78 79 static force_inline vector unsigned char 80 unpacklo_128_16x8 (vector unsigned char data1, vector unsigned char data2) 81 { 82 #ifdef WORDS_BIGENDIAN 83 return vec_mergel (data2, data1); 84 #else 85 return vec_mergel (data1, data2); 86 #endif 87 } 88 89 static force_inline vector unsigned char 90 unpackhi_128_16x8 (vector unsigned char data1, vector unsigned char data2) 91 { 92 #ifdef WORDS_BIGENDIAN 93 return vec_mergeh (data2, data1); 94 #else 95 return vec_mergeh (data1, data2); 96 #endif 97 } 98 99 static force_inline void 100 unpack_128_2x128 (vector unsigned char data1, 101 vector unsigned char data2, 102 vector unsigned char *data_lo, 103 vector unsigned char *data_hi) 104 { 105 *data_lo = unpacklo_128_16x8 (data1, data2); 106 *data_hi = unpackhi_128_16x8 (data1, data2); 107 } 108 109 static force_inline vector unsigned char 110 pix_multiply (vector unsigned char a, vector unsigned char b) 111 { 112 const vector unsigned char sel = (vector unsigned char){ 113 #ifdef WORDS_BIGENDIAN 114 0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16, 115 0x08, 0x18, 0x0a, 0x1a, 0x0c, 0x1c, 0x0e, 0x1e, 116 #else 117 0x01, 0x11, 0x03, 0x13, 0x05, 0x15, 0x07, 0x17, 118 0x09, 0x19, 0x0b, 0x1b, 0x0d, 0x1d, 0x0f, 0x1f, 119 #endif 120 }; 121 vector unsigned short e = vec_mule (a, b); 122 vector unsigned short o = vec_mulo (a, b); 123 124 e = vec_adds (e, create_mask_16_128 (128)); 125 o = vec_adds (o, create_mask_16_128 (128)); 126 127 e = vec_adds (e, vec_sr (e, vec_splat_u16 (8))); 128 o = vec_adds (o, vec_sr (o, vec_splat_u16 (8))); 129 130 return (vector unsigned char)vec_perm (e, o, sel); 131 } 132 133 static force_inline vector unsigned char 134 pix_add (vector unsigned char a, vector unsigned char b) 135 { 136 return vec_adds (a, b); 137 } 138 139 static force_inline vector unsigned char 140 pix_add_mul (vector unsigned char x, 141 vector unsigned char a, 142 vector unsigned char y, 143 vector unsigned char b) 144 { 145 vector unsigned char t1, t2; 146 147 t1 = pix_multiply (x, a); 148 t2 = pix_multiply (y, b); 149 150 return pix_add (t1, t2); 151 } 152 153 static force_inline vector unsigned char 154 negate (vector unsigned char src) 155 { 156 return vec_nor (src, src); 157 } 158 159 /* dest*~srca + src */ 160 static force_inline vector unsigned char 161 over (vector unsigned char src, 162 vector unsigned char srca, 163 vector unsigned char dest) 164 { 165 return vec_adds (src, pix_multiply (dest, negate (srca))); 166 } 167 168 /* in == pix_multiply */ 169 static force_inline vector unsigned char 170 in_over (vector unsigned char src, 171 vector unsigned char srca, 172 vector unsigned char mask, 173 vector unsigned char dest) 174 { 175 return over (pix_multiply (src, mask), pix_multiply (srca, mask), dest); 176 } 177 178 #ifdef WORDS_BIGENDIAN 179 180 #define COMPUTE_SHIFT_MASK(source) source##_mask = vec_lvsl (0, source); 181 182 #define COMPUTE_SHIFT_MASKS(dest, source) source##_mask = vec_lvsl (0, source); 183 184 #define COMPUTE_SHIFT_MASKC(dest, source, mask) \ 185 mask##_mask = vec_lvsl (0, mask); \ 186 source##_mask = vec_lvsl (0, source); 187 188 #define LOAD_VECTOR(source) \ 189 do \ 190 { \ 191 vector unsigned char tmp1, tmp2; \ 192 tmp1 = (typeof (tmp1))vec_ld (0, source); \ 193 tmp2 = (typeof (tmp2))vec_ld (15, source); \ 194 v##source = (typeof (v##source))vec_perm (tmp1, tmp2, source##_mask); \ 195 } while (0) 196 197 #define LOAD_VECTORS(dest, source) \ 198 do \ 199 { \ 200 LOAD_VECTOR (source); \ 201 v##dest = (typeof (v##dest))vec_ld (0, dest); \ 202 } while (0) 203 204 #define LOAD_VECTORSC(dest, source, mask) \ 205 do \ 206 { \ 207 LOAD_VECTORS (dest, source); \ 208 LOAD_VECTOR (mask); \ 209 } while (0) 210 211 #define DECLARE_SRC_MASK_VAR vector unsigned char src_mask 212 #define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask 213 214 #else 215 216 /* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op. 217 * They are defined that way because little endian altivec can do unaligned 218 * reads natively and have no need for constructing the permutation pattern 219 * variables. 220 */ 221 #define COMPUTE_SHIFT_MASK(source) 222 223 #define COMPUTE_SHIFT_MASKS(dest, source) 224 225 #define COMPUTE_SHIFT_MASKC(dest, source, mask) 226 227 #define LOAD_VECTOR(source) v##source = (typeof (v##source))vec_xl (0, source); 228 229 #define LOAD_VECTORS(dest, source) \ 230 LOAD_VECTOR (source); \ 231 LOAD_VECTOR (dest); 232 233 #define LOAD_VECTORSC(dest, source, mask) \ 234 LOAD_VECTORS (dest, source); \ 235 LOAD_VECTOR (mask); 236 237 #define DECLARE_SRC_MASK_VAR 238 #define DECLARE_MASK_MASK_VAR 239 240 #endif /* WORDS_BIGENDIAN */ 241 242 #define LOAD_VECTORSM(dest, source, mask) \ 243 LOAD_VECTORSC (dest, source, mask); \ 244 v##source = pix_multiply (v##source, splat_alpha (v##mask)); 245 246 #define STORE_VECTOR(dest) vec_st ((vector unsigned int)v##dest, 0, dest); 247 248 /* load 4 pixels from a 16-byte boundary aligned address */ 249 static force_inline vector unsigned char 250 load_128_aligned (const uint32_t *src) 251 { 252 return *((vector unsigned char *)src); 253 } 254 255 /* load 4 pixels from a unaligned address */ 256 static force_inline vector unsigned char 257 load_128_unaligned (const uint32_t *src) 258 { 259 vector unsigned char vsrc; 260 DECLARE_SRC_MASK_VAR; 261 262 COMPUTE_SHIFT_MASK (src); 263 LOAD_VECTOR (src); 264 265 return vsrc; 266 } 267 268 /* save 4 pixels on a 16-byte boundary aligned address */ 269 static force_inline void 270 save_128_aligned (uint32_t *data, vector unsigned char vdata) 271 { 272 STORE_VECTOR (data) 273 } 274 275 static force_inline int 276 is_opaque (vector unsigned char x) 277 { 278 return vec_all_eq (vec_and (x, mask_ff000000), mask_ff000000); 279 } 280 281 static force_inline int 282 is_zero (vector unsigned char x) 283 { 284 return vec_all_eq (x, vzero); 285 } 286 287 static force_inline int 288 is_transparent (vector unsigned char x) 289 { 290 return vec_all_eq (vec_and (x, mask_ff000000), vzero); 291 } 292 293 static force_inline uint32_t 294 core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst) 295 { 296 uint32_t a; 297 298 a = ALPHA_8 (src); 299 300 if (a == 0xff) 301 { 302 return src; 303 } 304 else if (src) 305 { 306 UN8x4_MUL_UN8_ADD_UN8x4 (dst, (~a & MASK), src); 307 } 308 309 return dst; 310 } 311 312 static force_inline uint32_t 313 combine1 (const uint32_t *ps, const uint32_t *pm) 314 { 315 uint32_t s = *ps; 316 317 if (pm) 318 UN8x4_MUL_UN8 (s, ALPHA_8 (*pm)); 319 320 return s; 321 } 322 323 static force_inline vector unsigned char 324 combine4 (const uint32_t *ps, const uint32_t *pm) 325 { 326 vector unsigned char src, msk; 327 328 if (pm) 329 { 330 msk = load_128_unaligned (pm); 331 332 if (is_transparent (msk)) 333 return vzero; 334 } 335 336 src = load_128_unaligned (ps); 337 338 if (pm) 339 src = pix_multiply (src, msk); 340 341 return src; 342 } 343 344 static void 345 vmx_combine_over_u_no_mask (uint32_t *dest, const uint32_t *src, int width) 346 { 347 vector unsigned char vdest, vsrc; 348 DECLARE_SRC_MASK_VAR; 349 350 while (width && ((uintptr_t)dest & 15)) 351 { 352 uint32_t s = *src++; 353 uint32_t d = *dest; 354 uint32_t ia = ALPHA_8 (~s); 355 356 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); 357 358 *dest++ = d; 359 width--; 360 } 361 362 COMPUTE_SHIFT_MASKS (dest, src); 363 364 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 365 for (int i = width / 4; i > 0; i--) 366 { 367 368 LOAD_VECTORS (dest, src); 369 370 vdest = over (vsrc, splat_alpha (vsrc), vdest); 371 372 STORE_VECTOR (dest); 373 374 src += 4; 375 dest += 4; 376 } 377 378 for (int i = width % 4; --i >= 0;) 379 { 380 uint32_t s = src[i]; 381 uint32_t d = dest[i]; 382 uint32_t ia = ALPHA_8 (~s); 383 384 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); 385 386 dest[i] = d; 387 } 388 } 389 390 static void 391 vmx_combine_over_u_mask (uint32_t *dest, 392 const uint32_t *src, 393 const uint32_t *mask, 394 int width) 395 { 396 vector unsigned char vdest, vsrc, vmask; 397 DECLARE_SRC_MASK_VAR; 398 DECLARE_MASK_MASK_VAR; 399 400 while (width && ((uintptr_t)dest & 15)) 401 { 402 uint32_t m = ALPHA_8 (*mask++); 403 uint32_t s = *src++; 404 uint32_t d = *dest; 405 uint32_t ia; 406 407 UN8x4_MUL_UN8 (s, m); 408 409 ia = ALPHA_8 (~s); 410 411 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); 412 *dest++ = d; 413 width--; 414 } 415 416 COMPUTE_SHIFT_MASKC (dest, src, mask); 417 418 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 419 for (int i = width / 4; i > 0; i--) 420 { 421 LOAD_VECTORSM (dest, src, mask); 422 423 vdest = over (vsrc, splat_alpha (vsrc), vdest); 424 425 STORE_VECTOR (dest); 426 427 src += 4; 428 dest += 4; 429 mask += 4; 430 } 431 432 for (int i = width % 4; --i >= 0;) 433 { 434 uint32_t m = ALPHA_8 (mask[i]); 435 uint32_t s = src[i]; 436 uint32_t d = dest[i]; 437 uint32_t ia; 438 439 UN8x4_MUL_UN8 (s, m); 440 441 ia = ALPHA_8 (~s); 442 443 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); 444 dest[i] = d; 445 } 446 } 447 448 static void 449 vmx_combine_over_u (pixman_implementation_t *imp, 450 pixman_op_t op, 451 uint32_t *dest, 452 const uint32_t *src, 453 const uint32_t *mask, 454 int width) 455 { 456 if (mask) 457 vmx_combine_over_u_mask (dest, src, mask, width); 458 else 459 vmx_combine_over_u_no_mask (dest, src, width); 460 } 461 462 static void 463 vmx_combine_over_reverse_u_no_mask (uint32_t *dest, 464 const uint32_t *src, 465 int width) 466 { 467 vector unsigned char vdest, vsrc; 468 DECLARE_SRC_MASK_VAR; 469 470 while (width && ((uintptr_t)dest & 15)) 471 { 472 uint32_t s = *src++; 473 uint32_t d = *dest; 474 uint32_t ia = ALPHA_8 (~d); 475 476 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); 477 *dest++ = s; 478 width--; 479 } 480 481 COMPUTE_SHIFT_MASKS (dest, src); 482 483 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 484 for (int i = width / 4; i > 0; i--) 485 { 486 487 LOAD_VECTORS (dest, src); 488 489 vdest = over (vdest, splat_alpha (vdest), vsrc); 490 491 STORE_VECTOR (dest); 492 493 src += 4; 494 dest += 4; 495 } 496 497 for (int i = width % 4; --i >= 0;) 498 { 499 uint32_t s = src[i]; 500 uint32_t d = dest[i]; 501 uint32_t ia = ALPHA_8 (~dest[i]); 502 503 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); 504 dest[i] = s; 505 } 506 } 507 508 static void 509 vmx_combine_over_reverse_u_mask (uint32_t *dest, 510 const uint32_t *src, 511 const uint32_t *mask, 512 int width) 513 { 514 vector unsigned char vdest, vsrc, vmask; 515 DECLARE_SRC_MASK_VAR; 516 DECLARE_MASK_MASK_VAR; 517 518 while (width && ((uintptr_t)dest & 15)) 519 { 520 uint32_t m = ALPHA_8 (*mask++); 521 uint32_t s = *src++; 522 uint32_t d = *dest; 523 uint32_t ia = ALPHA_8 (~d); 524 525 UN8x4_MUL_UN8 (s, m); 526 527 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); 528 *dest++ = s; 529 width--; 530 } 531 532 COMPUTE_SHIFT_MASKC (dest, src, mask); 533 534 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 535 for (int i = width / 4; i > 0; i--) 536 { 537 538 LOAD_VECTORSM (dest, src, mask); 539 540 vdest = over (vdest, splat_alpha (vdest), vsrc); 541 542 STORE_VECTOR (dest); 543 544 src += 4; 545 dest += 4; 546 mask += 4; 547 } 548 549 for (int i = width % 4; --i >= 0;) 550 { 551 uint32_t m = ALPHA_8 (mask[i]); 552 uint32_t s = src[i]; 553 uint32_t d = dest[i]; 554 uint32_t ia = ALPHA_8 (~dest[i]); 555 556 UN8x4_MUL_UN8 (s, m); 557 558 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d); 559 dest[i] = s; 560 } 561 } 562 563 static void 564 vmx_combine_over_reverse_u (pixman_implementation_t *imp, 565 pixman_op_t op, 566 uint32_t *dest, 567 const uint32_t *src, 568 const uint32_t *mask, 569 int width) 570 { 571 if (mask) 572 vmx_combine_over_reverse_u_mask (dest, src, mask, width); 573 else 574 vmx_combine_over_reverse_u_no_mask (dest, src, width); 575 } 576 577 static void 578 vmx_combine_in_u_no_mask (uint32_t *dest, const uint32_t *src, int width) 579 { 580 vector unsigned char vdest, vsrc; 581 DECLARE_SRC_MASK_VAR; 582 583 while (width && ((uintptr_t)dest & 15)) 584 { 585 uint32_t s = *src++; 586 uint32_t a = ALPHA_8 (*dest); 587 588 UN8x4_MUL_UN8 (s, a); 589 *dest++ = s; 590 width--; 591 } 592 593 COMPUTE_SHIFT_MASKS (dest, src); 594 595 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 596 for (int i = width / 4; i > 0; i--) 597 { 598 LOAD_VECTORS (dest, src); 599 600 vdest = pix_multiply (vsrc, splat_alpha (vdest)); 601 602 STORE_VECTOR (dest); 603 604 src += 4; 605 dest += 4; 606 } 607 608 for (int i = width % 4; --i >= 0;) 609 { 610 uint32_t s = src[i]; 611 uint32_t a = ALPHA_8 (dest[i]); 612 613 UN8x4_MUL_UN8 (s, a); 614 dest[i] = s; 615 } 616 } 617 618 static void 619 vmx_combine_in_u_mask (uint32_t *dest, 620 const uint32_t *src, 621 const uint32_t *mask, 622 int width) 623 { 624 vector unsigned char vdest, vsrc, vmask; 625 DECLARE_SRC_MASK_VAR; 626 DECLARE_MASK_MASK_VAR; 627 628 while (width && ((uintptr_t)dest & 15)) 629 { 630 uint32_t m = ALPHA_8 (*mask++); 631 uint32_t s = *src++; 632 uint32_t a = ALPHA_8 (*dest); 633 634 UN8x4_MUL_UN8 (s, m); 635 UN8x4_MUL_UN8 (s, a); 636 637 *dest++ = s; 638 width--; 639 } 640 641 COMPUTE_SHIFT_MASKC (dest, src, mask); 642 643 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 644 for (int i = width / 4; i > 0; i--) 645 { 646 LOAD_VECTORSM (dest, src, mask); 647 648 vdest = pix_multiply (vsrc, splat_alpha (vdest)); 649 650 STORE_VECTOR (dest); 651 652 src += 4; 653 dest += 4; 654 mask += 4; 655 } 656 657 for (int i = width % 4; --i >= 0;) 658 { 659 uint32_t m = ALPHA_8 (mask[i]); 660 uint32_t s = src[i]; 661 uint32_t a = ALPHA_8 (dest[i]); 662 663 UN8x4_MUL_UN8 (s, m); 664 UN8x4_MUL_UN8 (s, a); 665 666 dest[i] = s; 667 } 668 } 669 670 static void 671 vmx_combine_in_u (pixman_implementation_t *imp, 672 pixman_op_t op, 673 uint32_t *dest, 674 const uint32_t *src, 675 const uint32_t *mask, 676 int width) 677 { 678 if (mask) 679 vmx_combine_in_u_mask (dest, src, mask, width); 680 else 681 vmx_combine_in_u_no_mask (dest, src, width); 682 } 683 684 static void 685 vmx_combine_in_reverse_u_no_mask (uint32_t *dest, 686 const uint32_t *src, 687 int width) 688 { 689 vector unsigned char vdest, vsrc; 690 DECLARE_SRC_MASK_VAR; 691 692 while (width && ((uintptr_t)dest & 15)) 693 { 694 uint32_t d = *dest; 695 uint32_t a = ALPHA_8 (*src++); 696 697 UN8x4_MUL_UN8 (d, a); 698 699 *dest++ = d; 700 width--; 701 } 702 703 COMPUTE_SHIFT_MASKS (dest, src); 704 705 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 706 for (int i = width / 4; i > 0; i--) 707 { 708 LOAD_VECTORS (dest, src); 709 710 vdest = pix_multiply (vdest, splat_alpha (vsrc)); 711 712 STORE_VECTOR (dest); 713 714 src += 4; 715 dest += 4; 716 } 717 718 for (int i = width % 4; --i >= 0;) 719 { 720 uint32_t d = dest[i]; 721 uint32_t a = ALPHA_8 (src[i]); 722 723 UN8x4_MUL_UN8 (d, a); 724 725 dest[i] = d; 726 } 727 } 728 729 static void 730 vmx_combine_in_reverse_u_mask (uint32_t *dest, 731 const uint32_t *src, 732 const uint32_t *mask, 733 int width) 734 { 735 vector unsigned char vdest, vsrc, vmask; 736 DECLARE_SRC_MASK_VAR; 737 DECLARE_MASK_MASK_VAR; 738 739 while (width && ((uintptr_t)dest & 15)) 740 { 741 uint32_t m = ALPHA_8 (*mask++); 742 uint32_t d = *dest; 743 uint32_t a = *src++; 744 745 UN8x4_MUL_UN8 (a, m); 746 a = ALPHA_8 (a); 747 UN8x4_MUL_UN8 (d, a); 748 749 *dest++ = d; 750 width--; 751 } 752 753 COMPUTE_SHIFT_MASKC (dest, src, mask); 754 755 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 756 for (int i = width / 4; i > 0; i--) 757 { 758 LOAD_VECTORSM (dest, src, mask); 759 760 vdest = pix_multiply (vdest, splat_alpha (vsrc)); 761 762 STORE_VECTOR (dest); 763 764 src += 4; 765 dest += 4; 766 mask += 4; 767 } 768 769 for (int i = width % 4; --i >= 0;) 770 { 771 uint32_t m = ALPHA_8 (mask[i]); 772 uint32_t d = dest[i]; 773 uint32_t a = src[i]; 774 775 UN8x4_MUL_UN8 (a, m); 776 a = ALPHA_8 (a); 777 UN8x4_MUL_UN8 (d, a); 778 779 dest[i] = d; 780 } 781 } 782 783 static void 784 vmx_combine_in_reverse_u (pixman_implementation_t *imp, 785 pixman_op_t op, 786 uint32_t *dest, 787 const uint32_t *src, 788 const uint32_t *mask, 789 int width) 790 { 791 if (mask) 792 vmx_combine_in_reverse_u_mask (dest, src, mask, width); 793 else 794 vmx_combine_in_reverse_u_no_mask (dest, src, width); 795 } 796 797 static void 798 vmx_combine_out_u_no_mask (uint32_t *dest, const uint32_t *src, int width) 799 { 800 vector unsigned char vdest, vsrc; 801 DECLARE_SRC_MASK_VAR; 802 803 while (width && ((uintptr_t)dest & 15)) 804 { 805 uint32_t s = *src++; 806 uint32_t a = ALPHA_8 (~(*dest)); 807 808 UN8x4_MUL_UN8 (s, a); 809 810 *dest++ = s; 811 width--; 812 } 813 814 COMPUTE_SHIFT_MASKS (dest, src); 815 816 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 817 for (int i = width / 4; i > 0; i--) 818 { 819 LOAD_VECTORS (dest, src); 820 821 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest))); 822 823 STORE_VECTOR (dest); 824 825 src += 4; 826 dest += 4; 827 } 828 829 for (int i = width % 4; --i >= 0;) 830 { 831 uint32_t s = src[i]; 832 uint32_t a = ALPHA_8 (~dest[i]); 833 834 UN8x4_MUL_UN8 (s, a); 835 836 dest[i] = s; 837 } 838 } 839 840 static void 841 vmx_combine_out_u_mask (uint32_t *dest, 842 const uint32_t *src, 843 const uint32_t *mask, 844 int width) 845 { 846 vector unsigned char vdest, vsrc, vmask; 847 DECLARE_SRC_MASK_VAR; 848 DECLARE_MASK_MASK_VAR; 849 850 while (width && ((uintptr_t)dest & 15)) 851 { 852 uint32_t m = ALPHA_8 (*mask++); 853 uint32_t s = *src++; 854 uint32_t a = ALPHA_8 (~(*dest)); 855 856 UN8x4_MUL_UN8 (s, m); 857 UN8x4_MUL_UN8 (s, a); 858 859 *dest++ = s; 860 width--; 861 } 862 863 COMPUTE_SHIFT_MASKC (dest, src, mask); 864 865 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 866 for (int i = width / 4; i > 0; i--) 867 { 868 LOAD_VECTORSM (dest, src, mask); 869 870 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest))); 871 872 STORE_VECTOR (dest); 873 874 src += 4; 875 dest += 4; 876 mask += 4; 877 } 878 879 for (int i = width % 4; --i >= 0;) 880 { 881 uint32_t m = ALPHA_8 (mask[i]); 882 uint32_t s = src[i]; 883 uint32_t a = ALPHA_8 (~dest[i]); 884 885 UN8x4_MUL_UN8 (s, m); 886 UN8x4_MUL_UN8 (s, a); 887 888 dest[i] = s; 889 } 890 } 891 892 static void 893 vmx_combine_out_u (pixman_implementation_t *imp, 894 pixman_op_t op, 895 uint32_t *dest, 896 const uint32_t *src, 897 const uint32_t *mask, 898 int width) 899 { 900 if (mask) 901 vmx_combine_out_u_mask (dest, src, mask, width); 902 else 903 vmx_combine_out_u_no_mask (dest, src, width); 904 } 905 906 static void 907 vmx_combine_out_reverse_u_no_mask (uint32_t *dest, 908 const uint32_t *src, 909 int width) 910 { 911 vector unsigned char vdest, vsrc; 912 DECLARE_SRC_MASK_VAR; 913 914 while (width && ((uintptr_t)dest & 15)) 915 { 916 uint32_t d = *dest; 917 uint32_t a = ALPHA_8 (~(*src++)); 918 919 UN8x4_MUL_UN8 (d, a); 920 921 *dest++ = d; 922 width--; 923 } 924 925 COMPUTE_SHIFT_MASKS (dest, src); 926 927 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 928 for (int i = width / 4; i > 0; i--) 929 { 930 931 LOAD_VECTORS (dest, src); 932 933 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc))); 934 935 STORE_VECTOR (dest); 936 937 src += 4; 938 dest += 4; 939 } 940 941 for (int i = width % 4; --i >= 0;) 942 { 943 uint32_t d = dest[i]; 944 uint32_t a = ALPHA_8 (~src[i]); 945 946 UN8x4_MUL_UN8 (d, a); 947 948 dest[i] = d; 949 } 950 } 951 952 static void 953 vmx_combine_out_reverse_u_mask (uint32_t *dest, 954 const uint32_t *src, 955 const uint32_t *mask, 956 int width) 957 { 958 vector unsigned char vdest, vsrc, vmask; 959 DECLARE_SRC_MASK_VAR; 960 DECLARE_MASK_MASK_VAR; 961 962 while (width && ((uintptr_t)dest & 15)) 963 { 964 uint32_t m = ALPHA_8 (*mask++); 965 uint32_t d = *dest; 966 uint32_t a = *src++; 967 968 UN8x4_MUL_UN8 (a, m); 969 a = ALPHA_8 (~a); 970 UN8x4_MUL_UN8 (d, a); 971 972 *dest++ = d; 973 width--; 974 } 975 976 COMPUTE_SHIFT_MASKC (dest, src, mask); 977 978 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 979 for (int i = width / 4; i > 0; i--) 980 { 981 LOAD_VECTORSM (dest, src, mask); 982 983 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc))); 984 985 STORE_VECTOR (dest); 986 987 src += 4; 988 dest += 4; 989 mask += 4; 990 } 991 992 for (int i = width % 4; --i >= 0;) 993 { 994 uint32_t m = ALPHA_8 (mask[i]); 995 uint32_t d = dest[i]; 996 uint32_t a = src[i]; 997 998 UN8x4_MUL_UN8 (a, m); 999 a = ALPHA_8 (~a); 1000 UN8x4_MUL_UN8 (d, a); 1001 1002 dest[i] = d; 1003 } 1004 } 1005 1006 static void 1007 vmx_combine_out_reverse_u (pixman_implementation_t *imp, 1008 pixman_op_t op, 1009 uint32_t *dest, 1010 const uint32_t *src, 1011 const uint32_t *mask, 1012 int width) 1013 { 1014 if (mask) 1015 vmx_combine_out_reverse_u_mask (dest, src, mask, width); 1016 else 1017 vmx_combine_out_reverse_u_no_mask (dest, src, width); 1018 } 1019 1020 static void 1021 vmx_combine_atop_u_no_mask (uint32_t *dest, const uint32_t *src, int width) 1022 { 1023 vector unsigned char vdest, vsrc; 1024 DECLARE_SRC_MASK_VAR; 1025 1026 while (width && ((uintptr_t)dest & 15)) 1027 { 1028 uint32_t s = *src++; 1029 uint32_t d = *dest; 1030 uint32_t dest_a = ALPHA_8 (d); 1031 uint32_t src_ia = ALPHA_8 (~s); 1032 1033 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); 1034 1035 *dest++ = s; 1036 width--; 1037 } 1038 1039 COMPUTE_SHIFT_MASKS (dest, src); 1040 1041 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1042 for (int i = width / 4; i > 0; i--) 1043 { 1044 LOAD_VECTORS (dest, src); 1045 1046 vdest = pix_add_mul (vsrc, splat_alpha (vdest), vdest, 1047 splat_alpha (negate (vsrc))); 1048 1049 STORE_VECTOR (dest); 1050 1051 src += 4; 1052 dest += 4; 1053 } 1054 1055 for (int i = width % 4; --i >= 0;) 1056 { 1057 uint32_t s = src[i]; 1058 uint32_t d = dest[i]; 1059 uint32_t dest_a = ALPHA_8 (d); 1060 uint32_t src_ia = ALPHA_8 (~s); 1061 1062 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); 1063 1064 dest[i] = s; 1065 } 1066 } 1067 1068 static void 1069 vmx_combine_atop_u_mask (uint32_t *dest, 1070 const uint32_t *src, 1071 const uint32_t *mask, 1072 int width) 1073 { 1074 vector unsigned char vdest, vsrc, vmask; 1075 DECLARE_SRC_MASK_VAR; 1076 DECLARE_MASK_MASK_VAR; 1077 1078 while (width && ((uintptr_t)dest & 15)) 1079 { 1080 uint32_t m = ALPHA_8 (*mask++); 1081 uint32_t s = *src++; 1082 uint32_t d = *dest; 1083 uint32_t dest_a = ALPHA_8 (d); 1084 uint32_t src_ia; 1085 1086 UN8x4_MUL_UN8 (s, m); 1087 1088 src_ia = ALPHA_8 (~s); 1089 1090 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); 1091 1092 *dest++ = s; 1093 width--; 1094 } 1095 1096 COMPUTE_SHIFT_MASKC (dest, src, mask); 1097 1098 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1099 for (int i = width / 4; i > 0; i--) 1100 { 1101 LOAD_VECTORSM (dest, src, mask); 1102 1103 vdest = pix_add_mul (vsrc, splat_alpha (vdest), vdest, 1104 splat_alpha (negate (vsrc))); 1105 1106 STORE_VECTOR (dest); 1107 1108 src += 4; 1109 dest += 4; 1110 mask += 4; 1111 } 1112 1113 for (int i = width % 4; --i >= 0;) 1114 { 1115 uint32_t m = ALPHA_8 (mask[i]); 1116 uint32_t s = src[i]; 1117 uint32_t d = dest[i]; 1118 uint32_t dest_a = ALPHA_8 (d); 1119 uint32_t src_ia; 1120 1121 UN8x4_MUL_UN8 (s, m); 1122 1123 src_ia = ALPHA_8 (~s); 1124 1125 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia); 1126 1127 dest[i] = s; 1128 } 1129 } 1130 1131 static void 1132 vmx_combine_atop_u (pixman_implementation_t *imp, 1133 pixman_op_t op, 1134 uint32_t *dest, 1135 const uint32_t *src, 1136 const uint32_t *mask, 1137 int width) 1138 { 1139 if (mask) 1140 vmx_combine_atop_u_mask (dest, src, mask, width); 1141 else 1142 vmx_combine_atop_u_no_mask (dest, src, width); 1143 } 1144 1145 static void 1146 vmx_combine_atop_reverse_u_no_mask (uint32_t *dest, 1147 const uint32_t *src, 1148 int width) 1149 { 1150 vector unsigned char vdest, vsrc; 1151 DECLARE_SRC_MASK_VAR; 1152 1153 while (width && ((uintptr_t)dest & 15)) 1154 { 1155 uint32_t s = *src++; 1156 uint32_t d = *dest; 1157 uint32_t src_a = ALPHA_8 (s); 1158 uint32_t dest_ia = ALPHA_8 (~d); 1159 1160 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); 1161 1162 *dest++ = s; 1163 width--; 1164 } 1165 1166 COMPUTE_SHIFT_MASKS (dest, src); 1167 1168 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1169 for (int i = width / 4; i > 0; i--) 1170 { 1171 LOAD_VECTORS (dest, src); 1172 1173 vdest = pix_add_mul (vdest, splat_alpha (vsrc), vsrc, 1174 splat_alpha (negate (vdest))); 1175 1176 STORE_VECTOR (dest); 1177 1178 src += 4; 1179 dest += 4; 1180 } 1181 1182 for (int i = width % 4; --i >= 0;) 1183 { 1184 uint32_t s = src[i]; 1185 uint32_t d = dest[i]; 1186 uint32_t src_a = ALPHA_8 (s); 1187 uint32_t dest_ia = ALPHA_8 (~d); 1188 1189 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); 1190 1191 dest[i] = s; 1192 } 1193 } 1194 1195 static void 1196 vmx_combine_atop_reverse_u_mask (uint32_t *dest, 1197 const uint32_t *src, 1198 const uint32_t *mask, 1199 int width) 1200 { 1201 vector unsigned char vdest, vsrc, vmask; 1202 DECLARE_SRC_MASK_VAR; 1203 DECLARE_MASK_MASK_VAR; 1204 1205 while (width && ((uintptr_t)dest & 15)) 1206 { 1207 uint32_t m = ALPHA_8 (*mask++); 1208 uint32_t s = *src++; 1209 uint32_t d = *dest; 1210 uint32_t src_a; 1211 uint32_t dest_ia = ALPHA_8 (~d); 1212 1213 UN8x4_MUL_UN8 (s, m); 1214 1215 src_a = ALPHA_8 (s); 1216 1217 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); 1218 1219 *dest++ = s; 1220 width--; 1221 } 1222 1223 COMPUTE_SHIFT_MASKC (dest, src, mask); 1224 1225 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1226 for (int i = width / 4; i > 0; i--) 1227 { 1228 LOAD_VECTORSM (dest, src, mask); 1229 1230 vdest = pix_add_mul (vdest, splat_alpha (vsrc), vsrc, 1231 splat_alpha (negate (vdest))); 1232 1233 STORE_VECTOR (dest); 1234 1235 src += 4; 1236 dest += 4; 1237 mask += 4; 1238 } 1239 1240 for (int i = width % 4; --i >= 0;) 1241 { 1242 uint32_t m = ALPHA_8 (mask[i]); 1243 uint32_t s = src[i]; 1244 uint32_t d = dest[i]; 1245 uint32_t src_a; 1246 uint32_t dest_ia = ALPHA_8 (~d); 1247 1248 UN8x4_MUL_UN8 (s, m); 1249 1250 src_a = ALPHA_8 (s); 1251 1252 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a); 1253 1254 dest[i] = s; 1255 } 1256 } 1257 1258 static void 1259 vmx_combine_atop_reverse_u (pixman_implementation_t *imp, 1260 pixman_op_t op, 1261 uint32_t *dest, 1262 const uint32_t *src, 1263 const uint32_t *mask, 1264 int width) 1265 { 1266 if (mask) 1267 vmx_combine_atop_reverse_u_mask (dest, src, mask, width); 1268 else 1269 vmx_combine_atop_reverse_u_no_mask (dest, src, width); 1270 } 1271 1272 static void 1273 vmx_combine_xor_u_no_mask (uint32_t *dest, const uint32_t *src, int width) 1274 { 1275 vector unsigned char vdest, vsrc; 1276 DECLARE_SRC_MASK_VAR; 1277 1278 while (width && ((uintptr_t)dest & 15)) 1279 { 1280 uint32_t s = *src++; 1281 uint32_t d = *dest; 1282 uint32_t src_ia = ALPHA_8 (~s); 1283 uint32_t dest_ia = ALPHA_8 (~d); 1284 1285 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); 1286 1287 *dest++ = s; 1288 width--; 1289 } 1290 1291 COMPUTE_SHIFT_MASKS (dest, src); 1292 1293 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1294 for (int i = width / 4; i > 0; i--) 1295 { 1296 LOAD_VECTORS (dest, src); 1297 1298 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), vdest, 1299 splat_alpha (negate (vsrc))); 1300 1301 STORE_VECTOR (dest); 1302 1303 src += 4; 1304 dest += 4; 1305 } 1306 1307 for (int i = width % 4; --i >= 0;) 1308 { 1309 uint32_t s = src[i]; 1310 uint32_t d = dest[i]; 1311 uint32_t src_ia = ALPHA_8 (~s); 1312 uint32_t dest_ia = ALPHA_8 (~d); 1313 1314 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); 1315 1316 dest[i] = s; 1317 } 1318 } 1319 1320 static void 1321 vmx_combine_xor_u_mask (uint32_t *dest, 1322 const uint32_t *src, 1323 const uint32_t *mask, 1324 int width) 1325 { 1326 vector unsigned char vdest, vsrc, vmask; 1327 DECLARE_SRC_MASK_VAR; 1328 DECLARE_MASK_MASK_VAR; 1329 1330 while (width && ((uintptr_t)dest & 15)) 1331 { 1332 uint32_t m = ALPHA_8 (*mask++); 1333 uint32_t s = *src++; 1334 uint32_t d = *dest; 1335 uint32_t src_ia; 1336 uint32_t dest_ia = ALPHA_8 (~d); 1337 1338 UN8x4_MUL_UN8 (s, m); 1339 1340 src_ia = ALPHA_8 (~s); 1341 1342 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); 1343 1344 *dest++ = s; 1345 width--; 1346 } 1347 1348 COMPUTE_SHIFT_MASKC (dest, src, mask); 1349 1350 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1351 for (int i = width / 4; i > 0; i--) 1352 { 1353 LOAD_VECTORSM (dest, src, mask); 1354 1355 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), vdest, 1356 splat_alpha (negate (vsrc))); 1357 1358 STORE_VECTOR (dest); 1359 1360 src += 4; 1361 dest += 4; 1362 mask += 4; 1363 } 1364 1365 for (int i = width % 4; --i >= 0;) 1366 { 1367 uint32_t m = ALPHA_8 (mask[i]); 1368 uint32_t s = src[i]; 1369 uint32_t d = dest[i]; 1370 uint32_t src_ia; 1371 uint32_t dest_ia = ALPHA_8 (~d); 1372 1373 UN8x4_MUL_UN8 (s, m); 1374 1375 src_ia = ALPHA_8 (~s); 1376 1377 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia); 1378 1379 dest[i] = s; 1380 } 1381 } 1382 1383 static void 1384 vmx_combine_xor_u (pixman_implementation_t *imp, 1385 pixman_op_t op, 1386 uint32_t *dest, 1387 const uint32_t *src, 1388 const uint32_t *mask, 1389 int width) 1390 { 1391 if (mask) 1392 vmx_combine_xor_u_mask (dest, src, mask, width); 1393 else 1394 vmx_combine_xor_u_no_mask (dest, src, width); 1395 } 1396 1397 static void 1398 vmx_combine_add_u_no_mask (uint32_t *dest, const uint32_t *src, int width) 1399 { 1400 vector unsigned char vdest, vsrc; 1401 DECLARE_SRC_MASK_VAR; 1402 1403 while (width && ((uintptr_t)dest & 15)) 1404 { 1405 uint32_t s = *src++; 1406 uint32_t d = *dest; 1407 1408 UN8x4_ADD_UN8x4 (d, s); 1409 1410 *dest++ = d; 1411 width--; 1412 } 1413 1414 COMPUTE_SHIFT_MASKS (dest, src); 1415 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1416 for (int i = width / 4; i > 0; i--) 1417 { 1418 LOAD_VECTORS (dest, src); 1419 1420 vdest = pix_add (vsrc, vdest); 1421 1422 STORE_VECTOR (dest); 1423 1424 src += 4; 1425 dest += 4; 1426 } 1427 1428 for (int i = width % 4; --i >= 0;) 1429 { 1430 uint32_t s = src[i]; 1431 uint32_t d = dest[i]; 1432 1433 UN8x4_ADD_UN8x4 (d, s); 1434 1435 dest[i] = d; 1436 } 1437 } 1438 1439 static void 1440 vmx_combine_add_u_mask (uint32_t *dest, 1441 const uint32_t *src, 1442 const uint32_t *mask, 1443 int width) 1444 { 1445 vector unsigned char vdest, vsrc, vmask; 1446 DECLARE_SRC_MASK_VAR; 1447 DECLARE_MASK_MASK_VAR; 1448 1449 while (width && ((uintptr_t)dest & 15)) 1450 { 1451 uint32_t m = ALPHA_8 (*mask++); 1452 uint32_t s = *src++; 1453 uint32_t d = *dest; 1454 1455 UN8x4_MUL_UN8 (s, m); 1456 UN8x4_ADD_UN8x4 (d, s); 1457 1458 *dest++ = d; 1459 width--; 1460 } 1461 1462 COMPUTE_SHIFT_MASKC (dest, src, mask); 1463 1464 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1465 for (int i = width / 4; i > 0; i--) 1466 { 1467 LOAD_VECTORSM (dest, src, mask); 1468 1469 vdest = pix_add (vsrc, vdest); 1470 1471 STORE_VECTOR (dest); 1472 1473 src += 4; 1474 dest += 4; 1475 mask += 4; 1476 } 1477 1478 for (int i = width % 4; --i >= 0;) 1479 { 1480 uint32_t m = ALPHA_8 (mask[i]); 1481 uint32_t s = src[i]; 1482 uint32_t d = dest[i]; 1483 1484 UN8x4_MUL_UN8 (s, m); 1485 UN8x4_ADD_UN8x4 (d, s); 1486 1487 dest[i] = d; 1488 } 1489 } 1490 1491 static void 1492 vmx_combine_add_u (pixman_implementation_t *imp, 1493 pixman_op_t op, 1494 uint32_t *dest, 1495 const uint32_t *src, 1496 const uint32_t *mask, 1497 int width) 1498 { 1499 if (mask) 1500 vmx_combine_add_u_mask (dest, src, mask, width); 1501 else 1502 vmx_combine_add_u_no_mask (dest, src, width); 1503 } 1504 1505 static void 1506 vmx_combine_src_ca (pixman_implementation_t *imp, 1507 pixman_op_t op, 1508 uint32_t *dest, 1509 const uint32_t *src, 1510 const uint32_t *mask, 1511 int width) 1512 { 1513 vector unsigned char vdest, vsrc, vmask; 1514 DECLARE_SRC_MASK_VAR; 1515 DECLARE_MASK_MASK_VAR; 1516 1517 while (width && ((uintptr_t)dest & 15)) 1518 { 1519 uint32_t a = *mask++; 1520 uint32_t s = *src++; 1521 1522 UN8x4_MUL_UN8x4 (s, a); 1523 1524 *dest++ = s; 1525 width--; 1526 } 1527 1528 COMPUTE_SHIFT_MASKC (dest, src, mask); 1529 1530 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1531 for (int i = width / 4; i > 0; i--) 1532 { 1533 LOAD_VECTORSC (dest, src, mask); 1534 1535 vdest = pix_multiply (vsrc, vmask); 1536 1537 STORE_VECTOR (dest); 1538 1539 mask += 4; 1540 src += 4; 1541 dest += 4; 1542 } 1543 1544 for (int i = width % 4; --i >= 0;) 1545 { 1546 uint32_t a = mask[i]; 1547 uint32_t s = src[i]; 1548 1549 UN8x4_MUL_UN8x4 (s, a); 1550 1551 dest[i] = s; 1552 } 1553 } 1554 1555 static void 1556 vmx_combine_over_ca (pixman_implementation_t *imp, 1557 pixman_op_t op, 1558 uint32_t *dest, 1559 const uint32_t *src, 1560 const uint32_t *mask, 1561 int width) 1562 { 1563 vector unsigned char vdest, vsrc, vmask; 1564 DECLARE_SRC_MASK_VAR; 1565 DECLARE_MASK_MASK_VAR; 1566 1567 while (width && ((uintptr_t)dest & 15)) 1568 { 1569 uint32_t a = *mask++; 1570 uint32_t s = *src++; 1571 uint32_t d = *dest; 1572 uint32_t sa = ALPHA_8 (s); 1573 1574 UN8x4_MUL_UN8x4 (s, a); 1575 UN8x4_MUL_UN8 (a, sa); 1576 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s); 1577 1578 *dest++ = d; 1579 width--; 1580 } 1581 1582 COMPUTE_SHIFT_MASKC (dest, src, mask); 1583 1584 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1585 for (int i = width / 4; i > 0; i--) 1586 { 1587 LOAD_VECTORSC (dest, src, mask); 1588 1589 vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest); 1590 1591 STORE_VECTOR (dest); 1592 1593 mask += 4; 1594 src += 4; 1595 dest += 4; 1596 } 1597 1598 for (int i = width % 4; --i >= 0;) 1599 { 1600 uint32_t a = mask[i]; 1601 uint32_t s = src[i]; 1602 uint32_t d = dest[i]; 1603 uint32_t sa = ALPHA_8 (s); 1604 1605 UN8x4_MUL_UN8x4 (s, a); 1606 UN8x4_MUL_UN8 (a, sa); 1607 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s); 1608 1609 dest[i] = d; 1610 } 1611 } 1612 1613 static void 1614 vmx_combine_over_reverse_ca (pixman_implementation_t *imp, 1615 pixman_op_t op, 1616 uint32_t *dest, 1617 const uint32_t *src, 1618 const uint32_t *mask, 1619 int width) 1620 { 1621 vector unsigned char vdest, vsrc, vmask; 1622 DECLARE_SRC_MASK_VAR; 1623 DECLARE_MASK_MASK_VAR; 1624 1625 while (width && ((uintptr_t)dest & 15)) 1626 { 1627 uint32_t a = *mask++; 1628 uint32_t s = *src++; 1629 uint32_t d = *dest; 1630 uint32_t ida = ALPHA_8 (~d); 1631 1632 UN8x4_MUL_UN8x4 (s, a); 1633 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d); 1634 1635 *dest++ = s; 1636 width--; 1637 } 1638 1639 COMPUTE_SHIFT_MASKC (dest, src, mask); 1640 1641 /* printf("%s\n",__PRETTY_FUNCTION__); */ 1642 for (int i = width / 4; i > 0; i--) 1643 { 1644 LOAD_VECTORSC (dest, src, mask); 1645 1646 vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask)); 1647 1648 STORE_VECTOR (dest); 1649 1650 mask += 4; 1651 src += 4; 1652 dest += 4; 1653 } 1654 1655 for (int i = width % 4; --i >= 0;) 1656 { 1657 uint32_t a = mask[i]; 1658 uint32_t s = src[i]; 1659 uint32_t d = dest[i]; 1660 uint32_t ida = ALPHA_8 (~d); 1661 1662 UN8x4_MUL_UN8x4 (s, a); 1663 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d); 1664 1665 dest[i] = s; 1666 } 1667 } 1668 1669 static void 1670 vmx_combine_in_ca (pixman_implementation_t *imp, 1671 pixman_op_t op, 1672 uint32_t *dest, 1673 const uint32_t *src, 1674 const uint32_t *mask, 1675 int width) 1676 { 1677 vector unsigned char vdest, vsrc, vmask; 1678 DECLARE_SRC_MASK_VAR; 1679 DECLARE_MASK_MASK_VAR; 1680 1681 while (width && ((uintptr_t)dest & 15)) 1682 { 1683 uint32_t a = *mask++; 1684 uint32_t s = *src++; 1685 uint32_t da = ALPHA_8 (*dest); 1686 1687 UN8x4_MUL_UN8x4 (s, a); 1688 UN8x4_MUL_UN8 (s, da); 1689 1690 *dest++ = s; 1691 width--; 1692 } 1693 1694 COMPUTE_SHIFT_MASKC (dest, src, mask); 1695 1696 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1697 for (int i = width / 4; i > 0; i--) 1698 { 1699 LOAD_VECTORSC (dest, src, mask); 1700 1701 vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest)); 1702 1703 STORE_VECTOR (dest); 1704 1705 src += 4; 1706 dest += 4; 1707 mask += 4; 1708 } 1709 1710 for (int i = width % 4; --i >= 0;) 1711 { 1712 uint32_t a = mask[i]; 1713 uint32_t s = src[i]; 1714 uint32_t da = ALPHA_8 (dest[i]); 1715 1716 UN8x4_MUL_UN8x4 (s, a); 1717 UN8x4_MUL_UN8 (s, da); 1718 1719 dest[i] = s; 1720 } 1721 } 1722 1723 static void 1724 vmx_combine_in_reverse_ca (pixman_implementation_t *imp, 1725 pixman_op_t op, 1726 uint32_t *dest, 1727 const uint32_t *src, 1728 const uint32_t *mask, 1729 int width) 1730 { 1731 vector unsigned char vdest, vsrc, vmask; 1732 DECLARE_SRC_MASK_VAR; 1733 DECLARE_MASK_MASK_VAR; 1734 1735 while (width && ((uintptr_t)dest & 15)) 1736 { 1737 uint32_t a = *mask++; 1738 uint32_t d = *dest; 1739 uint32_t sa = ALPHA_8 (*src++); 1740 1741 UN8x4_MUL_UN8 (a, sa); 1742 UN8x4_MUL_UN8x4 (d, a); 1743 1744 *dest++ = d; 1745 width--; 1746 } 1747 1748 COMPUTE_SHIFT_MASKC (dest, src, mask); 1749 1750 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1751 for (int i = width / 4; i > 0; i--) 1752 { 1753 1754 LOAD_VECTORSC (dest, src, mask); 1755 1756 vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc))); 1757 1758 STORE_VECTOR (dest); 1759 1760 src += 4; 1761 dest += 4; 1762 mask += 4; 1763 } 1764 1765 for (int i = width % 4; --i >= 0;) 1766 { 1767 uint32_t a = mask[i]; 1768 uint32_t d = dest[i]; 1769 uint32_t sa = ALPHA_8 (src[i]); 1770 1771 UN8x4_MUL_UN8 (a, sa); 1772 UN8x4_MUL_UN8x4 (d, a); 1773 1774 dest[i] = d; 1775 } 1776 } 1777 1778 static void 1779 vmx_combine_out_ca (pixman_implementation_t *imp, 1780 pixman_op_t op, 1781 uint32_t *dest, 1782 const uint32_t *src, 1783 const uint32_t *mask, 1784 int width) 1785 { 1786 vector unsigned char vdest, vsrc, vmask; 1787 DECLARE_SRC_MASK_VAR; 1788 DECLARE_MASK_MASK_VAR; 1789 1790 while (width && ((uintptr_t)dest & 15)) 1791 { 1792 uint32_t a = *mask++; 1793 uint32_t s = *src++; 1794 uint32_t d = *dest; 1795 uint32_t da = ALPHA_8 (~d); 1796 1797 UN8x4_MUL_UN8x4 (s, a); 1798 UN8x4_MUL_UN8 (s, da); 1799 1800 *dest++ = s; 1801 width--; 1802 } 1803 1804 COMPUTE_SHIFT_MASKC (dest, src, mask); 1805 1806 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1807 for (int i = width / 4; i > 0; i--) 1808 { 1809 LOAD_VECTORSC (dest, src, mask); 1810 1811 vdest = pix_multiply (pix_multiply (vsrc, vmask), 1812 splat_alpha (negate (vdest))); 1813 1814 STORE_VECTOR (dest); 1815 1816 src += 4; 1817 dest += 4; 1818 mask += 4; 1819 } 1820 1821 for (int i = width % 4; --i >= 0;) 1822 { 1823 uint32_t a = mask[i]; 1824 uint32_t s = src[i]; 1825 uint32_t d = dest[i]; 1826 uint32_t da = ALPHA_8 (~d); 1827 1828 UN8x4_MUL_UN8x4 (s, a); 1829 UN8x4_MUL_UN8 (s, da); 1830 1831 dest[i] = s; 1832 } 1833 } 1834 1835 static void 1836 vmx_combine_out_reverse_ca (pixman_implementation_t *imp, 1837 pixman_op_t op, 1838 uint32_t *dest, 1839 const uint32_t *src, 1840 const uint32_t *mask, 1841 int width) 1842 { 1843 vector unsigned char vdest, vsrc, vmask; 1844 DECLARE_SRC_MASK_VAR; 1845 DECLARE_MASK_MASK_VAR; 1846 1847 while (width && ((uintptr_t)dest & 15)) 1848 { 1849 uint32_t a = *mask++; 1850 uint32_t s = *src++; 1851 uint32_t d = *dest; 1852 uint32_t sa = ALPHA_8 (s); 1853 1854 UN8x4_MUL_UN8 (a, sa); 1855 UN8x4_MUL_UN8x4 (d, ~a); 1856 1857 *dest++ = d; 1858 width--; 1859 } 1860 1861 COMPUTE_SHIFT_MASKC (dest, src, mask); 1862 1863 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1864 for (int i = width / 4; i > 0; i--) 1865 { 1866 LOAD_VECTORSC (dest, src, mask); 1867 1868 vdest = pix_multiply ( 1869 vdest, negate (pix_multiply (vmask, splat_alpha (vsrc)))); 1870 1871 STORE_VECTOR (dest); 1872 1873 src += 4; 1874 dest += 4; 1875 mask += 4; 1876 } 1877 1878 for (int i = width % 4; --i >= 0;) 1879 { 1880 uint32_t a = mask[i]; 1881 uint32_t s = src[i]; 1882 uint32_t d = dest[i]; 1883 uint32_t sa = ALPHA_8 (s); 1884 1885 UN8x4_MUL_UN8 (a, sa); 1886 UN8x4_MUL_UN8x4 (d, ~a); 1887 1888 dest[i] = d; 1889 } 1890 } 1891 1892 static void 1893 vmx_combine_atop_ca (pixman_implementation_t *imp, 1894 pixman_op_t op, 1895 uint32_t *dest, 1896 const uint32_t *src, 1897 const uint32_t *mask, 1898 int width) 1899 { 1900 vector unsigned char vdest, vsrc, vmask, vsrca; 1901 DECLARE_SRC_MASK_VAR; 1902 DECLARE_MASK_MASK_VAR; 1903 1904 while (width && ((uintptr_t)dest & 15)) 1905 { 1906 uint32_t a = *mask++; 1907 uint32_t s = *src++; 1908 uint32_t d = *dest; 1909 uint32_t sa = ALPHA_8 (s); 1910 uint32_t da = ALPHA_8 (d); 1911 1912 UN8x4_MUL_UN8x4 (s, a); 1913 UN8x4_MUL_UN8 (a, sa); 1914 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); 1915 1916 *dest++ = d; 1917 width--; 1918 } 1919 1920 COMPUTE_SHIFT_MASKC (dest, src, mask); 1921 1922 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1923 for (int i = width / 4; i > 0; i--) 1924 { 1925 LOAD_VECTORSC (dest, src, mask); 1926 1927 vsrca = splat_alpha (vsrc); 1928 1929 vsrc = pix_multiply (vsrc, vmask); 1930 vmask = pix_multiply (vmask, vsrca); 1931 1932 vdest = pix_add_mul (vsrc, splat_alpha (vdest), negate (vmask), vdest); 1933 1934 STORE_VECTOR (dest); 1935 1936 src += 4; 1937 dest += 4; 1938 mask += 4; 1939 } 1940 1941 for (int i = width % 4; --i >= 0;) 1942 { 1943 uint32_t a = mask[i]; 1944 uint32_t s = src[i]; 1945 uint32_t d = dest[i]; 1946 uint32_t sa = ALPHA_8 (s); 1947 uint32_t da = ALPHA_8 (d); 1948 1949 UN8x4_MUL_UN8x4 (s, a); 1950 UN8x4_MUL_UN8 (a, sa); 1951 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); 1952 1953 dest[i] = d; 1954 } 1955 } 1956 1957 static void 1958 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp, 1959 pixman_op_t op, 1960 uint32_t *dest, 1961 const uint32_t *src, 1962 const uint32_t *mask, 1963 int width) 1964 { 1965 vector unsigned char vdest, vsrc, vmask; 1966 DECLARE_SRC_MASK_VAR; 1967 DECLARE_MASK_MASK_VAR; 1968 1969 while (width && ((uintptr_t)dest & 15)) 1970 { 1971 uint32_t a = *mask++; 1972 uint32_t s = *src++; 1973 uint32_t d = *dest; 1974 uint32_t sa = ALPHA_8 (s); 1975 uint32_t da = ALPHA_8 (~d); 1976 1977 UN8x4_MUL_UN8x4 (s, a); 1978 UN8x4_MUL_UN8 (a, sa); 1979 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da); 1980 1981 *dest++ = d; 1982 width--; 1983 } 1984 1985 COMPUTE_SHIFT_MASKC (dest, src, mask); 1986 1987 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 1988 for (int i = width / 4; i > 0; i--) 1989 { 1990 LOAD_VECTORSC (dest, src, mask); 1991 1992 vdest = pix_add_mul (vdest, pix_multiply (vmask, splat_alpha (vsrc)), 1993 pix_multiply (vsrc, vmask), 1994 negate (splat_alpha (vdest))); 1995 1996 STORE_VECTOR (dest); 1997 1998 src += 4; 1999 dest += 4; 2000 mask += 4; 2001 } 2002 2003 for (int i = width % 4; --i >= 0;) 2004 { 2005 uint32_t a = mask[i]; 2006 uint32_t s = src[i]; 2007 uint32_t d = dest[i]; 2008 uint32_t sa = ALPHA_8 (s); 2009 uint32_t da = ALPHA_8 (~d); 2010 2011 UN8x4_MUL_UN8x4 (s, a); 2012 UN8x4_MUL_UN8 (a, sa); 2013 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da); 2014 2015 dest[i] = d; 2016 } 2017 } 2018 2019 static void 2020 vmx_combine_xor_ca (pixman_implementation_t *imp, 2021 pixman_op_t op, 2022 uint32_t *dest, 2023 const uint32_t *src, 2024 const uint32_t *mask, 2025 int width) 2026 { 2027 vector unsigned char vdest, vsrc, vmask; 2028 DECLARE_SRC_MASK_VAR; 2029 DECLARE_MASK_MASK_VAR; 2030 2031 while (width && ((uintptr_t)dest & 15)) 2032 { 2033 uint32_t a = *mask++; 2034 uint32_t s = *src++; 2035 uint32_t d = *dest; 2036 uint32_t sa = ALPHA_8 (s); 2037 uint32_t da = ALPHA_8 (~d); 2038 2039 UN8x4_MUL_UN8x4 (s, a); 2040 UN8x4_MUL_UN8 (a, sa); 2041 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); 2042 2043 *dest++ = d; 2044 width--; 2045 } 2046 2047 COMPUTE_SHIFT_MASKC (dest, src, mask); 2048 2049 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 2050 for (int i = width / 4; i > 0; i--) 2051 { 2052 LOAD_VECTORSC (dest, src, mask); 2053 2054 vdest = pix_add_mul ( 2055 vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))), 2056 pix_multiply (vsrc, vmask), negate (splat_alpha (vdest))); 2057 2058 STORE_VECTOR (dest); 2059 2060 src += 4; 2061 dest += 4; 2062 mask += 4; 2063 } 2064 2065 for (int i = width % 4; --i >= 0;) 2066 { 2067 uint32_t a = mask[i]; 2068 uint32_t s = src[i]; 2069 uint32_t d = dest[i]; 2070 uint32_t sa = ALPHA_8 (s); 2071 uint32_t da = ALPHA_8 (~d); 2072 2073 UN8x4_MUL_UN8x4 (s, a); 2074 UN8x4_MUL_UN8 (a, sa); 2075 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da); 2076 2077 dest[i] = d; 2078 } 2079 } 2080 2081 static void 2082 vmx_combine_add_ca (pixman_implementation_t *imp, 2083 pixman_op_t op, 2084 uint32_t *dest, 2085 const uint32_t *src, 2086 const uint32_t *mask, 2087 int width) 2088 { 2089 vector unsigned char vdest, vsrc, vmask; 2090 DECLARE_SRC_MASK_VAR; 2091 DECLARE_MASK_MASK_VAR; 2092 2093 while (width && ((uintptr_t)dest & 15)) 2094 { 2095 uint32_t a = *mask++; 2096 uint32_t s = *src++; 2097 uint32_t d = *dest; 2098 2099 UN8x4_MUL_UN8x4 (s, a); 2100 UN8x4_ADD_UN8x4 (s, d); 2101 2102 *dest++ = s; 2103 width--; 2104 } 2105 2106 COMPUTE_SHIFT_MASKC (dest, src, mask); 2107 2108 /* printf ("%s\n",__PRETTY_FUNCTION__); */ 2109 for (int i = width / 4; i > 0; i--) 2110 { 2111 LOAD_VECTORSC (dest, src, mask); 2112 2113 vdest = pix_add (pix_multiply (vsrc, vmask), vdest); 2114 2115 STORE_VECTOR (dest); 2116 2117 src += 4; 2118 dest += 4; 2119 mask += 4; 2120 } 2121 2122 for (int i = width % 4; --i >= 0;) 2123 { 2124 uint32_t a = mask[i]; 2125 uint32_t s = src[i]; 2126 uint32_t d = dest[i]; 2127 2128 UN8x4_MUL_UN8x4 (s, a); 2129 UN8x4_ADD_UN8x4 (s, d); 2130 2131 dest[i] = s; 2132 } 2133 } 2134 2135 static void 2136 vmx_composite_over_n_8_8888 (pixman_implementation_t *imp, 2137 pixman_composite_info_t *info) 2138 { 2139 PIXMAN_COMPOSITE_ARGS (info); 2140 uint32_t src, srca; 2141 uint32_t *dst_line, *dst; 2142 uint8_t *mask_line; 2143 int dst_stride, mask_stride; 2144 int32_t w; 2145 uint32_t m, d, s, ia; 2146 2147 vector unsigned char vsrc, valpha, vmask, vdst; 2148 2149 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2150 2151 srca = ALPHA_8 (src); 2152 if (src == 0) 2153 return; 2154 2155 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2156 dst_line, 1); 2157 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, 2158 mask_line, 1); 2159 2160 vsrc = (vector unsigned char)create_mask_32_128 (src); 2161 valpha = splat_alpha (vsrc); 2162 2163 while (height--) 2164 { 2165 const uint8_t *pm = mask_line; 2166 dst = dst_line; 2167 dst_line += dst_stride; 2168 mask_line += mask_stride; 2169 w = width; 2170 2171 while (w && (uintptr_t)dst & 15) 2172 { 2173 s = src; 2174 m = *pm++; 2175 2176 if (m) 2177 { 2178 d = *dst; 2179 UN8x4_MUL_UN8 (s, m); 2180 ia = ALPHA_8 (~s); 2181 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); 2182 *dst = d; 2183 } 2184 2185 w--; 2186 dst++; 2187 } 2188 2189 while (w >= 4) 2190 { 2191 m = *((uint32_t *)pm); 2192 2193 if (srca == 0xff && m == 0xffffffff) 2194 { 2195 save_128_aligned (dst, vsrc); 2196 } 2197 else if (m) 2198 { 2199 vmask = splat_pixel ( 2200 (vector unsigned char)create_mask_32_128 (m)); 2201 2202 /* dst is 16-byte aligned */ 2203 vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst)); 2204 2205 save_128_aligned (dst, vdst); 2206 } 2207 2208 w -= 4; 2209 dst += 4; 2210 pm += 4; 2211 } 2212 2213 while (w) 2214 { 2215 s = src; 2216 m = *pm++; 2217 2218 if (m) 2219 { 2220 d = *dst; 2221 UN8x4_MUL_UN8 (s, m); 2222 ia = ALPHA_8 (~s); 2223 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s); 2224 *dst = d; 2225 } 2226 2227 w--; 2228 dst++; 2229 } 2230 } 2231 } 2232 2233 static pixman_bool_t 2234 vmx_fill (pixman_implementation_t *imp, 2235 uint32_t *bits, 2236 int stride, 2237 int bpp, 2238 int x, 2239 int y, 2240 int width, 2241 int height, 2242 uint32_t filler) 2243 { 2244 uint32_t byte_width; 2245 uint8_t *byte_line; 2246 2247 vector unsigned int vfiller; 2248 2249 if (bpp == 8) 2250 { 2251 uint8_t b; 2252 uint16_t w; 2253 2254 stride = stride * (int)sizeof (uint32_t) / 1; 2255 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); 2256 byte_width = width; 2257 stride *= 1; 2258 2259 b = filler & 0xff; 2260 w = (b << 8) | b; 2261 filler = (w << 16) | w; 2262 } 2263 else if (bpp == 16) 2264 { 2265 stride = stride * (int)sizeof (uint32_t) / 2; 2266 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); 2267 byte_width = 2 * width; 2268 stride *= 2; 2269 2270 filler = (filler & 0xffff) * 0x00010001; 2271 } 2272 else if (bpp == 32) 2273 { 2274 stride = stride * (int)sizeof (uint32_t) / 4; 2275 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); 2276 byte_width = 4 * width; 2277 stride *= 4; 2278 } 2279 else 2280 { 2281 return FALSE; 2282 } 2283 2284 vfiller = create_mask_32_128 (filler); 2285 2286 while (height--) 2287 { 2288 int w; 2289 uint8_t *d = byte_line; 2290 byte_line += stride; 2291 w = byte_width; 2292 2293 if (w >= 1 && ((uintptr_t)d & 1)) 2294 { 2295 *(uint8_t *)d = filler; 2296 w -= 1; 2297 d += 1; 2298 } 2299 2300 while (w >= 2 && ((uintptr_t)d & 3)) 2301 { 2302 *(uint16_t *)d = filler; 2303 w -= 2; 2304 d += 2; 2305 } 2306 2307 while (w >= 4 && ((uintptr_t)d & 15)) 2308 { 2309 *(uint32_t *)d = filler; 2310 2311 w -= 4; 2312 d += 4; 2313 } 2314 2315 while (w >= 128) 2316 { 2317 vec_st (vfiller, 0, (uint32_t *)d); 2318 vec_st (vfiller, 0, (uint32_t *)d + 4); 2319 vec_st (vfiller, 0, (uint32_t *)d + 8); 2320 vec_st (vfiller, 0, (uint32_t *)d + 12); 2321 vec_st (vfiller, 0, (uint32_t *)d + 16); 2322 vec_st (vfiller, 0, (uint32_t *)d + 20); 2323 vec_st (vfiller, 0, (uint32_t *)d + 24); 2324 vec_st (vfiller, 0, (uint32_t *)d + 28); 2325 2326 d += 128; 2327 w -= 128; 2328 } 2329 2330 if (w >= 64) 2331 { 2332 vec_st (vfiller, 0, (uint32_t *)d); 2333 vec_st (vfiller, 0, (uint32_t *)d + 4); 2334 vec_st (vfiller, 0, (uint32_t *)d + 8); 2335 vec_st (vfiller, 0, (uint32_t *)d + 12); 2336 2337 d += 64; 2338 w -= 64; 2339 } 2340 2341 if (w >= 32) 2342 { 2343 vec_st (vfiller, 0, (uint32_t *)d); 2344 vec_st (vfiller, 0, (uint32_t *)d + 4); 2345 2346 d += 32; 2347 w -= 32; 2348 } 2349 2350 if (w >= 16) 2351 { 2352 vec_st (vfiller, 0, (uint32_t *)d); 2353 2354 d += 16; 2355 w -= 16; 2356 } 2357 2358 while (w >= 4) 2359 { 2360 *(uint32_t *)d = filler; 2361 2362 w -= 4; 2363 d += 4; 2364 } 2365 2366 if (w >= 2) 2367 { 2368 *(uint16_t *)d = filler; 2369 w -= 2; 2370 d += 2; 2371 } 2372 2373 if (w >= 1) 2374 { 2375 *(uint8_t *)d = filler; 2376 w -= 1; 2377 d += 1; 2378 } 2379 } 2380 2381 return TRUE; 2382 } 2383 2384 static void 2385 vmx_composite_src_x888_8888 (pixman_implementation_t *imp, 2386 pixman_composite_info_t *info) 2387 { 2388 PIXMAN_COMPOSITE_ARGS (info); 2389 uint32_t *dst_line, *dst; 2390 uint32_t *src_line, *src; 2391 int32_t w; 2392 int dst_stride, src_stride; 2393 2394 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2395 dst_line, 1); 2396 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2397 src_line, 1); 2398 2399 while (height--) 2400 { 2401 dst = dst_line; 2402 dst_line += dst_stride; 2403 src = src_line; 2404 src_line += src_stride; 2405 w = width; 2406 2407 while (w && (uintptr_t)dst & 15) 2408 { 2409 *dst++ = *src++ | 0xff000000; 2410 w--; 2411 } 2412 2413 while (w >= 16) 2414 { 2415 vector unsigned char vmx_src1, vmx_src2, vmx_src3, vmx_src4; 2416 2417 vmx_src1 = load_128_unaligned (src); 2418 vmx_src2 = load_128_unaligned (src + 4); 2419 vmx_src3 = load_128_unaligned (src + 8); 2420 vmx_src4 = load_128_unaligned (src + 12); 2421 2422 save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000)); 2423 save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000)); 2424 save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000)); 2425 save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000)); 2426 2427 dst += 16; 2428 src += 16; 2429 w -= 16; 2430 } 2431 2432 while (w) 2433 { 2434 *dst++ = *src++ | 0xff000000; 2435 w--; 2436 } 2437 } 2438 } 2439 2440 static void 2441 vmx_composite_over_n_8888 (pixman_implementation_t *imp, 2442 pixman_composite_info_t *info) 2443 { 2444 PIXMAN_COMPOSITE_ARGS (info); 2445 uint32_t *dst_line, *dst; 2446 uint32_t src, ia; 2447 int w, dst_stride; 2448 2449 vector unsigned char vdst, vsrc, via; 2450 2451 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2452 2453 if (src == 0) 2454 return; 2455 2456 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2457 dst_line, 1); 2458 2459 vsrc = (vector unsigned char)create_mask_32_128 (src); 2460 via = negate (splat_alpha (vsrc)); 2461 ia = ALPHA_8 (~src); 2462 2463 while (height--) 2464 { 2465 dst = dst_line; 2466 dst_line += dst_stride; 2467 w = width; 2468 2469 while (w && ((uintptr_t)dst & 15)) 2470 { 2471 uint32_t d = *dst; 2472 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src); 2473 *dst++ = d; 2474 w--; 2475 } 2476 2477 for (int i = w / 4; i > 0; i--) 2478 { 2479 vdst = pix_multiply (load_128_aligned (dst), via); 2480 save_128_aligned (dst, pix_add (vsrc, vdst)); 2481 dst += 4; 2482 } 2483 2484 for (int i = w % 4; --i >= 0;) 2485 { 2486 uint32_t d = dst[i]; 2487 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src); 2488 dst[i] = d; 2489 } 2490 } 2491 } 2492 2493 static void 2494 vmx_composite_over_8888_8888 (pixman_implementation_t *imp, 2495 pixman_composite_info_t *info) 2496 { 2497 PIXMAN_COMPOSITE_ARGS (info); 2498 int dst_stride, src_stride; 2499 uint32_t *dst_line, *dst; 2500 uint32_t *src_line, *src; 2501 2502 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2503 dst_line, 1); 2504 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2505 src_line, 1); 2506 2507 dst = dst_line; 2508 src = src_line; 2509 2510 while (height--) 2511 { 2512 vmx_combine_over_u (imp, op, dst, src, NULL, width); 2513 2514 dst += dst_stride; 2515 src += src_stride; 2516 } 2517 } 2518 2519 static void 2520 vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, 2521 pixman_composite_info_t *info) 2522 { 2523 PIXMAN_COMPOSITE_ARGS (info); 2524 uint32_t src, ia; 2525 uint32_t *dst_line, d; 2526 uint32_t *mask_line, m; 2527 uint32_t pack_cmp; 2528 int dst_stride, mask_stride; 2529 2530 vector unsigned char vsrc, valpha, vmask, vdest; 2531 2532 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2533 2534 if (src == 0) 2535 return; 2536 2537 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2538 dst_line, 1); 2539 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, 2540 mask_line, 1); 2541 2542 vsrc = (vector unsigned char)create_mask_32_128 (src); 2543 valpha = splat_alpha (vsrc); 2544 ia = ALPHA_8 (src); 2545 2546 while (height--) 2547 { 2548 int w = width; 2549 const uint32_t *pm = (uint32_t *)mask_line; 2550 uint32_t *pd = (uint32_t *)dst_line; 2551 uint32_t s; 2552 2553 dst_line += dst_stride; 2554 mask_line += mask_stride; 2555 2556 while (w && (uintptr_t)pd & 15) 2557 { 2558 s = src; 2559 m = *pm++; 2560 2561 if (m) 2562 { 2563 d = *pd; 2564 UN8x4_MUL_UN8x4 (s, m); 2565 UN8x4_MUL_UN8 (m, ia); 2566 m = ~m; 2567 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s); 2568 *pd = d; 2569 } 2570 2571 pd++; 2572 w--; 2573 } 2574 2575 while (w >= 4) 2576 { 2577 /* pm is NOT necessarily 16-byte aligned */ 2578 vmask = load_128_unaligned (pm); 2579 2580 pack_cmp = vec_all_eq (vmask, vzero); 2581 2582 /* if all bits in mask are zero, pack_cmp is not 0 */ 2583 if (pack_cmp == 0) 2584 { 2585 /* pd is 16-byte aligned */ 2586 vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd)); 2587 2588 save_128_aligned (pd, vdest); 2589 } 2590 2591 pd += 4; 2592 pm += 4; 2593 w -= 4; 2594 } 2595 2596 while (w) 2597 { 2598 s = src; 2599 m = *pm++; 2600 2601 if (m) 2602 { 2603 d = *pd; 2604 UN8x4_MUL_UN8x4 (s, m); 2605 UN8x4_MUL_UN8 (m, ia); 2606 m = ~m; 2607 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s); 2608 *pd = d; 2609 } 2610 2611 pd++; 2612 w--; 2613 } 2614 } 2615 } 2616 2617 static void 2618 vmx_composite_add_8_8 (pixman_implementation_t *imp, 2619 pixman_composite_info_t *info) 2620 { 2621 PIXMAN_COMPOSITE_ARGS (info); 2622 uint8_t *dst_line, *dst; 2623 uint8_t *src_line, *src; 2624 int dst_stride, src_stride; 2625 int32_t w; 2626 uint16_t t; 2627 2628 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, 2629 src_line, 1); 2630 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, 2631 dst_line, 1); 2632 2633 while (height--) 2634 { 2635 dst = dst_line; 2636 src = src_line; 2637 2638 dst_line += dst_stride; 2639 src_line += src_stride; 2640 w = width; 2641 2642 /* Small head */ 2643 while (w && (uintptr_t)dst & 3) 2644 { 2645 t = (*dst) + (*src++); 2646 *dst++ = t | (0 - (t >> 8)); 2647 w--; 2648 } 2649 2650 vmx_combine_add_u (imp, op, (uint32_t *)dst, (uint32_t *)src, NULL, 2651 w >> 2); 2652 2653 /* Small tail */ 2654 dst += w & 0xfffc; 2655 src += w & 0xfffc; 2656 2657 w &= 3; 2658 2659 while (w) 2660 { 2661 t = (*dst) + (*src++); 2662 *dst++ = t | (0 - (t >> 8)); 2663 w--; 2664 } 2665 } 2666 } 2667 2668 static void 2669 vmx_composite_add_8888_8888 (pixman_implementation_t *imp, 2670 pixman_composite_info_t *info) 2671 { 2672 PIXMAN_COMPOSITE_ARGS (info); 2673 uint32_t *dst_line, *dst; 2674 uint32_t *src_line, *src; 2675 int dst_stride, src_stride; 2676 2677 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2678 src_line, 1); 2679 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2680 dst_line, 1); 2681 2682 while (height--) 2683 { 2684 dst = dst_line; 2685 dst_line += dst_stride; 2686 src = src_line; 2687 src_line += src_stride; 2688 2689 vmx_combine_add_u (imp, op, dst, src, NULL, width); 2690 } 2691 } 2692 2693 static force_inline void 2694 scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t *pd, 2695 const uint32_t *ps, 2696 int32_t w, 2697 pixman_fixed_t vx, 2698 pixman_fixed_t unit_x, 2699 pixman_fixed_t src_width_fixed, 2700 pixman_bool_t fully_transparent_src) 2701 { 2702 uint32_t s, d; 2703 const uint32_t *pm = NULL; 2704 2705 vector unsigned char vsrc, vdst; 2706 2707 if (fully_transparent_src) 2708 return; 2709 2710 /* Align dst on a 16-byte boundary */ 2711 while (w && ((uintptr_t)pd & 15)) 2712 { 2713 d = *pd; 2714 s = combine1 (ps + pixman_fixed_to_int (vx), pm); 2715 vx += unit_x; 2716 while (vx >= 0) 2717 vx -= src_width_fixed; 2718 2719 *pd++ = core_combine_over_u_pixel_vmx (s, d); 2720 if (pm) 2721 pm++; 2722 w--; 2723 } 2724 2725 while (w >= 4) 2726 { 2727 uint32_t tmp[4]; 2728 2729 tmp[0] = *(ps + pixman_fixed_to_int (vx)); 2730 vx += unit_x; 2731 while (vx >= 0) 2732 vx -= src_width_fixed; 2733 tmp[1] = *(ps + pixman_fixed_to_int (vx)); 2734 vx += unit_x; 2735 while (vx >= 0) 2736 vx -= src_width_fixed; 2737 tmp[2] = *(ps + pixman_fixed_to_int (vx)); 2738 vx += unit_x; 2739 while (vx >= 0) 2740 vx -= src_width_fixed; 2741 tmp[3] = *(ps + pixman_fixed_to_int (vx)); 2742 vx += unit_x; 2743 while (vx >= 0) 2744 vx -= src_width_fixed; 2745 2746 vsrc = combine4 (tmp, pm); 2747 2748 if (is_opaque (vsrc)) 2749 { 2750 save_128_aligned (pd, vsrc); 2751 } 2752 else if (!is_zero (vsrc)) 2753 { 2754 vdst = over (vsrc, splat_alpha (vsrc), load_128_aligned (pd)); 2755 2756 save_128_aligned (pd, vdst); 2757 } 2758 2759 w -= 4; 2760 pd += 4; 2761 if (pm) 2762 pm += 4; 2763 } 2764 2765 while (w) 2766 { 2767 d = *pd; 2768 s = combine1 (ps + pixman_fixed_to_int (vx), pm); 2769 vx += unit_x; 2770 while (vx >= 0) 2771 vx -= src_width_fixed; 2772 2773 *pd++ = core_combine_over_u_pixel_vmx (s, d); 2774 if (pm) 2775 pm++; 2776 2777 w--; 2778 } 2779 } 2780 2781 /* clang-format off */ 2782 FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER, 2783 scaled_nearest_scanline_vmx_8888_8888_OVER, 2784 uint32_t, uint32_t, COVER) 2785 FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER, 2786 scaled_nearest_scanline_vmx_8888_8888_OVER, 2787 uint32_t, uint32_t, NONE) 2788 FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER, 2789 scaled_nearest_scanline_vmx_8888_8888_OVER, 2790 uint32_t, uint32_t, PAD) 2791 FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER, 2792 scaled_nearest_scanline_vmx_8888_8888_OVER, 2793 uint32_t, uint32_t, NORMAL) 2794 2795 static const pixman_fast_path_t vmx_fast_paths[] = 2796 { 2797 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, vmx_composite_over_n_8888), 2798 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, vmx_composite_over_n_8888), 2799 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888), 2800 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888), 2801 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888), 2802 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888), 2803 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888), 2804 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888), 2805 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888), 2806 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888), 2807 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca), 2808 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca), 2809 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca), 2810 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca), 2811 2812 /* PIXMAN_OP_ADD */ 2813 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8), 2814 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888), 2815 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888), 2816 2817 /* PIXMAN_OP_SRC */ 2818 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888), 2819 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888), 2820 2821 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888), 2822 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888), 2823 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888), 2824 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888), 2825 2826 { PIXMAN_OP_NONE }, 2827 }; 2828 /* clang-format on */ 2829 2830 static uint32_t * 2831 vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) 2832 { 2833 int w = iter->width; 2834 uint32_t *dst = iter->buffer; 2835 uint32_t *src = (uint32_t *)iter->bits; 2836 2837 iter->bits += iter->stride; 2838 2839 while (w && ((uintptr_t)dst) & 0x0f) 2840 { 2841 *dst++ = (*src++) | 0xff000000; 2842 w--; 2843 } 2844 2845 while (w >= 4) 2846 { 2847 save_128_aligned (dst, 2848 vec_or (load_128_unaligned (src), mask_ff000000)); 2849 2850 dst += 4; 2851 src += 4; 2852 w -= 4; 2853 } 2854 2855 while (w) 2856 { 2857 *dst++ = (*src++) | 0xff000000; 2858 w--; 2859 } 2860 2861 return iter->buffer; 2862 } 2863 2864 static uint32_t * 2865 vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) 2866 { 2867 int w = iter->width; 2868 uint32_t *dst = iter->buffer; 2869 uint8_t *src = iter->bits; 2870 2871 vector unsigned char vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6; 2872 2873 iter->bits += iter->stride; 2874 2875 while (w && (((uintptr_t)dst) & 15)) 2876 { 2877 *dst++ = *(src++) << 24; 2878 w--; 2879 } 2880 2881 while (w >= 16) 2882 { 2883 vmx0 = load_128_unaligned ((uint32_t *)src); 2884 2885 unpack_128_2x128 (vzero, vmx0, &vmx1, &vmx2); 2886 unpack_128_2x128 (vzero, vmx1, &vmx3, &vmx4); 2887 unpack_128_2x128 (vzero, vmx2, &vmx5, &vmx6); 2888 2889 save_128_aligned (dst, vmx6); 2890 save_128_aligned ((dst + 4), vmx5); 2891 save_128_aligned ((dst + 8), vmx4); 2892 save_128_aligned ((dst + 12), vmx3); 2893 2894 dst += 16; 2895 src += 16; 2896 w -= 16; 2897 } 2898 2899 while (w) 2900 { 2901 *dst++ = *(src++) << 24; 2902 w--; 2903 } 2904 2905 return iter->buffer; 2906 } 2907 2908 #define IMAGE_FLAGS \ 2909 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ 2910 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) 2911 2912 /* clang-format off */ 2913 static const pixman_iter_info_t vmx_iters[] = 2914 { 2915 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, 2916 _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL 2917 }, 2918 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, 2919 _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL 2920 }, 2921 { PIXMAN_null }, 2922 }; 2923 /* clang-format on */ 2924 2925 pixman_implementation_t * 2926 _pixman_implementation_create_vmx (pixman_implementation_t *fallback) 2927 { 2928 pixman_implementation_t *imp = _pixman_implementation_create ( 2929 fallback, vmx_fast_paths); 2930 2931 /* VMX constants */ 2932 mask_ff000000 = (vector unsigned char)create_mask_32_128 (0xff000000); 2933 2934 /* Set up function pointers */ 2935 2936 imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u; 2937 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u; 2938 imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u; 2939 imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u; 2940 imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u; 2941 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u; 2942 imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u; 2943 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u; 2944 imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u; 2945 2946 imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u; 2947 2948 imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca; 2949 imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca; 2950 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca; 2951 imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca; 2952 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca; 2953 imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca; 2954 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca; 2955 imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca; 2956 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca; 2957 imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca; 2958 imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca; 2959 2960 imp->fill = vmx_fill; 2961 2962 imp->iter_info = vmx_iters; 2963 2964 return imp; 2965 }