pixman-sse2.c (162877B)
1 /* 2 * Copyright © 2008 Rodrigo Kumpera 3 * Copyright © 2008 André Tupinambá 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of Red Hat not be used in advertising or 10 * publicity pertaining to distribution of the software without specific, 11 * written prior permission. Red Hat makes no representations about the 12 * suitability of this software for any purpose. It is provided "as is" 13 * without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Rodrigo Kumpera (kumpera@gmail.com) 25 * André Tupinambá (andrelrt@gmail.com) 26 * 27 * Based on work by Owen Taylor and Søren Sandmann 28 */ 29 #ifdef HAVE_CONFIG_H 30 #include <pixman-config.h> 31 #endif 32 33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */ 34 #define PSHUFD_IS_FAST 0 35 36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ 37 #include <emmintrin.h> /* for SSE2 intrinsics */ 38 #include "pixman-private.h" 39 #include "pixman-combine32.h" 40 #include "pixman-inlines.h" 41 42 static __m128i mask_0080; 43 static __m128i mask_00ff; 44 static __m128i mask_0101; 45 static __m128i mask_ffff; 46 static __m128i mask_ff000000; 47 static __m128i mask_alpha; 48 49 static __m128i mask_565_r; 50 static __m128i mask_565_g1, mask_565_g2; 51 static __m128i mask_565_b; 52 static __m128i mask_red; 53 static __m128i mask_green; 54 static __m128i mask_blue; 55 56 static __m128i mask_565_fix_rb; 57 static __m128i mask_565_fix_g; 58 59 static __m128i mask_565_rb; 60 static __m128i mask_565_pack_multiplier; 61 62 static force_inline __m128i 63 unpack_32_1x128 (uint32_t data) 64 { 65 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); 66 } 67 68 static force_inline void 69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) 70 { 71 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); 72 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); 73 } 74 75 static force_inline __m128i 76 unpack_565_to_8888 (__m128i lo) 77 { 78 __m128i r, g, b, rb, t; 79 80 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); 81 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); 82 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); 83 84 rb = _mm_or_si128 (r, b); 85 t = _mm_and_si128 (rb, mask_565_fix_rb); 86 t = _mm_srli_epi32 (t, 5); 87 rb = _mm_or_si128 (rb, t); 88 89 t = _mm_and_si128 (g, mask_565_fix_g); 90 t = _mm_srli_epi32 (t, 6); 91 g = _mm_or_si128 (g, t); 92 93 return _mm_or_si128 (rb, g); 94 } 95 96 static force_inline void 97 unpack_565_128_4x128 (__m128i data, 98 __m128i* data0, 99 __m128i* data1, 100 __m128i* data2, 101 __m128i* data3) 102 { 103 __m128i lo, hi; 104 105 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); 106 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); 107 108 lo = unpack_565_to_8888 (lo); 109 hi = unpack_565_to_8888 (hi); 110 111 unpack_128_2x128 (lo, data0, data1); 112 unpack_128_2x128 (hi, data2, data3); 113 } 114 115 static force_inline uint16_t 116 pack_565_32_16 (uint32_t pixel) 117 { 118 return (uint16_t) (((pixel >> 8) & 0xf800) | 119 ((pixel >> 5) & 0x07e0) | 120 ((pixel >> 3) & 0x001f)); 121 } 122 123 static force_inline __m128i 124 pack_2x128_128 (__m128i lo, __m128i hi) 125 { 126 return _mm_packus_epi16 (lo, hi); 127 } 128 129 static force_inline __m128i 130 pack_565_2packedx128_128 (__m128i lo, __m128i hi) 131 { 132 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); 133 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); 134 135 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); 136 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); 137 138 __m128i g0 = _mm_and_si128 (lo, mask_green); 139 __m128i g1 = _mm_and_si128 (hi, mask_green); 140 141 t0 = _mm_or_si128 (t0, g0); 142 t1 = _mm_or_si128 (t1, g1); 143 144 /* Simulates _mm_packus_epi32 */ 145 t0 = _mm_slli_epi32 (t0, 16 - 5); 146 t1 = _mm_slli_epi32 (t1, 16 - 5); 147 t0 = _mm_srai_epi32 (t0, 16); 148 t1 = _mm_srai_epi32 (t1, 16); 149 return _mm_packs_epi32 (t0, t1); 150 } 151 152 static force_inline __m128i 153 pack_565_2x128_128 (__m128i lo, __m128i hi) 154 { 155 __m128i data; 156 __m128i r, g1, g2, b; 157 158 data = pack_2x128_128 (lo, hi); 159 160 r = _mm_and_si128 (data, mask_565_r); 161 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); 162 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); 163 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); 164 165 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); 166 } 167 168 static force_inline __m128i 169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) 170 { 171 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), 172 pack_565_2x128_128 (*xmm2, *xmm3)); 173 } 174 175 static force_inline int 176 is_opaque (__m128i x) 177 { 178 __m128i ffs = _mm_cmpeq_epi8 (x, x); 179 180 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; 181 } 182 183 static force_inline int 184 is_zero (__m128i x) 185 { 186 return _mm_movemask_epi8 ( 187 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; 188 } 189 190 static force_inline int 191 is_transparent (__m128i x) 192 { 193 return (_mm_movemask_epi8 ( 194 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; 195 } 196 197 static force_inline __m128i 198 expand_pixel_32_1x128 (uint32_t data) 199 { 200 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); 201 } 202 203 static force_inline __m128i 204 expand_alpha_1x128 (__m128i data) 205 { 206 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, 207 _MM_SHUFFLE (3, 3, 3, 3)), 208 _MM_SHUFFLE (3, 3, 3, 3)); 209 } 210 211 static force_inline void 212 expand_alpha_2x128 (__m128i data_lo, 213 __m128i data_hi, 214 __m128i* alpha_lo, 215 __m128i* alpha_hi) 216 { 217 __m128i lo, hi; 218 219 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); 220 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); 221 222 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); 223 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); 224 } 225 226 static force_inline void 227 expand_alpha_rev_2x128 (__m128i data_lo, 228 __m128i data_hi, 229 __m128i* alpha_lo, 230 __m128i* alpha_hi) 231 { 232 __m128i lo, hi; 233 234 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); 235 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); 236 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); 237 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); 238 } 239 240 static force_inline void 241 pix_multiply_2x128 (__m128i* data_lo, 242 __m128i* data_hi, 243 __m128i* alpha_lo, 244 __m128i* alpha_hi, 245 __m128i* ret_lo, 246 __m128i* ret_hi) 247 { 248 __m128i lo, hi; 249 250 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); 251 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); 252 lo = _mm_adds_epu16 (lo, mask_0080); 253 hi = _mm_adds_epu16 (hi, mask_0080); 254 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); 255 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); 256 } 257 258 static force_inline void 259 pix_add_multiply_2x128 (__m128i* src_lo, 260 __m128i* src_hi, 261 __m128i* alpha_dst_lo, 262 __m128i* alpha_dst_hi, 263 __m128i* dst_lo, 264 __m128i* dst_hi, 265 __m128i* alpha_src_lo, 266 __m128i* alpha_src_hi, 267 __m128i* ret_lo, 268 __m128i* ret_hi) 269 { 270 __m128i t1_lo, t1_hi; 271 __m128i t2_lo, t2_hi; 272 273 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); 274 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); 275 276 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); 277 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); 278 } 279 280 static force_inline void 281 negate_2x128 (__m128i data_lo, 282 __m128i data_hi, 283 __m128i* neg_lo, 284 __m128i* neg_hi) 285 { 286 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); 287 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); 288 } 289 290 static force_inline void 291 invert_colors_2x128 (__m128i data_lo, 292 __m128i data_hi, 293 __m128i* inv_lo, 294 __m128i* inv_hi) 295 { 296 __m128i lo, hi; 297 298 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); 299 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); 300 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); 301 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); 302 } 303 304 static force_inline void 305 over_2x128 (__m128i* src_lo, 306 __m128i* src_hi, 307 __m128i* alpha_lo, 308 __m128i* alpha_hi, 309 __m128i* dst_lo, 310 __m128i* dst_hi) 311 { 312 __m128i t1, t2; 313 314 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); 315 316 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); 317 318 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); 319 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); 320 } 321 322 static force_inline void 323 over_rev_non_pre_2x128 (__m128i src_lo, 324 __m128i src_hi, 325 __m128i* dst_lo, 326 __m128i* dst_hi) 327 { 328 __m128i lo, hi; 329 __m128i alpha_lo, alpha_hi; 330 331 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); 332 333 lo = _mm_or_si128 (alpha_lo, mask_alpha); 334 hi = _mm_or_si128 (alpha_hi, mask_alpha); 335 336 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); 337 338 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); 339 340 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); 341 } 342 343 static force_inline void 344 in_over_2x128 (__m128i* src_lo, 345 __m128i* src_hi, 346 __m128i* alpha_lo, 347 __m128i* alpha_hi, 348 __m128i* mask_lo, 349 __m128i* mask_hi, 350 __m128i* dst_lo, 351 __m128i* dst_hi) 352 { 353 __m128i s_lo, s_hi; 354 __m128i a_lo, a_hi; 355 356 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); 357 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); 358 359 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); 360 } 361 362 /* load 4 pixels from a 16-byte boundary aligned address */ 363 static force_inline __m128i 364 load_128_aligned (__m128i* src) 365 { 366 return _mm_load_si128 (src); 367 } 368 369 /* load 4 pixels from a unaligned address */ 370 static force_inline __m128i 371 load_128_unaligned (const __m128i* src) 372 { 373 return _mm_loadu_si128 (src); 374 } 375 376 /* save 4 pixels on a 16-byte boundary aligned address */ 377 static force_inline void 378 save_128_aligned (__m128i* dst, 379 __m128i data) 380 { 381 _mm_store_si128 (dst, data); 382 } 383 384 static force_inline __m128i 385 load_32_1x128 (uint32_t data) 386 { 387 return _mm_cvtsi32_si128 (data); 388 } 389 390 static force_inline __m128i 391 expand_alpha_rev_1x128 (__m128i data) 392 { 393 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); 394 } 395 396 static force_inline __m128i 397 expand_pixel_8_1x128 (uint8_t data) 398 { 399 return _mm_shufflelo_epi16 ( 400 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); 401 } 402 403 static force_inline __m128i 404 pix_multiply_1x128 (__m128i data, 405 __m128i alpha) 406 { 407 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), 408 mask_0080), 409 mask_0101); 410 } 411 412 static force_inline __m128i 413 pix_add_multiply_1x128 (__m128i* src, 414 __m128i* alpha_dst, 415 __m128i* dst, 416 __m128i* alpha_src) 417 { 418 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); 419 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); 420 421 return _mm_adds_epu8 (t1, t2); 422 } 423 424 static force_inline __m128i 425 negate_1x128 (__m128i data) 426 { 427 return _mm_xor_si128 (data, mask_00ff); 428 } 429 430 static force_inline __m128i 431 invert_colors_1x128 (__m128i data) 432 { 433 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); 434 } 435 436 static force_inline __m128i 437 over_1x128 (__m128i src, __m128i alpha, __m128i dst) 438 { 439 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); 440 } 441 442 static force_inline __m128i 443 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) 444 { 445 return over_1x128 (pix_multiply_1x128 (*src, *mask), 446 pix_multiply_1x128 (*alpha, *mask), 447 *dst); 448 } 449 450 static force_inline __m128i 451 over_rev_non_pre_1x128 (__m128i src, __m128i dst) 452 { 453 __m128i alpha = expand_alpha_1x128 (src); 454 455 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), 456 _mm_or_si128 (alpha, mask_alpha)), 457 alpha, 458 dst); 459 } 460 461 static force_inline uint32_t 462 pack_1x128_32 (__m128i data) 463 { 464 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); 465 } 466 467 static force_inline __m128i 468 expand565_16_1x128 (uint16_t pixel) 469 { 470 __m128i m = _mm_cvtsi32_si128 (pixel); 471 472 m = unpack_565_to_8888 (m); 473 474 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); 475 } 476 477 static force_inline uint32_t 478 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) 479 { 480 uint8_t a; 481 __m128i xmms; 482 483 a = src >> 24; 484 485 if (a == 0xff) 486 { 487 return src; 488 } 489 else if (src) 490 { 491 xmms = unpack_32_1x128 (src); 492 return pack_1x128_32 ( 493 over_1x128 (xmms, expand_alpha_1x128 (xmms), 494 unpack_32_1x128 (dst))); 495 } 496 497 return dst; 498 } 499 500 static force_inline uint32_t 501 combine1 (const uint32_t *ps, const uint32_t *pm) 502 { 503 uint32_t s; 504 memcpy(&s, ps, sizeof(uint32_t)); 505 506 if (pm) 507 { 508 __m128i ms, mm; 509 510 mm = unpack_32_1x128 (*pm); 511 mm = expand_alpha_1x128 (mm); 512 513 ms = unpack_32_1x128 (s); 514 ms = pix_multiply_1x128 (ms, mm); 515 516 s = pack_1x128_32 (ms); 517 } 518 519 return s; 520 } 521 522 static force_inline __m128i 523 combine4 (const __m128i *ps, const __m128i *pm) 524 { 525 __m128i xmm_src_lo, xmm_src_hi; 526 __m128i xmm_msk_lo, xmm_msk_hi; 527 __m128i s; 528 529 if (pm) 530 { 531 xmm_msk_lo = load_128_unaligned (pm); 532 533 if (is_transparent (xmm_msk_lo)) 534 return _mm_setzero_si128 (); 535 } 536 537 s = load_128_unaligned (ps); 538 539 if (pm) 540 { 541 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); 542 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); 543 544 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); 545 546 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 547 &xmm_msk_lo, &xmm_msk_hi, 548 &xmm_src_lo, &xmm_src_hi); 549 550 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); 551 } 552 553 return s; 554 } 555 556 static force_inline void 557 core_combine_over_u_sse2_mask (uint32_t * pd, 558 const uint32_t* ps, 559 const uint32_t* pm, 560 int w) 561 { 562 uint32_t s, d; 563 564 /* Align dst on a 16-byte boundary */ 565 while (w && ((uintptr_t)pd & 15)) 566 { 567 d = *pd; 568 s = combine1 (ps, pm); 569 570 if (s) 571 *pd = core_combine_over_u_pixel_sse2 (s, d); 572 pd++; 573 ps++; 574 pm++; 575 w--; 576 } 577 578 while (w >= 4) 579 { 580 __m128i mask = load_128_unaligned ((__m128i *)pm); 581 582 if (!is_zero (mask)) 583 { 584 __m128i src; 585 __m128i src_hi, src_lo; 586 __m128i mask_hi, mask_lo; 587 __m128i alpha_hi, alpha_lo; 588 589 src = load_128_unaligned ((__m128i *)ps); 590 591 if (is_opaque (_mm_and_si128 (src, mask))) 592 { 593 save_128_aligned ((__m128i *)pd, src); 594 } 595 else 596 { 597 __m128i dst = load_128_aligned ((__m128i *)pd); 598 __m128i dst_hi, dst_lo; 599 600 unpack_128_2x128 (mask, &mask_lo, &mask_hi); 601 unpack_128_2x128 (src, &src_lo, &src_hi); 602 603 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); 604 pix_multiply_2x128 (&src_lo, &src_hi, 605 &mask_lo, &mask_hi, 606 &src_lo, &src_hi); 607 608 unpack_128_2x128 (dst, &dst_lo, &dst_hi); 609 610 expand_alpha_2x128 (src_lo, src_hi, 611 &alpha_lo, &alpha_hi); 612 613 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, 614 &dst_lo, &dst_hi); 615 616 save_128_aligned ( 617 (__m128i *)pd, 618 pack_2x128_128 (dst_lo, dst_hi)); 619 } 620 } 621 622 pm += 4; 623 ps += 4; 624 pd += 4; 625 w -= 4; 626 } 627 while (w) 628 { 629 d = *pd; 630 s = combine1 (ps, pm); 631 632 if (s) 633 *pd = core_combine_over_u_pixel_sse2 (s, d); 634 pd++; 635 ps++; 636 pm++; 637 638 w--; 639 } 640 } 641 642 static force_inline void 643 core_combine_over_u_sse2_no_mask (uint32_t * pd, 644 const uint32_t* ps, 645 int w) 646 { 647 uint32_t s, d; 648 649 /* Align dst on a 16-byte boundary */ 650 while (w && ((uintptr_t)pd & 15)) 651 { 652 d = *pd; 653 s = *ps; 654 655 if (s) 656 *pd = core_combine_over_u_pixel_sse2 (s, d); 657 pd++; 658 ps++; 659 w--; 660 } 661 662 while (w >= 4) 663 { 664 __m128i src; 665 __m128i src_hi, src_lo, dst_hi, dst_lo; 666 __m128i alpha_hi, alpha_lo; 667 668 src = load_128_unaligned ((__m128i *)ps); 669 670 if (!is_zero (src)) 671 { 672 if (is_opaque (src)) 673 { 674 save_128_aligned ((__m128i *)pd, src); 675 } 676 else 677 { 678 __m128i dst = load_128_aligned ((__m128i *)pd); 679 680 unpack_128_2x128 (src, &src_lo, &src_hi); 681 unpack_128_2x128 (dst, &dst_lo, &dst_hi); 682 683 expand_alpha_2x128 (src_lo, src_hi, 684 &alpha_lo, &alpha_hi); 685 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, 686 &dst_lo, &dst_hi); 687 688 save_128_aligned ( 689 (__m128i *)pd, 690 pack_2x128_128 (dst_lo, dst_hi)); 691 } 692 } 693 694 ps += 4; 695 pd += 4; 696 w -= 4; 697 } 698 while (w) 699 { 700 d = *pd; 701 s = *ps; 702 703 if (s) 704 *pd = core_combine_over_u_pixel_sse2 (s, d); 705 pd++; 706 ps++; 707 708 w--; 709 } 710 } 711 712 static force_inline void 713 sse2_combine_over_u (pixman_implementation_t *imp, 714 pixman_op_t op, 715 uint32_t * pd, 716 const uint32_t * ps, 717 const uint32_t * pm, 718 int w) 719 { 720 if (pm) 721 core_combine_over_u_sse2_mask (pd, ps, pm, w); 722 else 723 core_combine_over_u_sse2_no_mask (pd, ps, w); 724 } 725 726 static void 727 sse2_combine_over_reverse_u (pixman_implementation_t *imp, 728 pixman_op_t op, 729 uint32_t * pd, 730 const uint32_t * ps, 731 const uint32_t * pm, 732 int w) 733 { 734 uint32_t s, d; 735 736 __m128i xmm_dst_lo, xmm_dst_hi; 737 __m128i xmm_src_lo, xmm_src_hi; 738 __m128i xmm_alpha_lo, xmm_alpha_hi; 739 740 /* Align dst on a 16-byte boundary */ 741 while (w && 742 ((uintptr_t)pd & 15)) 743 { 744 d = *pd; 745 s = combine1 (ps, pm); 746 747 *pd++ = core_combine_over_u_pixel_sse2 (d, s); 748 w--; 749 ps++; 750 if (pm) 751 pm++; 752 } 753 754 while (w >= 4) 755 { 756 /* I'm loading unaligned because I'm not sure 757 * about the address alignment. 758 */ 759 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 760 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 761 762 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 763 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 764 765 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 766 &xmm_alpha_lo, &xmm_alpha_hi); 767 768 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 769 &xmm_alpha_lo, &xmm_alpha_hi, 770 &xmm_src_lo, &xmm_src_hi); 771 772 /* rebuid the 4 pixel data and save*/ 773 save_128_aligned ((__m128i*)pd, 774 pack_2x128_128 (xmm_src_lo, xmm_src_hi)); 775 776 w -= 4; 777 ps += 4; 778 pd += 4; 779 780 if (pm) 781 pm += 4; 782 } 783 784 while (w) 785 { 786 d = *pd; 787 s = combine1 (ps, pm); 788 789 *pd++ = core_combine_over_u_pixel_sse2 (d, s); 790 ps++; 791 w--; 792 if (pm) 793 pm++; 794 } 795 } 796 797 static force_inline uint32_t 798 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) 799 { 800 uint32_t maska = src >> 24; 801 802 if (maska == 0) 803 { 804 return 0; 805 } 806 else if (maska != 0xff) 807 { 808 return pack_1x128_32 ( 809 pix_multiply_1x128 (unpack_32_1x128 (dst), 810 expand_alpha_1x128 (unpack_32_1x128 (src)))); 811 } 812 813 return dst; 814 } 815 816 static void 817 sse2_combine_in_u (pixman_implementation_t *imp, 818 pixman_op_t op, 819 uint32_t * pd, 820 const uint32_t * ps, 821 const uint32_t * pm, 822 int w) 823 { 824 uint32_t s, d; 825 826 __m128i xmm_src_lo, xmm_src_hi; 827 __m128i xmm_dst_lo, xmm_dst_hi; 828 829 while (w && ((uintptr_t)pd & 15)) 830 { 831 s = combine1 (ps, pm); 832 d = *pd; 833 834 *pd++ = core_combine_in_u_pixel_sse2 (d, s); 835 w--; 836 ps++; 837 if (pm) 838 pm++; 839 } 840 841 while (w >= 4) 842 { 843 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 844 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); 845 846 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 847 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 848 849 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 850 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 851 &xmm_dst_lo, &xmm_dst_hi, 852 &xmm_dst_lo, &xmm_dst_hi); 853 854 save_128_aligned ((__m128i*)pd, 855 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 856 857 ps += 4; 858 pd += 4; 859 w -= 4; 860 if (pm) 861 pm += 4; 862 } 863 864 while (w) 865 { 866 s = combine1 (ps, pm); 867 d = *pd; 868 869 *pd++ = core_combine_in_u_pixel_sse2 (d, s); 870 w--; 871 ps++; 872 if (pm) 873 pm++; 874 } 875 } 876 877 static void 878 sse2_combine_in_reverse_u (pixman_implementation_t *imp, 879 pixman_op_t op, 880 uint32_t * pd, 881 const uint32_t * ps, 882 const uint32_t * pm, 883 int w) 884 { 885 uint32_t s, d; 886 887 __m128i xmm_src_lo, xmm_src_hi; 888 __m128i xmm_dst_lo, xmm_dst_hi; 889 890 while (w && ((uintptr_t)pd & 15)) 891 { 892 s = combine1 (ps, pm); 893 d = *pd; 894 895 *pd++ = core_combine_in_u_pixel_sse2 (s, d); 896 ps++; 897 w--; 898 if (pm) 899 pm++; 900 } 901 902 while (w >= 4) 903 { 904 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 905 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); 906 907 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 908 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 909 910 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 911 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 912 &xmm_src_lo, &xmm_src_hi, 913 &xmm_dst_lo, &xmm_dst_hi); 914 915 save_128_aligned ( 916 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 917 918 ps += 4; 919 pd += 4; 920 w -= 4; 921 if (pm) 922 pm += 4; 923 } 924 925 while (w) 926 { 927 s = combine1 (ps, pm); 928 d = *pd; 929 930 *pd++ = core_combine_in_u_pixel_sse2 (s, d); 931 w--; 932 ps++; 933 if (pm) 934 pm++; 935 } 936 } 937 938 static void 939 sse2_combine_out_reverse_u (pixman_implementation_t *imp, 940 pixman_op_t op, 941 uint32_t * pd, 942 const uint32_t * ps, 943 const uint32_t * pm, 944 int w) 945 { 946 while (w && ((uintptr_t)pd & 15)) 947 { 948 uint32_t s = combine1 (ps, pm); 949 uint32_t d = *pd; 950 951 *pd++ = pack_1x128_32 ( 952 pix_multiply_1x128 ( 953 unpack_32_1x128 (d), negate_1x128 ( 954 expand_alpha_1x128 (unpack_32_1x128 (s))))); 955 956 if (pm) 957 pm++; 958 ps++; 959 w--; 960 } 961 962 while (w >= 4) 963 { 964 __m128i xmm_src_lo, xmm_src_hi; 965 __m128i xmm_dst_lo, xmm_dst_hi; 966 967 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 968 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 969 970 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 971 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 972 973 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 974 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 975 976 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 977 &xmm_src_lo, &xmm_src_hi, 978 &xmm_dst_lo, &xmm_dst_hi); 979 980 save_128_aligned ( 981 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 982 983 ps += 4; 984 pd += 4; 985 if (pm) 986 pm += 4; 987 988 w -= 4; 989 } 990 991 while (w) 992 { 993 uint32_t s = combine1 (ps, pm); 994 uint32_t d = *pd; 995 996 *pd++ = pack_1x128_32 ( 997 pix_multiply_1x128 ( 998 unpack_32_1x128 (d), negate_1x128 ( 999 expand_alpha_1x128 (unpack_32_1x128 (s))))); 1000 ps++; 1001 if (pm) 1002 pm++; 1003 w--; 1004 } 1005 } 1006 1007 static void 1008 sse2_combine_out_u (pixman_implementation_t *imp, 1009 pixman_op_t op, 1010 uint32_t * pd, 1011 const uint32_t * ps, 1012 const uint32_t * pm, 1013 int w) 1014 { 1015 while (w && ((uintptr_t)pd & 15)) 1016 { 1017 uint32_t s = combine1 (ps, pm); 1018 uint32_t d = *pd; 1019 1020 *pd++ = pack_1x128_32 ( 1021 pix_multiply_1x128 ( 1022 unpack_32_1x128 (s), negate_1x128 ( 1023 expand_alpha_1x128 (unpack_32_1x128 (d))))); 1024 w--; 1025 ps++; 1026 if (pm) 1027 pm++; 1028 } 1029 1030 while (w >= 4) 1031 { 1032 __m128i xmm_src_lo, xmm_src_hi; 1033 __m128i xmm_dst_lo, xmm_dst_hi; 1034 1035 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); 1036 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1037 1038 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1039 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1040 1041 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1042 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1043 1044 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1045 &xmm_dst_lo, &xmm_dst_hi, 1046 &xmm_dst_lo, &xmm_dst_hi); 1047 1048 save_128_aligned ( 1049 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1050 1051 ps += 4; 1052 pd += 4; 1053 w -= 4; 1054 if (pm) 1055 pm += 4; 1056 } 1057 1058 while (w) 1059 { 1060 uint32_t s = combine1 (ps, pm); 1061 uint32_t d = *pd; 1062 1063 *pd++ = pack_1x128_32 ( 1064 pix_multiply_1x128 ( 1065 unpack_32_1x128 (s), negate_1x128 ( 1066 expand_alpha_1x128 (unpack_32_1x128 (d))))); 1067 w--; 1068 ps++; 1069 if (pm) 1070 pm++; 1071 } 1072 } 1073 1074 static force_inline uint32_t 1075 core_combine_atop_u_pixel_sse2 (uint32_t src, 1076 uint32_t dst) 1077 { 1078 __m128i s = unpack_32_1x128 (src); 1079 __m128i d = unpack_32_1x128 (dst); 1080 1081 __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); 1082 __m128i da = expand_alpha_1x128 (d); 1083 1084 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); 1085 } 1086 1087 static void 1088 sse2_combine_atop_u (pixman_implementation_t *imp, 1089 pixman_op_t op, 1090 uint32_t * pd, 1091 const uint32_t * ps, 1092 const uint32_t * pm, 1093 int w) 1094 { 1095 uint32_t s, d; 1096 1097 __m128i xmm_src_lo, xmm_src_hi; 1098 __m128i xmm_dst_lo, xmm_dst_hi; 1099 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1100 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1101 1102 while (w && ((uintptr_t)pd & 15)) 1103 { 1104 s = combine1 (ps, pm); 1105 d = *pd; 1106 1107 *pd++ = core_combine_atop_u_pixel_sse2 (s, d); 1108 w--; 1109 ps++; 1110 if (pm) 1111 pm++; 1112 } 1113 1114 while (w >= 4) 1115 { 1116 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 1117 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1118 1119 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1120 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1121 1122 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1123 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1124 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1125 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1126 1127 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, 1128 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1129 1130 pix_add_multiply_2x128 ( 1131 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1132 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1133 &xmm_dst_lo, &xmm_dst_hi); 1134 1135 save_128_aligned ( 1136 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1137 1138 ps += 4; 1139 pd += 4; 1140 w -= 4; 1141 if (pm) 1142 pm += 4; 1143 } 1144 1145 while (w) 1146 { 1147 s = combine1 (ps, pm); 1148 d = *pd; 1149 1150 *pd++ = core_combine_atop_u_pixel_sse2 (s, d); 1151 w--; 1152 ps++; 1153 if (pm) 1154 pm++; 1155 } 1156 } 1157 1158 static force_inline uint32_t 1159 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, 1160 uint32_t dst) 1161 { 1162 __m128i s = unpack_32_1x128 (src); 1163 __m128i d = unpack_32_1x128 (dst); 1164 1165 __m128i sa = expand_alpha_1x128 (s); 1166 __m128i da = negate_1x128 (expand_alpha_1x128 (d)); 1167 1168 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); 1169 } 1170 1171 static void 1172 sse2_combine_atop_reverse_u (pixman_implementation_t *imp, 1173 pixman_op_t op, 1174 uint32_t * pd, 1175 const uint32_t * ps, 1176 const uint32_t * pm, 1177 int w) 1178 { 1179 uint32_t s, d; 1180 1181 __m128i xmm_src_lo, xmm_src_hi; 1182 __m128i xmm_dst_lo, xmm_dst_hi; 1183 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1184 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1185 1186 while (w && ((uintptr_t)pd & 15)) 1187 { 1188 s = combine1 (ps, pm); 1189 d = *pd; 1190 1191 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); 1192 ps++; 1193 w--; 1194 if (pm) 1195 pm++; 1196 } 1197 1198 while (w >= 4) 1199 { 1200 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 1201 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1202 1203 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1204 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1205 1206 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1207 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1208 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1209 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1210 1211 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 1212 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1213 1214 pix_add_multiply_2x128 ( 1215 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1216 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1217 &xmm_dst_lo, &xmm_dst_hi); 1218 1219 save_128_aligned ( 1220 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1221 1222 ps += 4; 1223 pd += 4; 1224 w -= 4; 1225 if (pm) 1226 pm += 4; 1227 } 1228 1229 while (w) 1230 { 1231 s = combine1 (ps, pm); 1232 d = *pd; 1233 1234 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); 1235 ps++; 1236 w--; 1237 if (pm) 1238 pm++; 1239 } 1240 } 1241 1242 static force_inline uint32_t 1243 core_combine_xor_u_pixel_sse2 (uint32_t src, 1244 uint32_t dst) 1245 { 1246 __m128i s = unpack_32_1x128 (src); 1247 __m128i d = unpack_32_1x128 (dst); 1248 1249 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); 1250 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); 1251 1252 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); 1253 } 1254 1255 static void 1256 sse2_combine_xor_u (pixman_implementation_t *imp, 1257 pixman_op_t op, 1258 uint32_t * dst, 1259 const uint32_t * src, 1260 const uint32_t * mask, 1261 int width) 1262 { 1263 int w = width; 1264 uint32_t s, d; 1265 uint32_t* pd = dst; 1266 const uint32_t* ps = src; 1267 const uint32_t* pm = mask; 1268 1269 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1270 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1271 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1272 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1273 1274 while (w && ((uintptr_t)pd & 15)) 1275 { 1276 s = combine1 (ps, pm); 1277 d = *pd; 1278 1279 *pd++ = core_combine_xor_u_pixel_sse2 (s, d); 1280 w--; 1281 ps++; 1282 if (pm) 1283 pm++; 1284 } 1285 1286 while (w >= 4) 1287 { 1288 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); 1289 xmm_dst = load_128_aligned ((__m128i*) pd); 1290 1291 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1292 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1293 1294 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1295 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1296 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1297 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1298 1299 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, 1300 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1301 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 1302 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1303 1304 pix_add_multiply_2x128 ( 1305 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1306 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1307 &xmm_dst_lo, &xmm_dst_hi); 1308 1309 save_128_aligned ( 1310 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1311 1312 ps += 4; 1313 pd += 4; 1314 w -= 4; 1315 if (pm) 1316 pm += 4; 1317 } 1318 1319 while (w) 1320 { 1321 s = combine1 (ps, pm); 1322 d = *pd; 1323 1324 *pd++ = core_combine_xor_u_pixel_sse2 (s, d); 1325 w--; 1326 ps++; 1327 if (pm) 1328 pm++; 1329 } 1330 } 1331 1332 static force_inline void 1333 sse2_combine_add_u (pixman_implementation_t *imp, 1334 pixman_op_t op, 1335 uint32_t * dst, 1336 const uint32_t * src, 1337 const uint32_t * mask, 1338 int width) 1339 { 1340 int w = width; 1341 uint32_t s, d; 1342 uint32_t* pd = dst; 1343 const uint32_t* ps = src; 1344 const uint32_t* pm = mask; 1345 1346 while (w && (uintptr_t)pd & 15) 1347 { 1348 s = combine1 (ps, pm); 1349 d = *pd; 1350 1351 ps++; 1352 if (pm) 1353 pm++; 1354 *pd++ = _mm_cvtsi128_si32 ( 1355 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); 1356 w--; 1357 } 1358 1359 while (w >= 4) 1360 { 1361 __m128i s; 1362 1363 s = combine4 ((__m128i*)ps, (__m128i*)pm); 1364 1365 save_128_aligned ( 1366 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); 1367 1368 pd += 4; 1369 ps += 4; 1370 if (pm) 1371 pm += 4; 1372 w -= 4; 1373 } 1374 1375 while (w--) 1376 { 1377 s = combine1 (ps, pm); 1378 d = *pd; 1379 1380 ps++; 1381 *pd++ = _mm_cvtsi128_si32 ( 1382 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); 1383 if (pm) 1384 pm++; 1385 } 1386 } 1387 1388 static force_inline uint32_t 1389 core_combine_saturate_u_pixel_sse2 (uint32_t src, 1390 uint32_t dst) 1391 { 1392 __m128i ms = unpack_32_1x128 (src); 1393 __m128i md = unpack_32_1x128 (dst); 1394 uint32_t sa = src >> 24; 1395 uint32_t da = ~dst >> 24; 1396 1397 if (sa > da) 1398 { 1399 ms = pix_multiply_1x128 ( 1400 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); 1401 } 1402 1403 return pack_1x128_32 (_mm_adds_epu16 (md, ms)); 1404 } 1405 1406 static void 1407 sse2_combine_saturate_u (pixman_implementation_t *imp, 1408 pixman_op_t op, 1409 uint32_t * pd, 1410 const uint32_t * ps, 1411 const uint32_t * pm, 1412 int w) 1413 { 1414 uint32_t s, d; 1415 1416 uint32_t pack_cmp; 1417 __m128i xmm_src, xmm_dst; 1418 1419 while (w && (uintptr_t)pd & 15) 1420 { 1421 s = combine1 (ps, pm); 1422 d = *pd; 1423 1424 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1425 w--; 1426 ps++; 1427 if (pm) 1428 pm++; 1429 } 1430 1431 while (w >= 4) 1432 { 1433 xmm_dst = load_128_aligned ((__m128i*)pd); 1434 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); 1435 1436 pack_cmp = _mm_movemask_epi8 ( 1437 _mm_cmpgt_epi32 ( 1438 _mm_srli_epi32 (xmm_src, 24), 1439 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); 1440 1441 /* if some alpha src is grater than respective ~alpha dst */ 1442 if (pack_cmp) 1443 { 1444 s = combine1 (ps++, pm); 1445 d = *pd; 1446 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1447 if (pm) 1448 pm++; 1449 1450 s = combine1 (ps++, pm); 1451 d = *pd; 1452 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1453 if (pm) 1454 pm++; 1455 1456 s = combine1 (ps++, pm); 1457 d = *pd; 1458 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1459 if (pm) 1460 pm++; 1461 1462 s = combine1 (ps++, pm); 1463 d = *pd; 1464 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1465 if (pm) 1466 pm++; 1467 } 1468 else 1469 { 1470 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); 1471 1472 pd += 4; 1473 ps += 4; 1474 if (pm) 1475 pm += 4; 1476 } 1477 1478 w -= 4; 1479 } 1480 1481 while (w--) 1482 { 1483 s = combine1 (ps, pm); 1484 d = *pd; 1485 1486 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1487 ps++; 1488 if (pm) 1489 pm++; 1490 } 1491 } 1492 1493 static void 1494 sse2_combine_src_ca (pixman_implementation_t *imp, 1495 pixman_op_t op, 1496 uint32_t * pd, 1497 const uint32_t * ps, 1498 const uint32_t * pm, 1499 int w) 1500 { 1501 uint32_t s, m; 1502 1503 __m128i xmm_src_lo, xmm_src_hi; 1504 __m128i xmm_mask_lo, xmm_mask_hi; 1505 __m128i xmm_dst_lo, xmm_dst_hi; 1506 1507 while (w && (uintptr_t)pd & 15) 1508 { 1509 s = *ps++; 1510 m = *pm++; 1511 *pd++ = pack_1x128_32 ( 1512 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); 1513 w--; 1514 } 1515 1516 while (w >= 4) 1517 { 1518 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1519 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1520 1521 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1522 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1523 1524 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1525 &xmm_mask_lo, &xmm_mask_hi, 1526 &xmm_dst_lo, &xmm_dst_hi); 1527 1528 save_128_aligned ( 1529 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1530 1531 ps += 4; 1532 pd += 4; 1533 pm += 4; 1534 w -= 4; 1535 } 1536 1537 while (w) 1538 { 1539 s = *ps++; 1540 m = *pm++; 1541 *pd++ = pack_1x128_32 ( 1542 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); 1543 w--; 1544 } 1545 } 1546 1547 static force_inline uint32_t 1548 core_combine_over_ca_pixel_sse2 (uint32_t src, 1549 uint32_t mask, 1550 uint32_t dst) 1551 { 1552 __m128i s = unpack_32_1x128 (src); 1553 __m128i expAlpha = expand_alpha_1x128 (s); 1554 __m128i unpk_mask = unpack_32_1x128 (mask); 1555 __m128i unpk_dst = unpack_32_1x128 (dst); 1556 1557 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); 1558 } 1559 1560 static void 1561 sse2_combine_over_ca (pixman_implementation_t *imp, 1562 pixman_op_t op, 1563 uint32_t * pd, 1564 const uint32_t * ps, 1565 const uint32_t * pm, 1566 int w) 1567 { 1568 uint32_t s, m, d; 1569 1570 __m128i xmm_alpha_lo, xmm_alpha_hi; 1571 __m128i xmm_src_lo, xmm_src_hi; 1572 __m128i xmm_dst_lo, xmm_dst_hi; 1573 __m128i xmm_mask_lo, xmm_mask_hi; 1574 1575 while (w && (uintptr_t)pd & 15) 1576 { 1577 s = *ps++; 1578 m = *pm++; 1579 d = *pd; 1580 1581 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); 1582 w--; 1583 } 1584 1585 while (w >= 4) 1586 { 1587 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1588 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1589 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1590 1591 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1592 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1593 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1594 1595 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1596 &xmm_alpha_lo, &xmm_alpha_hi); 1597 1598 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 1599 &xmm_alpha_lo, &xmm_alpha_hi, 1600 &xmm_mask_lo, &xmm_mask_hi, 1601 &xmm_dst_lo, &xmm_dst_hi); 1602 1603 save_128_aligned ( 1604 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1605 1606 ps += 4; 1607 pd += 4; 1608 pm += 4; 1609 w -= 4; 1610 } 1611 1612 while (w) 1613 { 1614 s = *ps++; 1615 m = *pm++; 1616 d = *pd; 1617 1618 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); 1619 w--; 1620 } 1621 } 1622 1623 static force_inline uint32_t 1624 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, 1625 uint32_t mask, 1626 uint32_t dst) 1627 { 1628 __m128i d = unpack_32_1x128 (dst); 1629 1630 return pack_1x128_32 ( 1631 over_1x128 (d, expand_alpha_1x128 (d), 1632 pix_multiply_1x128 (unpack_32_1x128 (src), 1633 unpack_32_1x128 (mask)))); 1634 } 1635 1636 static void 1637 sse2_combine_over_reverse_ca (pixman_implementation_t *imp, 1638 pixman_op_t op, 1639 uint32_t * pd, 1640 const uint32_t * ps, 1641 const uint32_t * pm, 1642 int w) 1643 { 1644 uint32_t s, m, d; 1645 1646 __m128i xmm_alpha_lo, xmm_alpha_hi; 1647 __m128i xmm_src_lo, xmm_src_hi; 1648 __m128i xmm_dst_lo, xmm_dst_hi; 1649 __m128i xmm_mask_lo, xmm_mask_hi; 1650 1651 while (w && (uintptr_t)pd & 15) 1652 { 1653 s = *ps++; 1654 m = *pm++; 1655 d = *pd; 1656 1657 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); 1658 w--; 1659 } 1660 1661 while (w >= 4) 1662 { 1663 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1664 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1665 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1666 1667 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1668 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1669 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1670 1671 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1672 &xmm_alpha_lo, &xmm_alpha_hi); 1673 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1674 &xmm_mask_lo, &xmm_mask_hi, 1675 &xmm_mask_lo, &xmm_mask_hi); 1676 1677 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1678 &xmm_alpha_lo, &xmm_alpha_hi, 1679 &xmm_mask_lo, &xmm_mask_hi); 1680 1681 save_128_aligned ( 1682 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); 1683 1684 ps += 4; 1685 pd += 4; 1686 pm += 4; 1687 w -= 4; 1688 } 1689 1690 while (w) 1691 { 1692 s = *ps++; 1693 m = *pm++; 1694 d = *pd; 1695 1696 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); 1697 w--; 1698 } 1699 } 1700 1701 static void 1702 sse2_combine_in_ca (pixman_implementation_t *imp, 1703 pixman_op_t op, 1704 uint32_t * pd, 1705 const uint32_t * ps, 1706 const uint32_t * pm, 1707 int w) 1708 { 1709 uint32_t s, m, d; 1710 1711 __m128i xmm_alpha_lo, xmm_alpha_hi; 1712 __m128i xmm_src_lo, xmm_src_hi; 1713 __m128i xmm_dst_lo, xmm_dst_hi; 1714 __m128i xmm_mask_lo, xmm_mask_hi; 1715 1716 while (w && (uintptr_t)pd & 15) 1717 { 1718 s = *ps++; 1719 m = *pm++; 1720 d = *pd; 1721 1722 *pd++ = pack_1x128_32 ( 1723 pix_multiply_1x128 ( 1724 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), 1725 expand_alpha_1x128 (unpack_32_1x128 (d)))); 1726 1727 w--; 1728 } 1729 1730 while (w >= 4) 1731 { 1732 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1733 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1734 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1735 1736 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1737 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1738 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1739 1740 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1741 &xmm_alpha_lo, &xmm_alpha_hi); 1742 1743 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1744 &xmm_mask_lo, &xmm_mask_hi, 1745 &xmm_dst_lo, &xmm_dst_hi); 1746 1747 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1748 &xmm_alpha_lo, &xmm_alpha_hi, 1749 &xmm_dst_lo, &xmm_dst_hi); 1750 1751 save_128_aligned ( 1752 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1753 1754 ps += 4; 1755 pd += 4; 1756 pm += 4; 1757 w -= 4; 1758 } 1759 1760 while (w) 1761 { 1762 s = *ps++; 1763 m = *pm++; 1764 d = *pd; 1765 1766 *pd++ = pack_1x128_32 ( 1767 pix_multiply_1x128 ( 1768 pix_multiply_1x128 ( 1769 unpack_32_1x128 (s), unpack_32_1x128 (m)), 1770 expand_alpha_1x128 (unpack_32_1x128 (d)))); 1771 1772 w--; 1773 } 1774 } 1775 1776 static void 1777 sse2_combine_in_reverse_ca (pixman_implementation_t *imp, 1778 pixman_op_t op, 1779 uint32_t * pd, 1780 const uint32_t * ps, 1781 const uint32_t * pm, 1782 int w) 1783 { 1784 uint32_t s, m, d; 1785 1786 __m128i xmm_alpha_lo, xmm_alpha_hi; 1787 __m128i xmm_src_lo, xmm_src_hi; 1788 __m128i xmm_dst_lo, xmm_dst_hi; 1789 __m128i xmm_mask_lo, xmm_mask_hi; 1790 1791 while (w && (uintptr_t)pd & 15) 1792 { 1793 s = *ps++; 1794 m = *pm++; 1795 d = *pd; 1796 1797 *pd++ = pack_1x128_32 ( 1798 pix_multiply_1x128 ( 1799 unpack_32_1x128 (d), 1800 pix_multiply_1x128 (unpack_32_1x128 (m), 1801 expand_alpha_1x128 (unpack_32_1x128 (s))))); 1802 w--; 1803 } 1804 1805 while (w >= 4) 1806 { 1807 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1808 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1809 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1810 1811 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1812 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1813 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1814 1815 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1816 &xmm_alpha_lo, &xmm_alpha_hi); 1817 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1818 &xmm_alpha_lo, &xmm_alpha_hi, 1819 &xmm_alpha_lo, &xmm_alpha_hi); 1820 1821 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1822 &xmm_alpha_lo, &xmm_alpha_hi, 1823 &xmm_dst_lo, &xmm_dst_hi); 1824 1825 save_128_aligned ( 1826 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1827 1828 ps += 4; 1829 pd += 4; 1830 pm += 4; 1831 w -= 4; 1832 } 1833 1834 while (w) 1835 { 1836 s = *ps++; 1837 m = *pm++; 1838 d = *pd; 1839 1840 *pd++ = pack_1x128_32 ( 1841 pix_multiply_1x128 ( 1842 unpack_32_1x128 (d), 1843 pix_multiply_1x128 (unpack_32_1x128 (m), 1844 expand_alpha_1x128 (unpack_32_1x128 (s))))); 1845 w--; 1846 } 1847 } 1848 1849 static void 1850 sse2_combine_out_ca (pixman_implementation_t *imp, 1851 pixman_op_t op, 1852 uint32_t * pd, 1853 const uint32_t * ps, 1854 const uint32_t * pm, 1855 int w) 1856 { 1857 uint32_t s, m, d; 1858 1859 __m128i xmm_alpha_lo, xmm_alpha_hi; 1860 __m128i xmm_src_lo, xmm_src_hi; 1861 __m128i xmm_dst_lo, xmm_dst_hi; 1862 __m128i xmm_mask_lo, xmm_mask_hi; 1863 1864 while (w && (uintptr_t)pd & 15) 1865 { 1866 s = *ps++; 1867 m = *pm++; 1868 d = *pd; 1869 1870 *pd++ = pack_1x128_32 ( 1871 pix_multiply_1x128 ( 1872 pix_multiply_1x128 ( 1873 unpack_32_1x128 (s), unpack_32_1x128 (m)), 1874 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); 1875 w--; 1876 } 1877 1878 while (w >= 4) 1879 { 1880 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1881 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1882 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1883 1884 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1885 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1886 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1887 1888 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1889 &xmm_alpha_lo, &xmm_alpha_hi); 1890 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, 1891 &xmm_alpha_lo, &xmm_alpha_hi); 1892 1893 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1894 &xmm_mask_lo, &xmm_mask_hi, 1895 &xmm_dst_lo, &xmm_dst_hi); 1896 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1897 &xmm_alpha_lo, &xmm_alpha_hi, 1898 &xmm_dst_lo, &xmm_dst_hi); 1899 1900 save_128_aligned ( 1901 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1902 1903 ps += 4; 1904 pd += 4; 1905 pm += 4; 1906 w -= 4; 1907 } 1908 1909 while (w) 1910 { 1911 s = *ps++; 1912 m = *pm++; 1913 d = *pd; 1914 1915 *pd++ = pack_1x128_32 ( 1916 pix_multiply_1x128 ( 1917 pix_multiply_1x128 ( 1918 unpack_32_1x128 (s), unpack_32_1x128 (m)), 1919 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); 1920 1921 w--; 1922 } 1923 } 1924 1925 static void 1926 sse2_combine_out_reverse_ca (pixman_implementation_t *imp, 1927 pixman_op_t op, 1928 uint32_t * pd, 1929 const uint32_t * ps, 1930 const uint32_t * pm, 1931 int w) 1932 { 1933 uint32_t s, m, d; 1934 1935 __m128i xmm_alpha_lo, xmm_alpha_hi; 1936 __m128i xmm_src_lo, xmm_src_hi; 1937 __m128i xmm_dst_lo, xmm_dst_hi; 1938 __m128i xmm_mask_lo, xmm_mask_hi; 1939 1940 while (w && (uintptr_t)pd & 15) 1941 { 1942 s = *ps++; 1943 m = *pm++; 1944 d = *pd; 1945 1946 *pd++ = pack_1x128_32 ( 1947 pix_multiply_1x128 ( 1948 unpack_32_1x128 (d), 1949 negate_1x128 (pix_multiply_1x128 ( 1950 unpack_32_1x128 (m), 1951 expand_alpha_1x128 (unpack_32_1x128 (s)))))); 1952 w--; 1953 } 1954 1955 while (w >= 4) 1956 { 1957 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1958 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1959 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1960 1961 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1962 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1963 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1964 1965 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1966 &xmm_alpha_lo, &xmm_alpha_hi); 1967 1968 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1969 &xmm_alpha_lo, &xmm_alpha_hi, 1970 &xmm_mask_lo, &xmm_mask_hi); 1971 1972 negate_2x128 (xmm_mask_lo, xmm_mask_hi, 1973 &xmm_mask_lo, &xmm_mask_hi); 1974 1975 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1976 &xmm_mask_lo, &xmm_mask_hi, 1977 &xmm_dst_lo, &xmm_dst_hi); 1978 1979 save_128_aligned ( 1980 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1981 1982 ps += 4; 1983 pd += 4; 1984 pm += 4; 1985 w -= 4; 1986 } 1987 1988 while (w) 1989 { 1990 s = *ps++; 1991 m = *pm++; 1992 d = *pd; 1993 1994 *pd++ = pack_1x128_32 ( 1995 pix_multiply_1x128 ( 1996 unpack_32_1x128 (d), 1997 negate_1x128 (pix_multiply_1x128 ( 1998 unpack_32_1x128 (m), 1999 expand_alpha_1x128 (unpack_32_1x128 (s)))))); 2000 w--; 2001 } 2002 } 2003 2004 static force_inline uint32_t 2005 core_combine_atop_ca_pixel_sse2 (uint32_t src, 2006 uint32_t mask, 2007 uint32_t dst) 2008 { 2009 __m128i m = unpack_32_1x128 (mask); 2010 __m128i s = unpack_32_1x128 (src); 2011 __m128i d = unpack_32_1x128 (dst); 2012 __m128i sa = expand_alpha_1x128 (s); 2013 __m128i da = expand_alpha_1x128 (d); 2014 2015 s = pix_multiply_1x128 (s, m); 2016 m = negate_1x128 (pix_multiply_1x128 (m, sa)); 2017 2018 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); 2019 } 2020 2021 static void 2022 sse2_combine_atop_ca (pixman_implementation_t *imp, 2023 pixman_op_t op, 2024 uint32_t * pd, 2025 const uint32_t * ps, 2026 const uint32_t * pm, 2027 int w) 2028 { 2029 uint32_t s, m, d; 2030 2031 __m128i xmm_src_lo, xmm_src_hi; 2032 __m128i xmm_dst_lo, xmm_dst_hi; 2033 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 2034 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 2035 __m128i xmm_mask_lo, xmm_mask_hi; 2036 2037 while (w && (uintptr_t)pd & 15) 2038 { 2039 s = *ps++; 2040 m = *pm++; 2041 d = *pd; 2042 2043 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); 2044 w--; 2045 } 2046 2047 while (w >= 4) 2048 { 2049 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 2050 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 2051 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 2052 2053 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 2054 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 2055 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2056 2057 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 2058 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 2059 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 2060 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2061 2062 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 2063 &xmm_mask_lo, &xmm_mask_hi, 2064 &xmm_src_lo, &xmm_src_hi); 2065 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 2066 &xmm_alpha_src_lo, &xmm_alpha_src_hi, 2067 &xmm_mask_lo, &xmm_mask_hi); 2068 2069 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2070 2071 pix_add_multiply_2x128 ( 2072 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 2073 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 2074 &xmm_dst_lo, &xmm_dst_hi); 2075 2076 save_128_aligned ( 2077 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2078 2079 ps += 4; 2080 pd += 4; 2081 pm += 4; 2082 w -= 4; 2083 } 2084 2085 while (w) 2086 { 2087 s = *ps++; 2088 m = *pm++; 2089 d = *pd; 2090 2091 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); 2092 w--; 2093 } 2094 } 2095 2096 static force_inline uint32_t 2097 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, 2098 uint32_t mask, 2099 uint32_t dst) 2100 { 2101 __m128i m = unpack_32_1x128 (mask); 2102 __m128i s = unpack_32_1x128 (src); 2103 __m128i d = unpack_32_1x128 (dst); 2104 2105 __m128i da = negate_1x128 (expand_alpha_1x128 (d)); 2106 __m128i sa = expand_alpha_1x128 (s); 2107 2108 s = pix_multiply_1x128 (s, m); 2109 m = pix_multiply_1x128 (m, sa); 2110 2111 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); 2112 } 2113 2114 static void 2115 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, 2116 pixman_op_t op, 2117 uint32_t * pd, 2118 const uint32_t * ps, 2119 const uint32_t * pm, 2120 int w) 2121 { 2122 uint32_t s, m, d; 2123 2124 __m128i xmm_src_lo, xmm_src_hi; 2125 __m128i xmm_dst_lo, xmm_dst_hi; 2126 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 2127 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 2128 __m128i xmm_mask_lo, xmm_mask_hi; 2129 2130 while (w && (uintptr_t)pd & 15) 2131 { 2132 s = *ps++; 2133 m = *pm++; 2134 d = *pd; 2135 2136 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); 2137 w--; 2138 } 2139 2140 while (w >= 4) 2141 { 2142 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 2143 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 2144 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 2145 2146 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 2147 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 2148 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2149 2150 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 2151 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 2152 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 2153 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2154 2155 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 2156 &xmm_mask_lo, &xmm_mask_hi, 2157 &xmm_src_lo, &xmm_src_hi); 2158 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 2159 &xmm_alpha_src_lo, &xmm_alpha_src_hi, 2160 &xmm_mask_lo, &xmm_mask_hi); 2161 2162 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 2163 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2164 2165 pix_add_multiply_2x128 ( 2166 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 2167 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 2168 &xmm_dst_lo, &xmm_dst_hi); 2169 2170 save_128_aligned ( 2171 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2172 2173 ps += 4; 2174 pd += 4; 2175 pm += 4; 2176 w -= 4; 2177 } 2178 2179 while (w) 2180 { 2181 s = *ps++; 2182 m = *pm++; 2183 d = *pd; 2184 2185 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); 2186 w--; 2187 } 2188 } 2189 2190 static force_inline uint32_t 2191 core_combine_xor_ca_pixel_sse2 (uint32_t src, 2192 uint32_t mask, 2193 uint32_t dst) 2194 { 2195 __m128i a = unpack_32_1x128 (mask); 2196 __m128i s = unpack_32_1x128 (src); 2197 __m128i d = unpack_32_1x128 (dst); 2198 2199 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( 2200 a, expand_alpha_1x128 (s))); 2201 __m128i dest = pix_multiply_1x128 (s, a); 2202 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); 2203 2204 return pack_1x128_32 (pix_add_multiply_1x128 (&d, 2205 &alpha_dst, 2206 &dest, 2207 &alpha_src)); 2208 } 2209 2210 static void 2211 sse2_combine_xor_ca (pixman_implementation_t *imp, 2212 pixman_op_t op, 2213 uint32_t * pd, 2214 const uint32_t * ps, 2215 const uint32_t * pm, 2216 int w) 2217 { 2218 uint32_t s, m, d; 2219 2220 __m128i xmm_src_lo, xmm_src_hi; 2221 __m128i xmm_dst_lo, xmm_dst_hi; 2222 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 2223 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 2224 __m128i xmm_mask_lo, xmm_mask_hi; 2225 2226 while (w && (uintptr_t)pd & 15) 2227 { 2228 s = *ps++; 2229 m = *pm++; 2230 d = *pd; 2231 2232 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); 2233 w--; 2234 } 2235 2236 while (w >= 4) 2237 { 2238 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 2239 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 2240 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 2241 2242 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 2243 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 2244 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2245 2246 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 2247 &xmm_alpha_src_lo, &xmm_alpha_src_hi); 2248 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 2249 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2250 2251 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 2252 &xmm_mask_lo, &xmm_mask_hi, 2253 &xmm_src_lo, &xmm_src_hi); 2254 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 2255 &xmm_alpha_src_lo, &xmm_alpha_src_hi, 2256 &xmm_mask_lo, &xmm_mask_hi); 2257 2258 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 2259 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 2260 negate_2x128 (xmm_mask_lo, xmm_mask_hi, 2261 &xmm_mask_lo, &xmm_mask_hi); 2262 2263 pix_add_multiply_2x128 ( 2264 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 2265 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 2266 &xmm_dst_lo, &xmm_dst_hi); 2267 2268 save_128_aligned ( 2269 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2270 2271 ps += 4; 2272 pd += 4; 2273 pm += 4; 2274 w -= 4; 2275 } 2276 2277 while (w) 2278 { 2279 s = *ps++; 2280 m = *pm++; 2281 d = *pd; 2282 2283 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); 2284 w--; 2285 } 2286 } 2287 2288 static void 2289 sse2_combine_add_ca (pixman_implementation_t *imp, 2290 pixman_op_t op, 2291 uint32_t * pd, 2292 const uint32_t * ps, 2293 const uint32_t * pm, 2294 int w) 2295 { 2296 uint32_t s, m, d; 2297 2298 __m128i xmm_src_lo, xmm_src_hi; 2299 __m128i xmm_dst_lo, xmm_dst_hi; 2300 __m128i xmm_mask_lo, xmm_mask_hi; 2301 2302 while (w && (uintptr_t)pd & 15) 2303 { 2304 s = *ps++; 2305 m = *pm++; 2306 d = *pd; 2307 2308 *pd++ = pack_1x128_32 ( 2309 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), 2310 unpack_32_1x128 (m)), 2311 unpack_32_1x128 (d))); 2312 w--; 2313 } 2314 2315 while (w >= 4) 2316 { 2317 xmm_src_hi = load_128_unaligned ((__m128i*)ps); 2318 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 2319 xmm_dst_hi = load_128_aligned ((__m128i*)pd); 2320 2321 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 2322 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 2323 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 2324 2325 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 2326 &xmm_mask_lo, &xmm_mask_hi, 2327 &xmm_src_lo, &xmm_src_hi); 2328 2329 save_128_aligned ( 2330 (__m128i*)pd, pack_2x128_128 ( 2331 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), 2332 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); 2333 2334 ps += 4; 2335 pd += 4; 2336 pm += 4; 2337 w -= 4; 2338 } 2339 2340 while (w) 2341 { 2342 s = *ps++; 2343 m = *pm++; 2344 d = *pd; 2345 2346 *pd++ = pack_1x128_32 ( 2347 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), 2348 unpack_32_1x128 (m)), 2349 unpack_32_1x128 (d))); 2350 w--; 2351 } 2352 } 2353 2354 static force_inline __m128i 2355 create_mask_16_128 (uint16_t mask) 2356 { 2357 return _mm_set1_epi16 (mask); 2358 } 2359 2360 /* Work around a code generation bug in Sun Studio 12. */ 2361 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) 2362 # define create_mask_2x32_128(mask0, mask1) \ 2363 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) 2364 #else 2365 static force_inline __m128i 2366 create_mask_2x32_128 (uint32_t mask0, 2367 uint32_t mask1) 2368 { 2369 return _mm_set_epi32 (mask0, mask1, mask0, mask1); 2370 } 2371 #endif 2372 2373 static void 2374 sse2_composite_over_n_8888 (pixman_implementation_t *imp, 2375 pixman_composite_info_t *info) 2376 { 2377 PIXMAN_COMPOSITE_ARGS (info); 2378 uint32_t src; 2379 uint32_t *dst_line, *dst, d; 2380 int32_t w; 2381 int dst_stride; 2382 __m128i xmm_src, xmm_alpha; 2383 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 2384 2385 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2386 2387 if (src == 0) 2388 return; 2389 2390 PIXMAN_IMAGE_GET_LINE ( 2391 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2392 2393 xmm_src = expand_pixel_32_1x128 (src); 2394 xmm_alpha = expand_alpha_1x128 (xmm_src); 2395 2396 while (height--) 2397 { 2398 dst = dst_line; 2399 2400 dst_line += dst_stride; 2401 w = width; 2402 2403 while (w && (uintptr_t)dst & 15) 2404 { 2405 d = *dst; 2406 *dst++ = pack_1x128_32 (over_1x128 (xmm_src, 2407 xmm_alpha, 2408 unpack_32_1x128 (d))); 2409 w--; 2410 } 2411 2412 while (w >= 4) 2413 { 2414 xmm_dst = load_128_aligned ((__m128i*)dst); 2415 2416 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 2417 2418 over_2x128 (&xmm_src, &xmm_src, 2419 &xmm_alpha, &xmm_alpha, 2420 &xmm_dst_lo, &xmm_dst_hi); 2421 2422 /* rebuid the 4 pixel data and save*/ 2423 save_128_aligned ( 2424 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2425 2426 w -= 4; 2427 dst += 4; 2428 } 2429 2430 while (w) 2431 { 2432 d = *dst; 2433 *dst++ = pack_1x128_32 (over_1x128 (xmm_src, 2434 xmm_alpha, 2435 unpack_32_1x128 (d))); 2436 w--; 2437 } 2438 2439 } 2440 } 2441 2442 static void 2443 sse2_composite_over_n_0565 (pixman_implementation_t *imp, 2444 pixman_composite_info_t *info) 2445 { 2446 PIXMAN_COMPOSITE_ARGS (info); 2447 uint32_t src; 2448 uint16_t *dst_line, *dst, d; 2449 int32_t w; 2450 int dst_stride; 2451 __m128i xmm_src, xmm_alpha; 2452 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 2453 2454 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2455 2456 if (src == 0) 2457 return; 2458 2459 PIXMAN_IMAGE_GET_LINE ( 2460 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2461 2462 xmm_src = expand_pixel_32_1x128 (src); 2463 xmm_alpha = expand_alpha_1x128 (xmm_src); 2464 2465 while (height--) 2466 { 2467 dst = dst_line; 2468 2469 dst_line += dst_stride; 2470 w = width; 2471 2472 while (w && (uintptr_t)dst & 15) 2473 { 2474 d = *dst; 2475 2476 *dst++ = pack_565_32_16 ( 2477 pack_1x128_32 (over_1x128 (xmm_src, 2478 xmm_alpha, 2479 expand565_16_1x128 (d)))); 2480 w--; 2481 } 2482 2483 while (w >= 8) 2484 { 2485 xmm_dst = load_128_aligned ((__m128i*)dst); 2486 2487 unpack_565_128_4x128 (xmm_dst, 2488 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 2489 2490 over_2x128 (&xmm_src, &xmm_src, 2491 &xmm_alpha, &xmm_alpha, 2492 &xmm_dst0, &xmm_dst1); 2493 over_2x128 (&xmm_src, &xmm_src, 2494 &xmm_alpha, &xmm_alpha, 2495 &xmm_dst2, &xmm_dst3); 2496 2497 xmm_dst = pack_565_4x128_128 ( 2498 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 2499 2500 save_128_aligned ((__m128i*)dst, xmm_dst); 2501 2502 dst += 8; 2503 w -= 8; 2504 } 2505 2506 while (w--) 2507 { 2508 d = *dst; 2509 *dst++ = pack_565_32_16 ( 2510 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, 2511 expand565_16_1x128 (d)))); 2512 } 2513 } 2514 2515 } 2516 2517 static void 2518 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, 2519 pixman_composite_info_t *info) 2520 { 2521 PIXMAN_COMPOSITE_ARGS (info); 2522 uint32_t src; 2523 uint32_t *dst_line, d; 2524 uint32_t *mask_line, m; 2525 uint32_t pack_cmp; 2526 int dst_stride, mask_stride; 2527 2528 __m128i xmm_src; 2529 __m128i xmm_dst; 2530 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 2531 2532 __m128i mmx_src, mmx_mask, mmx_dest; 2533 2534 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2535 2536 if (src == 0) 2537 return; 2538 2539 PIXMAN_IMAGE_GET_LINE ( 2540 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2541 PIXMAN_IMAGE_GET_LINE ( 2542 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 2543 2544 xmm_src = _mm_unpacklo_epi8 ( 2545 create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); 2546 mmx_src = xmm_src; 2547 2548 while (height--) 2549 { 2550 int w = width; 2551 const uint32_t *pm = (uint32_t *)mask_line; 2552 uint32_t *pd = (uint32_t *)dst_line; 2553 2554 dst_line += dst_stride; 2555 mask_line += mask_stride; 2556 2557 while (w && (uintptr_t)pd & 15) 2558 { 2559 m = *pm++; 2560 2561 if (m) 2562 { 2563 d = *pd; 2564 2565 mmx_mask = unpack_32_1x128 (m); 2566 mmx_dest = unpack_32_1x128 (d); 2567 2568 *pd = pack_1x128_32 ( 2569 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), 2570 mmx_dest)); 2571 } 2572 2573 pd++; 2574 w--; 2575 } 2576 2577 while (w >= 4) 2578 { 2579 xmm_mask = load_128_unaligned ((__m128i*)pm); 2580 2581 pack_cmp = 2582 _mm_movemask_epi8 ( 2583 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 2584 2585 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ 2586 if (pack_cmp != 0xffff) 2587 { 2588 xmm_dst = load_128_aligned ((__m128i*)pd); 2589 2590 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 2591 2592 pix_multiply_2x128 (&xmm_src, &xmm_src, 2593 &xmm_mask_lo, &xmm_mask_hi, 2594 &xmm_mask_lo, &xmm_mask_hi); 2595 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); 2596 2597 save_128_aligned ( 2598 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); 2599 } 2600 2601 pd += 4; 2602 pm += 4; 2603 w -= 4; 2604 } 2605 2606 while (w) 2607 { 2608 m = *pm++; 2609 2610 if (m) 2611 { 2612 d = *pd; 2613 2614 mmx_mask = unpack_32_1x128 (m); 2615 mmx_dest = unpack_32_1x128 (d); 2616 2617 *pd = pack_1x128_32 ( 2618 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), 2619 mmx_dest)); 2620 } 2621 2622 pd++; 2623 w--; 2624 } 2625 } 2626 2627 } 2628 2629 static void 2630 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, 2631 pixman_composite_info_t *info) 2632 { 2633 PIXMAN_COMPOSITE_ARGS (info); 2634 uint32_t src; 2635 uint32_t *dst_line, d; 2636 uint32_t *mask_line, m; 2637 uint32_t pack_cmp; 2638 int dst_stride, mask_stride; 2639 2640 __m128i xmm_src, xmm_alpha; 2641 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 2642 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 2643 2644 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 2645 2646 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2647 2648 if (src == 0) 2649 return; 2650 2651 PIXMAN_IMAGE_GET_LINE ( 2652 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2653 PIXMAN_IMAGE_GET_LINE ( 2654 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 2655 2656 xmm_src = _mm_unpacklo_epi8 ( 2657 create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); 2658 xmm_alpha = expand_alpha_1x128 (xmm_src); 2659 mmx_src = xmm_src; 2660 mmx_alpha = xmm_alpha; 2661 2662 while (height--) 2663 { 2664 int w = width; 2665 const uint32_t *pm = (uint32_t *)mask_line; 2666 uint32_t *pd = (uint32_t *)dst_line; 2667 2668 dst_line += dst_stride; 2669 mask_line += mask_stride; 2670 2671 while (w && (uintptr_t)pd & 15) 2672 { 2673 m = *pm++; 2674 2675 if (m) 2676 { 2677 d = *pd; 2678 mmx_mask = unpack_32_1x128 (m); 2679 mmx_dest = unpack_32_1x128 (d); 2680 2681 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, 2682 &mmx_alpha, 2683 &mmx_mask, 2684 &mmx_dest)); 2685 } 2686 2687 pd++; 2688 w--; 2689 } 2690 2691 while (w >= 4) 2692 { 2693 xmm_mask = load_128_unaligned ((__m128i*)pm); 2694 2695 pack_cmp = 2696 _mm_movemask_epi8 ( 2697 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 2698 2699 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ 2700 if (pack_cmp != 0xffff) 2701 { 2702 xmm_dst = load_128_aligned ((__m128i*)pd); 2703 2704 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 2705 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 2706 2707 in_over_2x128 (&xmm_src, &xmm_src, 2708 &xmm_alpha, &xmm_alpha, 2709 &xmm_mask_lo, &xmm_mask_hi, 2710 &xmm_dst_lo, &xmm_dst_hi); 2711 2712 save_128_aligned ( 2713 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2714 } 2715 2716 pd += 4; 2717 pm += 4; 2718 w -= 4; 2719 } 2720 2721 while (w) 2722 { 2723 m = *pm++; 2724 2725 if (m) 2726 { 2727 d = *pd; 2728 mmx_mask = unpack_32_1x128 (m); 2729 mmx_dest = unpack_32_1x128 (d); 2730 2731 *pd = pack_1x128_32 ( 2732 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); 2733 } 2734 2735 pd++; 2736 w--; 2737 } 2738 } 2739 2740 } 2741 2742 static void 2743 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, 2744 pixman_composite_info_t *info) 2745 { 2746 PIXMAN_COMPOSITE_ARGS (info); 2747 uint32_t *dst_line, *dst; 2748 uint32_t *src_line, *src; 2749 uint32_t mask; 2750 int32_t w; 2751 int dst_stride, src_stride; 2752 2753 __m128i xmm_mask; 2754 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 2755 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 2756 __m128i xmm_alpha_lo, xmm_alpha_hi; 2757 2758 PIXMAN_IMAGE_GET_LINE ( 2759 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2760 PIXMAN_IMAGE_GET_LINE ( 2761 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2762 2763 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); 2764 2765 xmm_mask = create_mask_16_128 (mask >> 24); 2766 2767 while (height--) 2768 { 2769 dst = dst_line; 2770 dst_line += dst_stride; 2771 src = src_line; 2772 src_line += src_stride; 2773 w = width; 2774 2775 while (w && (uintptr_t)dst & 15) 2776 { 2777 uint32_t s = *src++; 2778 2779 if (s) 2780 { 2781 uint32_t d = *dst; 2782 2783 __m128i ms = unpack_32_1x128 (s); 2784 __m128i alpha = expand_alpha_1x128 (ms); 2785 __m128i dest = xmm_mask; 2786 __m128i alpha_dst = unpack_32_1x128 (d); 2787 2788 *dst = pack_1x128_32 ( 2789 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 2790 } 2791 dst++; 2792 w--; 2793 } 2794 2795 while (w >= 4) 2796 { 2797 xmm_src = load_128_unaligned ((__m128i*)src); 2798 2799 if (!is_zero (xmm_src)) 2800 { 2801 xmm_dst = load_128_aligned ((__m128i*)dst); 2802 2803 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 2804 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 2805 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 2806 &xmm_alpha_lo, &xmm_alpha_hi); 2807 2808 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 2809 &xmm_alpha_lo, &xmm_alpha_hi, 2810 &xmm_mask, &xmm_mask, 2811 &xmm_dst_lo, &xmm_dst_hi); 2812 2813 save_128_aligned ( 2814 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 2815 } 2816 2817 dst += 4; 2818 src += 4; 2819 w -= 4; 2820 } 2821 2822 while (w) 2823 { 2824 uint32_t s = *src++; 2825 2826 if (s) 2827 { 2828 uint32_t d = *dst; 2829 2830 __m128i ms = unpack_32_1x128 (s); 2831 __m128i alpha = expand_alpha_1x128 (ms); 2832 __m128i mask = xmm_mask; 2833 __m128i dest = unpack_32_1x128 (d); 2834 2835 *dst = pack_1x128_32 ( 2836 in_over_1x128 (&ms, &alpha, &mask, &dest)); 2837 } 2838 2839 dst++; 2840 w--; 2841 } 2842 } 2843 2844 } 2845 2846 static void 2847 sse2_composite_src_x888_0565 (pixman_implementation_t *imp, 2848 pixman_composite_info_t *info) 2849 { 2850 PIXMAN_COMPOSITE_ARGS (info); 2851 uint16_t *dst_line, *dst; 2852 uint32_t *src_line, *src, s; 2853 int dst_stride, src_stride; 2854 int32_t w; 2855 2856 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2857 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2858 2859 while (height--) 2860 { 2861 dst = dst_line; 2862 dst_line += dst_stride; 2863 src = src_line; 2864 src_line += src_stride; 2865 w = width; 2866 2867 while (w && (uintptr_t)dst & 15) 2868 { 2869 s = *src++; 2870 *dst = convert_8888_to_0565 (s); 2871 dst++; 2872 w--; 2873 } 2874 2875 while (w >= 8) 2876 { 2877 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); 2878 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); 2879 2880 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); 2881 2882 w -= 8; 2883 src += 8; 2884 dst += 8; 2885 } 2886 2887 while (w) 2888 { 2889 s = *src++; 2890 *dst = convert_8888_to_0565 (s); 2891 dst++; 2892 w--; 2893 } 2894 } 2895 } 2896 2897 static void 2898 sse2_composite_src_x888_8888 (pixman_implementation_t *imp, 2899 pixman_composite_info_t *info) 2900 { 2901 PIXMAN_COMPOSITE_ARGS (info); 2902 uint32_t *dst_line, *dst; 2903 uint32_t *src_line, *src; 2904 int32_t w; 2905 int dst_stride, src_stride; 2906 2907 2908 PIXMAN_IMAGE_GET_LINE ( 2909 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2910 PIXMAN_IMAGE_GET_LINE ( 2911 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2912 2913 while (height--) 2914 { 2915 dst = dst_line; 2916 dst_line += dst_stride; 2917 src = src_line; 2918 src_line += src_stride; 2919 w = width; 2920 2921 while (w && (uintptr_t)dst & 15) 2922 { 2923 *dst++ = *src++ | 0xff000000; 2924 w--; 2925 } 2926 2927 while (w >= 16) 2928 { 2929 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; 2930 2931 xmm_src1 = load_128_unaligned ((__m128i*)src + 0); 2932 xmm_src2 = load_128_unaligned ((__m128i*)src + 1); 2933 xmm_src3 = load_128_unaligned ((__m128i*)src + 2); 2934 xmm_src4 = load_128_unaligned ((__m128i*)src + 3); 2935 2936 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); 2937 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); 2938 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); 2939 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); 2940 2941 dst += 16; 2942 src += 16; 2943 w -= 16; 2944 } 2945 2946 while (w) 2947 { 2948 *dst++ = *src++ | 0xff000000; 2949 w--; 2950 } 2951 } 2952 2953 } 2954 2955 static void 2956 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, 2957 pixman_composite_info_t *info) 2958 { 2959 PIXMAN_COMPOSITE_ARGS (info); 2960 uint32_t *dst_line, *dst; 2961 uint32_t *src_line, *src; 2962 uint32_t mask; 2963 int dst_stride, src_stride; 2964 int32_t w; 2965 2966 __m128i xmm_mask, xmm_alpha; 2967 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 2968 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 2969 2970 PIXMAN_IMAGE_GET_LINE ( 2971 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2972 PIXMAN_IMAGE_GET_LINE ( 2973 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2974 2975 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); 2976 2977 xmm_mask = create_mask_16_128 (mask >> 24); 2978 xmm_alpha = mask_00ff; 2979 2980 while (height--) 2981 { 2982 dst = dst_line; 2983 dst_line += dst_stride; 2984 src = src_line; 2985 src_line += src_stride; 2986 w = width; 2987 2988 while (w && (uintptr_t)dst & 15) 2989 { 2990 uint32_t s = (*src++) | 0xff000000; 2991 uint32_t d = *dst; 2992 2993 __m128i src = unpack_32_1x128 (s); 2994 __m128i alpha = xmm_alpha; 2995 __m128i mask = xmm_mask; 2996 __m128i dest = unpack_32_1x128 (d); 2997 2998 *dst++ = pack_1x128_32 ( 2999 in_over_1x128 (&src, &alpha, &mask, &dest)); 3000 3001 w--; 3002 } 3003 3004 while (w >= 4) 3005 { 3006 xmm_src = _mm_or_si128 ( 3007 load_128_unaligned ((__m128i*)src), mask_ff000000); 3008 xmm_dst = load_128_aligned ((__m128i*)dst); 3009 3010 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3011 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 3012 3013 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 3014 &xmm_alpha, &xmm_alpha, 3015 &xmm_mask, &xmm_mask, 3016 &xmm_dst_lo, &xmm_dst_hi); 3017 3018 save_128_aligned ( 3019 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 3020 3021 dst += 4; 3022 src += 4; 3023 w -= 4; 3024 3025 } 3026 3027 while (w) 3028 { 3029 uint32_t s = (*src++) | 0xff000000; 3030 uint32_t d = *dst; 3031 3032 __m128i src = unpack_32_1x128 (s); 3033 __m128i alpha = xmm_alpha; 3034 __m128i mask = xmm_mask; 3035 __m128i dest = unpack_32_1x128 (d); 3036 3037 *dst++ = pack_1x128_32 ( 3038 in_over_1x128 (&src, &alpha, &mask, &dest)); 3039 3040 w--; 3041 } 3042 } 3043 3044 } 3045 3046 static void 3047 sse2_composite_over_8888_8888 (pixman_implementation_t *imp, 3048 pixman_composite_info_t *info) 3049 { 3050 PIXMAN_COMPOSITE_ARGS (info); 3051 int dst_stride, src_stride; 3052 uint32_t *dst_line, *dst; 3053 uint32_t *src_line, *src; 3054 3055 PIXMAN_IMAGE_GET_LINE ( 3056 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3057 PIXMAN_IMAGE_GET_LINE ( 3058 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3059 3060 dst = dst_line; 3061 src = src_line; 3062 3063 while (height--) 3064 { 3065 sse2_combine_over_u (imp, op, dst, src, NULL, width); 3066 3067 dst += dst_stride; 3068 src += src_stride; 3069 } 3070 } 3071 3072 static force_inline uint16_t 3073 composite_over_8888_0565pixel (uint32_t src, uint16_t dst) 3074 { 3075 __m128i ms; 3076 3077 ms = unpack_32_1x128 (src); 3078 return pack_565_32_16 ( 3079 pack_1x128_32 ( 3080 over_1x128 ( 3081 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); 3082 } 3083 3084 static void 3085 sse2_composite_over_8888_0565 (pixman_implementation_t *imp, 3086 pixman_composite_info_t *info) 3087 { 3088 PIXMAN_COMPOSITE_ARGS (info); 3089 uint16_t *dst_line, *dst, d; 3090 uint32_t *src_line, *src, s; 3091 int dst_stride, src_stride; 3092 int32_t w; 3093 3094 __m128i xmm_alpha_lo, xmm_alpha_hi; 3095 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 3096 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 3097 3098 PIXMAN_IMAGE_GET_LINE ( 3099 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3100 PIXMAN_IMAGE_GET_LINE ( 3101 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3102 3103 while (height--) 3104 { 3105 dst = dst_line; 3106 src = src_line; 3107 3108 dst_line += dst_stride; 3109 src_line += src_stride; 3110 w = width; 3111 3112 /* Align dst on a 16-byte boundary */ 3113 while (w && 3114 ((uintptr_t)dst & 15)) 3115 { 3116 s = *src++; 3117 d = *dst; 3118 3119 *dst++ = composite_over_8888_0565pixel (s, d); 3120 w--; 3121 } 3122 3123 /* It's a 8 pixel loop */ 3124 while (w >= 8) 3125 { 3126 /* I'm loading unaligned because I'm not sure 3127 * about the address alignment. 3128 */ 3129 xmm_src = load_128_unaligned ((__m128i*) src); 3130 xmm_dst = load_128_aligned ((__m128i*) dst); 3131 3132 /* Unpacking */ 3133 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3134 unpack_565_128_4x128 (xmm_dst, 3135 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 3136 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 3137 &xmm_alpha_lo, &xmm_alpha_hi); 3138 3139 /* I'm loading next 4 pixels from memory 3140 * before to optimze the memory read. 3141 */ 3142 xmm_src = load_128_unaligned ((__m128i*) (src + 4)); 3143 3144 over_2x128 (&xmm_src_lo, &xmm_src_hi, 3145 &xmm_alpha_lo, &xmm_alpha_hi, 3146 &xmm_dst0, &xmm_dst1); 3147 3148 /* Unpacking */ 3149 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3150 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 3151 &xmm_alpha_lo, &xmm_alpha_hi); 3152 3153 over_2x128 (&xmm_src_lo, &xmm_src_hi, 3154 &xmm_alpha_lo, &xmm_alpha_hi, 3155 &xmm_dst2, &xmm_dst3); 3156 3157 save_128_aligned ( 3158 (__m128i*)dst, pack_565_4x128_128 ( 3159 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 3160 3161 w -= 8; 3162 dst += 8; 3163 src += 8; 3164 } 3165 3166 while (w--) 3167 { 3168 s = *src++; 3169 d = *dst; 3170 3171 *dst++ = composite_over_8888_0565pixel (s, d); 3172 } 3173 } 3174 3175 } 3176 3177 static void 3178 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, 3179 pixman_composite_info_t *info) 3180 { 3181 PIXMAN_COMPOSITE_ARGS (info); 3182 uint32_t src, srca; 3183 uint32_t *dst_line, *dst; 3184 uint8_t *mask_line, *mask; 3185 int dst_stride, mask_stride; 3186 int32_t w; 3187 uint32_t d; 3188 3189 __m128i xmm_src, xmm_alpha, xmm_def; 3190 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 3191 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 3192 3193 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 3194 3195 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3196 3197 srca = src >> 24; 3198 if (src == 0) 3199 return; 3200 3201 PIXMAN_IMAGE_GET_LINE ( 3202 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3203 PIXMAN_IMAGE_GET_LINE ( 3204 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 3205 3206 xmm_def = create_mask_2x32_128 (src, src); 3207 xmm_src = expand_pixel_32_1x128 (src); 3208 xmm_alpha = expand_alpha_1x128 (xmm_src); 3209 mmx_src = xmm_src; 3210 mmx_alpha = xmm_alpha; 3211 3212 while (height--) 3213 { 3214 dst = dst_line; 3215 dst_line += dst_stride; 3216 mask = mask_line; 3217 mask_line += mask_stride; 3218 w = width; 3219 3220 while (w && (uintptr_t)dst & 15) 3221 { 3222 uint8_t m = *mask++; 3223 3224 if (m) 3225 { 3226 d = *dst; 3227 mmx_mask = expand_pixel_8_1x128 (m); 3228 mmx_dest = unpack_32_1x128 (d); 3229 3230 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, 3231 &mmx_alpha, 3232 &mmx_mask, 3233 &mmx_dest)); 3234 } 3235 3236 w--; 3237 dst++; 3238 } 3239 3240 while (w >= 4) 3241 { 3242 uint32_t m; 3243 memcpy(&m, mask, sizeof(uint32_t)); 3244 3245 if (srca == 0xff && m == 0xffffffff) 3246 { 3247 save_128_aligned ((__m128i*)dst, xmm_def); 3248 } 3249 else if (m) 3250 { 3251 xmm_dst = load_128_aligned ((__m128i*) dst); 3252 xmm_mask = unpack_32_1x128 (m); 3253 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 3254 3255 /* Unpacking */ 3256 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 3257 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3258 3259 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 3260 &xmm_mask_lo, &xmm_mask_hi); 3261 3262 in_over_2x128 (&xmm_src, &xmm_src, 3263 &xmm_alpha, &xmm_alpha, 3264 &xmm_mask_lo, &xmm_mask_hi, 3265 &xmm_dst_lo, &xmm_dst_hi); 3266 3267 save_128_aligned ( 3268 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 3269 } 3270 3271 w -= 4; 3272 dst += 4; 3273 mask += 4; 3274 } 3275 3276 while (w) 3277 { 3278 uint8_t m = *mask++; 3279 3280 if (m) 3281 { 3282 d = *dst; 3283 mmx_mask = expand_pixel_8_1x128 (m); 3284 mmx_dest = unpack_32_1x128 (d); 3285 3286 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, 3287 &mmx_alpha, 3288 &mmx_mask, 3289 &mmx_dest)); 3290 } 3291 3292 w--; 3293 dst++; 3294 } 3295 } 3296 3297 } 3298 3299 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) 3300 __attribute__((__force_align_arg_pointer__)) 3301 #endif 3302 static pixman_bool_t 3303 sse2_fill (pixman_implementation_t *imp, 3304 uint32_t * bits, 3305 int stride, 3306 int bpp, 3307 int x, 3308 int y, 3309 int width, 3310 int height, 3311 uint32_t filler) 3312 { 3313 uint32_t byte_width; 3314 uint8_t *byte_line; 3315 3316 __m128i xmm_def; 3317 3318 if (bpp == 8) 3319 { 3320 uint32_t b; 3321 uint32_t w; 3322 3323 stride = stride * (int) sizeof (uint32_t) / 1; 3324 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); 3325 byte_width = width; 3326 stride *= 1; 3327 3328 b = filler & 0xff; 3329 w = (b << 8) | b; 3330 filler = (w << 16) | w; 3331 } 3332 else if (bpp == 16) 3333 { 3334 stride = stride * (int) sizeof (uint32_t) / 2; 3335 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); 3336 byte_width = 2 * width; 3337 stride *= 2; 3338 3339 filler = (filler & 0xffff) * 0x00010001; 3340 } 3341 else if (bpp == 32) 3342 { 3343 stride = stride * (int) sizeof (uint32_t) / 4; 3344 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); 3345 byte_width = 4 * width; 3346 stride *= 4; 3347 } 3348 else 3349 { 3350 return FALSE; 3351 } 3352 3353 xmm_def = create_mask_2x32_128 (filler, filler); 3354 3355 while (height--) 3356 { 3357 int w; 3358 uint8_t *d = byte_line; 3359 byte_line += stride; 3360 w = byte_width; 3361 3362 if (w >= 1 && ((uintptr_t)d & 1)) 3363 { 3364 *(uint8_t *)d = filler; 3365 w -= 1; 3366 d += 1; 3367 } 3368 3369 while (w >= 2 && ((uintptr_t)d & 3)) 3370 { 3371 *(uint16_t *)d = filler; 3372 w -= 2; 3373 d += 2; 3374 } 3375 3376 while (w >= 4 && ((uintptr_t)d & 15)) 3377 { 3378 *(uint32_t *)d = filler; 3379 3380 w -= 4; 3381 d += 4; 3382 } 3383 3384 while (w >= 128) 3385 { 3386 save_128_aligned ((__m128i*)(d), xmm_def); 3387 save_128_aligned ((__m128i*)(d + 16), xmm_def); 3388 save_128_aligned ((__m128i*)(d + 32), xmm_def); 3389 save_128_aligned ((__m128i*)(d + 48), xmm_def); 3390 save_128_aligned ((__m128i*)(d + 64), xmm_def); 3391 save_128_aligned ((__m128i*)(d + 80), xmm_def); 3392 save_128_aligned ((__m128i*)(d + 96), xmm_def); 3393 save_128_aligned ((__m128i*)(d + 112), xmm_def); 3394 3395 d += 128; 3396 w -= 128; 3397 } 3398 3399 if (w >= 64) 3400 { 3401 save_128_aligned ((__m128i*)(d), xmm_def); 3402 save_128_aligned ((__m128i*)(d + 16), xmm_def); 3403 save_128_aligned ((__m128i*)(d + 32), xmm_def); 3404 save_128_aligned ((__m128i*)(d + 48), xmm_def); 3405 3406 d += 64; 3407 w -= 64; 3408 } 3409 3410 if (w >= 32) 3411 { 3412 save_128_aligned ((__m128i*)(d), xmm_def); 3413 save_128_aligned ((__m128i*)(d + 16), xmm_def); 3414 3415 d += 32; 3416 w -= 32; 3417 } 3418 3419 if (w >= 16) 3420 { 3421 save_128_aligned ((__m128i*)(d), xmm_def); 3422 3423 d += 16; 3424 w -= 16; 3425 } 3426 3427 while (w >= 4) 3428 { 3429 *(uint32_t *)d = filler; 3430 3431 w -= 4; 3432 d += 4; 3433 } 3434 3435 if (w >= 2) 3436 { 3437 *(uint16_t *)d = filler; 3438 w -= 2; 3439 d += 2; 3440 } 3441 3442 if (w >= 1) 3443 { 3444 *(uint8_t *)d = filler; 3445 w -= 1; 3446 d += 1; 3447 } 3448 } 3449 3450 return TRUE; 3451 } 3452 3453 static void 3454 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, 3455 pixman_composite_info_t *info) 3456 { 3457 PIXMAN_COMPOSITE_ARGS (info); 3458 uint32_t src, srca; 3459 uint32_t *dst_line, *dst; 3460 uint8_t *mask_line, *mask; 3461 int dst_stride, mask_stride; 3462 int32_t w; 3463 3464 __m128i xmm_src, xmm_def; 3465 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 3466 3467 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3468 3469 srca = src >> 24; 3470 if (src == 0) 3471 { 3472 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, 3473 PIXMAN_FORMAT_BPP (dest_image->bits.format), 3474 dest_x, dest_y, width, height, 0); 3475 return; 3476 } 3477 3478 PIXMAN_IMAGE_GET_LINE ( 3479 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3480 PIXMAN_IMAGE_GET_LINE ( 3481 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 3482 3483 xmm_def = create_mask_2x32_128 (src, src); 3484 xmm_src = expand_pixel_32_1x128 (src); 3485 3486 while (height--) 3487 { 3488 dst = dst_line; 3489 dst_line += dst_stride; 3490 mask = mask_line; 3491 mask_line += mask_stride; 3492 w = width; 3493 3494 while (w && (uintptr_t)dst & 15) 3495 { 3496 uint8_t m = *mask++; 3497 3498 if (m) 3499 { 3500 *dst = pack_1x128_32 ( 3501 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); 3502 } 3503 else 3504 { 3505 *dst = 0; 3506 } 3507 3508 w--; 3509 dst++; 3510 } 3511 3512 while (w >= 4) 3513 { 3514 uint32_t m; 3515 memcpy(&m, mask, sizeof(uint32_t)); 3516 3517 if (srca == 0xff && m == 0xffffffff) 3518 { 3519 save_128_aligned ((__m128i*)dst, xmm_def); 3520 } 3521 else if (m) 3522 { 3523 xmm_mask = unpack_32_1x128 (m); 3524 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 3525 3526 /* Unpacking */ 3527 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3528 3529 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 3530 &xmm_mask_lo, &xmm_mask_hi); 3531 3532 pix_multiply_2x128 (&xmm_src, &xmm_src, 3533 &xmm_mask_lo, &xmm_mask_hi, 3534 &xmm_mask_lo, &xmm_mask_hi); 3535 3536 save_128_aligned ( 3537 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); 3538 } 3539 else 3540 { 3541 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); 3542 } 3543 3544 w -= 4; 3545 dst += 4; 3546 mask += 4; 3547 } 3548 3549 while (w) 3550 { 3551 uint8_t m = *mask++; 3552 3553 if (m) 3554 { 3555 *dst = pack_1x128_32 ( 3556 pix_multiply_1x128 ( 3557 xmm_src, expand_pixel_8_1x128 (m))); 3558 } 3559 else 3560 { 3561 *dst = 0; 3562 } 3563 3564 w--; 3565 dst++; 3566 } 3567 } 3568 3569 } 3570 3571 static void 3572 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, 3573 pixman_composite_info_t *info) 3574 { 3575 PIXMAN_COMPOSITE_ARGS (info); 3576 uint32_t src; 3577 uint16_t *dst_line, *dst, d; 3578 uint8_t *mask_line, *mask; 3579 int dst_stride, mask_stride; 3580 int32_t w; 3581 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 3582 3583 __m128i xmm_src, xmm_alpha; 3584 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 3585 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 3586 3587 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3588 3589 if (src == 0) 3590 return; 3591 3592 PIXMAN_IMAGE_GET_LINE ( 3593 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3594 PIXMAN_IMAGE_GET_LINE ( 3595 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 3596 3597 xmm_src = expand_pixel_32_1x128 (src); 3598 xmm_alpha = expand_alpha_1x128 (xmm_src); 3599 mmx_src = xmm_src; 3600 mmx_alpha = xmm_alpha; 3601 3602 while (height--) 3603 { 3604 dst = dst_line; 3605 dst_line += dst_stride; 3606 mask = mask_line; 3607 mask_line += mask_stride; 3608 w = width; 3609 3610 while (w && (uintptr_t)dst & 15) 3611 { 3612 uint8_t m = *mask++; 3613 3614 if (m) 3615 { 3616 d = *dst; 3617 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 3618 mmx_dest = expand565_16_1x128 (d); 3619 3620 *dst = pack_565_32_16 ( 3621 pack_1x128_32 ( 3622 in_over_1x128 ( 3623 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 3624 } 3625 3626 w--; 3627 dst++; 3628 } 3629 3630 while (w >= 8) 3631 { 3632 uint32_t m; 3633 3634 xmm_dst = load_128_aligned ((__m128i*) dst); 3635 unpack_565_128_4x128 (xmm_dst, 3636 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 3637 3638 memcpy(&m, mask, sizeof(uint32_t)); 3639 mask += 4; 3640 3641 if (m) 3642 { 3643 xmm_mask = unpack_32_1x128 (m); 3644 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 3645 3646 /* Unpacking */ 3647 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3648 3649 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 3650 &xmm_mask_lo, &xmm_mask_hi); 3651 3652 in_over_2x128 (&xmm_src, &xmm_src, 3653 &xmm_alpha, &xmm_alpha, 3654 &xmm_mask_lo, &xmm_mask_hi, 3655 &xmm_dst0, &xmm_dst1); 3656 } 3657 3658 memcpy(&m, mask, sizeof(uint32_t)); 3659 mask += 4; 3660 3661 if (m) 3662 { 3663 xmm_mask = unpack_32_1x128 (m); 3664 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 3665 3666 /* Unpacking */ 3667 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3668 3669 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 3670 &xmm_mask_lo, &xmm_mask_hi); 3671 in_over_2x128 (&xmm_src, &xmm_src, 3672 &xmm_alpha, &xmm_alpha, 3673 &xmm_mask_lo, &xmm_mask_hi, 3674 &xmm_dst2, &xmm_dst3); 3675 } 3676 3677 save_128_aligned ( 3678 (__m128i*)dst, pack_565_4x128_128 ( 3679 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 3680 3681 w -= 8; 3682 dst += 8; 3683 } 3684 3685 while (w) 3686 { 3687 uint8_t m = *mask++; 3688 3689 if (m) 3690 { 3691 d = *dst; 3692 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 3693 mmx_dest = expand565_16_1x128 (d); 3694 3695 *dst = pack_565_32_16 ( 3696 pack_1x128_32 ( 3697 in_over_1x128 ( 3698 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 3699 } 3700 3701 w--; 3702 dst++; 3703 } 3704 } 3705 3706 } 3707 3708 static void 3709 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, 3710 pixman_composite_info_t *info) 3711 { 3712 PIXMAN_COMPOSITE_ARGS (info); 3713 uint16_t *dst_line, *dst, d; 3714 uint32_t *src_line, *src, s; 3715 int dst_stride, src_stride; 3716 int32_t w; 3717 uint32_t opaque, zero; 3718 3719 __m128i ms; 3720 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 3721 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 3722 3723 PIXMAN_IMAGE_GET_LINE ( 3724 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3725 PIXMAN_IMAGE_GET_LINE ( 3726 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3727 3728 while (height--) 3729 { 3730 dst = dst_line; 3731 dst_line += dst_stride; 3732 src = src_line; 3733 src_line += src_stride; 3734 w = width; 3735 3736 while (w && (uintptr_t)dst & 15) 3737 { 3738 s = *src++; 3739 d = *dst; 3740 3741 ms = unpack_32_1x128 (s); 3742 3743 *dst++ = pack_565_32_16 ( 3744 pack_1x128_32 ( 3745 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); 3746 w--; 3747 } 3748 3749 while (w >= 8) 3750 { 3751 /* First round */ 3752 xmm_src = load_128_unaligned ((__m128i*)src); 3753 xmm_dst = load_128_aligned ((__m128i*)dst); 3754 3755 opaque = is_opaque (xmm_src); 3756 zero = is_zero (xmm_src); 3757 3758 unpack_565_128_4x128 (xmm_dst, 3759 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 3760 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3761 3762 /* preload next round*/ 3763 xmm_src = load_128_unaligned ((__m128i*)(src + 4)); 3764 3765 if (opaque) 3766 { 3767 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 3768 &xmm_dst0, &xmm_dst1); 3769 } 3770 else if (!zero) 3771 { 3772 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 3773 &xmm_dst0, &xmm_dst1); 3774 } 3775 3776 /* Second round */ 3777 opaque = is_opaque (xmm_src); 3778 zero = is_zero (xmm_src); 3779 3780 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 3781 3782 if (opaque) 3783 { 3784 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 3785 &xmm_dst2, &xmm_dst3); 3786 } 3787 else if (!zero) 3788 { 3789 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 3790 &xmm_dst2, &xmm_dst3); 3791 } 3792 3793 save_128_aligned ( 3794 (__m128i*)dst, pack_565_4x128_128 ( 3795 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 3796 3797 w -= 8; 3798 src += 8; 3799 dst += 8; 3800 } 3801 3802 while (w) 3803 { 3804 s = *src++; 3805 d = *dst; 3806 3807 ms = unpack_32_1x128 (s); 3808 3809 *dst++ = pack_565_32_16 ( 3810 pack_1x128_32 ( 3811 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); 3812 w--; 3813 } 3814 } 3815 3816 } 3817 3818 static void 3819 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, 3820 pixman_composite_info_t *info) 3821 { 3822 PIXMAN_COMPOSITE_ARGS (info); 3823 uint32_t *dst_line, *dst, d; 3824 uint32_t *src_line, *src, s; 3825 int dst_stride, src_stride; 3826 int32_t w; 3827 uint32_t opaque, zero; 3828 3829 __m128i xmm_src_lo, xmm_src_hi; 3830 __m128i xmm_dst_lo, xmm_dst_hi; 3831 3832 PIXMAN_IMAGE_GET_LINE ( 3833 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3834 PIXMAN_IMAGE_GET_LINE ( 3835 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3836 3837 while (height--) 3838 { 3839 dst = dst_line; 3840 dst_line += dst_stride; 3841 src = src_line; 3842 src_line += src_stride; 3843 w = width; 3844 3845 while (w && (uintptr_t)dst & 15) 3846 { 3847 s = *src++; 3848 d = *dst; 3849 3850 *dst++ = pack_1x128_32 ( 3851 over_rev_non_pre_1x128 ( 3852 unpack_32_1x128 (s), unpack_32_1x128 (d))); 3853 3854 w--; 3855 } 3856 3857 while (w >= 4) 3858 { 3859 xmm_src_hi = load_128_unaligned ((__m128i*)src); 3860 3861 opaque = is_opaque (xmm_src_hi); 3862 zero = is_zero (xmm_src_hi); 3863 3864 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 3865 3866 if (opaque) 3867 { 3868 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 3869 &xmm_dst_lo, &xmm_dst_hi); 3870 3871 save_128_aligned ( 3872 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 3873 } 3874 else if (!zero) 3875 { 3876 xmm_dst_hi = load_128_aligned ((__m128i*)dst); 3877 3878 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 3879 3880 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 3881 &xmm_dst_lo, &xmm_dst_hi); 3882 3883 save_128_aligned ( 3884 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 3885 } 3886 3887 w -= 4; 3888 dst += 4; 3889 src += 4; 3890 } 3891 3892 while (w) 3893 { 3894 s = *src++; 3895 d = *dst; 3896 3897 *dst++ = pack_1x128_32 ( 3898 over_rev_non_pre_1x128 ( 3899 unpack_32_1x128 (s), unpack_32_1x128 (d))); 3900 3901 w--; 3902 } 3903 } 3904 3905 } 3906 3907 static void 3908 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, 3909 pixman_composite_info_t *info) 3910 { 3911 PIXMAN_COMPOSITE_ARGS (info); 3912 uint32_t src; 3913 uint16_t *dst_line, *dst, d; 3914 uint32_t *mask_line, *mask, m; 3915 int dst_stride, mask_stride; 3916 int w; 3917 uint32_t pack_cmp; 3918 3919 __m128i xmm_src, xmm_alpha; 3920 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 3921 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 3922 3923 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 3924 3925 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3926 3927 if (src == 0) 3928 return; 3929 3930 PIXMAN_IMAGE_GET_LINE ( 3931 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3932 PIXMAN_IMAGE_GET_LINE ( 3933 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 3934 3935 xmm_src = expand_pixel_32_1x128 (src); 3936 xmm_alpha = expand_alpha_1x128 (xmm_src); 3937 mmx_src = xmm_src; 3938 mmx_alpha = xmm_alpha; 3939 3940 while (height--) 3941 { 3942 w = width; 3943 mask = mask_line; 3944 dst = dst_line; 3945 mask_line += mask_stride; 3946 dst_line += dst_stride; 3947 3948 while (w && ((uintptr_t)dst & 15)) 3949 { 3950 m = *(uint32_t *) mask; 3951 3952 if (m) 3953 { 3954 d = *dst; 3955 mmx_mask = unpack_32_1x128 (m); 3956 mmx_dest = expand565_16_1x128 (d); 3957 3958 *dst = pack_565_32_16 ( 3959 pack_1x128_32 ( 3960 in_over_1x128 ( 3961 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 3962 } 3963 3964 w--; 3965 dst++; 3966 mask++; 3967 } 3968 3969 while (w >= 8) 3970 { 3971 /* First round */ 3972 xmm_mask = load_128_unaligned ((__m128i*)mask); 3973 xmm_dst = load_128_aligned ((__m128i*)dst); 3974 3975 pack_cmp = _mm_movemask_epi8 ( 3976 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 3977 3978 unpack_565_128_4x128 (xmm_dst, 3979 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 3980 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3981 3982 /* preload next round */ 3983 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); 3984 3985 /* preload next round */ 3986 if (pack_cmp != 0xffff) 3987 { 3988 in_over_2x128 (&xmm_src, &xmm_src, 3989 &xmm_alpha, &xmm_alpha, 3990 &xmm_mask_lo, &xmm_mask_hi, 3991 &xmm_dst0, &xmm_dst1); 3992 } 3993 3994 /* Second round */ 3995 pack_cmp = _mm_movemask_epi8 ( 3996 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 3997 3998 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 3999 4000 if (pack_cmp != 0xffff) 4001 { 4002 in_over_2x128 (&xmm_src, &xmm_src, 4003 &xmm_alpha, &xmm_alpha, 4004 &xmm_mask_lo, &xmm_mask_hi, 4005 &xmm_dst2, &xmm_dst3); 4006 } 4007 4008 save_128_aligned ( 4009 (__m128i*)dst, pack_565_4x128_128 ( 4010 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 4011 4012 w -= 8; 4013 dst += 8; 4014 mask += 8; 4015 } 4016 4017 while (w) 4018 { 4019 m = *(uint32_t *) mask; 4020 4021 if (m) 4022 { 4023 d = *dst; 4024 mmx_mask = unpack_32_1x128 (m); 4025 mmx_dest = expand565_16_1x128 (d); 4026 4027 *dst = pack_565_32_16 ( 4028 pack_1x128_32 ( 4029 in_over_1x128 ( 4030 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 4031 } 4032 4033 w--; 4034 dst++; 4035 mask++; 4036 } 4037 } 4038 4039 } 4040 4041 static void 4042 sse2_composite_in_n_8_8 (pixman_implementation_t *imp, 4043 pixman_composite_info_t *info) 4044 { 4045 PIXMAN_COMPOSITE_ARGS (info); 4046 uint8_t *dst_line, *dst; 4047 uint8_t *mask_line, *mask; 4048 int dst_stride, mask_stride; 4049 uint32_t d; 4050 uint32_t src; 4051 int32_t w; 4052 4053 __m128i xmm_alpha; 4054 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 4055 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4056 4057 PIXMAN_IMAGE_GET_LINE ( 4058 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4059 PIXMAN_IMAGE_GET_LINE ( 4060 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4061 4062 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4063 4064 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 4065 4066 while (height--) 4067 { 4068 dst = dst_line; 4069 dst_line += dst_stride; 4070 mask = mask_line; 4071 mask_line += mask_stride; 4072 w = width; 4073 4074 while (w && ((uintptr_t)dst & 15)) 4075 { 4076 uint8_t m = *mask++; 4077 d = (uint32_t) *dst; 4078 4079 *dst++ = (uint8_t) pack_1x128_32 ( 4080 pix_multiply_1x128 ( 4081 pix_multiply_1x128 (xmm_alpha, 4082 unpack_32_1x128 (m)), 4083 unpack_32_1x128 (d))); 4084 w--; 4085 } 4086 4087 while (w >= 16) 4088 { 4089 xmm_mask = load_128_unaligned ((__m128i*)mask); 4090 xmm_dst = load_128_aligned ((__m128i*)dst); 4091 4092 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4093 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4094 4095 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 4096 &xmm_mask_lo, &xmm_mask_hi, 4097 &xmm_mask_lo, &xmm_mask_hi); 4098 4099 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 4100 &xmm_dst_lo, &xmm_dst_hi, 4101 &xmm_dst_lo, &xmm_dst_hi); 4102 4103 save_128_aligned ( 4104 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4105 4106 mask += 16; 4107 dst += 16; 4108 w -= 16; 4109 } 4110 4111 while (w) 4112 { 4113 uint8_t m = *mask++; 4114 d = (uint32_t) *dst; 4115 4116 *dst++ = (uint8_t) pack_1x128_32 ( 4117 pix_multiply_1x128 ( 4118 pix_multiply_1x128 ( 4119 xmm_alpha, unpack_32_1x128 (m)), 4120 unpack_32_1x128 (d))); 4121 w--; 4122 } 4123 } 4124 4125 } 4126 4127 static void 4128 sse2_composite_in_n_8 (pixman_implementation_t *imp, 4129 pixman_composite_info_t *info) 4130 { 4131 PIXMAN_COMPOSITE_ARGS (info); 4132 uint8_t *dst_line, *dst; 4133 int dst_stride; 4134 uint32_t d; 4135 uint32_t src; 4136 int32_t w; 4137 4138 __m128i xmm_alpha; 4139 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4140 4141 PIXMAN_IMAGE_GET_LINE ( 4142 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4143 4144 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4145 4146 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 4147 4148 src = src >> 24; 4149 4150 if (src == 0xff) 4151 return; 4152 4153 if (src == 0x00) 4154 { 4155 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 4156 8, dest_x, dest_y, width, height, src); 4157 4158 return; 4159 } 4160 4161 while (height--) 4162 { 4163 dst = dst_line; 4164 dst_line += dst_stride; 4165 w = width; 4166 4167 while (w && ((uintptr_t)dst & 15)) 4168 { 4169 d = (uint32_t) *dst; 4170 4171 *dst++ = (uint8_t) pack_1x128_32 ( 4172 pix_multiply_1x128 ( 4173 xmm_alpha, 4174 unpack_32_1x128 (d))); 4175 w--; 4176 } 4177 4178 while (w >= 16) 4179 { 4180 xmm_dst = load_128_aligned ((__m128i*)dst); 4181 4182 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4183 4184 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 4185 &xmm_dst_lo, &xmm_dst_hi, 4186 &xmm_dst_lo, &xmm_dst_hi); 4187 4188 save_128_aligned ( 4189 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4190 4191 dst += 16; 4192 w -= 16; 4193 } 4194 4195 while (w) 4196 { 4197 d = (uint32_t) *dst; 4198 4199 *dst++ = (uint8_t) pack_1x128_32 ( 4200 pix_multiply_1x128 ( 4201 xmm_alpha, 4202 unpack_32_1x128 (d))); 4203 w--; 4204 } 4205 } 4206 4207 } 4208 4209 static void 4210 sse2_composite_in_8_8 (pixman_implementation_t *imp, 4211 pixman_composite_info_t *info) 4212 { 4213 PIXMAN_COMPOSITE_ARGS (info); 4214 uint8_t *dst_line, *dst; 4215 uint8_t *src_line, *src; 4216 int src_stride, dst_stride; 4217 int32_t w; 4218 uint32_t s, d; 4219 4220 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 4221 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4222 4223 PIXMAN_IMAGE_GET_LINE ( 4224 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4225 PIXMAN_IMAGE_GET_LINE ( 4226 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 4227 4228 while (height--) 4229 { 4230 dst = dst_line; 4231 dst_line += dst_stride; 4232 src = src_line; 4233 src_line += src_stride; 4234 w = width; 4235 4236 while (w && ((uintptr_t)dst & 15)) 4237 { 4238 s = (uint32_t) *src++; 4239 d = (uint32_t) *dst; 4240 4241 *dst++ = (uint8_t) pack_1x128_32 ( 4242 pix_multiply_1x128 ( 4243 unpack_32_1x128 (s), unpack_32_1x128 (d))); 4244 w--; 4245 } 4246 4247 while (w >= 16) 4248 { 4249 xmm_src = load_128_unaligned ((__m128i*)src); 4250 xmm_dst = load_128_aligned ((__m128i*)dst); 4251 4252 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 4253 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4254 4255 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 4256 &xmm_dst_lo, &xmm_dst_hi, 4257 &xmm_dst_lo, &xmm_dst_hi); 4258 4259 save_128_aligned ( 4260 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4261 4262 src += 16; 4263 dst += 16; 4264 w -= 16; 4265 } 4266 4267 while (w) 4268 { 4269 s = (uint32_t) *src++; 4270 d = (uint32_t) *dst; 4271 4272 *dst++ = (uint8_t) pack_1x128_32 ( 4273 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); 4274 w--; 4275 } 4276 } 4277 4278 } 4279 4280 static void 4281 sse2_composite_add_n_8_8 (pixman_implementation_t *imp, 4282 pixman_composite_info_t *info) 4283 { 4284 PIXMAN_COMPOSITE_ARGS (info); 4285 uint8_t *dst_line, *dst; 4286 uint8_t *mask_line, *mask; 4287 int dst_stride, mask_stride; 4288 int32_t w; 4289 uint32_t src; 4290 uint32_t d; 4291 4292 __m128i xmm_alpha; 4293 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 4294 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4295 4296 PIXMAN_IMAGE_GET_LINE ( 4297 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4298 PIXMAN_IMAGE_GET_LINE ( 4299 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4300 4301 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4302 4303 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 4304 4305 while (height--) 4306 { 4307 dst = dst_line; 4308 dst_line += dst_stride; 4309 mask = mask_line; 4310 mask_line += mask_stride; 4311 w = width; 4312 4313 while (w && ((uintptr_t)dst & 15)) 4314 { 4315 uint8_t m = *mask++; 4316 d = (uint32_t) *dst; 4317 4318 *dst++ = (uint8_t) pack_1x128_32 ( 4319 _mm_adds_epu16 ( 4320 pix_multiply_1x128 ( 4321 xmm_alpha, unpack_32_1x128 (m)), 4322 unpack_32_1x128 (d))); 4323 w--; 4324 } 4325 4326 while (w >= 16) 4327 { 4328 xmm_mask = load_128_unaligned ((__m128i*)mask); 4329 xmm_dst = load_128_aligned ((__m128i*)dst); 4330 4331 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4332 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4333 4334 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 4335 &xmm_mask_lo, &xmm_mask_hi, 4336 &xmm_mask_lo, &xmm_mask_hi); 4337 4338 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); 4339 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); 4340 4341 save_128_aligned ( 4342 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4343 4344 mask += 16; 4345 dst += 16; 4346 w -= 16; 4347 } 4348 4349 while (w) 4350 { 4351 uint8_t m = (uint32_t) *mask++; 4352 d = (uint32_t) *dst; 4353 4354 *dst++ = (uint8_t) pack_1x128_32 ( 4355 _mm_adds_epu16 ( 4356 pix_multiply_1x128 ( 4357 xmm_alpha, unpack_32_1x128 (m)), 4358 unpack_32_1x128 (d))); 4359 4360 w--; 4361 } 4362 } 4363 4364 } 4365 4366 static void 4367 sse2_composite_add_n_8 (pixman_implementation_t *imp, 4368 pixman_composite_info_t *info) 4369 { 4370 PIXMAN_COMPOSITE_ARGS (info); 4371 uint8_t *dst_line, *dst; 4372 int dst_stride; 4373 int32_t w; 4374 uint32_t src; 4375 4376 __m128i xmm_src; 4377 4378 PIXMAN_IMAGE_GET_LINE ( 4379 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4380 4381 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4382 4383 src >>= 24; 4384 4385 if (src == 0x00) 4386 return; 4387 4388 if (src == 0xff) 4389 { 4390 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 4391 8, dest_x, dest_y, width, height, 0xff); 4392 4393 return; 4394 } 4395 4396 src = (src << 24) | (src << 16) | (src << 8) | src; 4397 xmm_src = _mm_set_epi32 (src, src, src, src); 4398 4399 while (height--) 4400 { 4401 dst = dst_line; 4402 dst_line += dst_stride; 4403 w = width; 4404 4405 while (w && ((uintptr_t)dst & 15)) 4406 { 4407 *dst = (uint8_t)_mm_cvtsi128_si32 ( 4408 _mm_adds_epu8 ( 4409 xmm_src, 4410 _mm_cvtsi32_si128 (*dst))); 4411 4412 w--; 4413 dst++; 4414 } 4415 4416 while (w >= 16) 4417 { 4418 save_128_aligned ( 4419 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); 4420 4421 dst += 16; 4422 w -= 16; 4423 } 4424 4425 while (w) 4426 { 4427 *dst = (uint8_t)_mm_cvtsi128_si32 ( 4428 _mm_adds_epu8 ( 4429 xmm_src, 4430 _mm_cvtsi32_si128 (*dst))); 4431 4432 w--; 4433 dst++; 4434 } 4435 } 4436 4437 } 4438 4439 static void 4440 sse2_composite_add_8_8 (pixman_implementation_t *imp, 4441 pixman_composite_info_t *info) 4442 { 4443 PIXMAN_COMPOSITE_ARGS (info); 4444 uint8_t *dst_line, *dst; 4445 uint8_t *src_line, *src; 4446 int dst_stride, src_stride; 4447 int32_t w; 4448 uint16_t t; 4449 4450 PIXMAN_IMAGE_GET_LINE ( 4451 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 4452 PIXMAN_IMAGE_GET_LINE ( 4453 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 4454 4455 while (height--) 4456 { 4457 dst = dst_line; 4458 src = src_line; 4459 4460 dst_line += dst_stride; 4461 src_line += src_stride; 4462 w = width; 4463 4464 /* Small head */ 4465 while (w && (uintptr_t)dst & 3) 4466 { 4467 t = (*dst) + (*src++); 4468 *dst++ = t | (0 - (t >> 8)); 4469 w--; 4470 } 4471 4472 sse2_combine_add_u (imp, op, 4473 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); 4474 4475 /* Small tail */ 4476 dst += w & 0xfffc; 4477 src += w & 0xfffc; 4478 4479 w &= 3; 4480 4481 while (w) 4482 { 4483 t = (*dst) + (*src++); 4484 *dst++ = t | (0 - (t >> 8)); 4485 w--; 4486 } 4487 } 4488 4489 } 4490 4491 static void 4492 sse2_composite_add_8888_8888 (pixman_implementation_t *imp, 4493 pixman_composite_info_t *info) 4494 { 4495 PIXMAN_COMPOSITE_ARGS (info); 4496 uint32_t *dst_line, *dst; 4497 uint32_t *src_line, *src; 4498 int dst_stride, src_stride; 4499 4500 PIXMAN_IMAGE_GET_LINE ( 4501 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 4502 PIXMAN_IMAGE_GET_LINE ( 4503 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4504 4505 while (height--) 4506 { 4507 dst = dst_line; 4508 dst_line += dst_stride; 4509 src = src_line; 4510 src_line += src_stride; 4511 4512 sse2_combine_add_u (imp, op, dst, src, NULL, width); 4513 } 4514 } 4515 4516 static void 4517 sse2_composite_add_n_8888 (pixman_implementation_t *imp, 4518 pixman_composite_info_t *info) 4519 { 4520 PIXMAN_COMPOSITE_ARGS (info); 4521 uint32_t *dst_line, *dst, src; 4522 int dst_stride; 4523 4524 __m128i xmm_src; 4525 4526 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4527 4528 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4529 if (src == 0) 4530 return; 4531 4532 if (src == ~0) 4533 { 4534 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, 4535 dest_x, dest_y, width, height, ~0); 4536 4537 return; 4538 } 4539 4540 xmm_src = _mm_set_epi32 (src, src, src, src); 4541 while (height--) 4542 { 4543 int w = width; 4544 uint32_t d; 4545 4546 dst = dst_line; 4547 dst_line += dst_stride; 4548 4549 while (w && (uintptr_t)dst & 15) 4550 { 4551 d = *dst; 4552 *dst++ = 4553 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); 4554 w--; 4555 } 4556 4557 while (w >= 4) 4558 { 4559 save_128_aligned 4560 ((__m128i*)dst, 4561 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); 4562 4563 dst += 4; 4564 w -= 4; 4565 } 4566 4567 while (w--) 4568 { 4569 d = *dst; 4570 *dst++ = 4571 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, 4572 _mm_cvtsi32_si128 (d))); 4573 } 4574 } 4575 } 4576 4577 static void 4578 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, 4579 pixman_composite_info_t *info) 4580 { 4581 PIXMAN_COMPOSITE_ARGS (info); 4582 uint32_t *dst_line, *dst; 4583 uint8_t *mask_line, *mask; 4584 int dst_stride, mask_stride; 4585 int32_t w; 4586 uint32_t src; 4587 4588 __m128i xmm_src; 4589 4590 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 4591 if (src == 0) 4592 return; 4593 xmm_src = expand_pixel_32_1x128 (src); 4594 4595 PIXMAN_IMAGE_GET_LINE ( 4596 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4597 PIXMAN_IMAGE_GET_LINE ( 4598 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4599 4600 while (height--) 4601 { 4602 dst = dst_line; 4603 dst_line += dst_stride; 4604 mask = mask_line; 4605 mask_line += mask_stride; 4606 w = width; 4607 4608 while (w && ((uintptr_t)dst & 15)) 4609 { 4610 uint8_t m = *mask++; 4611 if (m) 4612 { 4613 *dst = pack_1x128_32 4614 (_mm_adds_epu16 4615 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), 4616 unpack_32_1x128 (*dst))); 4617 } 4618 dst++; 4619 w--; 4620 } 4621 4622 while (w >= 4) 4623 { 4624 uint32_t m; 4625 memcpy(&m, mask, sizeof(uint32_t)); 4626 4627 if (m) 4628 { 4629 __m128i xmm_mask_lo, xmm_mask_hi; 4630 __m128i xmm_dst_lo, xmm_dst_hi; 4631 4632 __m128i xmm_dst = load_128_aligned ((__m128i*)dst); 4633 __m128i xmm_mask = 4634 _mm_unpacklo_epi8 (unpack_32_1x128(m), 4635 _mm_setzero_si128 ()); 4636 4637 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4638 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4639 4640 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 4641 &xmm_mask_lo, &xmm_mask_hi); 4642 4643 pix_multiply_2x128 (&xmm_src, &xmm_src, 4644 &xmm_mask_lo, &xmm_mask_hi, 4645 &xmm_mask_lo, &xmm_mask_hi); 4646 4647 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); 4648 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); 4649 4650 save_128_aligned ( 4651 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4652 } 4653 4654 w -= 4; 4655 dst += 4; 4656 mask += 4; 4657 } 4658 4659 while (w) 4660 { 4661 uint8_t m = *mask++; 4662 if (m) 4663 { 4664 *dst = pack_1x128_32 4665 (_mm_adds_epu16 4666 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), 4667 unpack_32_1x128 (*dst))); 4668 } 4669 dst++; 4670 w--; 4671 } 4672 } 4673 } 4674 4675 static pixman_bool_t 4676 sse2_blt (pixman_implementation_t *imp, 4677 uint32_t * src_bits, 4678 uint32_t * dst_bits, 4679 int src_stride, 4680 int dst_stride, 4681 int src_bpp, 4682 int dst_bpp, 4683 int src_x, 4684 int src_y, 4685 int dest_x, 4686 int dest_y, 4687 int width, 4688 int height) 4689 { 4690 uint8_t * src_bytes; 4691 uint8_t * dst_bytes; 4692 int byte_width; 4693 4694 if (src_bpp != dst_bpp) 4695 return FALSE; 4696 4697 if (src_bpp == 16) 4698 { 4699 src_stride = src_stride * (int) sizeof (uint32_t) / 2; 4700 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; 4701 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); 4702 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 4703 byte_width = 2 * width; 4704 src_stride *= 2; 4705 dst_stride *= 2; 4706 } 4707 else if (src_bpp == 32) 4708 { 4709 src_stride = src_stride * (int) sizeof (uint32_t) / 4; 4710 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; 4711 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); 4712 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 4713 byte_width = 4 * width; 4714 src_stride *= 4; 4715 dst_stride *= 4; 4716 } 4717 else 4718 { 4719 return FALSE; 4720 } 4721 4722 while (height--) 4723 { 4724 int w; 4725 uint8_t *s = src_bytes; 4726 uint8_t *d = dst_bytes; 4727 src_bytes += src_stride; 4728 dst_bytes += dst_stride; 4729 w = byte_width; 4730 4731 while (w >= 2 && ((uintptr_t)d & 3)) 4732 { 4733 memmove(d, s, 2); 4734 w -= 2; 4735 s += 2; 4736 d += 2; 4737 } 4738 4739 while (w >= 4 && ((uintptr_t)d & 15)) 4740 { 4741 memmove(d, s, 4); 4742 4743 w -= 4; 4744 s += 4; 4745 d += 4; 4746 } 4747 4748 while (w >= 64) 4749 { 4750 __m128i xmm0, xmm1, xmm2, xmm3; 4751 4752 xmm0 = load_128_unaligned ((__m128i*)(s)); 4753 xmm1 = load_128_unaligned ((__m128i*)(s + 16)); 4754 xmm2 = load_128_unaligned ((__m128i*)(s + 32)); 4755 xmm3 = load_128_unaligned ((__m128i*)(s + 48)); 4756 4757 save_128_aligned ((__m128i*)(d), xmm0); 4758 save_128_aligned ((__m128i*)(d + 16), xmm1); 4759 save_128_aligned ((__m128i*)(d + 32), xmm2); 4760 save_128_aligned ((__m128i*)(d + 48), xmm3); 4761 4762 s += 64; 4763 d += 64; 4764 w -= 64; 4765 } 4766 4767 while (w >= 16) 4768 { 4769 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); 4770 4771 w -= 16; 4772 d += 16; 4773 s += 16; 4774 } 4775 4776 while (w >= 4) 4777 { 4778 memmove(d, s, 4); 4779 4780 w -= 4; 4781 s += 4; 4782 d += 4; 4783 } 4784 4785 if (w >= 2) 4786 { 4787 memmove(d, s, 2); 4788 w -= 2; 4789 s += 2; 4790 d += 2; 4791 } 4792 } 4793 4794 return TRUE; 4795 } 4796 4797 static void 4798 sse2_composite_copy_area (pixman_implementation_t *imp, 4799 pixman_composite_info_t *info) 4800 { 4801 PIXMAN_COMPOSITE_ARGS (info); 4802 sse2_blt (imp, src_image->bits.bits, 4803 dest_image->bits.bits, 4804 src_image->bits.rowstride, 4805 dest_image->bits.rowstride, 4806 PIXMAN_FORMAT_BPP (src_image->bits.format), 4807 PIXMAN_FORMAT_BPP (dest_image->bits.format), 4808 src_x, src_y, dest_x, dest_y, width, height); 4809 } 4810 4811 static void 4812 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, 4813 pixman_composite_info_t *info) 4814 { 4815 PIXMAN_COMPOSITE_ARGS (info); 4816 uint32_t *src, *src_line, s; 4817 uint32_t *dst, *dst_line, d; 4818 uint8_t *mask, *mask_line; 4819 int src_stride, mask_stride, dst_stride; 4820 int32_t w; 4821 __m128i ms; 4822 4823 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 4824 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4825 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 4826 4827 PIXMAN_IMAGE_GET_LINE ( 4828 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4829 PIXMAN_IMAGE_GET_LINE ( 4830 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4831 PIXMAN_IMAGE_GET_LINE ( 4832 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 4833 4834 while (height--) 4835 { 4836 src = src_line; 4837 src_line += src_stride; 4838 dst = dst_line; 4839 dst_line += dst_stride; 4840 mask = mask_line; 4841 mask_line += mask_stride; 4842 4843 w = width; 4844 4845 while (w && (uintptr_t)dst & 15) 4846 { 4847 uint8_t m = *mask++; 4848 s = 0xff000000 | *src++; 4849 d = *dst; 4850 ms = unpack_32_1x128 (s); 4851 4852 if (m != 0xff) 4853 { 4854 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 4855 __m128i md = unpack_32_1x128 (d); 4856 4857 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); 4858 } 4859 4860 *dst++ = pack_1x128_32 (ms); 4861 w--; 4862 } 4863 4864 while (w >= 4) 4865 { 4866 uint32_t m; 4867 memcpy(&m, mask, sizeof(uint32_t)); 4868 xmm_src = _mm_or_si128 ( 4869 load_128_unaligned ((__m128i*)src), mask_ff000000); 4870 4871 if (m == 0xffffffff) 4872 { 4873 save_128_aligned ((__m128i*)dst, xmm_src); 4874 } 4875 else 4876 { 4877 xmm_dst = load_128_aligned ((__m128i*)dst); 4878 4879 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 4880 4881 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 4882 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 4883 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 4884 4885 expand_alpha_rev_2x128 ( 4886 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 4887 4888 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 4889 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, 4890 &xmm_dst_lo, &xmm_dst_hi); 4891 4892 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 4893 } 4894 4895 src += 4; 4896 dst += 4; 4897 mask += 4; 4898 w -= 4; 4899 } 4900 4901 while (w) 4902 { 4903 uint8_t m = *mask++; 4904 4905 if (m) 4906 { 4907 s = 0xff000000 | *src; 4908 4909 if (m == 0xff) 4910 { 4911 *dst = s; 4912 } 4913 else 4914 { 4915 __m128i ma, md, ms; 4916 4917 d = *dst; 4918 4919 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 4920 md = unpack_32_1x128 (d); 4921 ms = unpack_32_1x128 (s); 4922 4923 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); 4924 } 4925 4926 } 4927 4928 src++; 4929 dst++; 4930 w--; 4931 } 4932 } 4933 4934 } 4935 4936 static void 4937 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, 4938 pixman_composite_info_t *info) 4939 { 4940 PIXMAN_COMPOSITE_ARGS (info); 4941 uint32_t *src, *src_line, s; 4942 uint32_t *dst, *dst_line, d; 4943 uint8_t *mask, *mask_line; 4944 int src_stride, mask_stride, dst_stride; 4945 int32_t w; 4946 4947 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 4948 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 4949 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 4950 4951 PIXMAN_IMAGE_GET_LINE ( 4952 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 4953 PIXMAN_IMAGE_GET_LINE ( 4954 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 4955 PIXMAN_IMAGE_GET_LINE ( 4956 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 4957 4958 while (height--) 4959 { 4960 src = src_line; 4961 src_line += src_stride; 4962 dst = dst_line; 4963 dst_line += dst_stride; 4964 mask = mask_line; 4965 mask_line += mask_stride; 4966 4967 w = width; 4968 4969 while (w && (uintptr_t)dst & 15) 4970 { 4971 uint32_t sa; 4972 uint8_t m = *mask++; 4973 4974 s = *src++; 4975 d = *dst; 4976 4977 sa = s >> 24; 4978 4979 if (m) 4980 { 4981 if (sa == 0xff && m == 0xff) 4982 { 4983 *dst = s; 4984 } 4985 else 4986 { 4987 __m128i ms, md, ma, msa; 4988 4989 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 4990 ms = unpack_32_1x128 (s); 4991 md = unpack_32_1x128 (d); 4992 4993 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 4994 4995 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 4996 } 4997 } 4998 4999 dst++; 5000 w--; 5001 } 5002 5003 while (w >= 4) 5004 { 5005 uint32_t m; 5006 memcpy(&m, mask, sizeof(uint32_t)); 5007 5008 if (m) 5009 { 5010 xmm_src = load_128_unaligned ((__m128i*)src); 5011 5012 if (m == 0xffffffff && is_opaque (xmm_src)) 5013 { 5014 save_128_aligned ((__m128i *)dst, xmm_src); 5015 } 5016 else 5017 { 5018 xmm_dst = load_128_aligned ((__m128i *)dst); 5019 5020 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 5021 5022 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5023 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 5024 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5025 5026 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 5027 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 5028 5029 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 5030 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 5031 5032 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5033 } 5034 } 5035 5036 src += 4; 5037 dst += 4; 5038 mask += 4; 5039 w -= 4; 5040 } 5041 5042 while (w) 5043 { 5044 uint32_t sa; 5045 uint8_t m = *mask++; 5046 5047 s = *src++; 5048 d = *dst; 5049 5050 sa = s >> 24; 5051 5052 if (m) 5053 { 5054 if (sa == 0xff && m == 0xff) 5055 { 5056 *dst = s; 5057 } 5058 else 5059 { 5060 __m128i ms, md, ma, msa; 5061 5062 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5063 ms = unpack_32_1x128 (s); 5064 md = unpack_32_1x128 (d); 5065 5066 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5067 5068 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5069 } 5070 } 5071 5072 dst++; 5073 w--; 5074 } 5075 } 5076 5077 } 5078 5079 static void 5080 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, 5081 pixman_composite_info_t *info) 5082 { 5083 PIXMAN_COMPOSITE_ARGS (info); 5084 uint32_t src; 5085 uint32_t *dst_line, *dst; 5086 __m128i xmm_src; 5087 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 5088 __m128i xmm_dsta_hi, xmm_dsta_lo; 5089 int dst_stride; 5090 int32_t w; 5091 5092 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 5093 5094 if (src == 0) 5095 return; 5096 5097 PIXMAN_IMAGE_GET_LINE ( 5098 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 5099 5100 xmm_src = expand_pixel_32_1x128 (src); 5101 5102 while (height--) 5103 { 5104 dst = dst_line; 5105 5106 dst_line += dst_stride; 5107 w = width; 5108 5109 while (w && (uintptr_t)dst & 15) 5110 { 5111 __m128i vd; 5112 5113 vd = unpack_32_1x128 (*dst); 5114 5115 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), 5116 xmm_src)); 5117 w--; 5118 dst++; 5119 } 5120 5121 while (w >= 4) 5122 { 5123 __m128i tmp_lo, tmp_hi; 5124 5125 xmm_dst = load_128_aligned ((__m128i*)dst); 5126 5127 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5128 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); 5129 5130 tmp_lo = xmm_src; 5131 tmp_hi = xmm_src; 5132 5133 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 5134 &xmm_dsta_lo, &xmm_dsta_hi, 5135 &tmp_lo, &tmp_hi); 5136 5137 save_128_aligned ( 5138 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); 5139 5140 w -= 4; 5141 dst += 4; 5142 } 5143 5144 while (w) 5145 { 5146 __m128i vd; 5147 5148 vd = unpack_32_1x128 (*dst); 5149 5150 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), 5151 xmm_src)); 5152 w--; 5153 dst++; 5154 } 5155 5156 } 5157 5158 } 5159 5160 static void 5161 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, 5162 pixman_composite_info_t *info) 5163 { 5164 PIXMAN_COMPOSITE_ARGS (info); 5165 uint32_t *src, *src_line, s; 5166 uint32_t *dst, *dst_line, d; 5167 uint32_t *mask, *mask_line; 5168 uint32_t m; 5169 int src_stride, mask_stride, dst_stride; 5170 int32_t w; 5171 5172 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 5173 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 5174 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 5175 5176 PIXMAN_IMAGE_GET_LINE ( 5177 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 5178 PIXMAN_IMAGE_GET_LINE ( 5179 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 5180 PIXMAN_IMAGE_GET_LINE ( 5181 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 5182 5183 while (height--) 5184 { 5185 src = src_line; 5186 src_line += src_stride; 5187 dst = dst_line; 5188 dst_line += dst_stride; 5189 mask = mask_line; 5190 mask_line += mask_stride; 5191 5192 w = width; 5193 5194 while (w && (uintptr_t)dst & 15) 5195 { 5196 uint32_t sa; 5197 5198 s = *src++; 5199 m = (*mask++) >> 24; 5200 d = *dst; 5201 5202 sa = s >> 24; 5203 5204 if (m) 5205 { 5206 if (sa == 0xff && m == 0xff) 5207 { 5208 *dst = s; 5209 } 5210 else 5211 { 5212 __m128i ms, md, ma, msa; 5213 5214 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5215 ms = unpack_32_1x128 (s); 5216 md = unpack_32_1x128 (d); 5217 5218 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5219 5220 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5221 } 5222 } 5223 5224 dst++; 5225 w--; 5226 } 5227 5228 while (w >= 4) 5229 { 5230 xmm_mask = load_128_unaligned ((__m128i*)mask); 5231 5232 if (!is_transparent (xmm_mask)) 5233 { 5234 xmm_src = load_128_unaligned ((__m128i*)src); 5235 5236 if (is_opaque (xmm_mask) && is_opaque (xmm_src)) 5237 { 5238 save_128_aligned ((__m128i *)dst, xmm_src); 5239 } 5240 else 5241 { 5242 xmm_dst = load_128_aligned ((__m128i *)dst); 5243 5244 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5245 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 5246 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5247 5248 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 5249 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 5250 5251 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 5252 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 5253 5254 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5255 } 5256 } 5257 5258 src += 4; 5259 dst += 4; 5260 mask += 4; 5261 w -= 4; 5262 } 5263 5264 while (w) 5265 { 5266 uint32_t sa; 5267 5268 s = *src++; 5269 m = (*mask++) >> 24; 5270 d = *dst; 5271 5272 sa = s >> 24; 5273 5274 if (m) 5275 { 5276 if (sa == 0xff && m == 0xff) 5277 { 5278 *dst = s; 5279 } 5280 else 5281 { 5282 __m128i ms, md, ma, msa; 5283 5284 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5285 ms = unpack_32_1x128 (s); 5286 md = unpack_32_1x128 (d); 5287 5288 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5289 5290 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5291 } 5292 } 5293 5294 dst++; 5295 w--; 5296 } 5297 } 5298 5299 } 5300 5301 /* A variant of 'sse2_combine_over_u' with minor tweaks */ 5302 static force_inline void 5303 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, 5304 const uint32_t* ps, 5305 int32_t w, 5306 pixman_fixed_t vx, 5307 pixman_fixed_t unit_x, 5308 pixman_fixed_t src_width_fixed, 5309 pixman_bool_t fully_transparent_src) 5310 { 5311 uint32_t s, d; 5312 const uint32_t* pm = NULL; 5313 5314 __m128i xmm_dst_lo, xmm_dst_hi; 5315 __m128i xmm_src_lo, xmm_src_hi; 5316 __m128i xmm_alpha_lo, xmm_alpha_hi; 5317 5318 if (fully_transparent_src) 5319 return; 5320 5321 /* Align dst on a 16-byte boundary */ 5322 while (w && ((uintptr_t)pd & 15)) 5323 { 5324 d = *pd; 5325 s = combine1 (ps + pixman_fixed_to_int (vx), pm); 5326 vx += unit_x; 5327 while (vx >= 0) 5328 vx -= src_width_fixed; 5329 5330 *pd++ = core_combine_over_u_pixel_sse2 (s, d); 5331 if (pm) 5332 pm++; 5333 w--; 5334 } 5335 5336 while (w >= 4) 5337 { 5338 __m128i tmp; 5339 uint32_t tmp1, tmp2, tmp3, tmp4; 5340 5341 tmp1 = *(ps + pixman_fixed_to_int (vx)); 5342 vx += unit_x; 5343 while (vx >= 0) 5344 vx -= src_width_fixed; 5345 tmp2 = *(ps + pixman_fixed_to_int (vx)); 5346 vx += unit_x; 5347 while (vx >= 0) 5348 vx -= src_width_fixed; 5349 tmp3 = *(ps + pixman_fixed_to_int (vx)); 5350 vx += unit_x; 5351 while (vx >= 0) 5352 vx -= src_width_fixed; 5353 tmp4 = *(ps + pixman_fixed_to_int (vx)); 5354 vx += unit_x; 5355 while (vx >= 0) 5356 vx -= src_width_fixed; 5357 5358 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); 5359 5360 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); 5361 5362 if (is_opaque (xmm_src_hi)) 5363 { 5364 save_128_aligned ((__m128i*)pd, xmm_src_hi); 5365 } 5366 else if (!is_zero (xmm_src_hi)) 5367 { 5368 xmm_dst_hi = load_128_aligned ((__m128i*) pd); 5369 5370 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 5371 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 5372 5373 expand_alpha_2x128 ( 5374 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); 5375 5376 over_2x128 (&xmm_src_lo, &xmm_src_hi, 5377 &xmm_alpha_lo, &xmm_alpha_hi, 5378 &xmm_dst_lo, &xmm_dst_hi); 5379 5380 /* rebuid the 4 pixel data and save*/ 5381 save_128_aligned ((__m128i*)pd, 5382 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5383 } 5384 5385 w -= 4; 5386 pd += 4; 5387 if (pm) 5388 pm += 4; 5389 } 5390 5391 while (w) 5392 { 5393 d = *pd; 5394 s = combine1 (ps + pixman_fixed_to_int (vx), pm); 5395 vx += unit_x; 5396 while (vx >= 0) 5397 vx -= src_width_fixed; 5398 5399 *pd++ = core_combine_over_u_pixel_sse2 (s, d); 5400 if (pm) 5401 pm++; 5402 5403 w--; 5404 } 5405 } 5406 5407 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, 5408 scaled_nearest_scanline_sse2_8888_8888_OVER, 5409 uint32_t, uint32_t, COVER) 5410 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, 5411 scaled_nearest_scanline_sse2_8888_8888_OVER, 5412 uint32_t, uint32_t, NONE) 5413 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, 5414 scaled_nearest_scanline_sse2_8888_8888_OVER, 5415 uint32_t, uint32_t, PAD) 5416 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, 5417 scaled_nearest_scanline_sse2_8888_8888_OVER, 5418 uint32_t, uint32_t, NORMAL) 5419 5420 static force_inline void 5421 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, 5422 uint32_t * dst, 5423 const uint32_t * src, 5424 int32_t w, 5425 pixman_fixed_t vx, 5426 pixman_fixed_t unit_x, 5427 pixman_fixed_t src_width_fixed, 5428 pixman_bool_t zero_src) 5429 { 5430 __m128i xmm_mask; 5431 __m128i xmm_src, xmm_src_lo, xmm_src_hi; 5432 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 5433 __m128i xmm_alpha_lo, xmm_alpha_hi; 5434 5435 if (zero_src || (*mask >> 24) == 0) 5436 return; 5437 5438 xmm_mask = create_mask_16_128 (*mask >> 24); 5439 5440 while (w && (uintptr_t)dst & 15) 5441 { 5442 uint32_t s = *(src + pixman_fixed_to_int (vx)); 5443 vx += unit_x; 5444 while (vx >= 0) 5445 vx -= src_width_fixed; 5446 5447 if (s) 5448 { 5449 uint32_t d = *dst; 5450 5451 __m128i ms = unpack_32_1x128 (s); 5452 __m128i alpha = expand_alpha_1x128 (ms); 5453 __m128i dest = xmm_mask; 5454 __m128i alpha_dst = unpack_32_1x128 (d); 5455 5456 *dst = pack_1x128_32 ( 5457 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 5458 } 5459 dst++; 5460 w--; 5461 } 5462 5463 while (w >= 4) 5464 { 5465 uint32_t tmp1, tmp2, tmp3, tmp4; 5466 5467 tmp1 = *(src + pixman_fixed_to_int (vx)); 5468 vx += unit_x; 5469 while (vx >= 0) 5470 vx -= src_width_fixed; 5471 tmp2 = *(src + pixman_fixed_to_int (vx)); 5472 vx += unit_x; 5473 while (vx >= 0) 5474 vx -= src_width_fixed; 5475 tmp3 = *(src + pixman_fixed_to_int (vx)); 5476 vx += unit_x; 5477 while (vx >= 0) 5478 vx -= src_width_fixed; 5479 tmp4 = *(src + pixman_fixed_to_int (vx)); 5480 vx += unit_x; 5481 while (vx >= 0) 5482 vx -= src_width_fixed; 5483 5484 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); 5485 5486 if (!is_zero (xmm_src)) 5487 { 5488 xmm_dst = load_128_aligned ((__m128i*)dst); 5489 5490 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5491 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5492 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 5493 &xmm_alpha_lo, &xmm_alpha_hi); 5494 5495 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 5496 &xmm_alpha_lo, &xmm_alpha_hi, 5497 &xmm_mask, &xmm_mask, 5498 &xmm_dst_lo, &xmm_dst_hi); 5499 5500 save_128_aligned ( 5501 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5502 } 5503 5504 dst += 4; 5505 w -= 4; 5506 } 5507 5508 while (w) 5509 { 5510 uint32_t s = *(src + pixman_fixed_to_int (vx)); 5511 vx += unit_x; 5512 while (vx >= 0) 5513 vx -= src_width_fixed; 5514 5515 if (s) 5516 { 5517 uint32_t d = *dst; 5518 5519 __m128i ms = unpack_32_1x128 (s); 5520 __m128i alpha = expand_alpha_1x128 (ms); 5521 __m128i mask = xmm_mask; 5522 __m128i dest = unpack_32_1x128 (d); 5523 5524 *dst = pack_1x128_32 ( 5525 in_over_1x128 (&ms, &alpha, &mask, &dest)); 5526 } 5527 5528 dst++; 5529 w--; 5530 } 5531 5532 } 5533 5534 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, 5535 scaled_nearest_scanline_sse2_8888_n_8888_OVER, 5536 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) 5537 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, 5538 scaled_nearest_scanline_sse2_8888_n_8888_OVER, 5539 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) 5540 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, 5541 scaled_nearest_scanline_sse2_8888_n_8888_OVER, 5542 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) 5543 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, 5544 scaled_nearest_scanline_sse2_8888_n_8888_OVER, 5545 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) 5546 5547 #if PSHUFD_IS_FAST 5548 5549 /***********************************************************************************/ 5550 5551 # define BILINEAR_DECLARE_VARIABLES \ 5552 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ 5553 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ 5554 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ 5555 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ 5556 unit_x, -unit_x, unit_x, -unit_x); \ 5557 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \ 5558 unit_x * 4, -unit_x * 4, \ 5559 unit_x * 4, -unit_x * 4, \ 5560 unit_x * 4, -unit_x * 4); \ 5561 const __m128i xmm_zero = _mm_setzero_si128 (); \ 5562 __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \ 5563 vx + unit_x * 2, -(vx + 1) - unit_x * 2, \ 5564 vx + unit_x * 1, -(vx + 1) - unit_x * 1, \ 5565 vx + unit_x * 0, -(vx + 1) - unit_x * 0); \ 5566 __m128i xmm_wh_state; 5567 5568 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \ 5569 do { \ 5570 int phase = phase_; \ 5571 __m128i xmm_wh, xmm_a, xmm_b; \ 5572 /* fetch 2x2 pixel block into sse2 registers */ \ 5573 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \ 5574 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \ 5575 vx += unit_x; \ 5576 /* vertical interpolation */ \ 5577 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \ 5578 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \ 5579 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \ 5580 /* calculate horizontal weights */ \ 5581 if (phase <= 0) \ 5582 { \ 5583 xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ 5584 16 - BILINEAR_INTERPOLATION_BITS)); \ 5585 xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \ 5586 phase = 0; \ 5587 } \ 5588 xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \ 5589 phase, phase)); \ 5590 /* horizontal interpolation */ \ 5591 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ 5592 xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \ 5593 /* shift the result */ \ 5594 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \ 5595 } while (0) 5596 5597 #else /************************************************************************/ 5598 5599 # define BILINEAR_DECLARE_VARIABLES \ 5600 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ 5601 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ 5602 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ 5603 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \ 5604 unit_x, -unit_x, unit_x, -unit_x); \ 5605 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \ 5606 unit_x * 4, -unit_x * 4, \ 5607 unit_x * 4, -unit_x * 4, \ 5608 unit_x * 4, -unit_x * 4); \ 5609 const __m128i xmm_zero = _mm_setzero_si128 (); \ 5610 __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \ 5611 vx, -(vx + 1), vx, -(vx + 1)) 5612 5613 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \ 5614 do { \ 5615 __m128i xmm_wh, xmm_a, xmm_b; \ 5616 /* fetch 2x2 pixel block into sse2 registers */ \ 5617 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \ 5618 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \ 5619 (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \ 5620 vx += unit_x; \ 5621 /* vertical interpolation */ \ 5622 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \ 5623 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \ 5624 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \ 5625 /* calculate horizontal weights */ \ 5626 xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \ 5627 16 - BILINEAR_INTERPOLATION_BITS)); \ 5628 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \ 5629 /* horizontal interpolation */ \ 5630 xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \ 5631 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \ 5632 /* shift the result */ \ 5633 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \ 5634 } while (0) 5635 5636 /***********************************************************************************/ 5637 5638 #endif 5639 5640 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \ 5641 do { \ 5642 __m128i xmm_pix; \ 5643 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \ 5644 xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \ 5645 xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \ 5646 pix = _mm_cvtsi128_si32 (xmm_pix); \ 5647 } while(0) 5648 5649 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \ 5650 do { \ 5651 __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \ 5652 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \ 5653 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \ 5654 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \ 5655 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \ 5656 xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \ 5657 xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \ 5658 pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \ 5659 } while(0) 5660 5661 #define BILINEAR_SKIP_ONE_PIXEL() \ 5662 do { \ 5663 vx += unit_x; \ 5664 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \ 5665 } while(0) 5666 5667 #define BILINEAR_SKIP_FOUR_PIXELS() \ 5668 do { \ 5669 vx += unit_x * 4; \ 5670 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \ 5671 } while(0) 5672 5673 /***********************************************************************************/ 5674 5675 static force_inline void 5676 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, 5677 const uint32_t * mask, 5678 const uint32_t * src_top, 5679 const uint32_t * src_bottom, 5680 int32_t w, 5681 int wt, 5682 int wb, 5683 pixman_fixed_t vx_, 5684 pixman_fixed_t unit_x_, 5685 pixman_fixed_t max_vx, 5686 pixman_bool_t zero_src) 5687 { 5688 intptr_t vx = vx_; 5689 intptr_t unit_x = unit_x_; 5690 BILINEAR_DECLARE_VARIABLES; 5691 uint32_t pix1, pix2; 5692 5693 while (w && ((uintptr_t)dst & 15)) 5694 { 5695 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5696 *dst++ = pix1; 5697 w--; 5698 } 5699 5700 while ((w -= 4) >= 0) { 5701 __m128i xmm_src; 5702 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); 5703 _mm_store_si128 ((__m128i *)dst, xmm_src); 5704 dst += 4; 5705 } 5706 5707 if (w & 2) 5708 { 5709 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5710 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 5711 *dst++ = pix1; 5712 *dst++ = pix2; 5713 } 5714 5715 if (w & 1) 5716 { 5717 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5718 *dst = pix1; 5719 } 5720 5721 } 5722 5723 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, 5724 scaled_bilinear_scanline_sse2_8888_8888_SRC, 5725 uint32_t, uint32_t, uint32_t, 5726 COVER, FLAG_NONE) 5727 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, 5728 scaled_bilinear_scanline_sse2_8888_8888_SRC, 5729 uint32_t, uint32_t, uint32_t, 5730 PAD, FLAG_NONE) 5731 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, 5732 scaled_bilinear_scanline_sse2_8888_8888_SRC, 5733 uint32_t, uint32_t, uint32_t, 5734 NONE, FLAG_NONE) 5735 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, 5736 scaled_bilinear_scanline_sse2_8888_8888_SRC, 5737 uint32_t, uint32_t, uint32_t, 5738 NORMAL, FLAG_NONE) 5739 5740 static force_inline void 5741 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst, 5742 const uint32_t * mask, 5743 const uint32_t * src_top, 5744 const uint32_t * src_bottom, 5745 int32_t w, 5746 int wt, 5747 int wb, 5748 pixman_fixed_t vx_, 5749 pixman_fixed_t unit_x_, 5750 pixman_fixed_t max_vx, 5751 pixman_bool_t zero_src) 5752 { 5753 intptr_t vx = vx_; 5754 intptr_t unit_x = unit_x_; 5755 BILINEAR_DECLARE_VARIABLES; 5756 uint32_t pix1, pix2; 5757 5758 while (w && ((uintptr_t)dst & 15)) 5759 { 5760 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5761 *dst++ = pix1 | 0xFF000000; 5762 w--; 5763 } 5764 5765 while ((w -= 4) >= 0) { 5766 __m128i xmm_src; 5767 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); 5768 _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000)); 5769 dst += 4; 5770 } 5771 5772 if (w & 2) 5773 { 5774 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5775 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 5776 *dst++ = pix1 | 0xFF000000; 5777 *dst++ = pix2 | 0xFF000000; 5778 } 5779 5780 if (w & 1) 5781 { 5782 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5783 *dst = pix1 | 0xFF000000; 5784 } 5785 } 5786 5787 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC, 5788 scaled_bilinear_scanline_sse2_x888_8888_SRC, 5789 uint32_t, uint32_t, uint32_t, 5790 COVER, FLAG_NONE) 5791 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC, 5792 scaled_bilinear_scanline_sse2_x888_8888_SRC, 5793 uint32_t, uint32_t, uint32_t, 5794 PAD, FLAG_NONE) 5795 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC, 5796 scaled_bilinear_scanline_sse2_x888_8888_SRC, 5797 uint32_t, uint32_t, uint32_t, 5798 NORMAL, FLAG_NONE) 5799 5800 static force_inline void 5801 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, 5802 const uint32_t * mask, 5803 const uint32_t * src_top, 5804 const uint32_t * src_bottom, 5805 int32_t w, 5806 int wt, 5807 int wb, 5808 pixman_fixed_t vx_, 5809 pixman_fixed_t unit_x_, 5810 pixman_fixed_t max_vx, 5811 pixman_bool_t zero_src) 5812 { 5813 intptr_t vx = vx_; 5814 intptr_t unit_x = unit_x_; 5815 BILINEAR_DECLARE_VARIABLES; 5816 uint32_t pix1, pix2; 5817 5818 while (w && ((uintptr_t)dst & 15)) 5819 { 5820 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5821 5822 if (pix1) 5823 { 5824 pix2 = *dst; 5825 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); 5826 } 5827 5828 w--; 5829 dst++; 5830 } 5831 5832 while (w >= 4) 5833 { 5834 __m128i xmm_src; 5835 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; 5836 __m128i xmm_alpha_hi, xmm_alpha_lo; 5837 5838 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); 5839 5840 if (!is_zero (xmm_src)) 5841 { 5842 if (is_opaque (xmm_src)) 5843 { 5844 save_128_aligned ((__m128i *)dst, xmm_src); 5845 } 5846 else 5847 { 5848 __m128i xmm_dst = load_128_aligned ((__m128i *)dst); 5849 5850 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5851 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5852 5853 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); 5854 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, 5855 &xmm_dst_lo, &xmm_dst_hi); 5856 5857 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5858 } 5859 } 5860 5861 w -= 4; 5862 dst += 4; 5863 } 5864 5865 while (w) 5866 { 5867 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5868 5869 if (pix1) 5870 { 5871 pix2 = *dst; 5872 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); 5873 } 5874 5875 w--; 5876 dst++; 5877 } 5878 } 5879 5880 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, 5881 scaled_bilinear_scanline_sse2_8888_8888_OVER, 5882 uint32_t, uint32_t, uint32_t, 5883 COVER, FLAG_NONE) 5884 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, 5885 scaled_bilinear_scanline_sse2_8888_8888_OVER, 5886 uint32_t, uint32_t, uint32_t, 5887 PAD, FLAG_NONE) 5888 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, 5889 scaled_bilinear_scanline_sse2_8888_8888_OVER, 5890 uint32_t, uint32_t, uint32_t, 5891 NONE, FLAG_NONE) 5892 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, 5893 scaled_bilinear_scanline_sse2_8888_8888_OVER, 5894 uint32_t, uint32_t, uint32_t, 5895 NORMAL, FLAG_NONE) 5896 5897 static force_inline void 5898 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, 5899 const uint8_t * mask, 5900 const uint32_t * src_top, 5901 const uint32_t * src_bottom, 5902 int32_t w, 5903 int wt, 5904 int wb, 5905 pixman_fixed_t vx_, 5906 pixman_fixed_t unit_x_, 5907 pixman_fixed_t max_vx, 5908 pixman_bool_t zero_src) 5909 { 5910 intptr_t vx = vx_; 5911 intptr_t unit_x = unit_x_; 5912 BILINEAR_DECLARE_VARIABLES; 5913 uint32_t pix1, pix2; 5914 5915 while (w && ((uintptr_t)dst & 15)) 5916 { 5917 uint32_t sa; 5918 uint8_t m = *mask++; 5919 5920 if (m) 5921 { 5922 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 5923 sa = pix1 >> 24; 5924 5925 if (sa == 0xff && m == 0xff) 5926 { 5927 *dst = pix1; 5928 } 5929 else 5930 { 5931 __m128i ms, md, ma, msa; 5932 5933 pix2 = *dst; 5934 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 5935 ms = unpack_32_1x128 (pix1); 5936 md = unpack_32_1x128 (pix2); 5937 5938 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 5939 5940 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 5941 } 5942 } 5943 else 5944 { 5945 BILINEAR_SKIP_ONE_PIXEL (); 5946 } 5947 5948 w--; 5949 dst++; 5950 } 5951 5952 while (w >= 4) 5953 { 5954 uint32_t m; 5955 5956 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 5957 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 5958 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 5959 5960 memcpy(&m, mask, sizeof(uint32_t)); 5961 5962 if (m) 5963 { 5964 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); 5965 5966 if (m == 0xffffffff && is_opaque (xmm_src)) 5967 { 5968 save_128_aligned ((__m128i *)dst, xmm_src); 5969 } 5970 else 5971 { 5972 xmm_dst = load_128_aligned ((__m128i *)dst); 5973 5974 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 5975 5976 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 5977 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 5978 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 5979 5980 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 5981 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 5982 5983 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 5984 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 5985 5986 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 5987 } 5988 } 5989 else 5990 { 5991 BILINEAR_SKIP_FOUR_PIXELS (); 5992 } 5993 5994 w -= 4; 5995 dst += 4; 5996 mask += 4; 5997 } 5998 5999 while (w) 6000 { 6001 uint32_t sa; 6002 uint8_t m = *mask++; 6003 6004 if (m) 6005 { 6006 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 6007 sa = pix1 >> 24; 6008 6009 if (sa == 0xff && m == 0xff) 6010 { 6011 *dst = pix1; 6012 } 6013 else 6014 { 6015 __m128i ms, md, ma, msa; 6016 6017 pix2 = *dst; 6018 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 6019 ms = unpack_32_1x128 (pix1); 6020 md = unpack_32_1x128 (pix2); 6021 6022 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 6023 6024 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 6025 } 6026 } 6027 else 6028 { 6029 BILINEAR_SKIP_ONE_PIXEL (); 6030 } 6031 6032 w--; 6033 dst++; 6034 } 6035 } 6036 6037 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, 6038 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 6039 uint32_t, uint8_t, uint32_t, 6040 COVER, FLAG_HAVE_NON_SOLID_MASK) 6041 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, 6042 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 6043 uint32_t, uint8_t, uint32_t, 6044 PAD, FLAG_HAVE_NON_SOLID_MASK) 6045 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, 6046 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 6047 uint32_t, uint8_t, uint32_t, 6048 NONE, FLAG_HAVE_NON_SOLID_MASK) 6049 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, 6050 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 6051 uint32_t, uint8_t, uint32_t, 6052 NORMAL, FLAG_HAVE_NON_SOLID_MASK) 6053 6054 static force_inline void 6055 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, 6056 const uint32_t * mask, 6057 const uint32_t * src_top, 6058 const uint32_t * src_bottom, 6059 int32_t w, 6060 int wt, 6061 int wb, 6062 pixman_fixed_t vx_, 6063 pixman_fixed_t unit_x_, 6064 pixman_fixed_t max_vx, 6065 pixman_bool_t zero_src) 6066 { 6067 intptr_t vx = vx_; 6068 intptr_t unit_x = unit_x_; 6069 BILINEAR_DECLARE_VARIABLES; 6070 uint32_t pix1; 6071 __m128i xmm_mask; 6072 6073 if (zero_src || (*mask >> 24) == 0) 6074 return; 6075 6076 xmm_mask = create_mask_16_128 (*mask >> 24); 6077 6078 while (w && ((uintptr_t)dst & 15)) 6079 { 6080 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 6081 if (pix1) 6082 { 6083 uint32_t d = *dst; 6084 6085 __m128i ms = unpack_32_1x128 (pix1); 6086 __m128i alpha = expand_alpha_1x128 (ms); 6087 __m128i dest = xmm_mask; 6088 __m128i alpha_dst = unpack_32_1x128 (d); 6089 6090 *dst = pack_1x128_32 6091 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 6092 } 6093 6094 dst++; 6095 w--; 6096 } 6097 6098 while (w >= 4) 6099 { 6100 __m128i xmm_src; 6101 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src); 6102 6103 if (!is_zero (xmm_src)) 6104 { 6105 __m128i xmm_src_lo, xmm_src_hi; 6106 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 6107 __m128i xmm_alpha_lo, xmm_alpha_hi; 6108 6109 xmm_dst = load_128_aligned ((__m128i*)dst); 6110 6111 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 6112 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 6113 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 6114 &xmm_alpha_lo, &xmm_alpha_hi); 6115 6116 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 6117 &xmm_alpha_lo, &xmm_alpha_hi, 6118 &xmm_mask, &xmm_mask, 6119 &xmm_dst_lo, &xmm_dst_hi); 6120 6121 save_128_aligned 6122 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 6123 } 6124 6125 dst += 4; 6126 w -= 4; 6127 } 6128 6129 while (w) 6130 { 6131 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 6132 if (pix1) 6133 { 6134 uint32_t d = *dst; 6135 6136 __m128i ms = unpack_32_1x128 (pix1); 6137 __m128i alpha = expand_alpha_1x128 (ms); 6138 __m128i dest = xmm_mask; 6139 __m128i alpha_dst = unpack_32_1x128 (d); 6140 6141 *dst = pack_1x128_32 6142 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 6143 } 6144 6145 dst++; 6146 w--; 6147 } 6148 } 6149 6150 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, 6151 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, 6152 uint32_t, uint32_t, uint32_t, 6153 COVER, FLAG_HAVE_SOLID_MASK) 6154 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, 6155 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, 6156 uint32_t, uint32_t, uint32_t, 6157 PAD, FLAG_HAVE_SOLID_MASK) 6158 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, 6159 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, 6160 uint32_t, uint32_t, uint32_t, 6161 NONE, FLAG_HAVE_SOLID_MASK) 6162 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, 6163 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, 6164 uint32_t, uint32_t, uint32_t, 6165 NORMAL, FLAG_HAVE_SOLID_MASK) 6166 6167 static const pixman_fast_path_t sse2_fast_paths[] = 6168 { 6169 /* PIXMAN_OP_OVER */ 6170 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), 6171 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), 6172 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), 6173 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), 6174 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), 6175 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565), 6176 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), 6177 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), 6178 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), 6179 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), 6180 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), 6181 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), 6182 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), 6183 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), 6184 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), 6185 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), 6186 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), 6187 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), 6188 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), 6189 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), 6190 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), 6191 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), 6192 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), 6193 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), 6194 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), 6195 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), 6196 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), 6197 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), 6198 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), 6199 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), 6200 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), 6201 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), 6202 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), 6203 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), 6204 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), 6205 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), 6206 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), 6207 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), 6208 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), 6209 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), 6210 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), 6211 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), 6212 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), 6213 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), 6214 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), 6215 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), 6216 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), 6217 6218 /* PIXMAN_OP_OVER_REVERSE */ 6219 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), 6220 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), 6221 6222 /* PIXMAN_OP_ADD */ 6223 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), 6224 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), 6225 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), 6226 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), 6227 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), 6228 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), 6229 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), 6230 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), 6231 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888), 6232 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888), 6233 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), 6234 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), 6235 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888), 6236 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888), 6237 6238 /* PIXMAN_OP_SRC */ 6239 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), 6240 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), 6241 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), 6242 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), 6243 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), 6244 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), 6245 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), 6246 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), 6247 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), 6248 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), 6249 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), 6250 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), 6251 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), 6252 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), 6253 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), 6254 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), 6255 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), 6256 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), 6257 6258 /* PIXMAN_OP_IN */ 6259 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), 6260 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), 6261 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), 6262 6263 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 6264 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 6265 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 6266 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 6267 6268 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), 6269 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), 6270 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), 6271 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), 6272 6273 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 6274 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 6275 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), 6276 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 6277 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 6278 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), 6279 6280 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), 6281 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), 6282 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), 6283 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), 6284 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888), 6285 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888), 6286 6287 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 6288 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 6289 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 6290 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 6291 6292 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), 6293 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), 6294 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), 6295 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), 6296 6297 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), 6298 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), 6299 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), 6300 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), 6301 6302 { PIXMAN_OP_NONE }, 6303 }; 6304 6305 static uint32_t * 6306 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) 6307 { 6308 int w = iter->width; 6309 __m128i ff000000 = mask_ff000000; 6310 uint32_t *dst = iter->buffer; 6311 uint32_t *src = (uint32_t *)iter->bits; 6312 6313 iter->bits += iter->stride; 6314 6315 while (w && ((uintptr_t)dst) & 0x0f) 6316 { 6317 *dst++ = (*src++) | 0xff000000; 6318 w--; 6319 } 6320 6321 while (w >= 4) 6322 { 6323 save_128_aligned ( 6324 (__m128i *)dst, _mm_or_si128 ( 6325 load_128_unaligned ((__m128i *)src), ff000000)); 6326 6327 dst += 4; 6328 src += 4; 6329 w -= 4; 6330 } 6331 6332 while (w) 6333 { 6334 *dst++ = (*src++) | 0xff000000; 6335 w--; 6336 } 6337 6338 return iter->buffer; 6339 } 6340 6341 static uint32_t * 6342 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) 6343 { 6344 int w = iter->width; 6345 uint32_t *dst = iter->buffer; 6346 uint16_t *src = (uint16_t *)iter->bits; 6347 __m128i ff000000 = mask_ff000000; 6348 6349 iter->bits += iter->stride; 6350 6351 while (w && ((uintptr_t)dst) & 0x0f) 6352 { 6353 uint16_t s = *src++; 6354 6355 *dst++ = convert_0565_to_8888 (s); 6356 w--; 6357 } 6358 6359 while (w >= 8) 6360 { 6361 __m128i lo, hi, s; 6362 6363 s = _mm_loadu_si128 ((__m128i *)src); 6364 6365 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); 6366 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); 6367 6368 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); 6369 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); 6370 6371 dst += 8; 6372 src += 8; 6373 w -= 8; 6374 } 6375 6376 while (w) 6377 { 6378 uint16_t s = *src++; 6379 6380 *dst++ = convert_0565_to_8888 (s); 6381 w--; 6382 } 6383 6384 return iter->buffer; 6385 } 6386 6387 static uint32_t * 6388 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) 6389 { 6390 int w = iter->width; 6391 uint32_t *dst = iter->buffer; 6392 uint8_t *src = iter->bits; 6393 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; 6394 6395 iter->bits += iter->stride; 6396 6397 while (w && (((uintptr_t)dst) & 15)) 6398 { 6399 *dst++ = (uint32_t)(*(src++)) << 24; 6400 w--; 6401 } 6402 6403 while (w >= 16) 6404 { 6405 xmm0 = _mm_loadu_si128((__m128i *)src); 6406 6407 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); 6408 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); 6409 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); 6410 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); 6411 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); 6412 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); 6413 6414 _mm_store_si128(((__m128i *)(dst + 0)), xmm3); 6415 _mm_store_si128(((__m128i *)(dst + 4)), xmm4); 6416 _mm_store_si128(((__m128i *)(dst + 8)), xmm5); 6417 _mm_store_si128(((__m128i *)(dst + 12)), xmm6); 6418 6419 dst += 16; 6420 src += 16; 6421 w -= 16; 6422 } 6423 6424 while (w) 6425 { 6426 *dst++ = (uint32_t)(*(src++)) << 24; 6427 w--; 6428 } 6429 6430 return iter->buffer; 6431 } 6432 6433 #define IMAGE_FLAGS \ 6434 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ 6435 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) 6436 6437 static const pixman_iter_info_t sse2_iters[] = 6438 { 6439 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, 6440 _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL 6441 }, 6442 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW, 6443 _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL 6444 }, 6445 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, 6446 _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL 6447 }, 6448 { PIXMAN_null }, 6449 }; 6450 6451 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) 6452 __attribute__((__force_align_arg_pointer__)) 6453 #endif 6454 pixman_implementation_t * 6455 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback) 6456 { 6457 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); 6458 6459 /* SSE2 constants */ 6460 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); 6461 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); 6462 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); 6463 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); 6464 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); 6465 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); 6466 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); 6467 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); 6468 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); 6469 mask_0080 = create_mask_16_128 (0x0080); 6470 mask_00ff = create_mask_16_128 (0x00ff); 6471 mask_0101 = create_mask_16_128 (0x0101); 6472 mask_ffff = create_mask_16_128 (0xffff); 6473 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); 6474 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); 6475 mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8); 6476 mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004); 6477 6478 /* Set up function pointers */ 6479 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; 6480 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; 6481 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; 6482 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; 6483 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; 6484 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; 6485 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; 6486 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; 6487 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; 6488 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; 6489 6490 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; 6491 6492 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; 6493 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; 6494 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; 6495 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; 6496 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; 6497 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; 6498 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; 6499 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; 6500 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; 6501 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; 6502 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; 6503 6504 imp->blt = sse2_blt; 6505 imp->fill = sse2_fill; 6506 6507 imp->iter_info = sse2_iters; 6508 6509 return imp; 6510 }