pixman-mmx.c (101356B)
1 /* 2 * Copyright © 2004, 2005 Red Hat, Inc. 3 * Copyright © 2004 Nicholas Miell 4 * Copyright © 2005 Trolltech AS 5 * 6 * Permission to use, copy, modify, distribute, and sell this software and its 7 * documentation for any purpose is hereby granted without fee, provided that 8 * the above copyright notice appear in all copies and that both that 9 * copyright notice and this permission notice appear in supporting 10 * documentation, and that the name of Red Hat not be used in advertising or 11 * publicity pertaining to distribution of the software without specific, 12 * written prior permission. Red Hat makes no representations about the 13 * suitability of this software for any purpose. It is provided "as is" 14 * without express or implied warranty. 15 * 16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 23 * SOFTWARE. 24 * 25 * Author: Søren Sandmann (sandmann@redhat.com) 26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com) 27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) 28 * 29 * Based on work by Owen Taylor 30 */ 31 32 #ifdef HAVE_CONFIG_H 33 #include <pixman-config.h> 34 #endif 35 36 #if defined USE_X86_MMX || defined USE_LOONGSON_MMI 37 38 #ifdef USE_LOONGSON_MMI 39 #include <loongson-mmintrin.h> 40 #else 41 #include <mmintrin.h> 42 #endif 43 #include "pixman-private.h" 44 #include "pixman-combine32.h" 45 #include "pixman-inlines.h" 46 47 #ifdef VERBOSE 48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) 49 #else 50 #define CHECKPOINT() 51 #endif 52 53 #ifdef USE_X86_MMX 54 # if (defined(__SSE2__) || defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) 55 # include <xmmintrin.h> 56 # else 57 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE 58 * instructions to be generated that we don't want. Just duplicate the 59 * functions we want to use. */ 60 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 61 _mm_movemask_pi8 (__m64 __A) 62 { 63 int ret; 64 65 asm ("pmovmskb %1, %0\n\t" 66 : "=r" (ret) 67 : "y" (__A) 68 ); 69 70 return ret; 71 } 72 73 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 74 _mm_mulhi_pu16 (__m64 __A, __m64 __B) 75 { 76 asm ("pmulhuw %1, %0\n\t" 77 : "+y" (__A) 78 : "y" (__B) 79 ); 80 return __A; 81 } 82 83 # define _mm_shuffle_pi16(A, N) \ 84 ({ \ 85 __m64 ret; \ 86 \ 87 asm ("pshufw %2, %1, %0\n\t" \ 88 : "=y" (ret) \ 89 : "y" (A), "K" ((const int8_t)N) \ 90 ); \ 91 \ 92 ret; \ 93 }) 94 # endif 95 #endif 96 97 #ifndef _MM_SHUFFLE 98 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 99 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 100 #endif 101 102 /* Notes about writing mmx code 103 * 104 * give memory operands as the second operand. If you give it as the 105 * first, gcc will first load it into a register, then use that 106 * register 107 * 108 * ie. use 109 * 110 * _mm_mullo_pi16 (x, mmx_constant); 111 * 112 * not 113 * 114 * _mm_mullo_pi16 (mmx_constant, x); 115 * 116 * Also try to minimize dependencies. i.e. when you need a value, try 117 * to calculate it from a value that was calculated as early as 118 * possible. 119 */ 120 121 /* --------------- MMX primitives ------------------------------------- */ 122 123 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be 124 * the name of the member used to access the data. 125 * If __m64 requires using mm_cvt* intrinsics functions to convert between 126 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. 127 * If __m64 and uint64_t values can just be cast to each other directly, 128 * then define USE_M64_CASTS. 129 * If __m64 is a double datatype, then define USE_M64_DOUBLE. 130 */ 131 #ifdef _MSC_VER 132 # ifdef __clang__ 133 # define USE_CVT_INTRINSICS 134 # else 135 # define M64_MEMBER m64_u64 136 # endif 137 #elif defined(__ICC) 138 # define USE_CVT_INTRINSICS 139 #elif defined(USE_LOONGSON_MMI) 140 # define USE_M64_DOUBLE 141 #elif defined(__GNUC__) 142 # define USE_M64_CASTS 143 #elif defined(__SUNPRO_C) 144 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) 145 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) 146 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ 147 * is defined. If it is used, then the mm_cvt* intrinsics must be used. 148 */ 149 # define USE_CVT_INTRINSICS 150 # else 151 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is 152 * disabled, __m64 is defined as a struct containing "unsigned long long l_". 153 */ 154 # define M64_MEMBER l_ 155 # endif 156 #endif 157 158 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) 159 typedef uint64_t mmxdatafield; 160 #else 161 typedef __m64 mmxdatafield; 162 #endif 163 164 typedef struct 165 { 166 mmxdatafield mmx_4x00ff; 167 mmxdatafield mmx_4x0080; 168 mmxdatafield mmx_565_rgb; 169 mmxdatafield mmx_565_unpack_multiplier; 170 mmxdatafield mmx_565_pack_multiplier; 171 mmxdatafield mmx_565_r; 172 mmxdatafield mmx_565_g; 173 mmxdatafield mmx_565_b; 174 mmxdatafield mmx_packed_565_rb; 175 mmxdatafield mmx_packed_565_g; 176 mmxdatafield mmx_expand_565_g; 177 mmxdatafield mmx_expand_565_b; 178 mmxdatafield mmx_expand_565_r; 179 #ifndef USE_LOONGSON_MMI 180 mmxdatafield mmx_mask_0; 181 mmxdatafield mmx_mask_1; 182 mmxdatafield mmx_mask_2; 183 mmxdatafield mmx_mask_3; 184 #endif 185 mmxdatafield mmx_full_alpha; 186 mmxdatafield mmx_4x0101; 187 mmxdatafield mmx_ff000000; 188 } mmx_data_t; 189 190 #if defined(_MSC_VER) 191 # define MMXDATA_INIT(field, val) { val ## UI64 } 192 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ 193 # define MMXDATA_INIT(field, val) field = { val ## ULL } 194 #else /* mmxdatafield is an integral type */ 195 # define MMXDATA_INIT(field, val) field = val ## ULL 196 #endif 197 198 static const mmx_data_t c = 199 { 200 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), 201 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), 202 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), 203 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), 204 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004), 205 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), 206 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), 207 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), 208 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), 209 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), 210 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0), 211 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f), 212 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800), 213 #ifndef USE_LOONGSON_MMI 214 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), 215 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), 216 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), 217 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), 218 #endif 219 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), 220 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101), 221 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000), 222 }; 223 224 #ifdef USE_CVT_INTRINSICS 225 # define MC(x) to_m64 (c.mmx_ ## x) 226 #elif defined(USE_M64_CASTS) 227 # define MC(x) ((__m64)c.mmx_ ## x) 228 #elif defined(USE_M64_DOUBLE) 229 # define MC(x) (*(__m64 *)&c.mmx_ ## x) 230 #else 231 # define MC(x) c.mmx_ ## x 232 #endif 233 234 static force_inline __m64 235 to_m64 (uint64_t x) 236 { 237 #ifdef USE_CVT_INTRINSICS 238 return _mm_cvtsi64_m64 (x); 239 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ 240 __m64 res; 241 242 res.M64_MEMBER = x; 243 return res; 244 #elif defined USE_M64_DOUBLE 245 return *(__m64 *)&x; 246 #else /* USE_M64_CASTS */ 247 return (__m64)x; 248 #endif 249 } 250 251 static force_inline uint64_t 252 to_uint64 (__m64 x) 253 { 254 #ifdef USE_CVT_INTRINSICS 255 return _mm_cvtm64_si64 (x); 256 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ 257 uint64_t res = x.M64_MEMBER; 258 return res; 259 #elif defined USE_M64_DOUBLE 260 return *(uint64_t *)&x; 261 #else /* USE_M64_CASTS */ 262 return (uint64_t)x; 263 #endif 264 } 265 266 static force_inline __m64 267 shift (__m64 v, 268 int s) 269 { 270 if (s > 0) 271 return _mm_slli_si64 (v, s); 272 else if (s < 0) 273 return _mm_srli_si64 (v, -s); 274 else 275 return v; 276 } 277 278 static force_inline __m64 279 negate (__m64 mask) 280 { 281 return _mm_xor_si64 (mask, MC (4x00ff)); 282 } 283 284 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1 285 * and maps its result to the same range. 286 * 287 * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner: 288 * Notation, Notation, Notation", the first of which is 289 * 290 * prod(a, b) = (a * b + 128) / 255. 291 * 292 * By approximating the division by 255 as 257/65536 it can be replaced by a 293 * multiply and a right shift. This is the implementation that we use in 294 * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended 295 * 3DNow!, and unavailable at the time of the book's publication) to perform 296 * the multiplication and right shift in a single operation. 297 * 298 * prod(a, b) = ((a * b + 128) * 257) >> 16. 299 * 300 * A third way (how pix_multiply() was implemented prior to 14208344) exists 301 * also that performs the multiplication by 257 with adds and shifts. 302 * 303 * Where temp = a * b + 128 304 * 305 * prod(a, b) = (temp + (temp >> 8)) >> 8. 306 */ 307 static force_inline __m64 308 pix_multiply (__m64 a, __m64 b) 309 { 310 __m64 res; 311 312 res = _mm_mullo_pi16 (a, b); 313 res = _mm_adds_pu16 (res, MC (4x0080)); 314 res = _mm_mulhi_pu16 (res, MC (4x0101)); 315 316 return res; 317 } 318 319 static force_inline __m64 320 pix_add (__m64 a, __m64 b) 321 { 322 return _mm_adds_pu8 (a, b); 323 } 324 325 static force_inline __m64 326 expand_alpha (__m64 pixel) 327 { 328 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); 329 } 330 331 static force_inline __m64 332 expand_alpha_rev (__m64 pixel) 333 { 334 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); 335 } 336 337 static force_inline __m64 338 invert_colors (__m64 pixel) 339 { 340 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); 341 } 342 343 static force_inline __m64 344 over (__m64 src, 345 __m64 srca, 346 __m64 dest) 347 { 348 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); 349 } 350 351 static force_inline __m64 352 over_rev_non_pre (__m64 src, __m64 dest) 353 { 354 __m64 srca = expand_alpha (src); 355 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); 356 357 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); 358 } 359 360 static force_inline __m64 361 in (__m64 src, __m64 mask) 362 { 363 return pix_multiply (src, mask); 364 } 365 366 #ifndef _MSC_VER 367 static force_inline __m64 368 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) 369 { 370 return over (in (src, mask), pix_multiply (srca, mask), dest); 371 } 372 373 #else 374 375 #define in_over(src, srca, mask, dest) \ 376 over (in (src, mask), pix_multiply (srca, mask), dest) 377 378 #endif 379 380 /* Elemental unaligned loads */ 381 382 static force_inline __m64 ldq_u(__m64 *p) 383 { 384 #ifdef USE_X86_MMX 385 /* x86's alignment restrictions are very relaxed, but that's no excuse */ 386 __m64 r; 387 memcpy(&r, p, sizeof(__m64)); 388 return r; 389 #else 390 struct __una_u64 { __m64 x __attribute__((packed)); }; 391 const struct __una_u64 *ptr = (const struct __una_u64 *) p; 392 return (__m64) ptr->x; 393 #endif 394 } 395 396 static force_inline uint32_t ldl_u(const uint32_t *p) 397 { 398 #ifdef USE_X86_MMX 399 /* x86's alignment restrictions are very relaxed. */ 400 uint32_t r; 401 memcpy(&r, p, sizeof(uint32_t)); 402 return r; 403 #else 404 struct __una_u32 { uint32_t x __attribute__((packed)); }; 405 const struct __una_u32 *ptr = (const struct __una_u32 *) p; 406 return ptr->x; 407 #endif 408 } 409 410 static force_inline __m64 411 load (const uint32_t *v) 412 { 413 #ifdef USE_LOONGSON_MMI 414 __m64 ret; 415 asm ("lwc1 %0, %1\n\t" 416 : "=f" (ret) 417 : "m" (*v) 418 ); 419 return ret; 420 #else 421 return _mm_cvtsi32_si64 (*v); 422 #endif 423 } 424 425 static force_inline __m64 426 load8888 (const uint32_t *v) 427 { 428 #ifdef USE_LOONGSON_MMI 429 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); 430 #else 431 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); 432 #endif 433 } 434 435 static force_inline __m64 436 load8888u (const uint32_t *v) 437 { 438 uint32_t l = ldl_u (v); 439 return load8888 (&l); 440 } 441 442 static force_inline __m64 443 pack8888 (__m64 lo, __m64 hi) 444 { 445 return _mm_packs_pu16 (lo, hi); 446 } 447 448 static force_inline void 449 store (uint32_t *dest, __m64 v) 450 { 451 #ifdef USE_LOONGSON_MMI 452 asm ("swc1 %1, %0\n\t" 453 : "=m" (*dest) 454 : "f" (v) 455 : "memory" 456 ); 457 #else 458 *dest = _mm_cvtsi64_si32 (v); 459 #endif 460 } 461 462 static force_inline void 463 store8888 (uint32_t *dest, __m64 v) 464 { 465 v = pack8888 (v, _mm_setzero_si64 ()); 466 store (dest, v); 467 } 468 469 static force_inline pixman_bool_t 470 is_equal (__m64 a, __m64 b) 471 { 472 #ifdef USE_LOONGSON_MMI 473 /* __m64 is double, we can compare directly. */ 474 return a == b; 475 #else 476 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; 477 #endif 478 } 479 480 static force_inline pixman_bool_t 481 is_opaque (__m64 v) 482 { 483 #ifdef USE_LOONGSON_MMI 484 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); 485 #else 486 __m64 ffs = _mm_cmpeq_pi8 (v, v); 487 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); 488 #endif 489 } 490 491 static force_inline pixman_bool_t 492 is_zero (__m64 v) 493 { 494 return is_equal (v, _mm_setzero_si64 ()); 495 } 496 497 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into 498 * 499 * 00RR00GG00BB 500 * 501 * --- Expanding 565 in the low word --- 502 * 503 * m = (m << (32 - 3)) | (m << (16 - 5)) | m; 504 * m = m & (01f0003f001f); 505 * m = m * (008404100840); 506 * m = m >> 8; 507 * 508 * Note the trick here - the top word is shifted by another nibble to 509 * avoid it bumping into the middle word 510 */ 511 static force_inline __m64 512 expand565 (__m64 pixel, int pos) 513 { 514 __m64 p = pixel; 515 __m64 t1, t2; 516 517 /* move pixel to low 16 bit and zero the rest */ 518 #ifdef USE_LOONGSON_MMI 519 p = loongson_extract_pi16 (p, pos); 520 #else 521 p = shift (shift (p, (3 - pos) * 16), -48); 522 #endif 523 524 t1 = shift (p, 36 - 11); 525 t2 = shift (p, 16 - 5); 526 527 p = _mm_or_si64 (t1, p); 528 p = _mm_or_si64 (t2, p); 529 p = _mm_and_si64 (p, MC (565_rgb)); 530 531 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); 532 return _mm_srli_pi16 (pixel, 8); 533 } 534 535 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of 536 * 537 * AARRGGBBRRGGBB 538 */ 539 static force_inline void 540 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) 541 { 542 __m64 t0, t1, alpha = _mm_setzero_si64 (); 543 __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); 544 __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); 545 __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); 546 if (full_alpha) 547 alpha = _mm_cmpeq_pi32 (alpha, alpha); 548 549 /* Replicate high bits into empty low bits. */ 550 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); 551 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); 552 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); 553 554 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */ 555 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */ 556 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */ 557 558 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */ 559 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */ 560 561 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ 562 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ 563 } 564 565 static force_inline __m64 566 expand8888 (__m64 in, int pos) 567 { 568 if (pos == 0) 569 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); 570 else 571 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); 572 } 573 574 static force_inline __m64 575 expandx888 (__m64 in, int pos) 576 { 577 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); 578 } 579 580 static force_inline void 581 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) 582 { 583 __m64 v0, v1; 584 expand_4xpacked565 (vin, &v0, &v1, full_alpha); 585 *vout0 = expand8888 (v0, 0); 586 *vout1 = expand8888 (v0, 1); 587 *vout2 = expand8888 (v1, 0); 588 *vout3 = expand8888 (v1, 1); 589 } 590 591 static force_inline __m64 592 pack_565 (__m64 pixel, __m64 target, int pos) 593 { 594 __m64 p = pixel; 595 __m64 t = target; 596 __m64 r, g, b; 597 598 r = _mm_and_si64 (p, MC (565_r)); 599 g = _mm_and_si64 (p, MC (565_g)); 600 b = _mm_and_si64 (p, MC (565_b)); 601 602 #ifdef USE_LOONGSON_MMI 603 r = shift (r, -(32 - 8)); 604 g = shift (g, -(16 - 3)); 605 b = shift (b, -(0 + 3)); 606 607 p = _mm_or_si64 (r, g); 608 p = _mm_or_si64 (p, b); 609 return loongson_insert_pi16 (t, p, pos); 610 #else 611 r = shift (r, -(32 - 8) + pos * 16); 612 g = shift (g, -(16 - 3) + pos * 16); 613 b = shift (b, -(0 + 3) + pos * 16); 614 615 if (pos == 0) 616 t = _mm_and_si64 (t, MC (mask_0)); 617 else if (pos == 1) 618 t = _mm_and_si64 (t, MC (mask_1)); 619 else if (pos == 2) 620 t = _mm_and_si64 (t, MC (mask_2)); 621 else if (pos == 3) 622 t = _mm_and_si64 (t, MC (mask_3)); 623 624 p = _mm_or_si64 (r, t); 625 p = _mm_or_si64 (g, p); 626 627 return _mm_or_si64 (b, p); 628 #endif 629 } 630 631 static force_inline __m64 632 pack_4xpacked565 (__m64 a, __m64 b) 633 { 634 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); 635 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); 636 637 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); 638 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); 639 640 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); 641 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); 642 643 t0 = _mm_or_si64 (t0, g0); 644 t1 = _mm_or_si64 (t1, g1); 645 646 t0 = shift(t0, -5); 647 t1 = shift(t1, -5 + 16); 648 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); 649 } 650 651 #ifndef _MSC_VER 652 653 static force_inline __m64 654 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) 655 { 656 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); 657 } 658 659 static force_inline __m64 660 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) 661 { 662 x = pix_multiply (x, a); 663 y = pix_multiply (y, b); 664 665 return pix_add (x, y); 666 } 667 668 #else 669 670 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */ 671 672 #define pack_4x565(v0, v1, v2, v3) \ 673 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)) 674 675 #define pix_add_mul(x, a, y, b) \ 676 ( x = pix_multiply (x, a), \ 677 y = pix_multiply (y, b), \ 678 pix_add (x, y) ) 679 680 #endif 681 682 /* --------------- MMX code patch for fbcompose.c --------------------- */ 683 684 static force_inline __m64 685 combine (const uint32_t *src, const uint32_t *mask) 686 { 687 __m64 vsrc = load8888 (src); 688 689 if (mask) 690 { 691 __m64 m = load8888 (mask); 692 693 m = expand_alpha (m); 694 vsrc = pix_multiply (vsrc, m); 695 } 696 697 return vsrc; 698 } 699 700 static force_inline __m64 701 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) 702 { 703 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); 704 705 if (is_opaque (vsrc)) 706 { 707 return vsrc; 708 } 709 else if (!is_zero (vsrc)) 710 { 711 return over (vsrc, expand_alpha (vsrc), 712 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); 713 } 714 715 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); 716 } 717 718 static void 719 mmx_combine_over_u (pixman_implementation_t *imp, 720 pixman_op_t op, 721 uint32_t * dest, 722 const uint32_t * src, 723 const uint32_t * mask, 724 int width) 725 { 726 const uint32_t *end = dest + width; 727 728 while (dest < end) 729 { 730 __m64 vsrc = combine (src, mask); 731 732 if (is_opaque (vsrc)) 733 { 734 store8888 (dest, vsrc); 735 } 736 else if (!is_zero (vsrc)) 737 { 738 __m64 sa = expand_alpha (vsrc); 739 store8888 (dest, over (vsrc, sa, load8888 (dest))); 740 } 741 742 ++dest; 743 ++src; 744 if (mask) 745 ++mask; 746 } 747 _mm_empty (); 748 } 749 750 static void 751 mmx_combine_over_reverse_u (pixman_implementation_t *imp, 752 pixman_op_t op, 753 uint32_t * dest, 754 const uint32_t * src, 755 const uint32_t * mask, 756 int width) 757 { 758 const uint32_t *end = dest + width; 759 760 while (dest < end) 761 { 762 __m64 d, da; 763 __m64 s = combine (src, mask); 764 765 d = load8888 (dest); 766 da = expand_alpha (d); 767 store8888 (dest, over (d, da, s)); 768 769 ++dest; 770 ++src; 771 if (mask) 772 mask++; 773 } 774 _mm_empty (); 775 } 776 777 static void 778 mmx_combine_in_u (pixman_implementation_t *imp, 779 pixman_op_t op, 780 uint32_t * dest, 781 const uint32_t * src, 782 const uint32_t * mask, 783 int width) 784 { 785 const uint32_t *end = dest + width; 786 787 while (dest < end) 788 { 789 __m64 a; 790 __m64 x = combine (src, mask); 791 792 a = load8888 (dest); 793 a = expand_alpha (a); 794 x = pix_multiply (x, a); 795 796 store8888 (dest, x); 797 798 ++dest; 799 ++src; 800 if (mask) 801 mask++; 802 } 803 _mm_empty (); 804 } 805 806 static void 807 mmx_combine_in_reverse_u (pixman_implementation_t *imp, 808 pixman_op_t op, 809 uint32_t * dest, 810 const uint32_t * src, 811 const uint32_t * mask, 812 int width) 813 { 814 const uint32_t *end = dest + width; 815 816 while (dest < end) 817 { 818 __m64 a = combine (src, mask); 819 __m64 x; 820 821 x = load8888 (dest); 822 a = expand_alpha (a); 823 x = pix_multiply (x, a); 824 store8888 (dest, x); 825 826 ++dest; 827 ++src; 828 if (mask) 829 mask++; 830 } 831 _mm_empty (); 832 } 833 834 static void 835 mmx_combine_out_u (pixman_implementation_t *imp, 836 pixman_op_t op, 837 uint32_t * dest, 838 const uint32_t * src, 839 const uint32_t * mask, 840 int width) 841 { 842 const uint32_t *end = dest + width; 843 844 while (dest < end) 845 { 846 __m64 a; 847 __m64 x = combine (src, mask); 848 849 a = load8888 (dest); 850 a = expand_alpha (a); 851 a = negate (a); 852 x = pix_multiply (x, a); 853 store8888 (dest, x); 854 855 ++dest; 856 ++src; 857 if (mask) 858 mask++; 859 } 860 _mm_empty (); 861 } 862 863 static void 864 mmx_combine_out_reverse_u (pixman_implementation_t *imp, 865 pixman_op_t op, 866 uint32_t * dest, 867 const uint32_t * src, 868 const uint32_t * mask, 869 int width) 870 { 871 const uint32_t *end = dest + width; 872 873 while (dest < end) 874 { 875 __m64 a = combine (src, mask); 876 __m64 x; 877 878 x = load8888 (dest); 879 a = expand_alpha (a); 880 a = negate (a); 881 x = pix_multiply (x, a); 882 883 store8888 (dest, x); 884 885 ++dest; 886 ++src; 887 if (mask) 888 mask++; 889 } 890 _mm_empty (); 891 } 892 893 static void 894 mmx_combine_atop_u (pixman_implementation_t *imp, 895 pixman_op_t op, 896 uint32_t * dest, 897 const uint32_t * src, 898 const uint32_t * mask, 899 int width) 900 { 901 const uint32_t *end = dest + width; 902 903 while (dest < end) 904 { 905 __m64 da, d, sia; 906 __m64 s = combine (src, mask); 907 908 d = load8888 (dest); 909 sia = expand_alpha (s); 910 sia = negate (sia); 911 da = expand_alpha (d); 912 s = pix_add_mul (s, da, d, sia); 913 store8888 (dest, s); 914 915 ++dest; 916 ++src; 917 if (mask) 918 mask++; 919 } 920 _mm_empty (); 921 } 922 923 static void 924 mmx_combine_atop_reverse_u (pixman_implementation_t *imp, 925 pixman_op_t op, 926 uint32_t * dest, 927 const uint32_t * src, 928 const uint32_t * mask, 929 int width) 930 { 931 const uint32_t *end; 932 933 end = dest + width; 934 935 while (dest < end) 936 { 937 __m64 dia, d, sa; 938 __m64 s = combine (src, mask); 939 940 d = load8888 (dest); 941 sa = expand_alpha (s); 942 dia = expand_alpha (d); 943 dia = negate (dia); 944 s = pix_add_mul (s, dia, d, sa); 945 store8888 (dest, s); 946 947 ++dest; 948 ++src; 949 if (mask) 950 mask++; 951 } 952 _mm_empty (); 953 } 954 955 static void 956 mmx_combine_xor_u (pixman_implementation_t *imp, 957 pixman_op_t op, 958 uint32_t * dest, 959 const uint32_t * src, 960 const uint32_t * mask, 961 int width) 962 { 963 const uint32_t *end = dest + width; 964 965 while (dest < end) 966 { 967 __m64 dia, d, sia; 968 __m64 s = combine (src, mask); 969 970 d = load8888 (dest); 971 sia = expand_alpha (s); 972 dia = expand_alpha (d); 973 sia = negate (sia); 974 dia = negate (dia); 975 s = pix_add_mul (s, dia, d, sia); 976 store8888 (dest, s); 977 978 ++dest; 979 ++src; 980 if (mask) 981 mask++; 982 } 983 _mm_empty (); 984 } 985 986 static void 987 mmx_combine_add_u (pixman_implementation_t *imp, 988 pixman_op_t op, 989 uint32_t * dest, 990 const uint32_t * src, 991 const uint32_t * mask, 992 int width) 993 { 994 const uint32_t *end = dest + width; 995 996 while (dest < end) 997 { 998 __m64 d; 999 __m64 s = combine (src, mask); 1000 1001 d = load8888 (dest); 1002 s = pix_add (s, d); 1003 store8888 (dest, s); 1004 1005 ++dest; 1006 ++src; 1007 if (mask) 1008 mask++; 1009 } 1010 _mm_empty (); 1011 } 1012 1013 static void 1014 mmx_combine_saturate_u (pixman_implementation_t *imp, 1015 pixman_op_t op, 1016 uint32_t * dest, 1017 const uint32_t * src, 1018 const uint32_t * mask, 1019 int width) 1020 { 1021 const uint32_t *end = dest + width; 1022 1023 while (dest < end) 1024 { 1025 uint32_t s, sa, da; 1026 uint32_t d = *dest; 1027 __m64 ms = combine (src, mask); 1028 __m64 md = load8888 (dest); 1029 1030 store8888(&s, ms); 1031 da = ~d >> 24; 1032 sa = s >> 24; 1033 1034 if (sa > da) 1035 { 1036 uint32_t quot = DIV_UN8 (da, sa) << 24; 1037 __m64 msa = load8888 ("); 1038 msa = expand_alpha (msa); 1039 ms = pix_multiply (ms, msa); 1040 } 1041 1042 md = pix_add (md, ms); 1043 store8888 (dest, md); 1044 1045 ++src; 1046 ++dest; 1047 if (mask) 1048 mask++; 1049 } 1050 _mm_empty (); 1051 } 1052 1053 static void 1054 mmx_combine_src_ca (pixman_implementation_t *imp, 1055 pixman_op_t op, 1056 uint32_t * dest, 1057 const uint32_t * src, 1058 const uint32_t * mask, 1059 int width) 1060 { 1061 const uint32_t *end = src + width; 1062 1063 while (src < end) 1064 { 1065 __m64 a = load8888 (mask); 1066 __m64 s = load8888 (src); 1067 1068 s = pix_multiply (s, a); 1069 store8888 (dest, s); 1070 1071 ++src; 1072 ++mask; 1073 ++dest; 1074 } 1075 _mm_empty (); 1076 } 1077 1078 static void 1079 mmx_combine_over_ca (pixman_implementation_t *imp, 1080 pixman_op_t op, 1081 uint32_t * dest, 1082 const uint32_t * src, 1083 const uint32_t * mask, 1084 int width) 1085 { 1086 const uint32_t *end = src + width; 1087 1088 while (src < end) 1089 { 1090 __m64 a = load8888 (mask); 1091 __m64 s = load8888 (src); 1092 __m64 d = load8888 (dest); 1093 __m64 sa = expand_alpha (s); 1094 1095 store8888 (dest, in_over (s, sa, a, d)); 1096 1097 ++src; 1098 ++dest; 1099 ++mask; 1100 } 1101 _mm_empty (); 1102 } 1103 1104 static void 1105 mmx_combine_over_reverse_ca (pixman_implementation_t *imp, 1106 pixman_op_t op, 1107 uint32_t * dest, 1108 const uint32_t * src, 1109 const uint32_t * mask, 1110 int width) 1111 { 1112 const uint32_t *end = src + width; 1113 1114 while (src < end) 1115 { 1116 __m64 a = load8888 (mask); 1117 __m64 s = load8888 (src); 1118 __m64 d = load8888 (dest); 1119 __m64 da = expand_alpha (d); 1120 1121 store8888 (dest, over (d, da, in (s, a))); 1122 1123 ++src; 1124 ++dest; 1125 ++mask; 1126 } 1127 _mm_empty (); 1128 } 1129 1130 static void 1131 mmx_combine_in_ca (pixman_implementation_t *imp, 1132 pixman_op_t op, 1133 uint32_t * dest, 1134 const uint32_t * src, 1135 const uint32_t * mask, 1136 int width) 1137 { 1138 const uint32_t *end = src + width; 1139 1140 while (src < end) 1141 { 1142 __m64 a = load8888 (mask); 1143 __m64 s = load8888 (src); 1144 __m64 d = load8888 (dest); 1145 __m64 da = expand_alpha (d); 1146 1147 s = pix_multiply (s, a); 1148 s = pix_multiply (s, da); 1149 store8888 (dest, s); 1150 1151 ++src; 1152 ++dest; 1153 ++mask; 1154 } 1155 _mm_empty (); 1156 } 1157 1158 static void 1159 mmx_combine_in_reverse_ca (pixman_implementation_t *imp, 1160 pixman_op_t op, 1161 uint32_t * dest, 1162 const uint32_t * src, 1163 const uint32_t * mask, 1164 int width) 1165 { 1166 const uint32_t *end = src + width; 1167 1168 while (src < end) 1169 { 1170 __m64 a = load8888 (mask); 1171 __m64 s = load8888 (src); 1172 __m64 d = load8888 (dest); 1173 __m64 sa = expand_alpha (s); 1174 1175 a = pix_multiply (a, sa); 1176 d = pix_multiply (d, a); 1177 store8888 (dest, d); 1178 1179 ++src; 1180 ++dest; 1181 ++mask; 1182 } 1183 _mm_empty (); 1184 } 1185 1186 static void 1187 mmx_combine_out_ca (pixman_implementation_t *imp, 1188 pixman_op_t op, 1189 uint32_t * dest, 1190 const uint32_t * src, 1191 const uint32_t * mask, 1192 int width) 1193 { 1194 const uint32_t *end = src + width; 1195 1196 while (src < end) 1197 { 1198 __m64 a = load8888 (mask); 1199 __m64 s = load8888 (src); 1200 __m64 d = load8888 (dest); 1201 __m64 da = expand_alpha (d); 1202 1203 da = negate (da); 1204 s = pix_multiply (s, a); 1205 s = pix_multiply (s, da); 1206 store8888 (dest, s); 1207 1208 ++src; 1209 ++dest; 1210 ++mask; 1211 } 1212 _mm_empty (); 1213 } 1214 1215 static void 1216 mmx_combine_out_reverse_ca (pixman_implementation_t *imp, 1217 pixman_op_t op, 1218 uint32_t * dest, 1219 const uint32_t * src, 1220 const uint32_t * mask, 1221 int width) 1222 { 1223 const uint32_t *end = src + width; 1224 1225 while (src < end) 1226 { 1227 __m64 a = load8888 (mask); 1228 __m64 s = load8888 (src); 1229 __m64 d = load8888 (dest); 1230 __m64 sa = expand_alpha (s); 1231 1232 a = pix_multiply (a, sa); 1233 a = negate (a); 1234 d = pix_multiply (d, a); 1235 store8888 (dest, d); 1236 1237 ++src; 1238 ++dest; 1239 ++mask; 1240 } 1241 _mm_empty (); 1242 } 1243 1244 static void 1245 mmx_combine_atop_ca (pixman_implementation_t *imp, 1246 pixman_op_t op, 1247 uint32_t * dest, 1248 const uint32_t * src, 1249 const uint32_t * mask, 1250 int width) 1251 { 1252 const uint32_t *end = src + width; 1253 1254 while (src < end) 1255 { 1256 __m64 a = load8888 (mask); 1257 __m64 s = load8888 (src); 1258 __m64 d = load8888 (dest); 1259 __m64 da = expand_alpha (d); 1260 __m64 sa = expand_alpha (s); 1261 1262 s = pix_multiply (s, a); 1263 a = pix_multiply (a, sa); 1264 a = negate (a); 1265 d = pix_add_mul (d, a, s, da); 1266 store8888 (dest, d); 1267 1268 ++src; 1269 ++dest; 1270 ++mask; 1271 } 1272 _mm_empty (); 1273 } 1274 1275 static void 1276 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, 1277 pixman_op_t op, 1278 uint32_t * dest, 1279 const uint32_t * src, 1280 const uint32_t * mask, 1281 int width) 1282 { 1283 const uint32_t *end = src + width; 1284 1285 while (src < end) 1286 { 1287 __m64 a = load8888 (mask); 1288 __m64 s = load8888 (src); 1289 __m64 d = load8888 (dest); 1290 __m64 da = expand_alpha (d); 1291 __m64 sa = expand_alpha (s); 1292 1293 s = pix_multiply (s, a); 1294 a = pix_multiply (a, sa); 1295 da = negate (da); 1296 d = pix_add_mul (d, a, s, da); 1297 store8888 (dest, d); 1298 1299 ++src; 1300 ++dest; 1301 ++mask; 1302 } 1303 _mm_empty (); 1304 } 1305 1306 static void 1307 mmx_combine_xor_ca (pixman_implementation_t *imp, 1308 pixman_op_t op, 1309 uint32_t * dest, 1310 const uint32_t * src, 1311 const uint32_t * mask, 1312 int width) 1313 { 1314 const uint32_t *end = src + width; 1315 1316 while (src < end) 1317 { 1318 __m64 a = load8888 (mask); 1319 __m64 s = load8888 (src); 1320 __m64 d = load8888 (dest); 1321 __m64 da = expand_alpha (d); 1322 __m64 sa = expand_alpha (s); 1323 1324 s = pix_multiply (s, a); 1325 a = pix_multiply (a, sa); 1326 da = negate (da); 1327 a = negate (a); 1328 d = pix_add_mul (d, a, s, da); 1329 store8888 (dest, d); 1330 1331 ++src; 1332 ++dest; 1333 ++mask; 1334 } 1335 _mm_empty (); 1336 } 1337 1338 static void 1339 mmx_combine_add_ca (pixman_implementation_t *imp, 1340 pixman_op_t op, 1341 uint32_t * dest, 1342 const uint32_t * src, 1343 const uint32_t * mask, 1344 int width) 1345 { 1346 const uint32_t *end = src + width; 1347 1348 while (src < end) 1349 { 1350 __m64 a = load8888 (mask); 1351 __m64 s = load8888 (src); 1352 __m64 d = load8888 (dest); 1353 1354 s = pix_multiply (s, a); 1355 d = pix_add (s, d); 1356 store8888 (dest, d); 1357 1358 ++src; 1359 ++dest; 1360 ++mask; 1361 } 1362 _mm_empty (); 1363 } 1364 1365 /* ------------- MMX code paths called from fbpict.c -------------------- */ 1366 1367 static void 1368 mmx_composite_over_n_8888 (pixman_implementation_t *imp, 1369 pixman_composite_info_t *info) 1370 { 1371 PIXMAN_COMPOSITE_ARGS (info); 1372 uint32_t src; 1373 uint32_t *dst_line, *dst; 1374 int32_t w; 1375 int dst_stride; 1376 __m64 vsrc, vsrca; 1377 1378 CHECKPOINT (); 1379 1380 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1381 1382 if (src == 0) 1383 return; 1384 1385 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1386 1387 vsrc = load8888 (&src); 1388 vsrca = expand_alpha (vsrc); 1389 1390 while (height--) 1391 { 1392 dst = dst_line; 1393 dst_line += dst_stride; 1394 w = width; 1395 1396 CHECKPOINT (); 1397 1398 while (w && (uintptr_t)dst & 7) 1399 { 1400 store8888 (dst, over (vsrc, vsrca, load8888 (dst))); 1401 1402 w--; 1403 dst++; 1404 } 1405 1406 while (w >= 2) 1407 { 1408 __m64 vdest; 1409 __m64 dest0, dest1; 1410 1411 vdest = *(__m64 *)dst; 1412 1413 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); 1414 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); 1415 1416 *(__m64 *)dst = pack8888 (dest0, dest1); 1417 1418 dst += 2; 1419 w -= 2; 1420 } 1421 1422 CHECKPOINT (); 1423 1424 if (w) 1425 { 1426 store8888 (dst, over (vsrc, vsrca, load8888 (dst))); 1427 } 1428 } 1429 1430 _mm_empty (); 1431 } 1432 1433 static void 1434 mmx_composite_over_n_0565 (pixman_implementation_t *imp, 1435 pixman_composite_info_t *info) 1436 { 1437 PIXMAN_COMPOSITE_ARGS (info); 1438 uint32_t src; 1439 uint16_t *dst_line, *dst; 1440 int32_t w; 1441 int dst_stride; 1442 __m64 vsrc, vsrca; 1443 1444 CHECKPOINT (); 1445 1446 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1447 1448 if (src == 0) 1449 return; 1450 1451 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1452 1453 vsrc = load8888 (&src); 1454 vsrca = expand_alpha (vsrc); 1455 1456 while (height--) 1457 { 1458 dst = dst_line; 1459 dst_line += dst_stride; 1460 w = width; 1461 1462 CHECKPOINT (); 1463 1464 while (w && (uintptr_t)dst & 7) 1465 { 1466 uint64_t d = *dst; 1467 __m64 vdest = expand565 (to_m64 (d), 0); 1468 1469 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); 1470 *dst = to_uint64 (vdest); 1471 1472 w--; 1473 dst++; 1474 } 1475 1476 while (w >= 4) 1477 { 1478 __m64 vdest = *(__m64 *)dst; 1479 __m64 v0, v1, v2, v3; 1480 1481 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1482 1483 v0 = over (vsrc, vsrca, v0); 1484 v1 = over (vsrc, vsrca, v1); 1485 v2 = over (vsrc, vsrca, v2); 1486 v3 = over (vsrc, vsrca, v3); 1487 1488 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 1489 1490 dst += 4; 1491 w -= 4; 1492 } 1493 1494 CHECKPOINT (); 1495 1496 while (w) 1497 { 1498 uint64_t d = *dst; 1499 __m64 vdest = expand565 (to_m64 (d), 0); 1500 1501 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); 1502 *dst = to_uint64 (vdest); 1503 1504 w--; 1505 dst++; 1506 } 1507 } 1508 1509 _mm_empty (); 1510 } 1511 1512 static void 1513 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, 1514 pixman_composite_info_t *info) 1515 { 1516 PIXMAN_COMPOSITE_ARGS (info); 1517 uint32_t src; 1518 uint32_t *dst_line; 1519 uint32_t *mask_line; 1520 int dst_stride, mask_stride; 1521 __m64 vsrc, vsrca; 1522 1523 CHECKPOINT (); 1524 1525 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1526 1527 if (src == 0) 1528 return; 1529 1530 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1531 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 1532 1533 vsrc = load8888 (&src); 1534 vsrca = expand_alpha (vsrc); 1535 1536 while (height--) 1537 { 1538 int twidth = width; 1539 uint32_t *p = (uint32_t *)mask_line; 1540 uint32_t *q = (uint32_t *)dst_line; 1541 1542 while (twidth && (uintptr_t)q & 7) 1543 { 1544 uint32_t m = *(uint32_t *)p; 1545 1546 if (m) 1547 { 1548 __m64 vdest = load8888 (q); 1549 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); 1550 store8888 (q, vdest); 1551 } 1552 1553 twidth--; 1554 p++; 1555 q++; 1556 } 1557 1558 while (twidth >= 2) 1559 { 1560 uint32_t m0, m1; 1561 m0 = *p; 1562 m1 = *(p + 1); 1563 1564 if (m0 | m1) 1565 { 1566 __m64 dest0, dest1; 1567 __m64 vdest = *(__m64 *)q; 1568 1569 dest0 = in_over (vsrc, vsrca, load8888 (&m0), 1570 expand8888 (vdest, 0)); 1571 dest1 = in_over (vsrc, vsrca, load8888 (&m1), 1572 expand8888 (vdest, 1)); 1573 1574 *(__m64 *)q = pack8888 (dest0, dest1); 1575 } 1576 1577 p += 2; 1578 q += 2; 1579 twidth -= 2; 1580 } 1581 1582 if (twidth) 1583 { 1584 uint32_t m = *(uint32_t *)p; 1585 1586 if (m) 1587 { 1588 __m64 vdest = load8888 (q); 1589 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); 1590 store8888 (q, vdest); 1591 } 1592 1593 twidth--; 1594 p++; 1595 q++; 1596 } 1597 1598 dst_line += dst_stride; 1599 mask_line += mask_stride; 1600 } 1601 1602 _mm_empty (); 1603 } 1604 1605 static void 1606 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, 1607 pixman_composite_info_t *info) 1608 { 1609 PIXMAN_COMPOSITE_ARGS (info); 1610 uint32_t *dst_line, *dst; 1611 uint32_t *src_line, *src; 1612 uint32_t mask; 1613 __m64 vmask; 1614 int dst_stride, src_stride; 1615 int32_t w; 1616 1617 CHECKPOINT (); 1618 1619 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1620 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1621 1622 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); 1623 vmask = expand_alpha (load8888 (&mask)); 1624 1625 while (height--) 1626 { 1627 dst = dst_line; 1628 dst_line += dst_stride; 1629 src = src_line; 1630 src_line += src_stride; 1631 w = width; 1632 1633 while (w && (uintptr_t)dst & 7) 1634 { 1635 __m64 s = load8888 (src); 1636 __m64 d = load8888 (dst); 1637 1638 store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); 1639 1640 w--; 1641 dst++; 1642 src++; 1643 } 1644 1645 while (w >= 2) 1646 { 1647 __m64 vs = ldq_u ((__m64 *)src); 1648 __m64 vd = *(__m64 *)dst; 1649 __m64 vsrc0 = expand8888 (vs, 0); 1650 __m64 vsrc1 = expand8888 (vs, 1); 1651 1652 *(__m64 *)dst = pack8888 ( 1653 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), 1654 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); 1655 1656 w -= 2; 1657 dst += 2; 1658 src += 2; 1659 } 1660 1661 if (w) 1662 { 1663 __m64 s = load8888 (src); 1664 __m64 d = load8888 (dst); 1665 1666 store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); 1667 } 1668 } 1669 1670 _mm_empty (); 1671 } 1672 1673 static void 1674 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, 1675 pixman_composite_info_t *info) 1676 { 1677 PIXMAN_COMPOSITE_ARGS (info); 1678 uint32_t *dst_line, *dst; 1679 uint32_t *src_line, *src; 1680 uint32_t mask; 1681 __m64 vmask; 1682 int dst_stride, src_stride; 1683 int32_t w; 1684 __m64 srca; 1685 1686 CHECKPOINT (); 1687 1688 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1689 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1690 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); 1691 1692 vmask = expand_alpha (load8888 (&mask)); 1693 srca = MC (4x00ff); 1694 1695 while (height--) 1696 { 1697 dst = dst_line; 1698 dst_line += dst_stride; 1699 src = src_line; 1700 src_line += src_stride; 1701 w = width; 1702 1703 while (w && (uintptr_t)dst & 7) 1704 { 1705 uint32_t ssrc = *src | 0xff000000; 1706 __m64 s = load8888 (&ssrc); 1707 __m64 d = load8888 (dst); 1708 1709 store8888 (dst, in_over (s, srca, vmask, d)); 1710 1711 w--; 1712 dst++; 1713 src++; 1714 } 1715 1716 while (w >= 16) 1717 { 1718 __m64 vd0 = *(__m64 *)(dst + 0); 1719 __m64 vd1 = *(__m64 *)(dst + 2); 1720 __m64 vd2 = *(__m64 *)(dst + 4); 1721 __m64 vd3 = *(__m64 *)(dst + 6); 1722 __m64 vd4 = *(__m64 *)(dst + 8); 1723 __m64 vd5 = *(__m64 *)(dst + 10); 1724 __m64 vd6 = *(__m64 *)(dst + 12); 1725 __m64 vd7 = *(__m64 *)(dst + 14); 1726 1727 __m64 vs0 = ldq_u ((__m64 *)(src + 0)); 1728 __m64 vs1 = ldq_u ((__m64 *)(src + 2)); 1729 __m64 vs2 = ldq_u ((__m64 *)(src + 4)); 1730 __m64 vs3 = ldq_u ((__m64 *)(src + 6)); 1731 __m64 vs4 = ldq_u ((__m64 *)(src + 8)); 1732 __m64 vs5 = ldq_u ((__m64 *)(src + 10)); 1733 __m64 vs6 = ldq_u ((__m64 *)(src + 12)); 1734 __m64 vs7 = ldq_u ((__m64 *)(src + 14)); 1735 1736 vd0 = pack8888 ( 1737 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), 1738 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); 1739 1740 vd1 = pack8888 ( 1741 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), 1742 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); 1743 1744 vd2 = pack8888 ( 1745 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), 1746 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); 1747 1748 vd3 = pack8888 ( 1749 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), 1750 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); 1751 1752 vd4 = pack8888 ( 1753 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), 1754 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); 1755 1756 vd5 = pack8888 ( 1757 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), 1758 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); 1759 1760 vd6 = pack8888 ( 1761 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), 1762 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); 1763 1764 vd7 = pack8888 ( 1765 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), 1766 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); 1767 1768 *(__m64 *)(dst + 0) = vd0; 1769 *(__m64 *)(dst + 2) = vd1; 1770 *(__m64 *)(dst + 4) = vd2; 1771 *(__m64 *)(dst + 6) = vd3; 1772 *(__m64 *)(dst + 8) = vd4; 1773 *(__m64 *)(dst + 10) = vd5; 1774 *(__m64 *)(dst + 12) = vd6; 1775 *(__m64 *)(dst + 14) = vd7; 1776 1777 w -= 16; 1778 dst += 16; 1779 src += 16; 1780 } 1781 1782 while (w) 1783 { 1784 uint32_t ssrc = *src | 0xff000000; 1785 __m64 s = load8888 (&ssrc); 1786 __m64 d = load8888 (dst); 1787 1788 store8888 (dst, in_over (s, srca, vmask, d)); 1789 1790 w--; 1791 dst++; 1792 src++; 1793 } 1794 } 1795 1796 _mm_empty (); 1797 } 1798 1799 static void 1800 mmx_composite_over_8888_8888 (pixman_implementation_t *imp, 1801 pixman_composite_info_t *info) 1802 { 1803 PIXMAN_COMPOSITE_ARGS (info); 1804 uint32_t *dst_line, *dst; 1805 uint32_t *src_line, *src; 1806 uint32_t s; 1807 int dst_stride, src_stride; 1808 uint8_t a; 1809 int32_t w; 1810 1811 CHECKPOINT (); 1812 1813 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1814 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1815 1816 while (height--) 1817 { 1818 dst = dst_line; 1819 dst_line += dst_stride; 1820 src = src_line; 1821 src_line += src_stride; 1822 w = width; 1823 1824 while (w--) 1825 { 1826 s = *src++; 1827 a = s >> 24; 1828 1829 if (a == 0xff) 1830 { 1831 *dst = s; 1832 } 1833 else if (s) 1834 { 1835 __m64 ms, sa; 1836 ms = load8888 (&s); 1837 sa = expand_alpha (ms); 1838 store8888 (dst, over (ms, sa, load8888 (dst))); 1839 } 1840 1841 dst++; 1842 } 1843 } 1844 _mm_empty (); 1845 } 1846 1847 static void 1848 mmx_composite_over_8888_0565 (pixman_implementation_t *imp, 1849 pixman_composite_info_t *info) 1850 { 1851 PIXMAN_COMPOSITE_ARGS (info); 1852 uint16_t *dst_line, *dst; 1853 uint32_t *src_line, *src; 1854 int dst_stride, src_stride; 1855 int32_t w; 1856 1857 CHECKPOINT (); 1858 1859 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1860 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1861 1862 #if 0 1863 /* FIXME */ 1864 assert (src_image->drawable == mask_image->drawable); 1865 #endif 1866 1867 while (height--) 1868 { 1869 dst = dst_line; 1870 dst_line += dst_stride; 1871 src = src_line; 1872 src_line += src_stride; 1873 w = width; 1874 1875 CHECKPOINT (); 1876 1877 while (w && (uintptr_t)dst & 7) 1878 { 1879 __m64 vsrc = load8888 (src); 1880 uint64_t d = *dst; 1881 __m64 vdest = expand565 (to_m64 (d), 0); 1882 1883 vdest = pack_565 ( 1884 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); 1885 1886 *dst = to_uint64 (vdest); 1887 1888 w--; 1889 dst++; 1890 src++; 1891 } 1892 1893 CHECKPOINT (); 1894 1895 while (w >= 4) 1896 { 1897 __m64 vdest = *(__m64 *)dst; 1898 __m64 v0, v1, v2, v3; 1899 __m64 vsrc0, vsrc1, vsrc2, vsrc3; 1900 1901 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1902 1903 vsrc0 = load8888 ((src + 0)); 1904 vsrc1 = load8888 ((src + 1)); 1905 vsrc2 = load8888 ((src + 2)); 1906 vsrc3 = load8888 ((src + 3)); 1907 1908 v0 = over (vsrc0, expand_alpha (vsrc0), v0); 1909 v1 = over (vsrc1, expand_alpha (vsrc1), v1); 1910 v2 = over (vsrc2, expand_alpha (vsrc2), v2); 1911 v3 = over (vsrc3, expand_alpha (vsrc3), v3); 1912 1913 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 1914 1915 w -= 4; 1916 dst += 4; 1917 src += 4; 1918 } 1919 1920 CHECKPOINT (); 1921 1922 while (w) 1923 { 1924 __m64 vsrc = load8888 (src); 1925 uint64_t d = *dst; 1926 __m64 vdest = expand565 (to_m64 (d), 0); 1927 1928 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); 1929 1930 *dst = to_uint64 (vdest); 1931 1932 w--; 1933 dst++; 1934 src++; 1935 } 1936 } 1937 1938 _mm_empty (); 1939 } 1940 1941 static void 1942 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, 1943 pixman_composite_info_t *info) 1944 { 1945 PIXMAN_COMPOSITE_ARGS (info); 1946 uint32_t src, srca; 1947 uint32_t *dst_line, *dst; 1948 uint8_t *mask_line, *mask; 1949 int dst_stride, mask_stride; 1950 int32_t w; 1951 __m64 vsrc, vsrca; 1952 uint64_t srcsrc; 1953 1954 CHECKPOINT (); 1955 1956 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1957 1958 srca = src >> 24; 1959 if (src == 0) 1960 return; 1961 1962 srcsrc = (uint64_t)src << 32 | src; 1963 1964 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1965 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1966 1967 vsrc = load8888 (&src); 1968 vsrca = expand_alpha (vsrc); 1969 1970 while (height--) 1971 { 1972 dst = dst_line; 1973 dst_line += dst_stride; 1974 mask = mask_line; 1975 mask_line += mask_stride; 1976 w = width; 1977 1978 CHECKPOINT (); 1979 1980 while (w && (uintptr_t)dst & 7) 1981 { 1982 uint64_t m = *mask; 1983 1984 if (m) 1985 { 1986 __m64 vdest = in_over (vsrc, vsrca, 1987 expand_alpha_rev (to_m64 (m)), 1988 load8888 (dst)); 1989 1990 store8888 (dst, vdest); 1991 } 1992 1993 w--; 1994 mask++; 1995 dst++; 1996 } 1997 1998 CHECKPOINT (); 1999 2000 while (w >= 2) 2001 { 2002 uint64_t m0, m1; 2003 2004 m0 = *mask; 2005 m1 = *(mask + 1); 2006 2007 if (srca == 0xff && (m0 & m1) == 0xff) 2008 { 2009 *(uint64_t *)dst = srcsrc; 2010 } 2011 else if (m0 | m1) 2012 { 2013 __m64 vdest; 2014 __m64 dest0, dest1; 2015 2016 vdest = *(__m64 *)dst; 2017 2018 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), 2019 expand8888 (vdest, 0)); 2020 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), 2021 expand8888 (vdest, 1)); 2022 2023 *(__m64 *)dst = pack8888 (dest0, dest1); 2024 } 2025 2026 mask += 2; 2027 dst += 2; 2028 w -= 2; 2029 } 2030 2031 CHECKPOINT (); 2032 2033 if (w) 2034 { 2035 uint64_t m = *mask; 2036 2037 if (m) 2038 { 2039 __m64 vdest = load8888 (dst); 2040 2041 vdest = in_over ( 2042 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); 2043 store8888 (dst, vdest); 2044 } 2045 } 2046 } 2047 2048 _mm_empty (); 2049 } 2050 2051 static pixman_bool_t 2052 mmx_fill (pixman_implementation_t *imp, 2053 uint32_t * bits, 2054 int stride, 2055 int bpp, 2056 int x, 2057 int y, 2058 int width, 2059 int height, 2060 uint32_t filler) 2061 { 2062 uint64_t fill; 2063 __m64 vfill; 2064 uint32_t byte_width; 2065 uint8_t *byte_line; 2066 2067 #if defined __GNUC__ && defined USE_X86_MMX 2068 __m64 v1, v2, v3, v4, v5, v6, v7; 2069 #endif 2070 2071 if (bpp != 16 && bpp != 32 && bpp != 8) 2072 return FALSE; 2073 2074 if (bpp == 8) 2075 { 2076 stride = stride * (int) sizeof (uint32_t) / 1; 2077 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); 2078 byte_width = width; 2079 stride *= 1; 2080 filler = (filler & 0xff) * 0x01010101; 2081 } 2082 else if (bpp == 16) 2083 { 2084 stride = stride * (int) sizeof (uint32_t) / 2; 2085 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); 2086 byte_width = 2 * width; 2087 stride *= 2; 2088 filler = (filler & 0xffff) * 0x00010001; 2089 } 2090 else 2091 { 2092 stride = stride * (int) sizeof (uint32_t) / 4; 2093 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); 2094 byte_width = 4 * width; 2095 stride *= 4; 2096 } 2097 2098 fill = ((uint64_t)filler << 32) | filler; 2099 vfill = to_m64 (fill); 2100 2101 #if defined __GNUC__ && defined USE_X86_MMX 2102 __asm__ ( 2103 "movq %7, %0\n" 2104 "movq %7, %1\n" 2105 "movq %7, %2\n" 2106 "movq %7, %3\n" 2107 "movq %7, %4\n" 2108 "movq %7, %5\n" 2109 "movq %7, %6\n" 2110 : "=&y" (v1), "=&y" (v2), "=&y" (v3), 2111 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) 2112 : "y" (vfill)); 2113 #endif 2114 2115 while (height--) 2116 { 2117 int w; 2118 uint8_t *d = byte_line; 2119 2120 byte_line += stride; 2121 w = byte_width; 2122 2123 if (w >= 1 && ((uintptr_t)d & 1)) 2124 { 2125 *(uint8_t *)d = (filler & 0xff); 2126 w--; 2127 d++; 2128 } 2129 2130 if (w >= 2 && ((uintptr_t)d & 3)) 2131 { 2132 *(uint16_t *)d = filler; 2133 w -= 2; 2134 d += 2; 2135 } 2136 2137 while (w >= 4 && ((uintptr_t)d & 7)) 2138 { 2139 *(uint32_t *)d = filler; 2140 2141 w -= 4; 2142 d += 4; 2143 } 2144 2145 while (w >= 64) 2146 { 2147 #if defined __GNUC__ && defined USE_X86_MMX 2148 __asm__ ( 2149 "movq %1, (%0)\n" 2150 "movq %2, 8(%0)\n" 2151 "movq %3, 16(%0)\n" 2152 "movq %4, 24(%0)\n" 2153 "movq %5, 32(%0)\n" 2154 "movq %6, 40(%0)\n" 2155 "movq %7, 48(%0)\n" 2156 "movq %8, 56(%0)\n" 2157 : 2158 : "r" (d), 2159 "y" (vfill), "y" (v1), "y" (v2), "y" (v3), 2160 "y" (v4), "y" (v5), "y" (v6), "y" (v7) 2161 : "memory"); 2162 #else 2163 *(__m64*) (d + 0) = vfill; 2164 *(__m64*) (d + 8) = vfill; 2165 *(__m64*) (d + 16) = vfill; 2166 *(__m64*) (d + 24) = vfill; 2167 *(__m64*) (d + 32) = vfill; 2168 *(__m64*) (d + 40) = vfill; 2169 *(__m64*) (d + 48) = vfill; 2170 *(__m64*) (d + 56) = vfill; 2171 #endif 2172 w -= 64; 2173 d += 64; 2174 } 2175 2176 while (w >= 4) 2177 { 2178 *(uint32_t *)d = filler; 2179 2180 w -= 4; 2181 d += 4; 2182 } 2183 if (w >= 2) 2184 { 2185 *(uint16_t *)d = filler; 2186 w -= 2; 2187 d += 2; 2188 } 2189 if (w >= 1) 2190 { 2191 *(uint8_t *)d = (filler & 0xff); 2192 w--; 2193 d++; 2194 } 2195 2196 } 2197 2198 _mm_empty (); 2199 return TRUE; 2200 } 2201 2202 static void 2203 mmx_composite_src_x888_0565 (pixman_implementation_t *imp, 2204 pixman_composite_info_t *info) 2205 { 2206 PIXMAN_COMPOSITE_ARGS (info); 2207 uint16_t *dst_line, *dst; 2208 uint32_t *src_line, *src, s; 2209 int dst_stride, src_stride; 2210 int32_t w; 2211 2212 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2213 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2214 2215 while (height--) 2216 { 2217 dst = dst_line; 2218 dst_line += dst_stride; 2219 src = src_line; 2220 src_line += src_stride; 2221 w = width; 2222 2223 while (w && (uintptr_t)dst & 7) 2224 { 2225 s = *src++; 2226 *dst = convert_8888_to_0565 (s); 2227 dst++; 2228 w--; 2229 } 2230 2231 while (w >= 4) 2232 { 2233 __m64 vdest; 2234 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); 2235 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); 2236 2237 vdest = pack_4xpacked565 (vsrc0, vsrc1); 2238 2239 *(__m64 *)dst = vdest; 2240 2241 w -= 4; 2242 src += 4; 2243 dst += 4; 2244 } 2245 2246 while (w) 2247 { 2248 s = *src++; 2249 *dst = convert_8888_to_0565 (s); 2250 dst++; 2251 w--; 2252 } 2253 } 2254 2255 _mm_empty (); 2256 } 2257 2258 static void 2259 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, 2260 pixman_composite_info_t *info) 2261 { 2262 PIXMAN_COMPOSITE_ARGS (info); 2263 uint32_t src, srca; 2264 uint32_t *dst_line, *dst; 2265 uint8_t *mask_line, *mask; 2266 int dst_stride, mask_stride; 2267 int32_t w; 2268 __m64 vsrc; 2269 uint64_t srcsrc; 2270 2271 CHECKPOINT (); 2272 2273 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2274 2275 srca = src >> 24; 2276 if (src == 0) 2277 { 2278 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, 2279 PIXMAN_FORMAT_BPP (dest_image->bits.format), 2280 dest_x, dest_y, width, height, 0); 2281 return; 2282 } 2283 2284 srcsrc = (uint64_t)src << 32 | src; 2285 2286 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2287 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 2288 2289 vsrc = load8888 (&src); 2290 2291 while (height--) 2292 { 2293 dst = dst_line; 2294 dst_line += dst_stride; 2295 mask = mask_line; 2296 mask_line += mask_stride; 2297 w = width; 2298 2299 CHECKPOINT (); 2300 2301 while (w && (uintptr_t)dst & 7) 2302 { 2303 uint64_t m = *mask; 2304 2305 if (m) 2306 { 2307 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); 2308 2309 store8888 (dst, vdest); 2310 } 2311 else 2312 { 2313 *dst = 0; 2314 } 2315 2316 w--; 2317 mask++; 2318 dst++; 2319 } 2320 2321 CHECKPOINT (); 2322 2323 while (w >= 2) 2324 { 2325 uint64_t m0, m1; 2326 m0 = *mask; 2327 m1 = *(mask + 1); 2328 2329 if (srca == 0xff && (m0 & m1) == 0xff) 2330 { 2331 *(uint64_t *)dst = srcsrc; 2332 } 2333 else if (m0 | m1) 2334 { 2335 __m64 dest0, dest1; 2336 2337 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); 2338 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); 2339 2340 *(__m64 *)dst = pack8888 (dest0, dest1); 2341 } 2342 else 2343 { 2344 *(uint64_t *)dst = 0; 2345 } 2346 2347 mask += 2; 2348 dst += 2; 2349 w -= 2; 2350 } 2351 2352 CHECKPOINT (); 2353 2354 if (w) 2355 { 2356 uint64_t m = *mask; 2357 2358 if (m) 2359 { 2360 __m64 vdest = load8888 (dst); 2361 2362 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); 2363 store8888 (dst, vdest); 2364 } 2365 else 2366 { 2367 *dst = 0; 2368 } 2369 } 2370 } 2371 2372 _mm_empty (); 2373 } 2374 2375 static void 2376 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, 2377 pixman_composite_info_t *info) 2378 { 2379 PIXMAN_COMPOSITE_ARGS (info); 2380 uint32_t src, srca; 2381 uint16_t *dst_line, *dst; 2382 uint8_t *mask_line, *mask; 2383 int dst_stride, mask_stride; 2384 int32_t w; 2385 __m64 vsrc, vsrca, tmp; 2386 __m64 srcsrcsrcsrc; 2387 2388 CHECKPOINT (); 2389 2390 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2391 2392 srca = src >> 24; 2393 if (src == 0) 2394 return; 2395 2396 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2397 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 2398 2399 vsrc = load8888 (&src); 2400 vsrca = expand_alpha (vsrc); 2401 2402 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); 2403 srcsrcsrcsrc = expand_alpha_rev (tmp); 2404 2405 while (height--) 2406 { 2407 dst = dst_line; 2408 dst_line += dst_stride; 2409 mask = mask_line; 2410 mask_line += mask_stride; 2411 w = width; 2412 2413 CHECKPOINT (); 2414 2415 while (w && (uintptr_t)dst & 7) 2416 { 2417 uint64_t m = *mask; 2418 2419 if (m) 2420 { 2421 uint64_t d = *dst; 2422 __m64 vd = to_m64 (d); 2423 __m64 vdest = in_over ( 2424 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); 2425 2426 vd = pack_565 (vdest, _mm_setzero_si64 (), 0); 2427 *dst = to_uint64 (vd); 2428 } 2429 2430 w--; 2431 mask++; 2432 dst++; 2433 } 2434 2435 CHECKPOINT (); 2436 2437 while (w >= 4) 2438 { 2439 uint64_t m0, m1, m2, m3; 2440 m0 = *mask; 2441 m1 = *(mask + 1); 2442 m2 = *(mask + 2); 2443 m3 = *(mask + 3); 2444 2445 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) 2446 { 2447 *(__m64 *)dst = srcsrcsrcsrc; 2448 } 2449 else if (m0 | m1 | m2 | m3) 2450 { 2451 __m64 vdest = *(__m64 *)dst; 2452 __m64 v0, v1, v2, v3; 2453 __m64 vm0, vm1, vm2, vm3; 2454 2455 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 2456 2457 vm0 = to_m64 (m0); 2458 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); 2459 2460 vm1 = to_m64 (m1); 2461 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); 2462 2463 vm2 = to_m64 (m2); 2464 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); 2465 2466 vm3 = to_m64 (m3); 2467 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); 2468 2469 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; 2470 } 2471 2472 w -= 4; 2473 mask += 4; 2474 dst += 4; 2475 } 2476 2477 CHECKPOINT (); 2478 2479 while (w) 2480 { 2481 uint64_t m = *mask; 2482 2483 if (m) 2484 { 2485 uint64_t d = *dst; 2486 __m64 vd = to_m64 (d); 2487 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), 2488 expand565 (vd, 0)); 2489 vd = pack_565 (vdest, _mm_setzero_si64 (), 0); 2490 *dst = to_uint64 (vd); 2491 } 2492 2493 w--; 2494 mask++; 2495 dst++; 2496 } 2497 } 2498 2499 _mm_empty (); 2500 } 2501 2502 static void 2503 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, 2504 pixman_composite_info_t *info) 2505 { 2506 PIXMAN_COMPOSITE_ARGS (info); 2507 uint16_t *dst_line, *dst; 2508 uint32_t *src_line, *src; 2509 int dst_stride, src_stride; 2510 int32_t w; 2511 2512 CHECKPOINT (); 2513 2514 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2515 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2516 2517 #if 0 2518 /* FIXME */ 2519 assert (src_image->drawable == mask_image->drawable); 2520 #endif 2521 2522 while (height--) 2523 { 2524 dst = dst_line; 2525 dst_line += dst_stride; 2526 src = src_line; 2527 src_line += src_stride; 2528 w = width; 2529 2530 CHECKPOINT (); 2531 2532 while (w && (uintptr_t)dst & 7) 2533 { 2534 __m64 vsrc = load8888 (src); 2535 uint64_t d = *dst; 2536 __m64 vdest = expand565 (to_m64 (d), 0); 2537 2538 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); 2539 2540 *dst = to_uint64 (vdest); 2541 2542 w--; 2543 dst++; 2544 src++; 2545 } 2546 2547 CHECKPOINT (); 2548 2549 while (w >= 4) 2550 { 2551 uint32_t s0, s1, s2, s3; 2552 unsigned char a0, a1, a2, a3; 2553 2554 s0 = *src; 2555 s1 = *(src + 1); 2556 s2 = *(src + 2); 2557 s3 = *(src + 3); 2558 2559 a0 = (s0 >> 24); 2560 a1 = (s1 >> 24); 2561 a2 = (s2 >> 24); 2562 a3 = (s3 >> 24); 2563 2564 if ((a0 & a1 & a2 & a3) == 0xFF) 2565 { 2566 __m64 v0 = invert_colors (load8888 (&s0)); 2567 __m64 v1 = invert_colors (load8888 (&s1)); 2568 __m64 v2 = invert_colors (load8888 (&s2)); 2569 __m64 v3 = invert_colors (load8888 (&s3)); 2570 2571 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 2572 } 2573 else if (s0 | s1 | s2 | s3) 2574 { 2575 __m64 vdest = *(__m64 *)dst; 2576 __m64 v0, v1, v2, v3; 2577 2578 __m64 vsrc0 = load8888 (&s0); 2579 __m64 vsrc1 = load8888 (&s1); 2580 __m64 vsrc2 = load8888 (&s2); 2581 __m64 vsrc3 = load8888 (&s3); 2582 2583 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 2584 2585 v0 = over_rev_non_pre (vsrc0, v0); 2586 v1 = over_rev_non_pre (vsrc1, v1); 2587 v2 = over_rev_non_pre (vsrc2, v2); 2588 v3 = over_rev_non_pre (vsrc3, v3); 2589 2590 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 2591 } 2592 2593 w -= 4; 2594 dst += 4; 2595 src += 4; 2596 } 2597 2598 CHECKPOINT (); 2599 2600 while (w) 2601 { 2602 __m64 vsrc = load8888 (src); 2603 uint64_t d = *dst; 2604 __m64 vdest = expand565 (to_m64 (d), 0); 2605 2606 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); 2607 2608 *dst = to_uint64 (vdest); 2609 2610 w--; 2611 dst++; 2612 src++; 2613 } 2614 } 2615 2616 _mm_empty (); 2617 } 2618 2619 static void 2620 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, 2621 pixman_composite_info_t *info) 2622 { 2623 PIXMAN_COMPOSITE_ARGS (info); 2624 uint32_t *dst_line, *dst; 2625 uint32_t *src_line, *src; 2626 int dst_stride, src_stride; 2627 int32_t w; 2628 2629 CHECKPOINT (); 2630 2631 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 2632 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 2633 2634 #if 0 2635 /* FIXME */ 2636 assert (src_image->drawable == mask_image->drawable); 2637 #endif 2638 2639 while (height--) 2640 { 2641 dst = dst_line; 2642 dst_line += dst_stride; 2643 src = src_line; 2644 src_line += src_stride; 2645 w = width; 2646 2647 while (w && (uintptr_t)dst & 7) 2648 { 2649 __m64 s = load8888 (src); 2650 __m64 d = load8888 (dst); 2651 2652 store8888 (dst, over_rev_non_pre (s, d)); 2653 2654 w--; 2655 dst++; 2656 src++; 2657 } 2658 2659 while (w >= 2) 2660 { 2661 uint32_t s0, s1; 2662 unsigned char a0, a1; 2663 __m64 d0, d1; 2664 2665 s0 = *src; 2666 s1 = *(src + 1); 2667 2668 a0 = (s0 >> 24); 2669 a1 = (s1 >> 24); 2670 2671 if ((a0 & a1) == 0xFF) 2672 { 2673 d0 = invert_colors (load8888 (&s0)); 2674 d1 = invert_colors (load8888 (&s1)); 2675 2676 *(__m64 *)dst = pack8888 (d0, d1); 2677 } 2678 else if (s0 | s1) 2679 { 2680 __m64 vdest = *(__m64 *)dst; 2681 2682 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); 2683 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); 2684 2685 *(__m64 *)dst = pack8888 (d0, d1); 2686 } 2687 2688 w -= 2; 2689 dst += 2; 2690 src += 2; 2691 } 2692 2693 if (w) 2694 { 2695 __m64 s = load8888 (src); 2696 __m64 d = load8888 (dst); 2697 2698 store8888 (dst, over_rev_non_pre (s, d)); 2699 } 2700 } 2701 2702 _mm_empty (); 2703 } 2704 2705 static void 2706 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, 2707 pixman_composite_info_t *info) 2708 { 2709 PIXMAN_COMPOSITE_ARGS (info); 2710 uint32_t src; 2711 uint16_t *dst_line; 2712 uint32_t *mask_line; 2713 int dst_stride, mask_stride; 2714 __m64 vsrc, vsrca; 2715 2716 CHECKPOINT (); 2717 2718 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2719 2720 if (src == 0) 2721 return; 2722 2723 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 2724 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 2725 2726 vsrc = load8888 (&src); 2727 vsrca = expand_alpha (vsrc); 2728 2729 while (height--) 2730 { 2731 int twidth = width; 2732 uint32_t *p = (uint32_t *)mask_line; 2733 uint16_t *q = (uint16_t *)dst_line; 2734 2735 while (twidth && ((uintptr_t)q & 7)) 2736 { 2737 uint32_t m = *(uint32_t *)p; 2738 2739 if (m) 2740 { 2741 uint64_t d = *q; 2742 __m64 vdest = expand565 (to_m64 (d), 0); 2743 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); 2744 *q = to_uint64 (vdest); 2745 } 2746 2747 twidth--; 2748 p++; 2749 q++; 2750 } 2751 2752 while (twidth >= 4) 2753 { 2754 uint32_t m0, m1, m2, m3; 2755 2756 m0 = *p; 2757 m1 = *(p + 1); 2758 m2 = *(p + 2); 2759 m3 = *(p + 3); 2760 2761 if ((m0 | m1 | m2 | m3)) 2762 { 2763 __m64 vdest = *(__m64 *)q; 2764 __m64 v0, v1, v2, v3; 2765 2766 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 2767 2768 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); 2769 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); 2770 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); 2771 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); 2772 2773 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); 2774 } 2775 twidth -= 4; 2776 p += 4; 2777 q += 4; 2778 } 2779 2780 while (twidth) 2781 { 2782 uint32_t m; 2783 2784 m = *(uint32_t *)p; 2785 if (m) 2786 { 2787 uint64_t d = *q; 2788 __m64 vdest = expand565 (to_m64 (d), 0); 2789 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); 2790 *q = to_uint64 (vdest); 2791 } 2792 2793 twidth--; 2794 p++; 2795 q++; 2796 } 2797 2798 mask_line += mask_stride; 2799 dst_line += dst_stride; 2800 } 2801 2802 _mm_empty (); 2803 } 2804 2805 static void 2806 mmx_composite_in_n_8_8 (pixman_implementation_t *imp, 2807 pixman_composite_info_t *info) 2808 { 2809 PIXMAN_COMPOSITE_ARGS (info); 2810 uint8_t *dst_line, *dst; 2811 uint8_t *mask_line, *mask; 2812 int dst_stride, mask_stride; 2813 int32_t w; 2814 uint32_t src; 2815 uint8_t sa; 2816 __m64 vsrc, vsrca; 2817 2818 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 2819 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 2820 2821 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2822 2823 sa = src >> 24; 2824 2825 vsrc = load8888 (&src); 2826 vsrca = expand_alpha (vsrc); 2827 2828 while (height--) 2829 { 2830 dst = dst_line; 2831 dst_line += dst_stride; 2832 mask = mask_line; 2833 mask_line += mask_stride; 2834 w = width; 2835 2836 while (w && (uintptr_t)dst & 7) 2837 { 2838 uint16_t tmp; 2839 uint8_t a; 2840 uint32_t m, d; 2841 2842 a = *mask++; 2843 d = *dst; 2844 2845 m = MUL_UN8 (sa, a, tmp); 2846 d = MUL_UN8 (m, d, tmp); 2847 2848 *dst++ = d; 2849 w--; 2850 } 2851 2852 while (w >= 4) 2853 { 2854 __m64 vmask; 2855 __m64 vdest; 2856 2857 vmask = load8888u ((uint32_t *)mask); 2858 vdest = load8888 ((uint32_t *)dst); 2859 2860 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); 2861 2862 dst += 4; 2863 mask += 4; 2864 w -= 4; 2865 } 2866 2867 while (w--) 2868 { 2869 uint16_t tmp; 2870 uint8_t a; 2871 uint32_t m, d; 2872 2873 a = *mask++; 2874 d = *dst; 2875 2876 m = MUL_UN8 (sa, a, tmp); 2877 d = MUL_UN8 (m, d, tmp); 2878 2879 *dst++ = d; 2880 } 2881 } 2882 2883 _mm_empty (); 2884 } 2885 2886 static void 2887 mmx_composite_in_8_8 (pixman_implementation_t *imp, 2888 pixman_composite_info_t *info) 2889 { 2890 PIXMAN_COMPOSITE_ARGS (info); 2891 uint8_t *dst_line, *dst; 2892 uint8_t *src_line, *src; 2893 int src_stride, dst_stride; 2894 int32_t w; 2895 2896 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 2897 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 2898 2899 while (height--) 2900 { 2901 dst = dst_line; 2902 dst_line += dst_stride; 2903 src = src_line; 2904 src_line += src_stride; 2905 w = width; 2906 2907 while (w && (uintptr_t)dst & 3) 2908 { 2909 uint8_t s, d; 2910 uint16_t tmp; 2911 2912 s = *src; 2913 d = *dst; 2914 2915 *dst = MUL_UN8 (s, d, tmp); 2916 2917 src++; 2918 dst++; 2919 w--; 2920 } 2921 2922 while (w >= 4) 2923 { 2924 uint32_t *s = (uint32_t *)src; 2925 uint32_t *d = (uint32_t *)dst; 2926 2927 store8888 (d, in (load8888u (s), load8888 (d))); 2928 2929 w -= 4; 2930 dst += 4; 2931 src += 4; 2932 } 2933 2934 while (w--) 2935 { 2936 uint8_t s, d; 2937 uint16_t tmp; 2938 2939 s = *src; 2940 d = *dst; 2941 2942 *dst = MUL_UN8 (s, d, tmp); 2943 2944 src++; 2945 dst++; 2946 } 2947 } 2948 2949 _mm_empty (); 2950 } 2951 2952 static void 2953 mmx_composite_add_n_8_8 (pixman_implementation_t *imp, 2954 pixman_composite_info_t *info) 2955 { 2956 PIXMAN_COMPOSITE_ARGS (info); 2957 uint8_t *dst_line, *dst; 2958 uint8_t *mask_line, *mask; 2959 int dst_stride, mask_stride; 2960 int32_t w; 2961 uint32_t src; 2962 uint8_t sa; 2963 __m64 vsrc, vsrca; 2964 2965 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 2966 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 2967 2968 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2969 2970 sa = src >> 24; 2971 2972 if (src == 0) 2973 return; 2974 2975 vsrc = load8888 (&src); 2976 vsrca = expand_alpha (vsrc); 2977 2978 while (height--) 2979 { 2980 dst = dst_line; 2981 dst_line += dst_stride; 2982 mask = mask_line; 2983 mask_line += mask_stride; 2984 w = width; 2985 2986 while (w && (uintptr_t)dst & 3) 2987 { 2988 uint16_t tmp; 2989 uint16_t a; 2990 uint32_t m, d; 2991 uint32_t r; 2992 2993 a = *mask++; 2994 d = *dst; 2995 2996 m = MUL_UN8 (sa, a, tmp); 2997 r = ADD_UN8 (m, d, tmp); 2998 2999 *dst++ = r; 3000 w--; 3001 } 3002 3003 while (w >= 4) 3004 { 3005 __m64 vmask; 3006 __m64 vdest; 3007 3008 vmask = load8888u ((uint32_t *)mask); 3009 vdest = load8888 ((uint32_t *)dst); 3010 3011 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); 3012 3013 dst += 4; 3014 mask += 4; 3015 w -= 4; 3016 } 3017 3018 while (w--) 3019 { 3020 uint16_t tmp; 3021 uint16_t a; 3022 uint32_t m, d; 3023 uint32_t r; 3024 3025 a = *mask++; 3026 d = *dst; 3027 3028 m = MUL_UN8 (sa, a, tmp); 3029 r = ADD_UN8 (m, d, tmp); 3030 3031 *dst++ = r; 3032 } 3033 } 3034 3035 _mm_empty (); 3036 } 3037 3038 static void 3039 mmx_composite_add_8_8 (pixman_implementation_t *imp, 3040 pixman_composite_info_t *info) 3041 { 3042 PIXMAN_COMPOSITE_ARGS (info); 3043 uint8_t *dst_line, *dst; 3044 uint8_t *src_line, *src; 3045 int dst_stride, src_stride; 3046 int32_t w; 3047 uint8_t s, d; 3048 uint16_t t; 3049 3050 CHECKPOINT (); 3051 3052 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 3053 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 3054 3055 while (height--) 3056 { 3057 dst = dst_line; 3058 dst_line += dst_stride; 3059 src = src_line; 3060 src_line += src_stride; 3061 w = width; 3062 3063 while (w && (uintptr_t)dst & 7) 3064 { 3065 s = *src; 3066 d = *dst; 3067 t = d + s; 3068 s = t | (0 - (t >> 8)); 3069 *dst = s; 3070 3071 dst++; 3072 src++; 3073 w--; 3074 } 3075 3076 while (w >= 8) 3077 { 3078 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); 3079 dst += 8; 3080 src += 8; 3081 w -= 8; 3082 } 3083 3084 while (w) 3085 { 3086 s = *src; 3087 d = *dst; 3088 t = d + s; 3089 s = t | (0 - (t >> 8)); 3090 *dst = s; 3091 3092 dst++; 3093 src++; 3094 w--; 3095 } 3096 } 3097 3098 _mm_empty (); 3099 } 3100 3101 static void 3102 mmx_composite_add_0565_0565 (pixman_implementation_t *imp, 3103 pixman_composite_info_t *info) 3104 { 3105 PIXMAN_COMPOSITE_ARGS (info); 3106 uint16_t *dst_line, *dst; 3107 uint32_t d; 3108 uint16_t *src_line, *src; 3109 uint32_t s; 3110 int dst_stride, src_stride; 3111 int32_t w; 3112 3113 CHECKPOINT (); 3114 3115 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); 3116 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 3117 3118 while (height--) 3119 { 3120 dst = dst_line; 3121 dst_line += dst_stride; 3122 src = src_line; 3123 src_line += src_stride; 3124 w = width; 3125 3126 while (w && (uintptr_t)dst & 7) 3127 { 3128 s = *src++; 3129 if (s) 3130 { 3131 d = *dst; 3132 s = convert_0565_to_8888 (s); 3133 if (d) 3134 { 3135 d = convert_0565_to_8888 (d); 3136 UN8x4_ADD_UN8x4 (s, d); 3137 } 3138 *dst = convert_8888_to_0565 (s); 3139 } 3140 dst++; 3141 w--; 3142 } 3143 3144 while (w >= 4) 3145 { 3146 __m64 vdest = *(__m64 *)dst; 3147 __m64 vsrc = ldq_u ((__m64 *)src); 3148 __m64 vd0, vd1; 3149 __m64 vs0, vs1; 3150 3151 expand_4xpacked565 (vdest, &vd0, &vd1, 0); 3152 expand_4xpacked565 (vsrc, &vs0, &vs1, 0); 3153 3154 vd0 = _mm_adds_pu8 (vd0, vs0); 3155 vd1 = _mm_adds_pu8 (vd1, vs1); 3156 3157 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1); 3158 3159 dst += 4; 3160 src += 4; 3161 w -= 4; 3162 } 3163 3164 while (w--) 3165 { 3166 s = *src++; 3167 if (s) 3168 { 3169 d = *dst; 3170 s = convert_0565_to_8888 (s); 3171 if (d) 3172 { 3173 d = convert_0565_to_8888 (d); 3174 UN8x4_ADD_UN8x4 (s, d); 3175 } 3176 *dst = convert_8888_to_0565 (s); 3177 } 3178 dst++; 3179 } 3180 } 3181 3182 _mm_empty (); 3183 } 3184 3185 static void 3186 mmx_composite_add_8888_8888 (pixman_implementation_t *imp, 3187 pixman_composite_info_t *info) 3188 { 3189 PIXMAN_COMPOSITE_ARGS (info); 3190 uint32_t *dst_line, *dst; 3191 uint32_t *src_line, *src; 3192 int dst_stride, src_stride; 3193 int32_t w; 3194 3195 CHECKPOINT (); 3196 3197 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3198 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3199 3200 while (height--) 3201 { 3202 dst = dst_line; 3203 dst_line += dst_stride; 3204 src = src_line; 3205 src_line += src_stride; 3206 w = width; 3207 3208 while (w && (uintptr_t)dst & 7) 3209 { 3210 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), 3211 load ((const uint32_t *)dst))); 3212 dst++; 3213 src++; 3214 w--; 3215 } 3216 3217 while (w >= 2) 3218 { 3219 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); 3220 dst += 2; 3221 src += 2; 3222 w -= 2; 3223 } 3224 3225 if (w) 3226 { 3227 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), 3228 load ((const uint32_t *)dst))); 3229 3230 } 3231 } 3232 3233 _mm_empty (); 3234 } 3235 3236 static pixman_bool_t 3237 mmx_blt (pixman_implementation_t *imp, 3238 uint32_t * src_bits, 3239 uint32_t * dst_bits, 3240 int src_stride, 3241 int dst_stride, 3242 int src_bpp, 3243 int dst_bpp, 3244 int src_x, 3245 int src_y, 3246 int dest_x, 3247 int dest_y, 3248 int width, 3249 int height) 3250 { 3251 uint8_t * src_bytes; 3252 uint8_t * dst_bytes; 3253 int byte_width; 3254 3255 if (src_bpp != dst_bpp) 3256 return FALSE; 3257 3258 if (src_bpp == 16) 3259 { 3260 src_stride = src_stride * (int) sizeof (uint32_t) / 2; 3261 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; 3262 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); 3263 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 3264 byte_width = 2 * width; 3265 src_stride *= 2; 3266 dst_stride *= 2; 3267 } 3268 else if (src_bpp == 32) 3269 { 3270 src_stride = src_stride * (int) sizeof (uint32_t) / 4; 3271 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; 3272 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); 3273 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 3274 byte_width = 4 * width; 3275 src_stride *= 4; 3276 dst_stride *= 4; 3277 } 3278 else 3279 { 3280 return FALSE; 3281 } 3282 3283 while (height--) 3284 { 3285 int w; 3286 uint8_t *s = src_bytes; 3287 uint8_t *d = dst_bytes; 3288 src_bytes += src_stride; 3289 dst_bytes += dst_stride; 3290 w = byte_width; 3291 3292 if (w >= 1 && ((uintptr_t)d & 1)) 3293 { 3294 *(uint8_t *)d = *(uint8_t *)s; 3295 w -= 1; 3296 s += 1; 3297 d += 1; 3298 } 3299 3300 if (w >= 2 && ((uintptr_t)d & 3)) 3301 { 3302 *(uint16_t *)d = *(uint16_t *)s; 3303 w -= 2; 3304 s += 2; 3305 d += 2; 3306 } 3307 3308 while (w >= 4 && ((uintptr_t)d & 7)) 3309 { 3310 *(uint32_t *)d = ldl_u ((uint32_t *)s); 3311 3312 w -= 4; 3313 s += 4; 3314 d += 4; 3315 } 3316 3317 while (w >= 64) 3318 { 3319 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX 3320 __asm__ ( 3321 "movq (%1), %%mm0\n" 3322 "movq 8(%1), %%mm1\n" 3323 "movq 16(%1), %%mm2\n" 3324 "movq 24(%1), %%mm3\n" 3325 "movq 32(%1), %%mm4\n" 3326 "movq 40(%1), %%mm5\n" 3327 "movq 48(%1), %%mm6\n" 3328 "movq 56(%1), %%mm7\n" 3329 3330 "movq %%mm0, (%0)\n" 3331 "movq %%mm1, 8(%0)\n" 3332 "movq %%mm2, 16(%0)\n" 3333 "movq %%mm3, 24(%0)\n" 3334 "movq %%mm4, 32(%0)\n" 3335 "movq %%mm5, 40(%0)\n" 3336 "movq %%mm6, 48(%0)\n" 3337 "movq %%mm7, 56(%0)\n" 3338 : 3339 : "r" (d), "r" (s) 3340 : "memory", 3341 "%mm0", "%mm1", "%mm2", "%mm3", 3342 "%mm4", "%mm5", "%mm6", "%mm7"); 3343 #else 3344 __m64 v0 = ldq_u ((__m64 *)(s + 0)); 3345 __m64 v1 = ldq_u ((__m64 *)(s + 8)); 3346 __m64 v2 = ldq_u ((__m64 *)(s + 16)); 3347 __m64 v3 = ldq_u ((__m64 *)(s + 24)); 3348 __m64 v4 = ldq_u ((__m64 *)(s + 32)); 3349 __m64 v5 = ldq_u ((__m64 *)(s + 40)); 3350 __m64 v6 = ldq_u ((__m64 *)(s + 48)); 3351 __m64 v7 = ldq_u ((__m64 *)(s + 56)); 3352 *(__m64 *)(d + 0) = v0; 3353 *(__m64 *)(d + 8) = v1; 3354 *(__m64 *)(d + 16) = v2; 3355 *(__m64 *)(d + 24) = v3; 3356 *(__m64 *)(d + 32) = v4; 3357 *(__m64 *)(d + 40) = v5; 3358 *(__m64 *)(d + 48) = v6; 3359 *(__m64 *)(d + 56) = v7; 3360 #endif 3361 3362 w -= 64; 3363 s += 64; 3364 d += 64; 3365 } 3366 while (w >= 4) 3367 { 3368 *(uint32_t *)d = ldl_u ((uint32_t *)s); 3369 3370 w -= 4; 3371 s += 4; 3372 d += 4; 3373 } 3374 if (w >= 2) 3375 { 3376 *(uint16_t *)d = *(uint16_t *)s; 3377 w -= 2; 3378 s += 2; 3379 d += 2; 3380 } 3381 } 3382 3383 _mm_empty (); 3384 3385 return TRUE; 3386 } 3387 3388 static void 3389 mmx_composite_copy_area (pixman_implementation_t *imp, 3390 pixman_composite_info_t *info) 3391 { 3392 PIXMAN_COMPOSITE_ARGS (info); 3393 3394 mmx_blt (imp, src_image->bits.bits, 3395 dest_image->bits.bits, 3396 src_image->bits.rowstride, 3397 dest_image->bits.rowstride, 3398 PIXMAN_FORMAT_BPP (src_image->bits.format), 3399 PIXMAN_FORMAT_BPP (dest_image->bits.format), 3400 src_x, src_y, dest_x, dest_y, width, height); 3401 } 3402 3403 static void 3404 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, 3405 pixman_composite_info_t *info) 3406 { 3407 PIXMAN_COMPOSITE_ARGS (info); 3408 uint32_t *src, *src_line; 3409 uint32_t *dst, *dst_line; 3410 uint8_t *mask, *mask_line; 3411 int src_stride, mask_stride, dst_stride; 3412 int32_t w; 3413 3414 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3415 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 3416 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 3417 3418 while (height--) 3419 { 3420 src = src_line; 3421 src_line += src_stride; 3422 dst = dst_line; 3423 dst_line += dst_stride; 3424 mask = mask_line; 3425 mask_line += mask_stride; 3426 3427 w = width; 3428 3429 while (w--) 3430 { 3431 uint64_t m = *mask; 3432 3433 if (m) 3434 { 3435 uint32_t ssrc = *src | 0xff000000; 3436 __m64 s = load8888 (&ssrc); 3437 3438 if (m == 0xff) 3439 { 3440 store8888 (dst, s); 3441 } 3442 else 3443 { 3444 __m64 sa = expand_alpha (s); 3445 __m64 vm = expand_alpha_rev (to_m64 (m)); 3446 __m64 vdest = in_over (s, sa, vm, load8888 (dst)); 3447 3448 store8888 (dst, vdest); 3449 } 3450 } 3451 3452 mask++; 3453 dst++; 3454 src++; 3455 } 3456 } 3457 3458 _mm_empty (); 3459 } 3460 3461 static void 3462 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, 3463 pixman_composite_info_t *info) 3464 { 3465 PIXMAN_COMPOSITE_ARGS (info); 3466 uint32_t src; 3467 uint32_t *dst_line, *dst; 3468 int32_t w; 3469 int dst_stride; 3470 __m64 vsrc; 3471 3472 CHECKPOINT (); 3473 3474 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 3475 3476 if (src == 0) 3477 return; 3478 3479 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 3480 3481 vsrc = load8888 (&src); 3482 3483 while (height--) 3484 { 3485 dst = dst_line; 3486 dst_line += dst_stride; 3487 w = width; 3488 3489 CHECKPOINT (); 3490 3491 while (w && (uintptr_t)dst & 7) 3492 { 3493 __m64 vdest = load8888 (dst); 3494 3495 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); 3496 3497 w--; 3498 dst++; 3499 } 3500 3501 while (w >= 2) 3502 { 3503 __m64 vdest = *(__m64 *)dst; 3504 __m64 dest0 = expand8888 (vdest, 0); 3505 __m64 dest1 = expand8888 (vdest, 1); 3506 3507 3508 dest0 = over (dest0, expand_alpha (dest0), vsrc); 3509 dest1 = over (dest1, expand_alpha (dest1), vsrc); 3510 3511 *(__m64 *)dst = pack8888 (dest0, dest1); 3512 3513 dst += 2; 3514 w -= 2; 3515 } 3516 3517 CHECKPOINT (); 3518 3519 if (w) 3520 { 3521 __m64 vdest = load8888 (dst); 3522 3523 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); 3524 } 3525 } 3526 3527 _mm_empty (); 3528 } 3529 3530 static force_inline void 3531 scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd, 3532 const uint32_t* ps, 3533 int32_t w, 3534 pixman_fixed_t vx, 3535 pixman_fixed_t unit_x, 3536 pixman_fixed_t src_width_fixed, 3537 pixman_bool_t fully_transparent_src) 3538 { 3539 if (fully_transparent_src) 3540 return; 3541 3542 while (w) 3543 { 3544 __m64 d = load (pd); 3545 __m64 s = load (ps + pixman_fixed_to_int (vx)); 3546 vx += unit_x; 3547 while (vx >= 0) 3548 vx -= src_width_fixed; 3549 3550 store8888 (pd, core_combine_over_u_pixel_mmx (s, d)); 3551 pd++; 3552 3553 w--; 3554 } 3555 3556 _mm_empty (); 3557 } 3558 3559 FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER, 3560 scaled_nearest_scanline_mmx_8888_8888_OVER, 3561 uint32_t, uint32_t, COVER) 3562 FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER, 3563 scaled_nearest_scanline_mmx_8888_8888_OVER, 3564 uint32_t, uint32_t, NONE) 3565 FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER, 3566 scaled_nearest_scanline_mmx_8888_8888_OVER, 3567 uint32_t, uint32_t, PAD) 3568 FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER, 3569 scaled_nearest_scanline_mmx_8888_8888_OVER, 3570 uint32_t, uint32_t, NORMAL) 3571 3572 static force_inline void 3573 scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask, 3574 uint32_t * dst, 3575 const uint32_t * src, 3576 int32_t w, 3577 pixman_fixed_t vx, 3578 pixman_fixed_t unit_x, 3579 pixman_fixed_t src_width_fixed, 3580 pixman_bool_t zero_src) 3581 { 3582 __m64 mm_mask; 3583 3584 if (zero_src || (*mask >> 24) == 0) 3585 { 3586 /* A workaround for https://gcc.gnu.org/PR47759 */ 3587 _mm_empty (); 3588 return; 3589 } 3590 3591 mm_mask = expand_alpha (load8888 (mask)); 3592 3593 while (w) 3594 { 3595 uint32_t s = *(src + pixman_fixed_to_int (vx)); 3596 vx += unit_x; 3597 while (vx >= 0) 3598 vx -= src_width_fixed; 3599 3600 if (s) 3601 { 3602 __m64 ms = load8888 (&s); 3603 __m64 alpha = expand_alpha (ms); 3604 __m64 dest = load8888 (dst); 3605 3606 store8888 (dst, (in_over (ms, alpha, mm_mask, dest))); 3607 } 3608 3609 dst++; 3610 w--; 3611 } 3612 3613 _mm_empty (); 3614 } 3615 3616 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER, 3617 scaled_nearest_scanline_mmx_8888_n_8888_OVER, 3618 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) 3619 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER, 3620 scaled_nearest_scanline_mmx_8888_n_8888_OVER, 3621 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) 3622 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER, 3623 scaled_nearest_scanline_mmx_8888_n_8888_OVER, 3624 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) 3625 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER, 3626 scaled_nearest_scanline_mmx_8888_n_8888_OVER, 3627 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) 3628 3629 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) 3630 #define BMSK (BSHIFT - 1) 3631 3632 #define BILINEAR_DECLARE_VARIABLES \ 3633 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ 3634 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ 3635 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ 3636 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ 3637 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ 3638 const __m64 mm_zero = _mm_setzero_si64 (); \ 3639 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) 3640 3641 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ 3642 do { \ 3643 /* fetch 2x2 pixel block into 2 mmx registers */ \ 3644 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ 3645 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ 3646 /* vertical interpolation */ \ 3647 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ 3648 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ 3649 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ 3650 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ 3651 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ 3652 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ 3653 /* calculate horizontal weights */ \ 3654 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ 3655 _mm_srli_pi16 (mm_x, \ 3656 16 - BILINEAR_INTERPOLATION_BITS))); \ 3657 /* horizontal interpolation */ \ 3658 __m64 p = _mm_unpacklo_pi16 (lo, hi); \ 3659 __m64 q = _mm_unpackhi_pi16 (lo, hi); \ 3660 vx += unit_x; \ 3661 lo = _mm_madd_pi16 (p, mm_wh); \ 3662 hi = _mm_madd_pi16 (q, mm_wh); \ 3663 mm_x = _mm_add_pi16 (mm_x, mm_ux); \ 3664 /* shift and pack the result */ \ 3665 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ 3666 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \ 3667 lo = _mm_packs_pi32 (lo, hi); \ 3668 lo = _mm_packs_pu16 (lo, lo); \ 3669 pix = lo; \ 3670 } while (0) 3671 3672 #define BILINEAR_SKIP_ONE_PIXEL() \ 3673 do { \ 3674 vx += unit_x; \ 3675 mm_x = _mm_add_pi16 (mm_x, mm_ux); \ 3676 } while(0) 3677 3678 static force_inline void 3679 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst, 3680 const uint32_t * mask, 3681 const uint32_t * src_top, 3682 const uint32_t * src_bottom, 3683 int32_t w, 3684 int wt, 3685 int wb, 3686 pixman_fixed_t vx, 3687 pixman_fixed_t unit_x, 3688 pixman_fixed_t max_vx, 3689 pixman_bool_t zero_src) 3690 { 3691 BILINEAR_DECLARE_VARIABLES; 3692 __m64 pix; 3693 3694 while (w--) 3695 { 3696 BILINEAR_INTERPOLATE_ONE_PIXEL (pix); 3697 store (dst, pix); 3698 dst++; 3699 } 3700 3701 _mm_empty (); 3702 } 3703 3704 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, 3705 scaled_bilinear_scanline_mmx_8888_8888_SRC, 3706 uint32_t, uint32_t, uint32_t, 3707 COVER, FLAG_NONE) 3708 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, 3709 scaled_bilinear_scanline_mmx_8888_8888_SRC, 3710 uint32_t, uint32_t, uint32_t, 3711 PAD, FLAG_NONE) 3712 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, 3713 scaled_bilinear_scanline_mmx_8888_8888_SRC, 3714 uint32_t, uint32_t, uint32_t, 3715 NONE, FLAG_NONE) 3716 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, 3717 scaled_bilinear_scanline_mmx_8888_8888_SRC, 3718 uint32_t, uint32_t, uint32_t, 3719 NORMAL, FLAG_NONE) 3720 3721 static force_inline void 3722 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst, 3723 const uint32_t * mask, 3724 const uint32_t * src_top, 3725 const uint32_t * src_bottom, 3726 int32_t w, 3727 int wt, 3728 int wb, 3729 pixman_fixed_t vx, 3730 pixman_fixed_t unit_x, 3731 pixman_fixed_t max_vx, 3732 pixman_bool_t zero_src) 3733 { 3734 BILINEAR_DECLARE_VARIABLES; 3735 __m64 pix1, pix2; 3736 3737 while (w) 3738 { 3739 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 3740 3741 if (!is_zero (pix1)) 3742 { 3743 pix2 = load (dst); 3744 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); 3745 } 3746 3747 w--; 3748 dst++; 3749 } 3750 3751 _mm_empty (); 3752 } 3753 3754 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, 3755 scaled_bilinear_scanline_mmx_8888_8888_OVER, 3756 uint32_t, uint32_t, uint32_t, 3757 COVER, FLAG_NONE) 3758 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, 3759 scaled_bilinear_scanline_mmx_8888_8888_OVER, 3760 uint32_t, uint32_t, uint32_t, 3761 PAD, FLAG_NONE) 3762 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, 3763 scaled_bilinear_scanline_mmx_8888_8888_OVER, 3764 uint32_t, uint32_t, uint32_t, 3765 NONE, FLAG_NONE) 3766 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, 3767 scaled_bilinear_scanline_mmx_8888_8888_OVER, 3768 uint32_t, uint32_t, uint32_t, 3769 NORMAL, FLAG_NONE) 3770 3771 static force_inline void 3772 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst, 3773 const uint8_t * mask, 3774 const uint32_t * src_top, 3775 const uint32_t * src_bottom, 3776 int32_t w, 3777 int wt, 3778 int wb, 3779 pixman_fixed_t vx, 3780 pixman_fixed_t unit_x, 3781 pixman_fixed_t max_vx, 3782 pixman_bool_t zero_src) 3783 { 3784 BILINEAR_DECLARE_VARIABLES; 3785 __m64 pix1, pix2; 3786 uint32_t m; 3787 3788 while (w) 3789 { 3790 m = (uint32_t) *mask++; 3791 3792 if (m) 3793 { 3794 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 3795 3796 if (m == 0xff && is_opaque (pix1)) 3797 { 3798 store (dst, pix1); 3799 } 3800 else 3801 { 3802 __m64 ms, md, ma, msa; 3803 3804 pix2 = load (dst); 3805 ma = expand_alpha_rev (to_m64 (m)); 3806 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); 3807 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); 3808 3809 msa = expand_alpha (ms); 3810 3811 store8888 (dst, (in_over (ms, msa, ma, md))); 3812 } 3813 } 3814 else 3815 { 3816 BILINEAR_SKIP_ONE_PIXEL (); 3817 } 3818 3819 w--; 3820 dst++; 3821 } 3822 3823 _mm_empty (); 3824 } 3825 3826 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, 3827 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 3828 uint32_t, uint8_t, uint32_t, 3829 COVER, FLAG_HAVE_NON_SOLID_MASK) 3830 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, 3831 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 3832 uint32_t, uint8_t, uint32_t, 3833 PAD, FLAG_HAVE_NON_SOLID_MASK) 3834 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, 3835 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 3836 uint32_t, uint8_t, uint32_t, 3837 NONE, FLAG_HAVE_NON_SOLID_MASK) 3838 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, 3839 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 3840 uint32_t, uint8_t, uint32_t, 3841 NORMAL, FLAG_HAVE_NON_SOLID_MASK) 3842 3843 static uint32_t * 3844 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) 3845 { 3846 int w = iter->width; 3847 uint32_t *dst = iter->buffer; 3848 uint32_t *src = (uint32_t *)iter->bits; 3849 3850 iter->bits += iter->stride; 3851 3852 while (w && ((uintptr_t)dst) & 7) 3853 { 3854 *dst++ = (*src++) | 0xff000000; 3855 w--; 3856 } 3857 3858 while (w >= 8) 3859 { 3860 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); 3861 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2)); 3862 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4)); 3863 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6)); 3864 3865 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000)); 3866 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000)); 3867 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000)); 3868 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000)); 3869 3870 dst += 8; 3871 src += 8; 3872 w -= 8; 3873 } 3874 3875 while (w) 3876 { 3877 *dst++ = (*src++) | 0xff000000; 3878 w--; 3879 } 3880 3881 _mm_empty (); 3882 return iter->buffer; 3883 } 3884 3885 static uint32_t * 3886 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) 3887 { 3888 int w = iter->width; 3889 uint32_t *dst = iter->buffer; 3890 uint16_t *src = (uint16_t *)iter->bits; 3891 3892 iter->bits += iter->stride; 3893 3894 while (w && ((uintptr_t)dst) & 0x0f) 3895 { 3896 uint16_t s = *src++; 3897 3898 *dst++ = convert_0565_to_8888 (s); 3899 w--; 3900 } 3901 3902 while (w >= 4) 3903 { 3904 __m64 vsrc = ldq_u ((__m64 *)src); 3905 __m64 mm0, mm1; 3906 3907 expand_4xpacked565 (vsrc, &mm0, &mm1, 1); 3908 3909 *(__m64 *)(dst + 0) = mm0; 3910 *(__m64 *)(dst + 2) = mm1; 3911 3912 dst += 4; 3913 src += 4; 3914 w -= 4; 3915 } 3916 3917 while (w) 3918 { 3919 uint16_t s = *src++; 3920 3921 *dst++ = convert_0565_to_8888 (s); 3922 w--; 3923 } 3924 3925 _mm_empty (); 3926 return iter->buffer; 3927 } 3928 3929 static uint32_t * 3930 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) 3931 { 3932 int w = iter->width; 3933 uint32_t *dst = iter->buffer; 3934 uint8_t *src = iter->bits; 3935 3936 iter->bits += iter->stride; 3937 3938 while (w && (((uintptr_t)dst) & 15)) 3939 { 3940 *dst++ = (uint32_t)*(src++) << 24; 3941 w--; 3942 } 3943 3944 while (w >= 8) 3945 { 3946 __m64 mm0 = ldq_u ((__m64 *)src); 3947 3948 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0); 3949 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0); 3950 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1); 3951 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1); 3952 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2); 3953 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2); 3954 3955 *(__m64 *)(dst + 0) = mm3; 3956 *(__m64 *)(dst + 2) = mm4; 3957 *(__m64 *)(dst + 4) = mm5; 3958 *(__m64 *)(dst + 6) = mm6; 3959 3960 dst += 8; 3961 src += 8; 3962 w -= 8; 3963 } 3964 3965 while (w) 3966 { 3967 *dst++ = (uint32_t)*(src++) << 24; 3968 w--; 3969 } 3970 3971 _mm_empty (); 3972 return iter->buffer; 3973 } 3974 3975 #define IMAGE_FLAGS \ 3976 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ 3977 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) 3978 3979 static const pixman_iter_info_t mmx_iters[] = 3980 { 3981 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, 3982 _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL 3983 }, 3984 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW, 3985 _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL 3986 }, 3987 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW, 3988 _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL 3989 }, 3990 { PIXMAN_null }, 3991 }; 3992 3993 static const pixman_fast_path_t mmx_fast_paths[] = 3994 { 3995 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), 3996 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), 3997 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), 3998 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), 3999 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), 4000 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), 4001 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), 4002 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), 4003 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), 4004 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), 4005 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), 4006 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), 4007 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), 4008 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), 4009 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), 4010 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), 4011 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), 4012 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), 4013 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), 4014 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), 4015 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), 4016 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), 4017 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), 4018 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), 4019 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), 4020 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), 4021 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), 4022 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), 4023 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), 4024 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ), 4025 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), 4026 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), 4027 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), 4028 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ), 4029 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 4030 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 4031 4032 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), 4033 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), 4034 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), 4035 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), 4036 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), 4037 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), 4038 4039 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888), 4040 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888), 4041 4042 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ), 4043 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ), 4044 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), 4045 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), 4046 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), 4047 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), 4048 4049 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), 4050 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), 4051 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), 4052 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), 4053 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), 4054 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), 4055 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), 4056 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), 4057 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), 4058 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), 4059 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 4060 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 4061 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 4062 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 4063 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), 4064 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), 4065 4066 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), 4067 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), 4068 4069 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 4070 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 4071 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), 4072 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), 4073 4074 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888 ), 4075 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888 ), 4076 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888 ), 4077 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888 ), 4078 4079 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), 4080 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 4081 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 4082 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), 4083 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 4084 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 4085 4086 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 4087 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 4088 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), 4089 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), 4090 4091 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ), 4092 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ), 4093 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ), 4094 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ), 4095 4096 { PIXMAN_OP_NONE }, 4097 }; 4098 4099 pixman_implementation_t * 4100 _pixman_implementation_create_mmx (pixman_implementation_t *fallback) 4101 { 4102 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); 4103 4104 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; 4105 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; 4106 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; 4107 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; 4108 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; 4109 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; 4110 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; 4111 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; 4112 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; 4113 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; 4114 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; 4115 4116 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; 4117 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; 4118 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; 4119 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; 4120 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; 4121 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; 4122 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; 4123 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; 4124 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; 4125 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; 4126 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; 4127 4128 imp->blt = mmx_blt; 4129 imp->fill = mmx_fill; 4130 4131 imp->iter_info = mmx_iters; 4132 4133 return imp; 4134 } 4135 4136 #endif /* USE_X86_MMX || USE_LOONGSON_MMI */