tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-mmx.c (101356B)


      1 /*
      2 * Copyright © 2004, 2005 Red Hat, Inc.
      3 * Copyright © 2004 Nicholas Miell
      4 * Copyright © 2005 Trolltech AS
      5 *
      6 * Permission to use, copy, modify, distribute, and sell this software and its
      7 * documentation for any purpose is hereby granted without fee, provided that
      8 * the above copyright notice appear in all copies and that both that
      9 * copyright notice and this permission notice appear in supporting
     10 * documentation, and that the name of Red Hat not be used in advertising or
     11 * publicity pertaining to distribution of the software without specific,
     12 * written prior permission.  Red Hat makes no representations about the
     13 * suitability of this software for any purpose.  It is provided "as is"
     14 * without express or implied warranty.
     15 *
     16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     23 * SOFTWARE.
     24 *
     25 * Author:  Søren Sandmann (sandmann@redhat.com)
     26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
     27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
     28 *
     29 * Based on work by Owen Taylor
     30 */
     31 
     32 #ifdef HAVE_CONFIG_H
     33 #include <pixman-config.h>
     34 #endif
     35 
     36 #if defined USE_X86_MMX || defined USE_LOONGSON_MMI
     37 
     38 #ifdef USE_LOONGSON_MMI
     39 #include <loongson-mmintrin.h>
     40 #else
     41 #include <mmintrin.h>
     42 #endif
     43 #include "pixman-private.h"
     44 #include "pixman-combine32.h"
     45 #include "pixman-inlines.h"
     46 
     47 #ifdef VERBOSE
     48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
     49 #else
     50 #define CHECKPOINT()
     51 #endif
     52 
     53 #ifdef USE_X86_MMX
     54 # if (defined(__SSE2__) || defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
     55 #  include <xmmintrin.h>
     56 # else
     57 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
     58 * instructions to be generated that we don't want. Just duplicate the
     59 * functions we want to use.  */
     60 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     61 _mm_movemask_pi8 (__m64 __A)
     62 {
     63    int ret;
     64 
     65    asm ("pmovmskb %1, %0\n\t"
     66 : "=r" (ret)
     67 : "y" (__A)
     68    );
     69 
     70    return ret;
     71 }
     72 
     73 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     74 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
     75 {
     76    asm ("pmulhuw %1, %0\n\t"
     77 : "+y" (__A)
     78 : "y" (__B)
     79    );
     80    return __A;
     81 }
     82 
     83 # define _mm_shuffle_pi16(A, N)						\
     84    ({									\
     85 __m64 ret;							\
     86 								\
     87 asm ("pshufw %2, %1, %0\n\t"					\
     88      : "=y" (ret)						\
     89      : "y" (A), "K" ((const int8_t)N)				\
     90 );								\
     91 								\
     92 ret;								\
     93    })
     94 # endif
     95 #endif
     96 
     97 #ifndef _MM_SHUFFLE
     98 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
     99 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
    100 #endif
    101 
    102 /* Notes about writing mmx code
    103 *
    104 * give memory operands as the second operand. If you give it as the
    105 * first, gcc will first load it into a register, then use that
    106 * register
    107 *
    108 *   ie. use
    109 *
    110 *         _mm_mullo_pi16 (x, mmx_constant);
    111 *
    112 *   not
    113 *
    114 *         _mm_mullo_pi16 (mmx_constant, x);
    115 *
    116 * Also try to minimize dependencies. i.e. when you need a value, try
    117 * to calculate it from a value that was calculated as early as
    118 * possible.
    119 */
    120 
    121 /* --------------- MMX primitives ------------------------------------- */
    122 
    123 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
    124 * the name of the member used to access the data.
    125 * If __m64 requires using mm_cvt* intrinsics functions to convert between
    126 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
    127 * If __m64 and uint64_t values can just be cast to each other directly,
    128 * then define USE_M64_CASTS.
    129 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
    130 */
    131 #ifdef _MSC_VER
    132 # ifdef __clang__
    133 #  define USE_CVT_INTRINSICS
    134 # else
    135 #  define M64_MEMBER m64_u64
    136 # endif
    137 #elif defined(__ICC)
    138 # define USE_CVT_INTRINSICS
    139 #elif defined(USE_LOONGSON_MMI)
    140 # define USE_M64_DOUBLE
    141 #elif defined(__GNUC__)
    142 # define USE_M64_CASTS
    143 #elif defined(__SUNPRO_C)
    144 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
    145 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
    146 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
    147 * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
    148 */
    149 #  define USE_CVT_INTRINSICS
    150 # else
    151 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
    152 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
    153 */
    154 #  define M64_MEMBER l_
    155 # endif
    156 #endif
    157 
    158 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
    159 typedef uint64_t mmxdatafield;
    160 #else
    161 typedef __m64 mmxdatafield;
    162 #endif
    163 
    164 typedef struct
    165 {
    166    mmxdatafield mmx_4x00ff;
    167    mmxdatafield mmx_4x0080;
    168    mmxdatafield mmx_565_rgb;
    169    mmxdatafield mmx_565_unpack_multiplier;
    170    mmxdatafield mmx_565_pack_multiplier;
    171    mmxdatafield mmx_565_r;
    172    mmxdatafield mmx_565_g;
    173    mmxdatafield mmx_565_b;
    174    mmxdatafield mmx_packed_565_rb;
    175    mmxdatafield mmx_packed_565_g;
    176    mmxdatafield mmx_expand_565_g;
    177    mmxdatafield mmx_expand_565_b;
    178    mmxdatafield mmx_expand_565_r;
    179 #ifndef USE_LOONGSON_MMI
    180    mmxdatafield mmx_mask_0;
    181    mmxdatafield mmx_mask_1;
    182    mmxdatafield mmx_mask_2;
    183    mmxdatafield mmx_mask_3;
    184 #endif
    185    mmxdatafield mmx_full_alpha;
    186    mmxdatafield mmx_4x0101;
    187    mmxdatafield mmx_ff000000;
    188 } mmx_data_t;
    189 
    190 #if defined(_MSC_VER)
    191 # define MMXDATA_INIT(field, val) { val ## UI64 }
    192 #elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
    193 # define MMXDATA_INIT(field, val) field =   { val ## ULL }
    194 #else                           /* mmxdatafield is an integral type */
    195 # define MMXDATA_INIT(field, val) field =   val ## ULL
    196 #endif
    197 
    198 static const mmx_data_t c =
    199 {
    200    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
    201    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
    202    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
    203    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
    204    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
    205    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
    206    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
    207    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
    208    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
    209    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
    210    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
    211    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
    212    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
    213 #ifndef USE_LOONGSON_MMI
    214    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
    215    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
    216    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
    217    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
    218 #endif
    219    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
    220    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
    221    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
    222 };
    223 
    224 #ifdef USE_CVT_INTRINSICS
    225 #    define MC(x) to_m64 (c.mmx_ ## x)
    226 #elif defined(USE_M64_CASTS)
    227 #    define MC(x) ((__m64)c.mmx_ ## x)
    228 #elif defined(USE_M64_DOUBLE)
    229 #    define MC(x) (*(__m64 *)&c.mmx_ ## x)
    230 #else
    231 #    define MC(x) c.mmx_ ## x
    232 #endif
    233 
    234 static force_inline __m64
    235 to_m64 (uint64_t x)
    236 {
    237 #ifdef USE_CVT_INTRINSICS
    238    return _mm_cvtsi64_m64 (x);
    239 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
    240    __m64 res;
    241 
    242    res.M64_MEMBER = x;
    243    return res;
    244 #elif defined USE_M64_DOUBLE
    245    return *(__m64 *)&x;
    246 #else /* USE_M64_CASTS */
    247    return (__m64)x;
    248 #endif
    249 }
    250 
    251 static force_inline uint64_t
    252 to_uint64 (__m64 x)
    253 {
    254 #ifdef USE_CVT_INTRINSICS
    255    return _mm_cvtm64_si64 (x);
    256 #elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
    257    uint64_t res = x.M64_MEMBER;
    258    return res;
    259 #elif defined USE_M64_DOUBLE
    260    return *(uint64_t *)&x;
    261 #else /* USE_M64_CASTS */
    262    return (uint64_t)x;
    263 #endif
    264 }
    265 
    266 static force_inline __m64
    267 shift (__m64 v,
    268       int   s)
    269 {
    270    if (s > 0)
    271 return _mm_slli_si64 (v, s);
    272    else if (s < 0)
    273 return _mm_srli_si64 (v, -s);
    274    else
    275 return v;
    276 }
    277 
    278 static force_inline __m64
    279 negate (__m64 mask)
    280 {
    281    return _mm_xor_si64 (mask, MC (4x00ff));
    282 }
    283 
    284 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
    285 * and maps its result to the same range.
    286 *
    287 * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
    288 * Notation, Notation, Notation", the first of which is
    289 *
    290 *   prod(a, b) = (a * b + 128) / 255.
    291 *
    292 * By approximating the division by 255 as 257/65536 it can be replaced by a
    293 * multiply and a right shift. This is the implementation that we use in
    294 * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
    295 * 3DNow!, and unavailable at the time of the book's publication) to perform
    296 * the multiplication and right shift in a single operation.
    297 *
    298 *   prod(a, b) = ((a * b + 128) * 257) >> 16.
    299 *
    300 * A third way (how pix_multiply() was implemented prior to 14208344) exists
    301 * also that performs the multiplication by 257 with adds and shifts.
    302 *
    303 * Where temp = a * b + 128
    304 *
    305 *   prod(a, b) = (temp + (temp >> 8)) >> 8.
    306 */
    307 static force_inline __m64
    308 pix_multiply (__m64 a, __m64 b)
    309 {
    310    __m64 res;
    311 
    312    res = _mm_mullo_pi16 (a, b);
    313    res = _mm_adds_pu16 (res, MC (4x0080));
    314    res = _mm_mulhi_pu16 (res, MC (4x0101));
    315 
    316    return res;
    317 }
    318 
    319 static force_inline __m64
    320 pix_add (__m64 a, __m64 b)
    321 {
    322    return _mm_adds_pu8 (a, b);
    323 }
    324 
    325 static force_inline __m64
    326 expand_alpha (__m64 pixel)
    327 {
    328    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
    329 }
    330 
    331 static force_inline __m64
    332 expand_alpha_rev (__m64 pixel)
    333 {
    334    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
    335 }
    336 
    337 static force_inline __m64
    338 invert_colors (__m64 pixel)
    339 {
    340    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
    341 }
    342 
    343 static force_inline __m64
    344 over (__m64 src,
    345      __m64 srca,
    346      __m64 dest)
    347 {
    348    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
    349 }
    350 
    351 static force_inline __m64
    352 over_rev_non_pre (__m64 src, __m64 dest)
    353 {
    354    __m64 srca = expand_alpha (src);
    355    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
    356 
    357    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
    358 }
    359 
    360 static force_inline __m64
    361 in (__m64 src, __m64 mask)
    362 {
    363    return pix_multiply (src, mask);
    364 }
    365 
    366 #ifndef _MSC_VER
    367 static force_inline __m64
    368 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
    369 {
    370    return over (in (src, mask), pix_multiply (srca, mask), dest);
    371 }
    372 
    373 #else
    374 
    375 #define in_over(src, srca, mask, dest)					\
    376    over (in (src, mask), pix_multiply (srca, mask), dest)
    377 
    378 #endif
    379 
    380 /* Elemental unaligned loads */
    381 
    382 static force_inline __m64 ldq_u(__m64 *p)
    383 {
    384 #ifdef USE_X86_MMX
    385    /* x86's alignment restrictions are very relaxed, but that's no excuse */
    386    __m64 r;
    387    memcpy(&r, p, sizeof(__m64));
    388    return r;
    389 #else
    390    struct __una_u64 { __m64 x __attribute__((packed)); };
    391    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
    392    return (__m64) ptr->x;
    393 #endif
    394 }
    395 
    396 static force_inline uint32_t ldl_u(const uint32_t *p)
    397 {
    398 #ifdef USE_X86_MMX
    399    /* x86's alignment restrictions are very relaxed. */
    400    uint32_t r;
    401    memcpy(&r, p, sizeof(uint32_t));
    402    return r;
    403 #else
    404    struct __una_u32 { uint32_t x __attribute__((packed)); };
    405    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
    406    return ptr->x;
    407 #endif
    408 }
    409 
    410 static force_inline __m64
    411 load (const uint32_t *v)
    412 {
    413 #ifdef USE_LOONGSON_MMI
    414    __m64 ret;
    415    asm ("lwc1 %0, %1\n\t"
    416 : "=f" (ret)
    417 : "m" (*v)
    418    );
    419    return ret;
    420 #else
    421    return _mm_cvtsi32_si64 (*v);
    422 #endif
    423 }
    424 
    425 static force_inline __m64
    426 load8888 (const uint32_t *v)
    427 {
    428 #ifdef USE_LOONGSON_MMI
    429    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
    430 #else
    431    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
    432 #endif
    433 }
    434 
    435 static force_inline __m64
    436 load8888u (const uint32_t *v)
    437 {
    438    uint32_t l = ldl_u (v);
    439    return load8888 (&l);
    440 }
    441 
    442 static force_inline __m64
    443 pack8888 (__m64 lo, __m64 hi)
    444 {
    445    return _mm_packs_pu16 (lo, hi);
    446 }
    447 
    448 static force_inline void
    449 store (uint32_t *dest, __m64 v)
    450 {
    451 #ifdef USE_LOONGSON_MMI
    452    asm ("swc1 %1, %0\n\t"
    453 : "=m" (*dest)
    454 : "f" (v)
    455 : "memory"
    456    );
    457 #else
    458    *dest = _mm_cvtsi64_si32 (v);
    459 #endif
    460 }
    461 
    462 static force_inline void
    463 store8888 (uint32_t *dest, __m64 v)
    464 {
    465    v = pack8888 (v, _mm_setzero_si64 ());
    466    store (dest, v);
    467 }
    468 
    469 static force_inline pixman_bool_t
    470 is_equal (__m64 a, __m64 b)
    471 {
    472 #ifdef USE_LOONGSON_MMI
    473    /* __m64 is double, we can compare directly. */
    474    return a == b;
    475 #else
    476    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
    477 #endif
    478 }
    479 
    480 static force_inline pixman_bool_t
    481 is_opaque (__m64 v)
    482 {
    483 #ifdef USE_LOONGSON_MMI
    484    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
    485 #else
    486    __m64 ffs = _mm_cmpeq_pi8 (v, v);
    487    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
    488 #endif
    489 }
    490 
    491 static force_inline pixman_bool_t
    492 is_zero (__m64 v)
    493 {
    494    return is_equal (v, _mm_setzero_si64 ());
    495 }
    496 
    497 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
    498 *
    499 *    00RR00GG00BB
    500 *
    501 * --- Expanding 565 in the low word ---
    502 *
    503 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
    504 * m = m & (01f0003f001f);
    505 * m = m * (008404100840);
    506 * m = m >> 8;
    507 *
    508 * Note the trick here - the top word is shifted by another nibble to
    509 * avoid it bumping into the middle word
    510 */
    511 static force_inline __m64
    512 expand565 (__m64 pixel, int pos)
    513 {
    514    __m64 p = pixel;
    515    __m64 t1, t2;
    516 
    517    /* move pixel to low 16 bit and zero the rest */
    518 #ifdef USE_LOONGSON_MMI
    519    p = loongson_extract_pi16 (p, pos);
    520 #else
    521    p = shift (shift (p, (3 - pos) * 16), -48);
    522 #endif
    523 
    524    t1 = shift (p, 36 - 11);
    525    t2 = shift (p, 16 - 5);
    526 
    527    p = _mm_or_si64 (t1, p);
    528    p = _mm_or_si64 (t2, p);
    529    p = _mm_and_si64 (p, MC (565_rgb));
    530 
    531    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
    532    return _mm_srli_pi16 (pixel, 8);
    533 }
    534 
    535 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
    536 *
    537 *    AARRGGBBRRGGBB
    538 */
    539 static force_inline void
    540 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
    541 {
    542    __m64 t0, t1, alpha = _mm_setzero_si64 ();
    543    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
    544    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
    545    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
    546    if (full_alpha)
    547 alpha = _mm_cmpeq_pi32 (alpha, alpha);
    548 
    549    /* Replicate high bits into empty low bits. */
    550    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
    551    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
    552    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
    553 
    554    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
    555    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
    556    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
    557 
    558    t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
    559    t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
    560 
    561    *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
    562    *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
    563 }
    564 
    565 static force_inline __m64
    566 expand8888 (__m64 in, int pos)
    567 {
    568    if (pos == 0)
    569 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
    570    else
    571 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
    572 }
    573 
    574 static force_inline __m64
    575 expandx888 (__m64 in, int pos)
    576 {
    577    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
    578 }
    579 
    580 static force_inline void
    581 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
    582 {
    583    __m64 v0, v1;
    584    expand_4xpacked565 (vin, &v0, &v1, full_alpha);
    585    *vout0 = expand8888 (v0, 0);
    586    *vout1 = expand8888 (v0, 1);
    587    *vout2 = expand8888 (v1, 0);
    588    *vout3 = expand8888 (v1, 1);
    589 }
    590 
    591 static force_inline __m64
    592 pack_565 (__m64 pixel, __m64 target, int pos)
    593 {
    594    __m64 p = pixel;
    595    __m64 t = target;
    596    __m64 r, g, b;
    597 
    598    r = _mm_and_si64 (p, MC (565_r));
    599    g = _mm_and_si64 (p, MC (565_g));
    600    b = _mm_and_si64 (p, MC (565_b));
    601 
    602 #ifdef USE_LOONGSON_MMI
    603    r = shift (r, -(32 - 8));
    604    g = shift (g, -(16 - 3));
    605    b = shift (b, -(0  + 3));
    606 
    607    p = _mm_or_si64 (r, g);
    608    p = _mm_or_si64 (p, b);
    609    return loongson_insert_pi16 (t, p, pos);
    610 #else
    611    r = shift (r, -(32 - 8) + pos * 16);
    612    g = shift (g, -(16 - 3) + pos * 16);
    613    b = shift (b, -(0  + 3) + pos * 16);
    614 
    615    if (pos == 0)
    616 t = _mm_and_si64 (t, MC (mask_0));
    617    else if (pos == 1)
    618 t = _mm_and_si64 (t, MC (mask_1));
    619    else if (pos == 2)
    620 t = _mm_and_si64 (t, MC (mask_2));
    621    else if (pos == 3)
    622 t = _mm_and_si64 (t, MC (mask_3));
    623 
    624    p = _mm_or_si64 (r, t);
    625    p = _mm_or_si64 (g, p);
    626 
    627    return _mm_or_si64 (b, p);
    628 #endif
    629 }
    630 
    631 static force_inline __m64
    632 pack_4xpacked565 (__m64 a, __m64 b)
    633 {
    634    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
    635    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
    636 
    637    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
    638    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
    639 
    640    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
    641    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
    642 
    643    t0 = _mm_or_si64 (t0, g0);
    644    t1 = _mm_or_si64 (t1, g1);
    645 
    646    t0 = shift(t0, -5);
    647    t1 = shift(t1, -5 + 16);
    648    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
    649 }
    650 
    651 #ifndef _MSC_VER
    652 
    653 static force_inline __m64
    654 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
    655 {
    656    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
    657 }
    658 
    659 static force_inline __m64
    660 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
    661 {
    662    x = pix_multiply (x, a);
    663    y = pix_multiply (y, b);
    664 
    665    return pix_add (x, y);
    666 }
    667 
    668 #else
    669 
    670 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
    671 
    672 #define pack_4x565(v0, v1, v2, v3) \
    673    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
    674 
    675 #define pix_add_mul(x, a, y, b)	 \
    676    ( x = pix_multiply (x, a),	 \
    677      y = pix_multiply (y, b),	 \
    678      pix_add (x, y) )
    679 
    680 #endif
    681 
    682 /* --------------- MMX code patch for fbcompose.c --------------------- */
    683 
    684 static force_inline __m64
    685 combine (const uint32_t *src, const uint32_t *mask)
    686 {
    687    __m64 vsrc = load8888 (src);
    688 
    689    if (mask)
    690    {
    691 __m64 m = load8888 (mask);
    692 
    693 m = expand_alpha (m);
    694 vsrc = pix_multiply (vsrc, m);
    695    }
    696 
    697    return vsrc;
    698 }
    699 
    700 static force_inline __m64
    701 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
    702 {
    703    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
    704 
    705    if (is_opaque (vsrc))
    706    {
    707 return vsrc;
    708    }
    709    else if (!is_zero (vsrc))
    710    {
    711 return over (vsrc, expand_alpha (vsrc),
    712 	     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
    713    }
    714 
    715    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
    716 }
    717 
    718 static void
    719 mmx_combine_over_u (pixman_implementation_t *imp,
    720                    pixman_op_t              op,
    721                    uint32_t *               dest,
    722                    const uint32_t *         src,
    723                    const uint32_t *         mask,
    724                    int                      width)
    725 {
    726    const uint32_t *end = dest + width;
    727 
    728    while (dest < end)
    729    {
    730 __m64 vsrc = combine (src, mask);
    731 
    732 if (is_opaque (vsrc))
    733 {
    734     store8888 (dest, vsrc);
    735 }
    736 else if (!is_zero (vsrc))
    737 {
    738     __m64 sa = expand_alpha (vsrc);
    739     store8888 (dest, over (vsrc, sa, load8888 (dest)));
    740 }
    741 
    742 ++dest;
    743 ++src;
    744 if (mask)
    745     ++mask;
    746    }
    747    _mm_empty ();
    748 }
    749 
    750 static void
    751 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
    752                            pixman_op_t              op,
    753                            uint32_t *               dest,
    754                            const uint32_t *         src,
    755                            const uint32_t *         mask,
    756                            int                      width)
    757 {
    758    const uint32_t *end = dest + width;
    759 
    760    while (dest < end)
    761    {
    762 __m64 d, da;
    763 __m64 s = combine (src, mask);
    764 
    765 d = load8888 (dest);
    766 da = expand_alpha (d);
    767 store8888 (dest, over (d, da, s));
    768 
    769 ++dest;
    770 ++src;
    771 if (mask)
    772     mask++;
    773    }
    774    _mm_empty ();
    775 }
    776 
    777 static void
    778 mmx_combine_in_u (pixman_implementation_t *imp,
    779                  pixman_op_t              op,
    780                  uint32_t *               dest,
    781                  const uint32_t *         src,
    782                  const uint32_t *         mask,
    783                  int                      width)
    784 {
    785    const uint32_t *end = dest + width;
    786 
    787    while (dest < end)
    788    {
    789 __m64 a;
    790 __m64 x = combine (src, mask);
    791 
    792 a = load8888 (dest);
    793 a = expand_alpha (a);
    794 x = pix_multiply (x, a);
    795 
    796 store8888 (dest, x);
    797 
    798 ++dest;
    799 ++src;
    800 if (mask)
    801     mask++;
    802    }
    803    _mm_empty ();
    804 }
    805 
    806 static void
    807 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
    808                          pixman_op_t              op,
    809                          uint32_t *               dest,
    810                          const uint32_t *         src,
    811                          const uint32_t *         mask,
    812                          int                      width)
    813 {
    814    const uint32_t *end = dest + width;
    815 
    816    while (dest < end)
    817    {
    818 __m64 a = combine (src, mask);
    819 __m64 x;
    820 
    821 x = load8888 (dest);
    822 a = expand_alpha (a);
    823 x = pix_multiply (x, a);
    824 store8888 (dest, x);
    825 
    826 ++dest;
    827 ++src;
    828 if (mask)
    829     mask++;
    830    }
    831    _mm_empty ();
    832 }
    833 
    834 static void
    835 mmx_combine_out_u (pixman_implementation_t *imp,
    836                   pixman_op_t              op,
    837                   uint32_t *               dest,
    838                   const uint32_t *         src,
    839                   const uint32_t *         mask,
    840                   int                      width)
    841 {
    842    const uint32_t *end = dest + width;
    843 
    844    while (dest < end)
    845    {
    846 __m64 a;
    847 __m64 x = combine (src, mask);
    848 
    849 a = load8888 (dest);
    850 a = expand_alpha (a);
    851 a = negate (a);
    852 x = pix_multiply (x, a);
    853 store8888 (dest, x);
    854 
    855 ++dest;
    856 ++src;
    857 if (mask)
    858     mask++;
    859    }
    860    _mm_empty ();
    861 }
    862 
    863 static void
    864 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
    865                           pixman_op_t              op,
    866                           uint32_t *               dest,
    867                           const uint32_t *         src,
    868                           const uint32_t *         mask,
    869                           int                      width)
    870 {
    871    const uint32_t *end = dest + width;
    872 
    873    while (dest < end)
    874    {
    875 __m64 a = combine (src, mask);
    876 __m64 x;
    877 
    878 x = load8888 (dest);
    879 a = expand_alpha (a);
    880 a = negate (a);
    881 x = pix_multiply (x, a);
    882 
    883 store8888 (dest, x);
    884 
    885 ++dest;
    886 ++src;
    887 if (mask)
    888     mask++;
    889    }
    890    _mm_empty ();
    891 }
    892 
    893 static void
    894 mmx_combine_atop_u (pixman_implementation_t *imp,
    895                    pixman_op_t              op,
    896                    uint32_t *               dest,
    897                    const uint32_t *         src,
    898                    const uint32_t *         mask,
    899                    int                      width)
    900 {
    901    const uint32_t *end = dest + width;
    902 
    903    while (dest < end)
    904    {
    905 __m64 da, d, sia;
    906 __m64 s = combine (src, mask);
    907 
    908 d = load8888 (dest);
    909 sia = expand_alpha (s);
    910 sia = negate (sia);
    911 da = expand_alpha (d);
    912 s = pix_add_mul (s, da, d, sia);
    913 store8888 (dest, s);
    914 
    915 ++dest;
    916 ++src;
    917 if (mask)
    918     mask++;
    919    }
    920    _mm_empty ();
    921 }
    922 
    923 static void
    924 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
    925                            pixman_op_t              op,
    926                            uint32_t *               dest,
    927                            const uint32_t *         src,
    928                            const uint32_t *         mask,
    929                            int                      width)
    930 {
    931    const uint32_t *end;
    932 
    933    end = dest + width;
    934 
    935    while (dest < end)
    936    {
    937 __m64 dia, d, sa;
    938 __m64 s = combine (src, mask);
    939 
    940 d = load8888 (dest);
    941 sa = expand_alpha (s);
    942 dia = expand_alpha (d);
    943 dia = negate (dia);
    944 s = pix_add_mul (s, dia, d, sa);
    945 store8888 (dest, s);
    946 
    947 ++dest;
    948 ++src;
    949 if (mask)
    950     mask++;
    951    }
    952    _mm_empty ();
    953 }
    954 
    955 static void
    956 mmx_combine_xor_u (pixman_implementation_t *imp,
    957                   pixman_op_t              op,
    958                   uint32_t *               dest,
    959                   const uint32_t *         src,
    960                   const uint32_t *         mask,
    961                   int                      width)
    962 {
    963    const uint32_t *end = dest + width;
    964 
    965    while (dest < end)
    966    {
    967 __m64 dia, d, sia;
    968 __m64 s = combine (src, mask);
    969 
    970 d = load8888 (dest);
    971 sia = expand_alpha (s);
    972 dia = expand_alpha (d);
    973 sia = negate (sia);
    974 dia = negate (dia);
    975 s = pix_add_mul (s, dia, d, sia);
    976 store8888 (dest, s);
    977 
    978 ++dest;
    979 ++src;
    980 if (mask)
    981     mask++;
    982    }
    983    _mm_empty ();
    984 }
    985 
    986 static void
    987 mmx_combine_add_u (pixman_implementation_t *imp,
    988                   pixman_op_t              op,
    989                   uint32_t *               dest,
    990                   const uint32_t *         src,
    991                   const uint32_t *         mask,
    992                   int                      width)
    993 {
    994    const uint32_t *end = dest + width;
    995 
    996    while (dest < end)
    997    {
    998 __m64 d;
    999 __m64 s = combine (src, mask);
   1000 
   1001 d = load8888 (dest);
   1002 s = pix_add (s, d);
   1003 store8888 (dest, s);
   1004 
   1005 ++dest;
   1006 ++src;
   1007 if (mask)
   1008     mask++;
   1009    }
   1010    _mm_empty ();
   1011 }
   1012 
   1013 static void
   1014 mmx_combine_saturate_u (pixman_implementation_t *imp,
   1015                        pixman_op_t              op,
   1016                        uint32_t *               dest,
   1017                        const uint32_t *         src,
   1018                        const uint32_t *         mask,
   1019                        int                      width)
   1020 {
   1021    const uint32_t *end = dest + width;
   1022 
   1023    while (dest < end)
   1024    {
   1025 uint32_t s, sa, da;
   1026 uint32_t d = *dest;
   1027 __m64 ms = combine (src, mask);
   1028 __m64 md = load8888 (dest);
   1029 
   1030 store8888(&s, ms);
   1031 da = ~d >> 24;
   1032 sa = s >> 24;
   1033 
   1034 if (sa > da)
   1035 {
   1036     uint32_t quot = DIV_UN8 (da, sa) << 24;
   1037     __m64 msa = load8888 (&quot);
   1038     msa = expand_alpha (msa);
   1039     ms = pix_multiply (ms, msa);
   1040 }
   1041 
   1042 md = pix_add (md, ms);
   1043 store8888 (dest, md);
   1044 
   1045 ++src;
   1046 ++dest;
   1047 if (mask)
   1048     mask++;
   1049    }
   1050    _mm_empty ();
   1051 }
   1052 
   1053 static void
   1054 mmx_combine_src_ca (pixman_implementation_t *imp,
   1055                    pixman_op_t              op,
   1056                    uint32_t *               dest,
   1057                    const uint32_t *         src,
   1058                    const uint32_t *         mask,
   1059                    int                      width)
   1060 {
   1061    const uint32_t *end = src + width;
   1062 
   1063    while (src < end)
   1064    {
   1065 __m64 a = load8888 (mask);
   1066 __m64 s = load8888 (src);
   1067 
   1068 s = pix_multiply (s, a);
   1069 store8888 (dest, s);
   1070 
   1071 ++src;
   1072 ++mask;
   1073 ++dest;
   1074    }
   1075    _mm_empty ();
   1076 }
   1077 
   1078 static void
   1079 mmx_combine_over_ca (pixman_implementation_t *imp,
   1080                     pixman_op_t              op,
   1081                     uint32_t *               dest,
   1082                     const uint32_t *         src,
   1083                     const uint32_t *         mask,
   1084                     int                      width)
   1085 {
   1086    const uint32_t *end = src + width;
   1087 
   1088    while (src < end)
   1089    {
   1090 __m64 a = load8888 (mask);
   1091 __m64 s = load8888 (src);
   1092 __m64 d = load8888 (dest);
   1093 __m64 sa = expand_alpha (s);
   1094 
   1095 store8888 (dest, in_over (s, sa, a, d));
   1096 
   1097 ++src;
   1098 ++dest;
   1099 ++mask;
   1100    }
   1101    _mm_empty ();
   1102 }
   1103 
   1104 static void
   1105 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
   1106                             pixman_op_t              op,
   1107                             uint32_t *               dest,
   1108                             const uint32_t *         src,
   1109                             const uint32_t *         mask,
   1110                             int                      width)
   1111 {
   1112    const uint32_t *end = src + width;
   1113 
   1114    while (src < end)
   1115    {
   1116 __m64 a = load8888 (mask);
   1117 __m64 s = load8888 (src);
   1118 __m64 d = load8888 (dest);
   1119 __m64 da = expand_alpha (d);
   1120 
   1121 store8888 (dest, over (d, da, in (s, a)));
   1122 
   1123 ++src;
   1124 ++dest;
   1125 ++mask;
   1126    }
   1127    _mm_empty ();
   1128 }
   1129 
   1130 static void
   1131 mmx_combine_in_ca (pixman_implementation_t *imp,
   1132                   pixman_op_t              op,
   1133                   uint32_t *               dest,
   1134                   const uint32_t *         src,
   1135                   const uint32_t *         mask,
   1136                   int                      width)
   1137 {
   1138    const uint32_t *end = src + width;
   1139 
   1140    while (src < end)
   1141    {
   1142 __m64 a = load8888 (mask);
   1143 __m64 s = load8888 (src);
   1144 __m64 d = load8888 (dest);
   1145 __m64 da = expand_alpha (d);
   1146 
   1147 s = pix_multiply (s, a);
   1148 s = pix_multiply (s, da);
   1149 store8888 (dest, s);
   1150 
   1151 ++src;
   1152 ++dest;
   1153 ++mask;
   1154    }
   1155    _mm_empty ();
   1156 }
   1157 
   1158 static void
   1159 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
   1160                           pixman_op_t              op,
   1161                           uint32_t *               dest,
   1162                           const uint32_t *         src,
   1163                           const uint32_t *         mask,
   1164                           int                      width)
   1165 {
   1166    const uint32_t *end = src + width;
   1167 
   1168    while (src < end)
   1169    {
   1170 __m64 a = load8888 (mask);
   1171 __m64 s = load8888 (src);
   1172 __m64 d = load8888 (dest);
   1173 __m64 sa = expand_alpha (s);
   1174 
   1175 a = pix_multiply (a, sa);
   1176 d = pix_multiply (d, a);
   1177 store8888 (dest, d);
   1178 
   1179 ++src;
   1180 ++dest;
   1181 ++mask;
   1182    }
   1183    _mm_empty ();
   1184 }
   1185 
   1186 static void
   1187 mmx_combine_out_ca (pixman_implementation_t *imp,
   1188                    pixman_op_t              op,
   1189                    uint32_t *               dest,
   1190                    const uint32_t *         src,
   1191                    const uint32_t *         mask,
   1192                    int                      width)
   1193 {
   1194    const uint32_t *end = src + width;
   1195 
   1196    while (src < end)
   1197    {
   1198 __m64 a = load8888 (mask);
   1199 __m64 s = load8888 (src);
   1200 __m64 d = load8888 (dest);
   1201 __m64 da = expand_alpha (d);
   1202 
   1203 da = negate (da);
   1204 s = pix_multiply (s, a);
   1205 s = pix_multiply (s, da);
   1206 store8888 (dest, s);
   1207 
   1208 ++src;
   1209 ++dest;
   1210 ++mask;
   1211    }
   1212    _mm_empty ();
   1213 }
   1214 
   1215 static void
   1216 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
   1217                            pixman_op_t              op,
   1218                            uint32_t *               dest,
   1219                            const uint32_t *         src,
   1220                            const uint32_t *         mask,
   1221                            int                      width)
   1222 {
   1223    const uint32_t *end = src + width;
   1224 
   1225    while (src < end)
   1226    {
   1227 __m64 a = load8888 (mask);
   1228 __m64 s = load8888 (src);
   1229 __m64 d = load8888 (dest);
   1230 __m64 sa = expand_alpha (s);
   1231 
   1232 a = pix_multiply (a, sa);
   1233 a = negate (a);
   1234 d = pix_multiply (d, a);
   1235 store8888 (dest, d);
   1236 
   1237 ++src;
   1238 ++dest;
   1239 ++mask;
   1240    }
   1241    _mm_empty ();
   1242 }
   1243 
   1244 static void
   1245 mmx_combine_atop_ca (pixman_implementation_t *imp,
   1246                     pixman_op_t              op,
   1247                     uint32_t *               dest,
   1248                     const uint32_t *         src,
   1249                     const uint32_t *         mask,
   1250                     int                      width)
   1251 {
   1252    const uint32_t *end = src + width;
   1253 
   1254    while (src < end)
   1255    {
   1256 __m64 a = load8888 (mask);
   1257 __m64 s = load8888 (src);
   1258 __m64 d = load8888 (dest);
   1259 __m64 da = expand_alpha (d);
   1260 __m64 sa = expand_alpha (s);
   1261 
   1262 s = pix_multiply (s, a);
   1263 a = pix_multiply (a, sa);
   1264 a = negate (a);
   1265 d = pix_add_mul (d, a, s, da);
   1266 store8888 (dest, d);
   1267 
   1268 ++src;
   1269 ++dest;
   1270 ++mask;
   1271    }
   1272    _mm_empty ();
   1273 }
   1274 
   1275 static void
   1276 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
   1277                             pixman_op_t              op,
   1278                             uint32_t *               dest,
   1279                             const uint32_t *         src,
   1280                             const uint32_t *         mask,
   1281                             int                      width)
   1282 {
   1283    const uint32_t *end = src + width;
   1284 
   1285    while (src < end)
   1286    {
   1287 __m64 a = load8888 (mask);
   1288 __m64 s = load8888 (src);
   1289 __m64 d = load8888 (dest);
   1290 __m64 da = expand_alpha (d);
   1291 __m64 sa = expand_alpha (s);
   1292 
   1293 s = pix_multiply (s, a);
   1294 a = pix_multiply (a, sa);
   1295 da = negate (da);
   1296 d = pix_add_mul (d, a, s, da);
   1297 store8888 (dest, d);
   1298 
   1299 ++src;
   1300 ++dest;
   1301 ++mask;
   1302    }
   1303    _mm_empty ();
   1304 }
   1305 
   1306 static void
   1307 mmx_combine_xor_ca (pixman_implementation_t *imp,
   1308                    pixman_op_t              op,
   1309                    uint32_t *               dest,
   1310                    const uint32_t *         src,
   1311                    const uint32_t *         mask,
   1312                    int                      width)
   1313 {
   1314    const uint32_t *end = src + width;
   1315 
   1316    while (src < end)
   1317    {
   1318 __m64 a = load8888 (mask);
   1319 __m64 s = load8888 (src);
   1320 __m64 d = load8888 (dest);
   1321 __m64 da = expand_alpha (d);
   1322 __m64 sa = expand_alpha (s);
   1323 
   1324 s = pix_multiply (s, a);
   1325 a = pix_multiply (a, sa);
   1326 da = negate (da);
   1327 a = negate (a);
   1328 d = pix_add_mul (d, a, s, da);
   1329 store8888 (dest, d);
   1330 
   1331 ++src;
   1332 ++dest;
   1333 ++mask;
   1334    }
   1335    _mm_empty ();
   1336 }
   1337 
   1338 static void
   1339 mmx_combine_add_ca (pixman_implementation_t *imp,
   1340                    pixman_op_t              op,
   1341                    uint32_t *               dest,
   1342                    const uint32_t *         src,
   1343                    const uint32_t *         mask,
   1344                    int                      width)
   1345 {
   1346    const uint32_t *end = src + width;
   1347 
   1348    while (src < end)
   1349    {
   1350 __m64 a = load8888 (mask);
   1351 __m64 s = load8888 (src);
   1352 __m64 d = load8888 (dest);
   1353 
   1354 s = pix_multiply (s, a);
   1355 d = pix_add (s, d);
   1356 store8888 (dest, d);
   1357 
   1358 ++src;
   1359 ++dest;
   1360 ++mask;
   1361    }
   1362    _mm_empty ();
   1363 }
   1364 
   1365 /* ------------- MMX code paths called from fbpict.c -------------------- */
   1366 
   1367 static void
   1368 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
   1369                           pixman_composite_info_t *info)
   1370 {
   1371    PIXMAN_COMPOSITE_ARGS (info);
   1372    uint32_t src;
   1373    uint32_t    *dst_line, *dst;
   1374    int32_t w;
   1375    int dst_stride;
   1376    __m64 vsrc, vsrca;
   1377 
   1378    CHECKPOINT ();
   1379 
   1380    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   1381 
   1382    if (src == 0)
   1383 return;
   1384 
   1385    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1386 
   1387    vsrc = load8888 (&src);
   1388    vsrca = expand_alpha (vsrc);
   1389 
   1390    while (height--)
   1391    {
   1392 dst = dst_line;
   1393 dst_line += dst_stride;
   1394 w = width;
   1395 
   1396 CHECKPOINT ();
   1397 
   1398 while (w && (uintptr_t)dst & 7)
   1399 {
   1400     store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
   1401 
   1402     w--;
   1403     dst++;
   1404 }
   1405 
   1406 while (w >= 2)
   1407 {
   1408     __m64 vdest;
   1409     __m64 dest0, dest1;
   1410 
   1411     vdest = *(__m64 *)dst;
   1412 
   1413     dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
   1414     dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
   1415 
   1416     *(__m64 *)dst = pack8888 (dest0, dest1);
   1417 
   1418     dst += 2;
   1419     w -= 2;
   1420 }
   1421 
   1422 CHECKPOINT ();
   1423 
   1424 if (w)
   1425 {
   1426     store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
   1427 }
   1428    }
   1429 
   1430    _mm_empty ();
   1431 }
   1432 
   1433 static void
   1434 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
   1435                           pixman_composite_info_t *info)
   1436 {
   1437    PIXMAN_COMPOSITE_ARGS (info);
   1438    uint32_t src;
   1439    uint16_t    *dst_line, *dst;
   1440    int32_t w;
   1441    int dst_stride;
   1442    __m64 vsrc, vsrca;
   1443 
   1444    CHECKPOINT ();
   1445 
   1446    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   1447 
   1448    if (src == 0)
   1449 return;
   1450 
   1451    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   1452 
   1453    vsrc = load8888 (&src);
   1454    vsrca = expand_alpha (vsrc);
   1455 
   1456    while (height--)
   1457    {
   1458 dst = dst_line;
   1459 dst_line += dst_stride;
   1460 w = width;
   1461 
   1462 CHECKPOINT ();
   1463 
   1464 while (w && (uintptr_t)dst & 7)
   1465 {
   1466     uint64_t d = *dst;
   1467     __m64 vdest = expand565 (to_m64 (d), 0);
   1468 
   1469     vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
   1470     *dst = to_uint64 (vdest);
   1471 
   1472     w--;
   1473     dst++;
   1474 }
   1475 
   1476 while (w >= 4)
   1477 {
   1478     __m64 vdest = *(__m64 *)dst;
   1479     __m64 v0, v1, v2, v3;
   1480 
   1481     expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   1482 
   1483     v0 = over (vsrc, vsrca, v0);
   1484     v1 = over (vsrc, vsrca, v1);
   1485     v2 = over (vsrc, vsrca, v2);
   1486     v3 = over (vsrc, vsrca, v3);
   1487 
   1488     *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
   1489 
   1490     dst += 4;
   1491     w -= 4;
   1492 }
   1493 
   1494 CHECKPOINT ();
   1495 
   1496 while (w)
   1497 {
   1498     uint64_t d = *dst;
   1499     __m64 vdest = expand565 (to_m64 (d), 0);
   1500 
   1501     vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
   1502     *dst = to_uint64 (vdest);
   1503 
   1504     w--;
   1505     dst++;
   1506 }
   1507    }
   1508 
   1509    _mm_empty ();
   1510 }
   1511 
   1512 static void
   1513 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
   1514                                   pixman_composite_info_t *info)
   1515 {
   1516    PIXMAN_COMPOSITE_ARGS (info);
   1517    uint32_t src;
   1518    uint32_t    *dst_line;
   1519    uint32_t    *mask_line;
   1520    int dst_stride, mask_stride;
   1521    __m64 vsrc, vsrca;
   1522 
   1523    CHECKPOINT ();
   1524 
   1525    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   1526 
   1527    if (src == 0)
   1528 return;
   1529 
   1530    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1531    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   1532 
   1533    vsrc = load8888 (&src);
   1534    vsrca = expand_alpha (vsrc);
   1535 
   1536    while (height--)
   1537    {
   1538 int twidth = width;
   1539 uint32_t *p = (uint32_t *)mask_line;
   1540 uint32_t *q = (uint32_t *)dst_line;
   1541 
   1542 while (twidth && (uintptr_t)q & 7)
   1543 {
   1544     uint32_t m = *(uint32_t *)p;
   1545 
   1546     if (m)
   1547     {
   1548 	__m64 vdest = load8888 (q);
   1549 	vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
   1550 	store8888 (q, vdest);
   1551     }
   1552 
   1553     twidth--;
   1554     p++;
   1555     q++;
   1556 }
   1557 
   1558 while (twidth >= 2)
   1559 {
   1560     uint32_t m0, m1;
   1561     m0 = *p;
   1562     m1 = *(p + 1);
   1563 
   1564     if (m0 | m1)
   1565     {
   1566 	__m64 dest0, dest1;
   1567 	__m64 vdest = *(__m64 *)q;
   1568 
   1569 	dest0 = in_over (vsrc, vsrca, load8888 (&m0),
   1570 	                 expand8888 (vdest, 0));
   1571 	dest1 = in_over (vsrc, vsrca, load8888 (&m1),
   1572 	                 expand8888 (vdest, 1));
   1573 
   1574 	*(__m64 *)q = pack8888 (dest0, dest1);
   1575     }
   1576 
   1577     p += 2;
   1578     q += 2;
   1579     twidth -= 2;
   1580 }
   1581 
   1582 if (twidth)
   1583 {
   1584     uint32_t m = *(uint32_t *)p;
   1585 
   1586     if (m)
   1587     {
   1588 	__m64 vdest = load8888 (q);
   1589 	vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
   1590 	store8888 (q, vdest);
   1591     }
   1592 
   1593     twidth--;
   1594     p++;
   1595     q++;
   1596 }
   1597 
   1598 dst_line += dst_stride;
   1599 mask_line += mask_stride;
   1600    }
   1601 
   1602    _mm_empty ();
   1603 }
   1604 
   1605 static void
   1606 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
   1607                                pixman_composite_info_t *info)
   1608 {
   1609    PIXMAN_COMPOSITE_ARGS (info);
   1610    uint32_t    *dst_line, *dst;
   1611    uint32_t    *src_line, *src;
   1612    uint32_t mask;
   1613    __m64 vmask;
   1614    int dst_stride, src_stride;
   1615    int32_t w;
   1616 
   1617    CHECKPOINT ();
   1618 
   1619    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1620    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   1621 
   1622    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
   1623    vmask = expand_alpha (load8888 (&mask));
   1624 
   1625    while (height--)
   1626    {
   1627 dst = dst_line;
   1628 dst_line += dst_stride;
   1629 src = src_line;
   1630 src_line += src_stride;
   1631 w = width;
   1632 
   1633 while (w && (uintptr_t)dst & 7)
   1634 {
   1635     __m64 s = load8888 (src);
   1636     __m64 d = load8888 (dst);
   1637 
   1638     store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
   1639 
   1640     w--;
   1641     dst++;
   1642     src++;
   1643 }
   1644 
   1645 while (w >= 2)
   1646 {
   1647     __m64 vs = ldq_u ((__m64 *)src);
   1648     __m64 vd = *(__m64 *)dst;
   1649     __m64 vsrc0 = expand8888 (vs, 0);
   1650     __m64 vsrc1 = expand8888 (vs, 1);
   1651 
   1652     *(__m64 *)dst = pack8888 (
   1653         in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
   1654         in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
   1655 
   1656     w -= 2;
   1657     dst += 2;
   1658     src += 2;
   1659 }
   1660 
   1661 if (w)
   1662 {
   1663     __m64 s = load8888 (src);
   1664     __m64 d = load8888 (dst);
   1665 
   1666     store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
   1667 }
   1668    }
   1669 
   1670    _mm_empty ();
   1671 }
   1672 
   1673 static void
   1674 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
   1675                                pixman_composite_info_t *info)
   1676 {
   1677    PIXMAN_COMPOSITE_ARGS (info);
   1678    uint32_t *dst_line, *dst;
   1679    uint32_t *src_line, *src;
   1680    uint32_t mask;
   1681    __m64 vmask;
   1682    int dst_stride, src_stride;
   1683    int32_t w;
   1684    __m64 srca;
   1685 
   1686    CHECKPOINT ();
   1687 
   1688    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1689    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   1690    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
   1691 
   1692    vmask = expand_alpha (load8888 (&mask));
   1693    srca = MC (4x00ff);
   1694 
   1695    while (height--)
   1696    {
   1697 dst = dst_line;
   1698 dst_line += dst_stride;
   1699 src = src_line;
   1700 src_line += src_stride;
   1701 w = width;
   1702 
   1703 while (w && (uintptr_t)dst & 7)
   1704 {
   1705     uint32_t ssrc = *src | 0xff000000;
   1706     __m64 s = load8888 (&ssrc);
   1707     __m64 d = load8888 (dst);
   1708 
   1709     store8888 (dst, in_over (s, srca, vmask, d));
   1710 
   1711     w--;
   1712     dst++;
   1713     src++;
   1714 }
   1715 
   1716 while (w >= 16)
   1717 {
   1718     __m64 vd0 = *(__m64 *)(dst + 0);
   1719     __m64 vd1 = *(__m64 *)(dst + 2);
   1720     __m64 vd2 = *(__m64 *)(dst + 4);
   1721     __m64 vd3 = *(__m64 *)(dst + 6);
   1722     __m64 vd4 = *(__m64 *)(dst + 8);
   1723     __m64 vd5 = *(__m64 *)(dst + 10);
   1724     __m64 vd6 = *(__m64 *)(dst + 12);
   1725     __m64 vd7 = *(__m64 *)(dst + 14);
   1726 
   1727     __m64 vs0 = ldq_u ((__m64 *)(src + 0));
   1728     __m64 vs1 = ldq_u ((__m64 *)(src + 2));
   1729     __m64 vs2 = ldq_u ((__m64 *)(src + 4));
   1730     __m64 vs3 = ldq_u ((__m64 *)(src + 6));
   1731     __m64 vs4 = ldq_u ((__m64 *)(src + 8));
   1732     __m64 vs5 = ldq_u ((__m64 *)(src + 10));
   1733     __m64 vs6 = ldq_u ((__m64 *)(src + 12));
   1734     __m64 vs7 = ldq_u ((__m64 *)(src + 14));
   1735 
   1736     vd0 = pack8888 (
   1737         in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
   1738         in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
   1739 
   1740     vd1 = pack8888 (
   1741         in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
   1742         in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
   1743 
   1744     vd2 = pack8888 (
   1745         in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
   1746         in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
   1747 
   1748     vd3 = pack8888 (
   1749         in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
   1750         in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
   1751 
   1752     vd4 = pack8888 (
   1753         in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
   1754         in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
   1755 
   1756     vd5 = pack8888 (
   1757         in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
   1758         in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
   1759 
   1760     vd6 = pack8888 (
   1761         in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
   1762         in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
   1763 
   1764     vd7 = pack8888 (
   1765         in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
   1766         in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
   1767 
   1768     *(__m64 *)(dst + 0) = vd0;
   1769     *(__m64 *)(dst + 2) = vd1;
   1770     *(__m64 *)(dst + 4) = vd2;
   1771     *(__m64 *)(dst + 6) = vd3;
   1772     *(__m64 *)(dst + 8) = vd4;
   1773     *(__m64 *)(dst + 10) = vd5;
   1774     *(__m64 *)(dst + 12) = vd6;
   1775     *(__m64 *)(dst + 14) = vd7;
   1776 
   1777     w -= 16;
   1778     dst += 16;
   1779     src += 16;
   1780 }
   1781 
   1782 while (w)
   1783 {
   1784     uint32_t ssrc = *src | 0xff000000;
   1785     __m64 s = load8888 (&ssrc);
   1786     __m64 d = load8888 (dst);
   1787 
   1788     store8888 (dst, in_over (s, srca, vmask, d));
   1789 
   1790     w--;
   1791     dst++;
   1792     src++;
   1793 }
   1794    }
   1795 
   1796    _mm_empty ();
   1797 }
   1798 
   1799 static void
   1800 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
   1801                              pixman_composite_info_t *info)
   1802 {
   1803    PIXMAN_COMPOSITE_ARGS (info);
   1804    uint32_t *dst_line, *dst;
   1805    uint32_t *src_line, *src;
   1806    uint32_t s;
   1807    int dst_stride, src_stride;
   1808    uint8_t a;
   1809    int32_t w;
   1810 
   1811    CHECKPOINT ();
   1812 
   1813    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1814    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   1815 
   1816    while (height--)
   1817    {
   1818 dst = dst_line;
   1819 dst_line += dst_stride;
   1820 src = src_line;
   1821 src_line += src_stride;
   1822 w = width;
   1823 
   1824 while (w--)
   1825 {
   1826     s = *src++;
   1827     a = s >> 24;
   1828 
   1829     if (a == 0xff)
   1830     {
   1831 	*dst = s;
   1832     }
   1833     else if (s)
   1834     {
   1835 	__m64 ms, sa;
   1836 	ms = load8888 (&s);
   1837 	sa = expand_alpha (ms);
   1838 	store8888 (dst, over (ms, sa, load8888 (dst)));
   1839     }
   1840 
   1841     dst++;
   1842 }
   1843    }
   1844    _mm_empty ();
   1845 }
   1846 
   1847 static void
   1848 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
   1849                              pixman_composite_info_t *info)
   1850 {
   1851    PIXMAN_COMPOSITE_ARGS (info);
   1852    uint16_t    *dst_line, *dst;
   1853    uint32_t    *src_line, *src;
   1854    int dst_stride, src_stride;
   1855    int32_t w;
   1856 
   1857    CHECKPOINT ();
   1858 
   1859    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   1860    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   1861 
   1862 #if 0
   1863    /* FIXME */
   1864    assert (src_image->drawable == mask_image->drawable);
   1865 #endif
   1866 
   1867    while (height--)
   1868    {
   1869 dst = dst_line;
   1870 dst_line += dst_stride;
   1871 src = src_line;
   1872 src_line += src_stride;
   1873 w = width;
   1874 
   1875 CHECKPOINT ();
   1876 
   1877 while (w && (uintptr_t)dst & 7)
   1878 {
   1879     __m64 vsrc = load8888 (src);
   1880     uint64_t d = *dst;
   1881     __m64 vdest = expand565 (to_m64 (d), 0);
   1882 
   1883     vdest = pack_565 (
   1884 	over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
   1885 
   1886     *dst = to_uint64 (vdest);
   1887 
   1888     w--;
   1889     dst++;
   1890     src++;
   1891 }
   1892 
   1893 CHECKPOINT ();
   1894 
   1895 while (w >= 4)
   1896 {
   1897     __m64 vdest = *(__m64 *)dst;
   1898     __m64 v0, v1, v2, v3;
   1899     __m64 vsrc0, vsrc1, vsrc2, vsrc3;
   1900 
   1901     expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   1902 
   1903     vsrc0 = load8888 ((src + 0));
   1904     vsrc1 = load8888 ((src + 1));
   1905     vsrc2 = load8888 ((src + 2));
   1906     vsrc3 = load8888 ((src + 3));
   1907 
   1908     v0 = over (vsrc0, expand_alpha (vsrc0), v0);
   1909     v1 = over (vsrc1, expand_alpha (vsrc1), v1);
   1910     v2 = over (vsrc2, expand_alpha (vsrc2), v2);
   1911     v3 = over (vsrc3, expand_alpha (vsrc3), v3);
   1912 
   1913     *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
   1914 
   1915     w -= 4;
   1916     dst += 4;
   1917     src += 4;
   1918 }
   1919 
   1920 CHECKPOINT ();
   1921 
   1922 while (w)
   1923 {
   1924     __m64 vsrc = load8888 (src);
   1925     uint64_t d = *dst;
   1926     __m64 vdest = expand565 (to_m64 (d), 0);
   1927 
   1928     vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
   1929 
   1930     *dst = to_uint64 (vdest);
   1931 
   1932     w--;
   1933     dst++;
   1934     src++;
   1935 }
   1936    }
   1937 
   1938    _mm_empty ();
   1939 }
   1940 
   1941 static void
   1942 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
   1943                             pixman_composite_info_t *info)
   1944 {
   1945    PIXMAN_COMPOSITE_ARGS (info);
   1946    uint32_t src, srca;
   1947    uint32_t *dst_line, *dst;
   1948    uint8_t *mask_line, *mask;
   1949    int dst_stride, mask_stride;
   1950    int32_t w;
   1951    __m64 vsrc, vsrca;
   1952    uint64_t srcsrc;
   1953 
   1954    CHECKPOINT ();
   1955 
   1956    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   1957 
   1958    srca = src >> 24;
   1959    if (src == 0)
   1960 return;
   1961 
   1962    srcsrc = (uint64_t)src << 32 | src;
   1963 
   1964    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   1965    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   1966 
   1967    vsrc = load8888 (&src);
   1968    vsrca = expand_alpha (vsrc);
   1969 
   1970    while (height--)
   1971    {
   1972 dst = dst_line;
   1973 dst_line += dst_stride;
   1974 mask = mask_line;
   1975 mask_line += mask_stride;
   1976 w = width;
   1977 
   1978 CHECKPOINT ();
   1979 
   1980 while (w && (uintptr_t)dst & 7)
   1981 {
   1982     uint64_t m = *mask;
   1983 
   1984     if (m)
   1985     {
   1986 	__m64 vdest = in_over (vsrc, vsrca,
   1987 			       expand_alpha_rev (to_m64 (m)),
   1988 			       load8888 (dst));
   1989 
   1990 	store8888 (dst, vdest);
   1991     }
   1992 
   1993     w--;
   1994     mask++;
   1995     dst++;
   1996 }
   1997 
   1998 CHECKPOINT ();
   1999 
   2000 while (w >= 2)
   2001 {
   2002     uint64_t m0, m1;
   2003 
   2004     m0 = *mask;
   2005     m1 = *(mask + 1);
   2006 
   2007     if (srca == 0xff && (m0 & m1) == 0xff)
   2008     {
   2009 	*(uint64_t *)dst = srcsrc;
   2010     }
   2011     else if (m0 | m1)
   2012     {
   2013 	__m64 vdest;
   2014 	__m64 dest0, dest1;
   2015 
   2016 	vdest = *(__m64 *)dst;
   2017 
   2018 	dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
   2019 			 expand8888 (vdest, 0));
   2020 	dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
   2021 			 expand8888 (vdest, 1));
   2022 
   2023 	*(__m64 *)dst = pack8888 (dest0, dest1);
   2024     }
   2025 
   2026     mask += 2;
   2027     dst += 2;
   2028     w -= 2;
   2029 }
   2030 
   2031 CHECKPOINT ();
   2032 
   2033 if (w)
   2034 {
   2035     uint64_t m = *mask;
   2036 
   2037     if (m)
   2038     {
   2039 	__m64 vdest = load8888 (dst);
   2040 
   2041 	vdest = in_over (
   2042 	    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
   2043 	store8888 (dst, vdest);
   2044     }
   2045 }
   2046    }
   2047 
   2048    _mm_empty ();
   2049 }
   2050 
   2051 static pixman_bool_t
   2052 mmx_fill (pixman_implementation_t *imp,
   2053          uint32_t *               bits,
   2054          int                      stride,
   2055          int                      bpp,
   2056          int                      x,
   2057          int                      y,
   2058          int                      width,
   2059          int                      height,
   2060          uint32_t		   filler)
   2061 {
   2062    uint64_t fill;
   2063    __m64 vfill;
   2064    uint32_t byte_width;
   2065    uint8_t     *byte_line;
   2066 
   2067 #if defined __GNUC__ && defined USE_X86_MMX
   2068    __m64 v1, v2, v3, v4, v5, v6, v7;
   2069 #endif
   2070 
   2071    if (bpp != 16 && bpp != 32 && bpp != 8)
   2072 return FALSE;
   2073 
   2074    if (bpp == 8)
   2075    {
   2076 stride = stride * (int) sizeof (uint32_t) / 1;
   2077 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
   2078 byte_width = width;
   2079 stride *= 1;
   2080        filler = (filler & 0xff) * 0x01010101;
   2081    }
   2082    else if (bpp == 16)
   2083    {
   2084 stride = stride * (int) sizeof (uint32_t) / 2;
   2085 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
   2086 byte_width = 2 * width;
   2087 stride *= 2;
   2088        filler = (filler & 0xffff) * 0x00010001;
   2089    }
   2090    else
   2091    {
   2092 stride = stride * (int) sizeof (uint32_t) / 4;
   2093 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
   2094 byte_width = 4 * width;
   2095 stride *= 4;
   2096    }
   2097 
   2098    fill = ((uint64_t)filler << 32) | filler;
   2099    vfill = to_m64 (fill);
   2100 
   2101 #if defined __GNUC__ && defined USE_X86_MMX
   2102    __asm__ (
   2103        "movq		%7,	%0\n"
   2104        "movq		%7,	%1\n"
   2105        "movq		%7,	%2\n"
   2106        "movq		%7,	%3\n"
   2107        "movq		%7,	%4\n"
   2108        "movq		%7,	%5\n"
   2109        "movq		%7,	%6\n"
   2110 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
   2111   "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
   2112 : "y" (vfill));
   2113 #endif
   2114 
   2115    while (height--)
   2116    {
   2117 int w;
   2118 uint8_t *d = byte_line;
   2119 
   2120 byte_line += stride;
   2121 w = byte_width;
   2122 
   2123 if (w >= 1 && ((uintptr_t)d & 1))
   2124 {
   2125     *(uint8_t *)d = (filler & 0xff);
   2126     w--;
   2127     d++;
   2128 }
   2129 
   2130 if (w >= 2 && ((uintptr_t)d & 3))
   2131 {
   2132     *(uint16_t *)d = filler;
   2133     w -= 2;
   2134     d += 2;
   2135 }
   2136 
   2137 while (w >= 4 && ((uintptr_t)d & 7))
   2138 {
   2139     *(uint32_t *)d = filler;
   2140 
   2141     w -= 4;
   2142     d += 4;
   2143 }
   2144 
   2145 while (w >= 64)
   2146 {
   2147 #if defined __GNUC__ && defined USE_X86_MMX
   2148     __asm__ (
   2149         "movq	%1,	  (%0)\n"
   2150         "movq	%2,	 8(%0)\n"
   2151         "movq	%3,	16(%0)\n"
   2152         "movq	%4,	24(%0)\n"
   2153         "movq	%5,	32(%0)\n"
   2154         "movq	%6,	40(%0)\n"
   2155         "movq	%7,	48(%0)\n"
   2156         "movq	%8,	56(%0)\n"
   2157 	:
   2158 	: "r" (d),
   2159 	  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
   2160 	  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
   2161 	: "memory");
   2162 #else
   2163     *(__m64*) (d +  0) = vfill;
   2164     *(__m64*) (d +  8) = vfill;
   2165     *(__m64*) (d + 16) = vfill;
   2166     *(__m64*) (d + 24) = vfill;
   2167     *(__m64*) (d + 32) = vfill;
   2168     *(__m64*) (d + 40) = vfill;
   2169     *(__m64*) (d + 48) = vfill;
   2170     *(__m64*) (d + 56) = vfill;
   2171 #endif
   2172     w -= 64;
   2173     d += 64;
   2174 }
   2175 
   2176 while (w >= 4)
   2177 {
   2178     *(uint32_t *)d = filler;
   2179 
   2180     w -= 4;
   2181     d += 4;
   2182 }
   2183 if (w >= 2)
   2184 {
   2185     *(uint16_t *)d = filler;
   2186     w -= 2;
   2187     d += 2;
   2188 }
   2189 if (w >= 1)
   2190 {
   2191     *(uint8_t *)d = (filler & 0xff);
   2192     w--;
   2193     d++;
   2194 }
   2195 
   2196    }
   2197 
   2198    _mm_empty ();
   2199    return TRUE;
   2200 }
   2201 
   2202 static void
   2203 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
   2204                             pixman_composite_info_t *info)
   2205 {
   2206    PIXMAN_COMPOSITE_ARGS (info);
   2207    uint16_t    *dst_line, *dst;
   2208    uint32_t    *src_line, *src, s;
   2209    int dst_stride, src_stride;
   2210    int32_t w;
   2211 
   2212    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2213    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2214 
   2215    while (height--)
   2216    {
   2217 dst = dst_line;
   2218 dst_line += dst_stride;
   2219 src = src_line;
   2220 src_line += src_stride;
   2221 w = width;
   2222 
   2223 while (w && (uintptr_t)dst & 7)
   2224 {
   2225     s = *src++;
   2226     *dst = convert_8888_to_0565 (s);
   2227     dst++;
   2228     w--;
   2229 }
   2230 
   2231 while (w >= 4)
   2232 {
   2233     __m64 vdest;
   2234     __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
   2235     __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
   2236 
   2237     vdest = pack_4xpacked565 (vsrc0, vsrc1);
   2238 
   2239     *(__m64 *)dst = vdest;
   2240 
   2241     w -= 4;
   2242     src += 4;
   2243     dst += 4;
   2244 }
   2245 
   2246 while (w)
   2247 {
   2248     s = *src++;
   2249     *dst = convert_8888_to_0565 (s);
   2250     dst++;
   2251     w--;
   2252 }
   2253    }
   2254 
   2255    _mm_empty ();
   2256 }
   2257 
   2258 static void
   2259 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
   2260                            pixman_composite_info_t *info)
   2261 {
   2262    PIXMAN_COMPOSITE_ARGS (info);
   2263    uint32_t src, srca;
   2264    uint32_t    *dst_line, *dst;
   2265    uint8_t     *mask_line, *mask;
   2266    int dst_stride, mask_stride;
   2267    int32_t w;
   2268    __m64 vsrc;
   2269    uint64_t srcsrc;
   2270 
   2271    CHECKPOINT ();
   2272 
   2273    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2274 
   2275    srca = src >> 24;
   2276    if (src == 0)
   2277    {
   2278 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
   2279 	  PIXMAN_FORMAT_BPP (dest_image->bits.format),
   2280 	  dest_x, dest_y, width, height, 0);
   2281 return;
   2282    }
   2283 
   2284    srcsrc = (uint64_t)src << 32 | src;
   2285 
   2286    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2287    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   2288 
   2289    vsrc = load8888 (&src);
   2290 
   2291    while (height--)
   2292    {
   2293 dst = dst_line;
   2294 dst_line += dst_stride;
   2295 mask = mask_line;
   2296 mask_line += mask_stride;
   2297 w = width;
   2298 
   2299 CHECKPOINT ();
   2300 
   2301 while (w && (uintptr_t)dst & 7)
   2302 {
   2303     uint64_t m = *mask;
   2304 
   2305     if (m)
   2306     {
   2307 	__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
   2308 
   2309 	store8888 (dst, vdest);
   2310     }
   2311     else
   2312     {
   2313 	*dst = 0;
   2314     }
   2315 
   2316     w--;
   2317     mask++;
   2318     dst++;
   2319 }
   2320 
   2321 CHECKPOINT ();
   2322 
   2323 while (w >= 2)
   2324 {
   2325     uint64_t m0, m1;
   2326     m0 = *mask;
   2327     m1 = *(mask + 1);
   2328 
   2329     if (srca == 0xff && (m0 & m1) == 0xff)
   2330     {
   2331 	*(uint64_t *)dst = srcsrc;
   2332     }
   2333     else if (m0 | m1)
   2334     {
   2335 	__m64 dest0, dest1;
   2336 
   2337 	dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
   2338 	dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
   2339 
   2340 	*(__m64 *)dst = pack8888 (dest0, dest1);
   2341     }
   2342     else
   2343     {
   2344 	*(uint64_t *)dst = 0;
   2345     }
   2346 
   2347     mask += 2;
   2348     dst += 2;
   2349     w -= 2;
   2350 }
   2351 
   2352 CHECKPOINT ();
   2353 
   2354 if (w)
   2355 {
   2356     uint64_t m = *mask;
   2357 
   2358     if (m)
   2359     {
   2360 	__m64 vdest = load8888 (dst);
   2361 
   2362 	vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
   2363 	store8888 (dst, vdest);
   2364     }
   2365     else
   2366     {
   2367 	*dst = 0;
   2368     }
   2369 }
   2370    }
   2371 
   2372    _mm_empty ();
   2373 }
   2374 
   2375 static void
   2376 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
   2377                             pixman_composite_info_t *info)
   2378 {
   2379    PIXMAN_COMPOSITE_ARGS (info);
   2380    uint32_t src, srca;
   2381    uint16_t *dst_line, *dst;
   2382    uint8_t *mask_line, *mask;
   2383    int dst_stride, mask_stride;
   2384    int32_t w;
   2385    __m64 vsrc, vsrca, tmp;
   2386    __m64 srcsrcsrcsrc;
   2387 
   2388    CHECKPOINT ();
   2389 
   2390    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2391 
   2392    srca = src >> 24;
   2393    if (src == 0)
   2394 return;
   2395 
   2396    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2397    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   2398 
   2399    vsrc = load8888 (&src);
   2400    vsrca = expand_alpha (vsrc);
   2401 
   2402    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
   2403    srcsrcsrcsrc = expand_alpha_rev (tmp);
   2404 
   2405    while (height--)
   2406    {
   2407 dst = dst_line;
   2408 dst_line += dst_stride;
   2409 mask = mask_line;
   2410 mask_line += mask_stride;
   2411 w = width;
   2412 
   2413 CHECKPOINT ();
   2414 
   2415 while (w && (uintptr_t)dst & 7)
   2416 {
   2417     uint64_t m = *mask;
   2418 
   2419     if (m)
   2420     {
   2421 	uint64_t d = *dst;
   2422 	__m64 vd = to_m64 (d);
   2423 	__m64 vdest = in_over (
   2424 	    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
   2425 
   2426 	vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
   2427 	*dst = to_uint64 (vd);
   2428     }
   2429 
   2430     w--;
   2431     mask++;
   2432     dst++;
   2433 }
   2434 
   2435 CHECKPOINT ();
   2436 
   2437 while (w >= 4)
   2438 {
   2439     uint64_t m0, m1, m2, m3;
   2440     m0 = *mask;
   2441     m1 = *(mask + 1);
   2442     m2 = *(mask + 2);
   2443     m3 = *(mask + 3);
   2444 
   2445     if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
   2446     {
   2447 	*(__m64 *)dst = srcsrcsrcsrc;
   2448     }
   2449     else if (m0 | m1 | m2 | m3)
   2450     {
   2451 	__m64 vdest = *(__m64 *)dst;
   2452 	__m64 v0, v1, v2, v3;
   2453 	__m64 vm0, vm1, vm2, vm3;
   2454 
   2455 	expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   2456 
   2457 	vm0 = to_m64 (m0);
   2458 	v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
   2459 
   2460 	vm1 = to_m64 (m1);
   2461 	v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
   2462 
   2463 	vm2 = to_m64 (m2);
   2464 	v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
   2465 
   2466 	vm3 = to_m64 (m3);
   2467 	v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
   2468 
   2469 	*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
   2470     }
   2471 
   2472     w -= 4;
   2473     mask += 4;
   2474     dst += 4;
   2475 }
   2476 
   2477 CHECKPOINT ();
   2478 
   2479 while (w)
   2480 {
   2481     uint64_t m = *mask;
   2482 
   2483     if (m)
   2484     {
   2485 	uint64_t d = *dst;
   2486 	__m64 vd = to_m64 (d);
   2487 	__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
   2488 			       expand565 (vd, 0));
   2489 	vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
   2490 	*dst = to_uint64 (vd);
   2491     }
   2492 
   2493     w--;
   2494     mask++;
   2495     dst++;
   2496 }
   2497    }
   2498 
   2499    _mm_empty ();
   2500 }
   2501 
   2502 static void
   2503 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
   2504                                pixman_composite_info_t *info)
   2505 {
   2506    PIXMAN_COMPOSITE_ARGS (info);
   2507    uint16_t    *dst_line, *dst;
   2508    uint32_t    *src_line, *src;
   2509    int dst_stride, src_stride;
   2510    int32_t w;
   2511 
   2512    CHECKPOINT ();
   2513 
   2514    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2515    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2516 
   2517 #if 0
   2518    /* FIXME */
   2519    assert (src_image->drawable == mask_image->drawable);
   2520 #endif
   2521 
   2522    while (height--)
   2523    {
   2524 dst = dst_line;
   2525 dst_line += dst_stride;
   2526 src = src_line;
   2527 src_line += src_stride;
   2528 w = width;
   2529 
   2530 CHECKPOINT ();
   2531 
   2532 while (w && (uintptr_t)dst & 7)
   2533 {
   2534     __m64 vsrc = load8888 (src);
   2535     uint64_t d = *dst;
   2536     __m64 vdest = expand565 (to_m64 (d), 0);
   2537 
   2538     vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
   2539 
   2540     *dst = to_uint64 (vdest);
   2541 
   2542     w--;
   2543     dst++;
   2544     src++;
   2545 }
   2546 
   2547 CHECKPOINT ();
   2548 
   2549 while (w >= 4)
   2550 {
   2551     uint32_t s0, s1, s2, s3;
   2552     unsigned char a0, a1, a2, a3;
   2553 
   2554     s0 = *src;
   2555     s1 = *(src + 1);
   2556     s2 = *(src + 2);
   2557     s3 = *(src + 3);
   2558 
   2559     a0 = (s0 >> 24);
   2560     a1 = (s1 >> 24);
   2561     a2 = (s2 >> 24);
   2562     a3 = (s3 >> 24);
   2563 
   2564     if ((a0 & a1 & a2 & a3) == 0xFF)
   2565     {
   2566 	__m64 v0 = invert_colors (load8888 (&s0));
   2567 	__m64 v1 = invert_colors (load8888 (&s1));
   2568 	__m64 v2 = invert_colors (load8888 (&s2));
   2569 	__m64 v3 = invert_colors (load8888 (&s3));
   2570 
   2571 	*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
   2572     }
   2573     else if (s0 | s1 | s2 | s3)
   2574     {
   2575 	__m64 vdest = *(__m64 *)dst;
   2576 	__m64 v0, v1, v2, v3;
   2577 
   2578 	__m64 vsrc0 = load8888 (&s0);
   2579 	__m64 vsrc1 = load8888 (&s1);
   2580 	__m64 vsrc2 = load8888 (&s2);
   2581 	__m64 vsrc3 = load8888 (&s3);
   2582 
   2583 	expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   2584 
   2585 	v0 = over_rev_non_pre (vsrc0, v0);
   2586 	v1 = over_rev_non_pre (vsrc1, v1);
   2587 	v2 = over_rev_non_pre (vsrc2, v2);
   2588 	v3 = over_rev_non_pre (vsrc3, v3);
   2589 
   2590 	*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
   2591     }
   2592 
   2593     w -= 4;
   2594     dst += 4;
   2595     src += 4;
   2596 }
   2597 
   2598 CHECKPOINT ();
   2599 
   2600 while (w)
   2601 {
   2602     __m64 vsrc = load8888 (src);
   2603     uint64_t d = *dst;
   2604     __m64 vdest = expand565 (to_m64 (d), 0);
   2605 
   2606     vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
   2607 
   2608     *dst = to_uint64 (vdest);
   2609 
   2610     w--;
   2611     dst++;
   2612     src++;
   2613 }
   2614    }
   2615 
   2616    _mm_empty ();
   2617 }
   2618 
   2619 static void
   2620 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
   2621                                pixman_composite_info_t *info)
   2622 {
   2623    PIXMAN_COMPOSITE_ARGS (info);
   2624    uint32_t    *dst_line, *dst;
   2625    uint32_t    *src_line, *src;
   2626    int dst_stride, src_stride;
   2627    int32_t w;
   2628 
   2629    CHECKPOINT ();
   2630 
   2631    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2632    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2633 
   2634 #if 0
   2635    /* FIXME */
   2636    assert (src_image->drawable == mask_image->drawable);
   2637 #endif
   2638 
   2639    while (height--)
   2640    {
   2641 dst = dst_line;
   2642 dst_line += dst_stride;
   2643 src = src_line;
   2644 src_line += src_stride;
   2645 w = width;
   2646 
   2647 while (w && (uintptr_t)dst & 7)
   2648 {
   2649     __m64 s = load8888 (src);
   2650     __m64 d = load8888 (dst);
   2651 
   2652     store8888 (dst, over_rev_non_pre (s, d));
   2653 
   2654     w--;
   2655     dst++;
   2656     src++;
   2657 }
   2658 
   2659 while (w >= 2)
   2660 {
   2661     uint32_t s0, s1;
   2662     unsigned char a0, a1;
   2663     __m64 d0, d1;
   2664 
   2665     s0 = *src;
   2666     s1 = *(src + 1);
   2667 
   2668     a0 = (s0 >> 24);
   2669     a1 = (s1 >> 24);
   2670 
   2671     if ((a0 & a1) == 0xFF)
   2672     {
   2673 	d0 = invert_colors (load8888 (&s0));
   2674 	d1 = invert_colors (load8888 (&s1));
   2675 
   2676 	*(__m64 *)dst = pack8888 (d0, d1);
   2677     }
   2678     else if (s0 | s1)
   2679     {
   2680 	__m64 vdest = *(__m64 *)dst;
   2681 
   2682 	d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
   2683 	d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
   2684 
   2685 	*(__m64 *)dst = pack8888 (d0, d1);
   2686     }
   2687 
   2688     w -= 2;
   2689     dst += 2;
   2690     src += 2;
   2691 }
   2692 
   2693 if (w)
   2694 {
   2695     __m64 s = load8888 (src);
   2696     __m64 d = load8888 (dst);
   2697 
   2698     store8888 (dst, over_rev_non_pre (s, d));
   2699 }
   2700    }
   2701 
   2702    _mm_empty ();
   2703 }
   2704 
   2705 static void
   2706 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
   2707                                   pixman_composite_info_t *info)
   2708 {
   2709    PIXMAN_COMPOSITE_ARGS (info);
   2710    uint32_t src;
   2711    uint16_t    *dst_line;
   2712    uint32_t    *mask_line;
   2713    int dst_stride, mask_stride;
   2714    __m64 vsrc, vsrca;
   2715 
   2716    CHECKPOINT ();
   2717 
   2718    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2719 
   2720    if (src == 0)
   2721 return;
   2722 
   2723    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2724    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   2725 
   2726    vsrc = load8888 (&src);
   2727    vsrca = expand_alpha (vsrc);
   2728 
   2729    while (height--)
   2730    {
   2731 int twidth = width;
   2732 uint32_t *p = (uint32_t *)mask_line;
   2733 uint16_t *q = (uint16_t *)dst_line;
   2734 
   2735 while (twidth && ((uintptr_t)q & 7))
   2736 {
   2737     uint32_t m = *(uint32_t *)p;
   2738 
   2739     if (m)
   2740     {
   2741 	uint64_t d = *q;
   2742 	__m64 vdest = expand565 (to_m64 (d), 0);
   2743 	vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
   2744 	*q = to_uint64 (vdest);
   2745     }
   2746 
   2747     twidth--;
   2748     p++;
   2749     q++;
   2750 }
   2751 
   2752 while (twidth >= 4)
   2753 {
   2754     uint32_t m0, m1, m2, m3;
   2755 
   2756     m0 = *p;
   2757     m1 = *(p + 1);
   2758     m2 = *(p + 2);
   2759     m3 = *(p + 3);
   2760 
   2761     if ((m0 | m1 | m2 | m3))
   2762     {
   2763 	__m64 vdest = *(__m64 *)q;
   2764 	__m64 v0, v1, v2, v3;
   2765 
   2766 	expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
   2767 
   2768 	v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
   2769 	v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
   2770 	v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
   2771 	v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
   2772 
   2773 	*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
   2774     }
   2775     twidth -= 4;
   2776     p += 4;
   2777     q += 4;
   2778 }
   2779 
   2780 while (twidth)
   2781 {
   2782     uint32_t m;
   2783 
   2784     m = *(uint32_t *)p;
   2785     if (m)
   2786     {
   2787 	uint64_t d = *q;
   2788 	__m64 vdest = expand565 (to_m64 (d), 0);
   2789 	vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
   2790 	*q = to_uint64 (vdest);
   2791     }
   2792 
   2793     twidth--;
   2794     p++;
   2795     q++;
   2796 }
   2797 
   2798 mask_line += mask_stride;
   2799 dst_line += dst_stride;
   2800    }
   2801 
   2802    _mm_empty ();
   2803 }
   2804 
   2805 static void
   2806 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
   2807                        pixman_composite_info_t *info)
   2808 {
   2809    PIXMAN_COMPOSITE_ARGS (info);
   2810    uint8_t *dst_line, *dst;
   2811    uint8_t *mask_line, *mask;
   2812    int dst_stride, mask_stride;
   2813    int32_t w;
   2814    uint32_t src;
   2815    uint8_t sa;
   2816    __m64 vsrc, vsrca;
   2817 
   2818    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   2819    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   2820 
   2821    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2822 
   2823    sa = src >> 24;
   2824 
   2825    vsrc = load8888 (&src);
   2826    vsrca = expand_alpha (vsrc);
   2827 
   2828    while (height--)
   2829    {
   2830 dst = dst_line;
   2831 dst_line += dst_stride;
   2832 mask = mask_line;
   2833 mask_line += mask_stride;
   2834 w = width;
   2835 
   2836 while (w && (uintptr_t)dst & 7)
   2837 {
   2838     uint16_t tmp;
   2839     uint8_t a;
   2840     uint32_t m, d;
   2841 
   2842     a = *mask++;
   2843     d = *dst;
   2844 
   2845     m = MUL_UN8 (sa, a, tmp);
   2846     d = MUL_UN8 (m, d, tmp);
   2847 
   2848     *dst++ = d;
   2849     w--;
   2850 }
   2851 
   2852 while (w >= 4)
   2853 {
   2854     __m64 vmask;
   2855     __m64 vdest;
   2856 
   2857     vmask = load8888u ((uint32_t *)mask);
   2858     vdest = load8888 ((uint32_t *)dst);
   2859 
   2860     store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
   2861 
   2862     dst += 4;
   2863     mask += 4;
   2864     w -= 4;
   2865 }
   2866 
   2867 while (w--)
   2868 {
   2869     uint16_t tmp;
   2870     uint8_t a;
   2871     uint32_t m, d;
   2872 
   2873     a = *mask++;
   2874     d = *dst;
   2875 
   2876     m = MUL_UN8 (sa, a, tmp);
   2877     d = MUL_UN8 (m, d, tmp);
   2878 
   2879     *dst++ = d;
   2880 }
   2881    }
   2882 
   2883    _mm_empty ();
   2884 }
   2885 
   2886 static void
   2887 mmx_composite_in_8_8 (pixman_implementation_t *imp,
   2888                      pixman_composite_info_t *info)
   2889 {
   2890    PIXMAN_COMPOSITE_ARGS (info);
   2891    uint8_t     *dst_line, *dst;
   2892    uint8_t     *src_line, *src;
   2893    int src_stride, dst_stride;
   2894    int32_t w;
   2895 
   2896    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   2897    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
   2898 
   2899    while (height--)
   2900    {
   2901 dst = dst_line;
   2902 dst_line += dst_stride;
   2903 src = src_line;
   2904 src_line += src_stride;
   2905 w = width;
   2906 
   2907 while (w && (uintptr_t)dst & 3)
   2908 {
   2909     uint8_t s, d;
   2910     uint16_t tmp;
   2911 
   2912     s = *src;
   2913     d = *dst;
   2914 
   2915     *dst = MUL_UN8 (s, d, tmp);
   2916 
   2917     src++;
   2918     dst++;
   2919     w--;
   2920 }
   2921 
   2922 while (w >= 4)
   2923 {
   2924     uint32_t *s = (uint32_t *)src;
   2925     uint32_t *d = (uint32_t *)dst;
   2926 
   2927     store8888 (d, in (load8888u (s), load8888 (d)));
   2928 
   2929     w -= 4;
   2930     dst += 4;
   2931     src += 4;
   2932 }
   2933 
   2934 while (w--)
   2935 {
   2936     uint8_t s, d;
   2937     uint16_t tmp;
   2938 
   2939     s = *src;
   2940     d = *dst;
   2941 
   2942     *dst = MUL_UN8 (s, d, tmp);
   2943 
   2944     src++;
   2945     dst++;
   2946 }
   2947    }
   2948 
   2949    _mm_empty ();
   2950 }
   2951 
   2952 static void
   2953 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
   2954 		 pixman_composite_info_t *info)
   2955 {
   2956    PIXMAN_COMPOSITE_ARGS (info);
   2957    uint8_t     *dst_line, *dst;
   2958    uint8_t     *mask_line, *mask;
   2959    int dst_stride, mask_stride;
   2960    int32_t w;
   2961    uint32_t src;
   2962    uint8_t sa;
   2963    __m64 vsrc, vsrca;
   2964 
   2965    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   2966    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   2967 
   2968    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2969 
   2970    sa = src >> 24;
   2971 
   2972    if (src == 0)
   2973 return;
   2974 
   2975    vsrc = load8888 (&src);
   2976    vsrca = expand_alpha (vsrc);
   2977 
   2978    while (height--)
   2979    {
   2980 dst = dst_line;
   2981 dst_line += dst_stride;
   2982 mask = mask_line;
   2983 mask_line += mask_stride;
   2984 w = width;
   2985 
   2986 while (w && (uintptr_t)dst & 3)
   2987 {
   2988     uint16_t tmp;
   2989     uint16_t a;
   2990     uint32_t m, d;
   2991     uint32_t r;
   2992 
   2993     a = *mask++;
   2994     d = *dst;
   2995 
   2996     m = MUL_UN8 (sa, a, tmp);
   2997     r = ADD_UN8 (m, d, tmp);
   2998 
   2999     *dst++ = r;
   3000     w--;
   3001 }
   3002 
   3003 while (w >= 4)
   3004 {
   3005     __m64 vmask;
   3006     __m64 vdest;
   3007 
   3008     vmask = load8888u ((uint32_t *)mask);
   3009     vdest = load8888 ((uint32_t *)dst);
   3010 
   3011     store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
   3012 
   3013     dst += 4;
   3014     mask += 4;
   3015     w -= 4;
   3016 }
   3017 
   3018 while (w--)
   3019 {
   3020     uint16_t tmp;
   3021     uint16_t a;
   3022     uint32_t m, d;
   3023     uint32_t r;
   3024 
   3025     a = *mask++;
   3026     d = *dst;
   3027 
   3028     m = MUL_UN8 (sa, a, tmp);
   3029     r = ADD_UN8 (m, d, tmp);
   3030 
   3031     *dst++ = r;
   3032 }
   3033    }
   3034 
   3035    _mm_empty ();
   3036 }
   3037 
   3038 static void
   3039 mmx_composite_add_8_8 (pixman_implementation_t *imp,
   3040 	       pixman_composite_info_t *info)
   3041 {
   3042    PIXMAN_COMPOSITE_ARGS (info);
   3043    uint8_t *dst_line, *dst;
   3044    uint8_t *src_line, *src;
   3045    int dst_stride, src_stride;
   3046    int32_t w;
   3047    uint8_t s, d;
   3048    uint16_t t;
   3049 
   3050    CHECKPOINT ();
   3051 
   3052    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
   3053    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   3054 
   3055    while (height--)
   3056    {
   3057 dst = dst_line;
   3058 dst_line += dst_stride;
   3059 src = src_line;
   3060 src_line += src_stride;
   3061 w = width;
   3062 
   3063 while (w && (uintptr_t)dst & 7)
   3064 {
   3065     s = *src;
   3066     d = *dst;
   3067     t = d + s;
   3068     s = t | (0 - (t >> 8));
   3069     *dst = s;
   3070 
   3071     dst++;
   3072     src++;
   3073     w--;
   3074 }
   3075 
   3076 while (w >= 8)
   3077 {
   3078     *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
   3079     dst += 8;
   3080     src += 8;
   3081     w -= 8;
   3082 }
   3083 
   3084 while (w)
   3085 {
   3086     s = *src;
   3087     d = *dst;
   3088     t = d + s;
   3089     s = t | (0 - (t >> 8));
   3090     *dst = s;
   3091 
   3092     dst++;
   3093     src++;
   3094     w--;
   3095 }
   3096    }
   3097 
   3098    _mm_empty ();
   3099 }
   3100 
   3101 static void
   3102 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
   3103                             pixman_composite_info_t *info)
   3104 {
   3105    PIXMAN_COMPOSITE_ARGS (info);
   3106    uint16_t    *dst_line, *dst;
   3107    uint32_t	d;
   3108    uint16_t    *src_line, *src;
   3109    uint32_t	s;
   3110    int dst_stride, src_stride;
   3111    int32_t w;
   3112 
   3113    CHECKPOINT ();
   3114 
   3115    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
   3116    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3117 
   3118    while (height--)
   3119    {
   3120 dst = dst_line;
   3121 dst_line += dst_stride;
   3122 src = src_line;
   3123 src_line += src_stride;
   3124 w = width;
   3125 
   3126 while (w && (uintptr_t)dst & 7)
   3127 {
   3128     s = *src++;
   3129     if (s)
   3130     {
   3131 	d = *dst;
   3132 	s = convert_0565_to_8888 (s);
   3133 	if (d)
   3134 	{
   3135 	    d = convert_0565_to_8888 (d);
   3136 	    UN8x4_ADD_UN8x4 (s, d);
   3137 	}
   3138 	*dst = convert_8888_to_0565 (s);
   3139     }
   3140     dst++;
   3141     w--;
   3142 }
   3143 
   3144 while (w >= 4)
   3145 {
   3146     __m64 vdest = *(__m64 *)dst;
   3147     __m64 vsrc = ldq_u ((__m64 *)src);
   3148     __m64 vd0, vd1;
   3149     __m64 vs0, vs1;
   3150 
   3151     expand_4xpacked565 (vdest, &vd0, &vd1, 0);
   3152     expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
   3153 
   3154     vd0 = _mm_adds_pu8 (vd0, vs0);
   3155     vd1 = _mm_adds_pu8 (vd1, vs1);
   3156 
   3157     *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
   3158 
   3159     dst += 4;
   3160     src += 4;
   3161     w -= 4;
   3162 }
   3163 
   3164 while (w--)
   3165 {
   3166     s = *src++;
   3167     if (s)
   3168     {
   3169 	d = *dst;
   3170 	s = convert_0565_to_8888 (s);
   3171 	if (d)
   3172 	{
   3173 	    d = convert_0565_to_8888 (d);
   3174 	    UN8x4_ADD_UN8x4 (s, d);
   3175 	}
   3176 	*dst = convert_8888_to_0565 (s);
   3177     }
   3178     dst++;
   3179 }
   3180    }
   3181 
   3182    _mm_empty ();
   3183 }
   3184 
   3185 static void
   3186 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
   3187                             pixman_composite_info_t *info)
   3188 {
   3189    PIXMAN_COMPOSITE_ARGS (info);
   3190    uint32_t    *dst_line, *dst;
   3191    uint32_t    *src_line, *src;
   3192    int dst_stride, src_stride;
   3193    int32_t w;
   3194 
   3195    CHECKPOINT ();
   3196 
   3197    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3198    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3199 
   3200    while (height--)
   3201    {
   3202 dst = dst_line;
   3203 dst_line += dst_stride;
   3204 src = src_line;
   3205 src_line += src_stride;
   3206 w = width;
   3207 
   3208 while (w && (uintptr_t)dst & 7)
   3209 {
   3210     store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
   3211                               load ((const uint32_t *)dst)));
   3212     dst++;
   3213     src++;
   3214     w--;
   3215 }
   3216 
   3217 while (w >= 2)
   3218 {
   3219     *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
   3220     dst += 2;
   3221     src += 2;
   3222     w -= 2;
   3223 }
   3224 
   3225 if (w)
   3226 {
   3227     store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
   3228                               load ((const uint32_t *)dst)));
   3229 
   3230 }
   3231    }
   3232 
   3233    _mm_empty ();
   3234 }
   3235 
   3236 static pixman_bool_t
   3237 mmx_blt (pixman_implementation_t *imp,
   3238         uint32_t *               src_bits,
   3239         uint32_t *               dst_bits,
   3240         int                      src_stride,
   3241         int                      dst_stride,
   3242         int                      src_bpp,
   3243         int                      dst_bpp,
   3244         int                      src_x,
   3245         int                      src_y,
   3246         int                      dest_x,
   3247         int                      dest_y,
   3248         int                      width,
   3249         int                      height)
   3250 {
   3251    uint8_t *   src_bytes;
   3252    uint8_t *   dst_bytes;
   3253    int byte_width;
   3254 
   3255    if (src_bpp != dst_bpp)
   3256 return FALSE;
   3257 
   3258    if (src_bpp == 16)
   3259    {
   3260 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
   3261 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
   3262 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
   3263 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
   3264 byte_width = 2 * width;
   3265 src_stride *= 2;
   3266 dst_stride *= 2;
   3267    }
   3268    else if (src_bpp == 32)
   3269    {
   3270 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
   3271 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
   3272 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
   3273 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
   3274 byte_width = 4 * width;
   3275 src_stride *= 4;
   3276 dst_stride *= 4;
   3277    }
   3278    else
   3279    {
   3280 return FALSE;
   3281    }
   3282 
   3283    while (height--)
   3284    {
   3285 int w;
   3286 uint8_t *s = src_bytes;
   3287 uint8_t *d = dst_bytes;
   3288 src_bytes += src_stride;
   3289 dst_bytes += dst_stride;
   3290 w = byte_width;
   3291 
   3292 if (w >= 1 && ((uintptr_t)d & 1))
   3293 {
   3294     *(uint8_t *)d = *(uint8_t *)s;
   3295     w -= 1;
   3296     s += 1;
   3297     d += 1;
   3298 }
   3299 
   3300 if (w >= 2 && ((uintptr_t)d & 3))
   3301 {
   3302     *(uint16_t *)d = *(uint16_t *)s;
   3303     w -= 2;
   3304     s += 2;
   3305     d += 2;
   3306 }
   3307 
   3308 while (w >= 4 && ((uintptr_t)d & 7))
   3309 {
   3310     *(uint32_t *)d = ldl_u ((uint32_t *)s);
   3311 
   3312     w -= 4;
   3313     s += 4;
   3314     d += 4;
   3315 }
   3316 
   3317 while (w >= 64)
   3318 {
   3319 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
   3320     __asm__ (
   3321         "movq	  (%1),	  %%mm0\n"
   3322         "movq	 8(%1),	  %%mm1\n"
   3323         "movq	16(%1),	  %%mm2\n"
   3324         "movq	24(%1),	  %%mm3\n"
   3325         "movq	32(%1),	  %%mm4\n"
   3326         "movq	40(%1),	  %%mm5\n"
   3327         "movq	48(%1),	  %%mm6\n"
   3328         "movq	56(%1),	  %%mm7\n"
   3329 
   3330         "movq	%%mm0,	  (%0)\n"
   3331         "movq	%%mm1,	 8(%0)\n"
   3332         "movq	%%mm2,	16(%0)\n"
   3333         "movq	%%mm3,	24(%0)\n"
   3334         "movq	%%mm4,	32(%0)\n"
   3335         "movq	%%mm5,	40(%0)\n"
   3336         "movq	%%mm6,	48(%0)\n"
   3337         "movq	%%mm7,	56(%0)\n"
   3338 	:
   3339 	: "r" (d), "r" (s)
   3340 	: "memory",
   3341 	  "%mm0", "%mm1", "%mm2", "%mm3",
   3342 	  "%mm4", "%mm5", "%mm6", "%mm7");
   3343 #else
   3344     __m64 v0 = ldq_u ((__m64 *)(s + 0));
   3345     __m64 v1 = ldq_u ((__m64 *)(s + 8));
   3346     __m64 v2 = ldq_u ((__m64 *)(s + 16));
   3347     __m64 v3 = ldq_u ((__m64 *)(s + 24));
   3348     __m64 v4 = ldq_u ((__m64 *)(s + 32));
   3349     __m64 v5 = ldq_u ((__m64 *)(s + 40));
   3350     __m64 v6 = ldq_u ((__m64 *)(s + 48));
   3351     __m64 v7 = ldq_u ((__m64 *)(s + 56));
   3352     *(__m64 *)(d + 0)  = v0;
   3353     *(__m64 *)(d + 8)  = v1;
   3354     *(__m64 *)(d + 16) = v2;
   3355     *(__m64 *)(d + 24) = v3;
   3356     *(__m64 *)(d + 32) = v4;
   3357     *(__m64 *)(d + 40) = v5;
   3358     *(__m64 *)(d + 48) = v6;
   3359     *(__m64 *)(d + 56) = v7;
   3360 #endif
   3361 
   3362     w -= 64;
   3363     s += 64;
   3364     d += 64;
   3365 }
   3366 while (w >= 4)
   3367 {
   3368     *(uint32_t *)d = ldl_u ((uint32_t *)s);
   3369 
   3370     w -= 4;
   3371     s += 4;
   3372     d += 4;
   3373 }
   3374 if (w >= 2)
   3375 {
   3376     *(uint16_t *)d = *(uint16_t *)s;
   3377     w -= 2;
   3378     s += 2;
   3379     d += 2;
   3380 }
   3381    }
   3382 
   3383    _mm_empty ();
   3384 
   3385    return TRUE;
   3386 }
   3387 
   3388 static void
   3389 mmx_composite_copy_area (pixman_implementation_t *imp,
   3390                         pixman_composite_info_t *info)
   3391 {
   3392    PIXMAN_COMPOSITE_ARGS (info);
   3393 
   3394    mmx_blt (imp, src_image->bits.bits,
   3395      dest_image->bits.bits,
   3396      src_image->bits.rowstride,
   3397      dest_image->bits.rowstride,
   3398      PIXMAN_FORMAT_BPP (src_image->bits.format),
   3399      PIXMAN_FORMAT_BPP (dest_image->bits.format),
   3400      src_x, src_y, dest_x, dest_y, width, height);
   3401 }
   3402 
   3403 static void
   3404 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
   3405                                pixman_composite_info_t *info)
   3406 {
   3407    PIXMAN_COMPOSITE_ARGS (info);
   3408    uint32_t  *src, *src_line;
   3409    uint32_t  *dst, *dst_line;
   3410    uint8_t  *mask, *mask_line;
   3411    int src_stride, mask_stride, dst_stride;
   3412    int32_t w;
   3413 
   3414    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3415    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   3416    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3417 
   3418    while (height--)
   3419    {
   3420 src = src_line;
   3421 src_line += src_stride;
   3422 dst = dst_line;
   3423 dst_line += dst_stride;
   3424 mask = mask_line;
   3425 mask_line += mask_stride;
   3426 
   3427 w = width;
   3428 
   3429 while (w--)
   3430 {
   3431     uint64_t m = *mask;
   3432 
   3433     if (m)
   3434     {
   3435 	uint32_t ssrc = *src | 0xff000000;
   3436 	__m64 s = load8888 (&ssrc);
   3437 
   3438 	if (m == 0xff)
   3439 	{
   3440 	    store8888 (dst, s);
   3441 	}
   3442 	else
   3443 	{
   3444 	    __m64 sa = expand_alpha (s);
   3445 	    __m64 vm = expand_alpha_rev (to_m64 (m));
   3446 	    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
   3447 
   3448 	    store8888 (dst, vdest);
   3449 	}
   3450     }
   3451 
   3452     mask++;
   3453     dst++;
   3454     src++;
   3455 }
   3456    }
   3457 
   3458    _mm_empty ();
   3459 }
   3460 
   3461 static void
   3462 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
   3463                                   pixman_composite_info_t *info)
   3464 {
   3465    PIXMAN_COMPOSITE_ARGS (info);
   3466    uint32_t src;
   3467    uint32_t    *dst_line, *dst;
   3468    int32_t w;
   3469    int dst_stride;
   3470    __m64 vsrc;
   3471 
   3472    CHECKPOINT ();
   3473 
   3474    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3475 
   3476    if (src == 0)
   3477 return;
   3478 
   3479    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3480 
   3481    vsrc = load8888 (&src);
   3482 
   3483    while (height--)
   3484    {
   3485 dst = dst_line;
   3486 dst_line += dst_stride;
   3487 w = width;
   3488 
   3489 CHECKPOINT ();
   3490 
   3491 while (w && (uintptr_t)dst & 7)
   3492 {
   3493     __m64 vdest = load8888 (dst);
   3494 
   3495     store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
   3496 
   3497     w--;
   3498     dst++;
   3499 }
   3500 
   3501 while (w >= 2)
   3502 {
   3503     __m64 vdest = *(__m64 *)dst;
   3504     __m64 dest0 = expand8888 (vdest, 0);
   3505     __m64 dest1 = expand8888 (vdest, 1);
   3506 
   3507 
   3508     dest0 = over (dest0, expand_alpha (dest0), vsrc);
   3509     dest1 = over (dest1, expand_alpha (dest1), vsrc);
   3510 
   3511     *(__m64 *)dst = pack8888 (dest0, dest1);
   3512 
   3513     dst += 2;
   3514     w -= 2;
   3515 }
   3516 
   3517 CHECKPOINT ();
   3518 
   3519 if (w)
   3520 {
   3521     __m64 vdest = load8888 (dst);
   3522 
   3523     store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
   3524 }
   3525    }
   3526 
   3527    _mm_empty ();
   3528 }
   3529 
   3530 static force_inline void
   3531 scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t*       pd,
   3532                                            const uint32_t* ps,
   3533                                            int32_t         w,
   3534                                            pixman_fixed_t  vx,
   3535                                            pixman_fixed_t  unit_x,
   3536                                            pixman_fixed_t  src_width_fixed,
   3537                                            pixman_bool_t   fully_transparent_src)
   3538 {
   3539    if (fully_transparent_src)
   3540 return;
   3541 
   3542    while (w)
   3543    {
   3544 __m64 d = load (pd);
   3545 __m64 s = load (ps + pixman_fixed_to_int (vx));
   3546 vx += unit_x;
   3547 while (vx >= 0)
   3548     vx -= src_width_fixed;
   3549 
   3550 store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
   3551 pd++;
   3552 
   3553 w--;
   3554    }
   3555 
   3556    _mm_empty ();
   3557 }
   3558 
   3559 FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
   3560 	       scaled_nearest_scanline_mmx_8888_8888_OVER,
   3561 	       uint32_t, uint32_t, COVER)
   3562 FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
   3563 	       scaled_nearest_scanline_mmx_8888_8888_OVER,
   3564 	       uint32_t, uint32_t, NONE)
   3565 FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
   3566 	       scaled_nearest_scanline_mmx_8888_8888_OVER,
   3567 	       uint32_t, uint32_t, PAD)
   3568 FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
   3569 	       scaled_nearest_scanline_mmx_8888_8888_OVER,
   3570 	       uint32_t, uint32_t, NORMAL)
   3571 
   3572 static force_inline void
   3573 scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
   3574 				      uint32_t *       dst,
   3575 				      const uint32_t * src,
   3576 				      int32_t          w,
   3577 				      pixman_fixed_t   vx,
   3578 				      pixman_fixed_t   unit_x,
   3579 				      pixman_fixed_t   src_width_fixed,
   3580 				      pixman_bool_t    zero_src)
   3581 {
   3582    __m64 mm_mask;
   3583 
   3584    if (zero_src || (*mask >> 24) == 0)
   3585    {
   3586 /* A workaround for https://gcc.gnu.org/PR47759 */
   3587 _mm_empty ();
   3588 return;
   3589    }
   3590 
   3591    mm_mask = expand_alpha (load8888 (mask));
   3592 
   3593    while (w)
   3594    {
   3595 uint32_t s = *(src + pixman_fixed_to_int (vx));
   3596 vx += unit_x;
   3597 while (vx >= 0)
   3598     vx -= src_width_fixed;
   3599 
   3600 if (s)
   3601 {
   3602     __m64 ms = load8888 (&s);
   3603     __m64 alpha = expand_alpha (ms);
   3604     __m64 dest  = load8888 (dst);
   3605 
   3606     store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
   3607 }
   3608 
   3609 dst++;
   3610 w--;
   3611    }
   3612 
   3613    _mm_empty ();
   3614 }
   3615 
   3616 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
   3617 		      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
   3618 		      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
   3619 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
   3620 		      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
   3621 		      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
   3622 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
   3623 		      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
   3624 		      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
   3625 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
   3626 		      scaled_nearest_scanline_mmx_8888_n_8888_OVER,
   3627 		      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
   3628 
   3629 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
   3630 #define BMSK (BSHIFT - 1)
   3631 
   3632 #define BILINEAR_DECLARE_VARIABLES						\
   3633    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
   3634    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
   3635    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
   3636    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
   3637    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
   3638    const __m64 mm_zero = _mm_setzero_si64 ();					\
   3639    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
   3640 
   3641 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
   3642 do {										\
   3643    /* fetch 2x2 pixel block into 2 mmx registers */				\
   3644    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
   3645    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
   3646    /* vertical interpolation */						\
   3647    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
   3648    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
   3649    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
   3650    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
   3651    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
   3652    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
   3653    /* calculate horizontal weights */						\
   3654    __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
   3655 		  _mm_srli_pi16 (mm_x,					\
   3656 				 16 - BILINEAR_INTERPOLATION_BITS)));	\
   3657    /* horizontal interpolation */						\
   3658    __m64 p = _mm_unpacklo_pi16 (lo, hi);					\
   3659    __m64 q = _mm_unpackhi_pi16 (lo, hi);					\
   3660    vx += unit_x;								\
   3661    lo = _mm_madd_pi16 (p, mm_wh);						\
   3662    hi = _mm_madd_pi16 (q, mm_wh);						\
   3663    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
   3664    /* shift and pack the result */						\
   3665    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
   3666    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
   3667    lo = _mm_packs_pi32 (lo, hi);						\
   3668    lo = _mm_packs_pu16 (lo, lo);						\
   3669    pix = lo;									\
   3670 } while (0)
   3671 
   3672 #define BILINEAR_SKIP_ONE_PIXEL()						\
   3673 do {										\
   3674    vx += unit_x;								\
   3675    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
   3676 } while(0)
   3677 
   3678 static force_inline void
   3679 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
   3680 				    const uint32_t * mask,
   3681 				    const uint32_t * src_top,
   3682 				    const uint32_t * src_bottom,
   3683 				    int32_t          w,
   3684 				    int              wt,
   3685 				    int              wb,
   3686 				    pixman_fixed_t   vx,
   3687 				    pixman_fixed_t   unit_x,
   3688 				    pixman_fixed_t   max_vx,
   3689 				    pixman_bool_t    zero_src)
   3690 {
   3691    BILINEAR_DECLARE_VARIABLES;
   3692    __m64 pix;
   3693 
   3694    while (w--)
   3695    {
   3696 BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
   3697 store (dst, pix);
   3698 dst++;
   3699    }
   3700 
   3701    _mm_empty ();
   3702 }
   3703 
   3704 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
   3705 		       scaled_bilinear_scanline_mmx_8888_8888_SRC,
   3706 		       uint32_t, uint32_t, uint32_t,
   3707 		       COVER, FLAG_NONE)
   3708 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
   3709 		       scaled_bilinear_scanline_mmx_8888_8888_SRC,
   3710 		       uint32_t, uint32_t, uint32_t,
   3711 		       PAD, FLAG_NONE)
   3712 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
   3713 		       scaled_bilinear_scanline_mmx_8888_8888_SRC,
   3714 		       uint32_t, uint32_t, uint32_t,
   3715 		       NONE, FLAG_NONE)
   3716 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
   3717 		       scaled_bilinear_scanline_mmx_8888_8888_SRC,
   3718 		       uint32_t, uint32_t, uint32_t,
   3719 		       NORMAL, FLAG_NONE)
   3720 
   3721 static force_inline void
   3722 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
   3723 				     const uint32_t * mask,
   3724 				     const uint32_t * src_top,
   3725 				     const uint32_t * src_bottom,
   3726 				     int32_t          w,
   3727 				     int              wt,
   3728 				     int              wb,
   3729 				     pixman_fixed_t   vx,
   3730 				     pixman_fixed_t   unit_x,
   3731 				     pixman_fixed_t   max_vx,
   3732 				     pixman_bool_t    zero_src)
   3733 {
   3734    BILINEAR_DECLARE_VARIABLES;
   3735    __m64 pix1, pix2;
   3736 
   3737    while (w)
   3738    {
   3739 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   3740 
   3741 if (!is_zero (pix1))
   3742 {
   3743     pix2 = load (dst);
   3744     store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
   3745 }
   3746 
   3747 w--;
   3748 dst++;
   3749    }
   3750 
   3751    _mm_empty ();
   3752 }
   3753 
   3754 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
   3755 		       scaled_bilinear_scanline_mmx_8888_8888_OVER,
   3756 		       uint32_t, uint32_t, uint32_t,
   3757 		       COVER, FLAG_NONE)
   3758 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
   3759 		       scaled_bilinear_scanline_mmx_8888_8888_OVER,
   3760 		       uint32_t, uint32_t, uint32_t,
   3761 		       PAD, FLAG_NONE)
   3762 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
   3763 		       scaled_bilinear_scanline_mmx_8888_8888_OVER,
   3764 		       uint32_t, uint32_t, uint32_t,
   3765 		       NONE, FLAG_NONE)
   3766 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
   3767 		       scaled_bilinear_scanline_mmx_8888_8888_OVER,
   3768 		       uint32_t, uint32_t, uint32_t,
   3769 		       NORMAL, FLAG_NONE)
   3770 
   3771 static force_inline void
   3772 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
   3773 				       const uint8_t  * mask,
   3774 				       const uint32_t * src_top,
   3775 				       const uint32_t * src_bottom,
   3776 				       int32_t          w,
   3777 				       int              wt,
   3778 				       int              wb,
   3779 				       pixman_fixed_t   vx,
   3780 				       pixman_fixed_t   unit_x,
   3781 				       pixman_fixed_t   max_vx,
   3782 				       pixman_bool_t    zero_src)
   3783 {
   3784    BILINEAR_DECLARE_VARIABLES;
   3785    __m64 pix1, pix2;
   3786    uint32_t m;
   3787 
   3788    while (w)
   3789    {
   3790 m = (uint32_t) *mask++;
   3791 
   3792 if (m)
   3793 {
   3794     BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   3795 
   3796     if (m == 0xff && is_opaque (pix1))
   3797     {
   3798 	store (dst, pix1);
   3799     }
   3800     else
   3801     {
   3802 	__m64 ms, md, ma, msa;
   3803 
   3804 	pix2 = load (dst);
   3805 	ma = expand_alpha_rev (to_m64 (m));
   3806 	ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
   3807 	md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
   3808 
   3809 	msa = expand_alpha (ms);
   3810 
   3811 	store8888 (dst, (in_over (ms, msa, ma, md)));
   3812     }
   3813 }
   3814 else
   3815 {
   3816     BILINEAR_SKIP_ONE_PIXEL ();
   3817 }
   3818 
   3819 w--;
   3820 dst++;
   3821    }
   3822 
   3823    _mm_empty ();
   3824 }
   3825 
   3826 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
   3827 		       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
   3828 		       uint32_t, uint8_t, uint32_t,
   3829 		       COVER, FLAG_HAVE_NON_SOLID_MASK)
   3830 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
   3831 		       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
   3832 		       uint32_t, uint8_t, uint32_t,
   3833 		       PAD, FLAG_HAVE_NON_SOLID_MASK)
   3834 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
   3835 		       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
   3836 		       uint32_t, uint8_t, uint32_t,
   3837 		       NONE, FLAG_HAVE_NON_SOLID_MASK)
   3838 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
   3839 		       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
   3840 		       uint32_t, uint8_t, uint32_t,
   3841 		       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
   3842 
   3843 static uint32_t *
   3844 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
   3845 {
   3846    int w = iter->width;
   3847    uint32_t *dst = iter->buffer;
   3848    uint32_t *src = (uint32_t *)iter->bits;
   3849 
   3850    iter->bits += iter->stride;
   3851 
   3852    while (w && ((uintptr_t)dst) & 7)
   3853    {
   3854 *dst++ = (*src++) | 0xff000000;
   3855 w--;
   3856    }
   3857 
   3858    while (w >= 8)
   3859    {
   3860 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
   3861 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
   3862 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
   3863 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
   3864 
   3865 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
   3866 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
   3867 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
   3868 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
   3869 
   3870 dst += 8;
   3871 src += 8;
   3872 w -= 8;
   3873    }
   3874 
   3875    while (w)
   3876    {
   3877 *dst++ = (*src++) | 0xff000000;
   3878 w--;
   3879    }
   3880 
   3881    _mm_empty ();
   3882    return iter->buffer;
   3883 }
   3884 
   3885 static uint32_t *
   3886 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
   3887 {
   3888    int w = iter->width;
   3889    uint32_t *dst = iter->buffer;
   3890    uint16_t *src = (uint16_t *)iter->bits;
   3891 
   3892    iter->bits += iter->stride;
   3893 
   3894    while (w && ((uintptr_t)dst) & 0x0f)
   3895    {
   3896 uint16_t s = *src++;
   3897 
   3898 *dst++ = convert_0565_to_8888 (s);
   3899 w--;
   3900    }
   3901 
   3902    while (w >= 4)
   3903    {
   3904 __m64 vsrc = ldq_u ((__m64 *)src);
   3905 __m64 mm0, mm1;
   3906 
   3907 expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
   3908 
   3909 *(__m64 *)(dst + 0) = mm0;
   3910 *(__m64 *)(dst + 2) = mm1;
   3911 
   3912 dst += 4;
   3913 src += 4;
   3914 w -= 4;
   3915    }
   3916 
   3917    while (w)
   3918    {
   3919 uint16_t s = *src++;
   3920 
   3921 *dst++ = convert_0565_to_8888 (s);
   3922 w--;
   3923    }
   3924 
   3925    _mm_empty ();
   3926    return iter->buffer;
   3927 }
   3928 
   3929 static uint32_t *
   3930 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
   3931 {
   3932    int w = iter->width;
   3933    uint32_t *dst = iter->buffer;
   3934    uint8_t *src = iter->bits;
   3935 
   3936    iter->bits += iter->stride;
   3937 
   3938    while (w && (((uintptr_t)dst) & 15))
   3939    {
   3940        *dst++ = (uint32_t)*(src++) << 24;
   3941        w--;
   3942    }
   3943 
   3944    while (w >= 8)
   3945    {
   3946 __m64 mm0 = ldq_u ((__m64 *)src);
   3947 
   3948 __m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
   3949 __m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
   3950 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
   3951 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
   3952 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
   3953 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
   3954 
   3955 *(__m64 *)(dst + 0) = mm3;
   3956 *(__m64 *)(dst + 2) = mm4;
   3957 *(__m64 *)(dst + 4) = mm5;
   3958 *(__m64 *)(dst + 6) = mm6;
   3959 
   3960 dst += 8;
   3961 src += 8;
   3962 w -= 8;
   3963    }
   3964 
   3965    while (w)
   3966    {
   3967 *dst++ = (uint32_t)*(src++) << 24;
   3968 w--;
   3969    }
   3970 
   3971    _mm_empty ();
   3972    return iter->buffer;
   3973 }
   3974 
   3975 #define IMAGE_FLAGS							\
   3976    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
   3977     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
   3978 
   3979 static const pixman_iter_info_t mmx_iters[] = 
   3980 {
   3981    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
   3982      _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
   3983    },
   3984    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
   3985      _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
   3986    },
   3987    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
   3988      _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
   3989    },
   3990    { PIXMAN_null },
   3991 };
   3992 
   3993 static const pixman_fast_path_t mmx_fast_paths[] =
   3994 {
   3995    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
   3996    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
   3997    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
   3998    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
   3999    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
   4000    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
   4001    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
   4002    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
   4003    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
   4004    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
   4005    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
   4006    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
   4007    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
   4008    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
   4009    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
   4010    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
   4011    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
   4012    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
   4013    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
   4014    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
   4015    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
   4016    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
   4017    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
   4018    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
   4019    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
   4020    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
   4021    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
   4022    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
   4023    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
   4024    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
   4025    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
   4026    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
   4027    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
   4028    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
   4029    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
   4030    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
   4031 
   4032    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
   4033    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
   4034    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
   4035    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
   4036    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
   4037    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
   4038 
   4039    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
   4040    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
   4041 
   4042    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
   4043    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
   4044    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
   4045    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
   4046    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
   4047    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
   4048 
   4049    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
   4050    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
   4051    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
   4052    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
   4053    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
   4054    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
   4055    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
   4056    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
   4057    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
   4058    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
   4059    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
   4060    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
   4061    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
   4062    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
   4063    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
   4064    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
   4065 
   4066    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
   4067    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
   4068 
   4069    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, x8r8g8b8, mmx_8888_8888                            ),
   4070    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, x8b8g8r8, mmx_8888_8888                            ),
   4071    SIMPLE_NEAREST_FAST_PATH (OVER,   a8r8g8b8, a8r8g8b8, mmx_8888_8888                            ),
   4072    SIMPLE_NEAREST_FAST_PATH (OVER,   a8b8g8r8, a8b8g8r8, mmx_8888_8888                            ),
   4073 
   4074    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888                 ),
   4075    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888                 ),
   4076    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888                 ),
   4077    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888                 ),
   4078 
   4079    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
   4080    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
   4081    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
   4082    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
   4083    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
   4084    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
   4085 
   4086    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
   4087    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
   4088    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
   4089    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
   4090 
   4091    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
   4092    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
   4093    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
   4094    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
   4095 
   4096    { PIXMAN_OP_NONE },
   4097 };
   4098 
   4099 pixman_implementation_t *
   4100 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
   4101 {
   4102    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
   4103 
   4104    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
   4105    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
   4106    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
   4107    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
   4108    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
   4109    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
   4110    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
   4111    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
   4112    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
   4113    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
   4114    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
   4115 
   4116    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
   4117    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
   4118    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
   4119    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
   4120    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
   4121    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
   4122    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
   4123    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
   4124    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
   4125    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
   4126    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
   4127 
   4128    imp->blt = mmx_blt;
   4129    imp->fill = mmx_fill;
   4130 
   4131    imp->iter_info = mmx_iters;
   4132 
   4133    return imp;
   4134 }
   4135 
   4136 #endif /* USE_X86_MMX || USE_LOONGSON_MMI */