tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-sse2.c (162877B)


      1 /*
      2 * Copyright © 2008 Rodrigo Kumpera
      3 * Copyright © 2008 André Tupinambá
      4 *
      5 * Permission to use, copy, modify, distribute, and sell this software and its
      6 * documentation for any purpose is hereby granted without fee, provided that
      7 * the above copyright notice appear in all copies and that both that
      8 * copyright notice and this permission notice appear in supporting
      9 * documentation, and that the name of Red Hat not be used in advertising or
     10 * publicity pertaining to distribution of the software without specific,
     11 * written prior permission.  Red Hat makes no representations about the
     12 * suitability of this software for any purpose.  It is provided "as is"
     13 * without express or implied warranty.
     14 *
     15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     22 * SOFTWARE.
     23 *
     24 * Author:  Rodrigo Kumpera (kumpera@gmail.com)
     25 *          André Tupinambá (andrelrt@gmail.com)
     26 *
     27 * Based on work by Owen Taylor and Søren Sandmann
     28 */
     29 #ifdef HAVE_CONFIG_H
     30 #include <pixman-config.h>
     31 #endif
     32 
     33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
     34 #define PSHUFD_IS_FAST 0
     35 
     36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
     37 #include <emmintrin.h> /* for SSE2 intrinsics */
     38 #include "pixman-private.h"
     39 #include "pixman-combine32.h"
     40 #include "pixman-inlines.h"
     41 
     42 static __m128i mask_0080;
     43 static __m128i mask_00ff;
     44 static __m128i mask_0101;
     45 static __m128i mask_ffff;
     46 static __m128i mask_ff000000;
     47 static __m128i mask_alpha;
     48 
     49 static __m128i mask_565_r;
     50 static __m128i mask_565_g1, mask_565_g2;
     51 static __m128i mask_565_b;
     52 static __m128i mask_red;
     53 static __m128i mask_green;
     54 static __m128i mask_blue;
     55 
     56 static __m128i mask_565_fix_rb;
     57 static __m128i mask_565_fix_g;
     58 
     59 static __m128i mask_565_rb;
     60 static __m128i mask_565_pack_multiplier;
     61 
     62 static force_inline __m128i
     63 unpack_32_1x128 (uint32_t data)
     64 {
     65    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
     66 }
     67 
     68 static force_inline void
     69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
     70 {
     71    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
     72    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
     73 }
     74 
     75 static force_inline __m128i
     76 unpack_565_to_8888 (__m128i lo)
     77 {
     78    __m128i r, g, b, rb, t;
     79 
     80    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
     81    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
     82    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
     83 
     84    rb = _mm_or_si128 (r, b);
     85    t  = _mm_and_si128 (rb, mask_565_fix_rb);
     86    t  = _mm_srli_epi32 (t, 5);
     87    rb = _mm_or_si128 (rb, t);
     88 
     89    t  = _mm_and_si128 (g, mask_565_fix_g);
     90    t  = _mm_srli_epi32 (t, 6);
     91    g  = _mm_or_si128 (g, t);
     92 
     93    return _mm_or_si128 (rb, g);
     94 }
     95 
     96 static force_inline void
     97 unpack_565_128_4x128 (__m128i  data,
     98                      __m128i* data0,
     99                      __m128i* data1,
    100                      __m128i* data2,
    101                      __m128i* data3)
    102 {
    103    __m128i lo, hi;
    104 
    105    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
    106    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
    107 
    108    lo = unpack_565_to_8888 (lo);
    109    hi = unpack_565_to_8888 (hi);
    110 
    111    unpack_128_2x128 (lo, data0, data1);
    112    unpack_128_2x128 (hi, data2, data3);
    113 }
    114 
    115 static force_inline uint16_t
    116 pack_565_32_16 (uint32_t pixel)
    117 {
    118    return (uint16_t) (((pixel >> 8) & 0xf800) |
    119 	       ((pixel >> 5) & 0x07e0) |
    120 	       ((pixel >> 3) & 0x001f));
    121 }
    122 
    123 static force_inline __m128i
    124 pack_2x128_128 (__m128i lo, __m128i hi)
    125 {
    126    return _mm_packus_epi16 (lo, hi);
    127 }
    128 
    129 static force_inline __m128i
    130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
    131 {
    132    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
    133    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
    134 
    135    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
    136    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
    137 
    138    __m128i g0 = _mm_and_si128 (lo, mask_green);
    139    __m128i g1 = _mm_and_si128 (hi, mask_green);
    140 
    141    t0 = _mm_or_si128 (t0, g0);
    142    t1 = _mm_or_si128 (t1, g1);
    143 
    144    /* Simulates _mm_packus_epi32 */
    145    t0 = _mm_slli_epi32 (t0, 16 - 5);
    146    t1 = _mm_slli_epi32 (t1, 16 - 5);
    147    t0 = _mm_srai_epi32 (t0, 16);
    148    t1 = _mm_srai_epi32 (t1, 16);
    149    return _mm_packs_epi32 (t0, t1);
    150 }
    151 
    152 static force_inline __m128i
    153 pack_565_2x128_128 (__m128i lo, __m128i hi)
    154 {
    155    __m128i data;
    156    __m128i r, g1, g2, b;
    157 
    158    data = pack_2x128_128 (lo, hi);
    159 
    160    r  = _mm_and_si128 (data, mask_565_r);
    161    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
    162    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
    163    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
    164 
    165    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
    166 }
    167 
    168 static force_inline __m128i
    169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
    170 {
    171    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
    172 		     pack_565_2x128_128 (*xmm2, *xmm3));
    173 }
    174 
    175 static force_inline int
    176 is_opaque (__m128i x)
    177 {
    178    __m128i ffs = _mm_cmpeq_epi8 (x, x);
    179 
    180    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
    181 }
    182 
    183 static force_inline int
    184 is_zero (__m128i x)
    185 {
    186    return _mm_movemask_epi8 (
    187 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
    188 }
    189 
    190 static force_inline int
    191 is_transparent (__m128i x)
    192 {
    193    return (_mm_movemask_epi8 (
    194 	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
    195 }
    196 
    197 static force_inline __m128i
    198 expand_pixel_32_1x128 (uint32_t data)
    199 {
    200    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
    201 }
    202 
    203 static force_inline __m128i
    204 expand_alpha_1x128 (__m128i data)
    205 {
    206    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
    207 					     _MM_SHUFFLE (3, 3, 3, 3)),
    208 			_MM_SHUFFLE (3, 3, 3, 3));
    209 }
    210 
    211 static force_inline void
    212 expand_alpha_2x128 (__m128i  data_lo,
    213                    __m128i  data_hi,
    214                    __m128i* alpha_lo,
    215                    __m128i* alpha_hi)
    216 {
    217    __m128i lo, hi;
    218 
    219    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
    220    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
    221 
    222    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
    223    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
    224 }
    225 
    226 static force_inline void
    227 expand_alpha_rev_2x128 (__m128i  data_lo,
    228                        __m128i  data_hi,
    229                        __m128i* alpha_lo,
    230                        __m128i* alpha_hi)
    231 {
    232    __m128i lo, hi;
    233 
    234    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
    235    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
    236    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
    237    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
    238 }
    239 
    240 static force_inline void
    241 pix_multiply_2x128 (__m128i* data_lo,
    242                    __m128i* data_hi,
    243                    __m128i* alpha_lo,
    244                    __m128i* alpha_hi,
    245                    __m128i* ret_lo,
    246                    __m128i* ret_hi)
    247 {
    248    __m128i lo, hi;
    249 
    250    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
    251    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
    252    lo = _mm_adds_epu16 (lo, mask_0080);
    253    hi = _mm_adds_epu16 (hi, mask_0080);
    254    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
    255    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
    256 }
    257 
    258 static force_inline void
    259 pix_add_multiply_2x128 (__m128i* src_lo,
    260                        __m128i* src_hi,
    261                        __m128i* alpha_dst_lo,
    262                        __m128i* alpha_dst_hi,
    263                        __m128i* dst_lo,
    264                        __m128i* dst_hi,
    265                        __m128i* alpha_src_lo,
    266                        __m128i* alpha_src_hi,
    267                        __m128i* ret_lo,
    268                        __m128i* ret_hi)
    269 {
    270    __m128i t1_lo, t1_hi;
    271    __m128i t2_lo, t2_hi;
    272 
    273    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
    274    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
    275 
    276    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
    277    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
    278 }
    279 
    280 static force_inline void
    281 negate_2x128 (__m128i  data_lo,
    282              __m128i  data_hi,
    283              __m128i* neg_lo,
    284              __m128i* neg_hi)
    285 {
    286    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
    287    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
    288 }
    289 
    290 static force_inline void
    291 invert_colors_2x128 (__m128i  data_lo,
    292                     __m128i  data_hi,
    293                     __m128i* inv_lo,
    294                     __m128i* inv_hi)
    295 {
    296    __m128i lo, hi;
    297 
    298    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
    299    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
    300    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
    301    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
    302 }
    303 
    304 static force_inline void
    305 over_2x128 (__m128i* src_lo,
    306            __m128i* src_hi,
    307            __m128i* alpha_lo,
    308            __m128i* alpha_hi,
    309            __m128i* dst_lo,
    310            __m128i* dst_hi)
    311 {
    312    __m128i t1, t2;
    313 
    314    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
    315 
    316    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
    317 
    318    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
    319    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
    320 }
    321 
    322 static force_inline void
    323 over_rev_non_pre_2x128 (__m128i  src_lo,
    324                        __m128i  src_hi,
    325                        __m128i* dst_lo,
    326                        __m128i* dst_hi)
    327 {
    328    __m128i lo, hi;
    329    __m128i alpha_lo, alpha_hi;
    330 
    331    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
    332 
    333    lo = _mm_or_si128 (alpha_lo, mask_alpha);
    334    hi = _mm_or_si128 (alpha_hi, mask_alpha);
    335 
    336    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
    337 
    338    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
    339 
    340    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
    341 }
    342 
    343 static force_inline void
    344 in_over_2x128 (__m128i* src_lo,
    345               __m128i* src_hi,
    346               __m128i* alpha_lo,
    347               __m128i* alpha_hi,
    348               __m128i* mask_lo,
    349               __m128i* mask_hi,
    350               __m128i* dst_lo,
    351               __m128i* dst_hi)
    352 {
    353    __m128i s_lo, s_hi;
    354    __m128i a_lo, a_hi;
    355 
    356    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
    357    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
    358 
    359    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
    360 }
    361 
    362 /* load 4 pixels from a 16-byte boundary aligned address */
    363 static force_inline __m128i
    364 load_128_aligned (__m128i* src)
    365 {
    366    return _mm_load_si128 (src);
    367 }
    368 
    369 /* load 4 pixels from a unaligned address */
    370 static force_inline __m128i
    371 load_128_unaligned (const __m128i* src)
    372 {
    373    return _mm_loadu_si128 (src);
    374 }
    375 
    376 /* save 4 pixels on a 16-byte boundary aligned address */
    377 static force_inline void
    378 save_128_aligned (__m128i* dst,
    379                  __m128i  data)
    380 {
    381    _mm_store_si128 (dst, data);
    382 }
    383 
    384 static force_inline __m128i
    385 load_32_1x128 (uint32_t data)
    386 {
    387    return _mm_cvtsi32_si128 (data);
    388 }
    389 
    390 static force_inline __m128i
    391 expand_alpha_rev_1x128 (__m128i data)
    392 {
    393    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
    394 }
    395 
    396 static force_inline __m128i
    397 expand_pixel_8_1x128 (uint8_t data)
    398 {
    399    return _mm_shufflelo_epi16 (
    400 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
    401 }
    402 
    403 static force_inline __m128i
    404 pix_multiply_1x128 (__m128i data,
    405 	    __m128i alpha)
    406 {
    407    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
    408 				    mask_0080),
    409 		    mask_0101);
    410 }
    411 
    412 static force_inline __m128i
    413 pix_add_multiply_1x128 (__m128i* src,
    414 		__m128i* alpha_dst,
    415 		__m128i* dst,
    416 		__m128i* alpha_src)
    417 {
    418    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
    419    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
    420 
    421    return _mm_adds_epu8 (t1, t2);
    422 }
    423 
    424 static force_inline __m128i
    425 negate_1x128 (__m128i data)
    426 {
    427    return _mm_xor_si128 (data, mask_00ff);
    428 }
    429 
    430 static force_inline __m128i
    431 invert_colors_1x128 (__m128i data)
    432 {
    433    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
    434 }
    435 
    436 static force_inline __m128i
    437 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
    438 {
    439    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
    440 }
    441 
    442 static force_inline __m128i
    443 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
    444 {
    445    return over_1x128 (pix_multiply_1x128 (*src, *mask),
    446 	       pix_multiply_1x128 (*alpha, *mask),
    447 	       *dst);
    448 }
    449 
    450 static force_inline __m128i
    451 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
    452 {
    453    __m128i alpha = expand_alpha_1x128 (src);
    454 
    455    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
    456 				   _mm_or_si128 (alpha, mask_alpha)),
    457 	       alpha,
    458 	       dst);
    459 }
    460 
    461 static force_inline uint32_t
    462 pack_1x128_32 (__m128i data)
    463 {
    464    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
    465 }
    466 
    467 static force_inline __m128i
    468 expand565_16_1x128 (uint16_t pixel)
    469 {
    470    __m128i m = _mm_cvtsi32_si128 (pixel);
    471 
    472    m = unpack_565_to_8888 (m);
    473 
    474    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
    475 }
    476 
    477 static force_inline uint32_t
    478 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
    479 {
    480    uint8_t a;
    481    __m128i xmms;
    482 
    483    a = src >> 24;
    484 
    485    if (a == 0xff)
    486    {
    487 return src;
    488    }
    489    else if (src)
    490    {
    491 xmms = unpack_32_1x128 (src);
    492 return pack_1x128_32 (
    493     over_1x128 (xmms, expand_alpha_1x128 (xmms),
    494 		unpack_32_1x128 (dst)));
    495    }
    496 
    497    return dst;
    498 }
    499 
    500 static force_inline uint32_t
    501 combine1 (const uint32_t *ps, const uint32_t *pm)
    502 {
    503    uint32_t s;
    504    memcpy(&s, ps, sizeof(uint32_t));
    505 
    506    if (pm)
    507    {
    508 __m128i ms, mm;
    509 
    510 mm = unpack_32_1x128 (*pm);
    511 mm = expand_alpha_1x128 (mm);
    512 
    513 ms = unpack_32_1x128 (s);
    514 ms = pix_multiply_1x128 (ms, mm);
    515 
    516 s = pack_1x128_32 (ms);
    517    }
    518 
    519    return s;
    520 }
    521 
    522 static force_inline __m128i
    523 combine4 (const __m128i *ps, const __m128i *pm)
    524 {
    525    __m128i xmm_src_lo, xmm_src_hi;
    526    __m128i xmm_msk_lo, xmm_msk_hi;
    527    __m128i s;
    528 
    529    if (pm)
    530    {
    531 xmm_msk_lo = load_128_unaligned (pm);
    532 
    533 if (is_transparent (xmm_msk_lo))
    534     return _mm_setzero_si128 ();
    535    }
    536 
    537    s = load_128_unaligned (ps);
    538 
    539    if (pm)
    540    {
    541 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
    542 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
    543 
    544 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
    545 
    546 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    547 		    &xmm_msk_lo, &xmm_msk_hi,
    548 		    &xmm_src_lo, &xmm_src_hi);
    549 
    550 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
    551    }
    552 
    553    return s;
    554 }
    555 
    556 static force_inline void
    557 core_combine_over_u_sse2_mask (uint32_t *	  pd,
    558 		       const uint32_t*    ps,
    559 		       const uint32_t*    pm,
    560 		       int                w)
    561 {
    562    uint32_t s, d;
    563 
    564    /* Align dst on a 16-byte boundary */
    565    while (w && ((uintptr_t)pd & 15))
    566    {
    567 d = *pd;
    568 s = combine1 (ps, pm);
    569 
    570 if (s)
    571     *pd = core_combine_over_u_pixel_sse2 (s, d);
    572 pd++;
    573 ps++;
    574 pm++;
    575 w--;
    576    }
    577 
    578    while (w >= 4)
    579    {
    580 __m128i mask = load_128_unaligned ((__m128i *)pm);
    581 
    582 if (!is_zero (mask))
    583 {
    584     __m128i src;
    585     __m128i src_hi, src_lo;
    586     __m128i mask_hi, mask_lo;
    587     __m128i alpha_hi, alpha_lo;
    588 
    589     src = load_128_unaligned ((__m128i *)ps);
    590 
    591     if (is_opaque (_mm_and_si128 (src, mask)))
    592     {
    593 	save_128_aligned ((__m128i *)pd, src);
    594     }
    595     else
    596     {
    597 	__m128i dst = load_128_aligned ((__m128i *)pd);
    598 	__m128i dst_hi, dst_lo;
    599 
    600 	unpack_128_2x128 (mask, &mask_lo, &mask_hi);
    601 	unpack_128_2x128 (src, &src_lo, &src_hi);
    602 
    603 	expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
    604 	pix_multiply_2x128 (&src_lo, &src_hi,
    605 			    &mask_lo, &mask_hi,
    606 			    &src_lo, &src_hi);
    607 
    608 	unpack_128_2x128 (dst, &dst_lo, &dst_hi);
    609 
    610 	expand_alpha_2x128 (src_lo, src_hi,
    611 			    &alpha_lo, &alpha_hi);
    612 
    613 	over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
    614 		    &dst_lo, &dst_hi);
    615 
    616 	save_128_aligned (
    617 	    (__m128i *)pd,
    618 	    pack_2x128_128 (dst_lo, dst_hi));
    619     }
    620 }
    621 
    622 pm += 4;
    623 ps += 4;
    624 pd += 4;
    625 w -= 4;
    626    }
    627    while (w)
    628    {
    629 d = *pd;
    630 s = combine1 (ps, pm);
    631 
    632 if (s)
    633     *pd = core_combine_over_u_pixel_sse2 (s, d);
    634 pd++;
    635 ps++;
    636 pm++;
    637 
    638 w--;
    639    }
    640 }
    641 
    642 static force_inline void
    643 core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
    644 			  const uint32_t*    ps,
    645 			  int                w)
    646 {
    647    uint32_t s, d;
    648 
    649    /* Align dst on a 16-byte boundary */
    650    while (w && ((uintptr_t)pd & 15))
    651    {
    652 d = *pd;
    653 s = *ps;
    654 
    655 if (s)
    656     *pd = core_combine_over_u_pixel_sse2 (s, d);
    657 pd++;
    658 ps++;
    659 w--;
    660    }
    661 
    662    while (w >= 4)
    663    {
    664 __m128i src;
    665 __m128i src_hi, src_lo, dst_hi, dst_lo;
    666 __m128i alpha_hi, alpha_lo;
    667 
    668 src = load_128_unaligned ((__m128i *)ps);
    669 
    670 if (!is_zero (src))
    671 {
    672     if (is_opaque (src))
    673     {
    674 	save_128_aligned ((__m128i *)pd, src);
    675     }
    676     else
    677     {
    678 	__m128i dst = load_128_aligned ((__m128i *)pd);
    679 
    680 	unpack_128_2x128 (src, &src_lo, &src_hi);
    681 	unpack_128_2x128 (dst, &dst_lo, &dst_hi);
    682 
    683 	expand_alpha_2x128 (src_lo, src_hi,
    684 			    &alpha_lo, &alpha_hi);
    685 	over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
    686 		    &dst_lo, &dst_hi);
    687 
    688 	save_128_aligned (
    689 	    (__m128i *)pd,
    690 	    pack_2x128_128 (dst_lo, dst_hi));
    691     }
    692 }
    693 
    694 ps += 4;
    695 pd += 4;
    696 w -= 4;
    697    }
    698    while (w)
    699    {
    700 d = *pd;
    701 s = *ps;
    702 
    703 if (s)
    704     *pd = core_combine_over_u_pixel_sse2 (s, d);
    705 pd++;
    706 ps++;
    707 
    708 w--;
    709    }
    710 }
    711 
    712 static force_inline void
    713 sse2_combine_over_u (pixman_implementation_t *imp,
    714                     pixman_op_t              op,
    715                     uint32_t *               pd,
    716                     const uint32_t *         ps,
    717                     const uint32_t *         pm,
    718                     int                      w)
    719 {
    720    if (pm)
    721 core_combine_over_u_sse2_mask (pd, ps, pm, w);
    722    else
    723 core_combine_over_u_sse2_no_mask (pd, ps, w);
    724 }
    725 
    726 static void
    727 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
    728                             pixman_op_t              op,
    729                             uint32_t *               pd,
    730                             const uint32_t *         ps,
    731                             const uint32_t *         pm,
    732                             int                      w)
    733 {
    734    uint32_t s, d;
    735 
    736    __m128i xmm_dst_lo, xmm_dst_hi;
    737    __m128i xmm_src_lo, xmm_src_hi;
    738    __m128i xmm_alpha_lo, xmm_alpha_hi;
    739 
    740    /* Align dst on a 16-byte boundary */
    741    while (w &&
    742           ((uintptr_t)pd & 15))
    743    {
    744 d = *pd;
    745 s = combine1 (ps, pm);
    746 
    747 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
    748 w--;
    749 ps++;
    750 if (pm)
    751     pm++;
    752    }
    753 
    754    while (w >= 4)
    755    {
    756 /* I'm loading unaligned because I'm not sure
    757  * about the address alignment.
    758  */
    759 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
    760 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    761 
    762 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    763 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    764 
    765 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
    766 		    &xmm_alpha_lo, &xmm_alpha_hi);
    767 
    768 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    769 	    &xmm_alpha_lo, &xmm_alpha_hi,
    770 	    &xmm_src_lo, &xmm_src_hi);
    771 
    772 /* rebuid the 4 pixel data and save*/
    773 save_128_aligned ((__m128i*)pd,
    774 		  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
    775 
    776 w -= 4;
    777 ps += 4;
    778 pd += 4;
    779 
    780 if (pm)
    781     pm += 4;
    782    }
    783 
    784    while (w)
    785    {
    786 d = *pd;
    787 s = combine1 (ps, pm);
    788 
    789 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
    790 ps++;
    791 w--;
    792 if (pm)
    793     pm++;
    794    }
    795 }
    796 
    797 static force_inline uint32_t
    798 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
    799 {
    800    uint32_t maska = src >> 24;
    801 
    802    if (maska == 0)
    803    {
    804 return 0;
    805    }
    806    else if (maska != 0xff)
    807    {
    808 return pack_1x128_32 (
    809     pix_multiply_1x128 (unpack_32_1x128 (dst),
    810 			expand_alpha_1x128 (unpack_32_1x128 (src))));
    811    }
    812 
    813    return dst;
    814 }
    815 
    816 static void
    817 sse2_combine_in_u (pixman_implementation_t *imp,
    818                   pixman_op_t              op,
    819                   uint32_t *               pd,
    820                   const uint32_t *         ps,
    821                   const uint32_t *         pm,
    822                   int                      w)
    823 {
    824    uint32_t s, d;
    825 
    826    __m128i xmm_src_lo, xmm_src_hi;
    827    __m128i xmm_dst_lo, xmm_dst_hi;
    828 
    829    while (w && ((uintptr_t)pd & 15))
    830    {
    831 s = combine1 (ps, pm);
    832 d = *pd;
    833 
    834 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
    835 w--;
    836 ps++;
    837 if (pm)
    838     pm++;
    839    }
    840 
    841    while (w >= 4)
    842    {
    843 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    844 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
    845 
    846 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    847 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    848 
    849 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    850 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
    851 		    &xmm_dst_lo, &xmm_dst_hi,
    852 		    &xmm_dst_lo, &xmm_dst_hi);
    853 
    854 save_128_aligned ((__m128i*)pd,
    855 		  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    856 
    857 ps += 4;
    858 pd += 4;
    859 w -= 4;
    860 if (pm)
    861     pm += 4;
    862    }
    863 
    864    while (w)
    865    {
    866 s = combine1 (ps, pm);
    867 d = *pd;
    868 
    869 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
    870 w--;
    871 ps++;
    872 if (pm)
    873     pm++;
    874    }
    875 }
    876 
    877 static void
    878 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
    879                           pixman_op_t              op,
    880                           uint32_t *               pd,
    881                           const uint32_t *         ps,
    882                           const uint32_t *         pm,
    883                           int                      w)
    884 {
    885    uint32_t s, d;
    886 
    887    __m128i xmm_src_lo, xmm_src_hi;
    888    __m128i xmm_dst_lo, xmm_dst_hi;
    889 
    890    while (w && ((uintptr_t)pd & 15))
    891    {
    892 s = combine1 (ps, pm);
    893 d = *pd;
    894 
    895 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
    896 ps++;
    897 w--;
    898 if (pm)
    899     pm++;
    900    }
    901 
    902    while (w >= 4)
    903    {
    904 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    905 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
    906 
    907 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    908 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    909 
    910 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    911 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    912 		    &xmm_src_lo, &xmm_src_hi,
    913 		    &xmm_dst_lo, &xmm_dst_hi);
    914 
    915 save_128_aligned (
    916     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    917 
    918 ps += 4;
    919 pd += 4;
    920 w -= 4;
    921 if (pm)
    922     pm += 4;
    923    }
    924 
    925    while (w)
    926    {
    927 s = combine1 (ps, pm);
    928 d = *pd;
    929 
    930 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
    931 w--;
    932 ps++;
    933 if (pm)
    934     pm++;
    935    }
    936 }
    937 
    938 static void
    939 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
    940                            pixman_op_t              op,
    941                            uint32_t *               pd,
    942                            const uint32_t *         ps,
    943                            const uint32_t *         pm,
    944                            int                      w)
    945 {
    946    while (w && ((uintptr_t)pd & 15))
    947    {
    948 uint32_t s = combine1 (ps, pm);
    949 uint32_t d = *pd;
    950 
    951 *pd++ = pack_1x128_32 (
    952     pix_multiply_1x128 (
    953 	unpack_32_1x128 (d), negate_1x128 (
    954 	    expand_alpha_1x128 (unpack_32_1x128 (s)))));
    955 
    956 if (pm)
    957     pm++;
    958 ps++;
    959 w--;
    960    }
    961 
    962    while (w >= 4)
    963    {
    964 __m128i xmm_src_lo, xmm_src_hi;
    965 __m128i xmm_dst_lo, xmm_dst_hi;
    966 
    967 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
    968 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
    969 
    970 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    971 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
    972 
    973 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    974 negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
    975 
    976 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
    977 		    &xmm_src_lo, &xmm_src_hi,
    978 		    &xmm_dst_lo, &xmm_dst_hi);
    979 
    980 save_128_aligned (
    981     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
    982 
    983 ps += 4;
    984 pd += 4;
    985 if (pm)
    986     pm += 4;
    987 
    988 w -= 4;
    989    }
    990 
    991    while (w)
    992    {
    993 uint32_t s = combine1 (ps, pm);
    994 uint32_t d = *pd;
    995 
    996 *pd++ = pack_1x128_32 (
    997     pix_multiply_1x128 (
    998 	unpack_32_1x128 (d), negate_1x128 (
    999 	    expand_alpha_1x128 (unpack_32_1x128 (s)))));
   1000 ps++;
   1001 if (pm)
   1002     pm++;
   1003 w--;
   1004    }
   1005 }
   1006 
   1007 static void
   1008 sse2_combine_out_u (pixman_implementation_t *imp,
   1009                    pixman_op_t              op,
   1010                    uint32_t *               pd,
   1011                    const uint32_t *         ps,
   1012                    const uint32_t *         pm,
   1013                    int                      w)
   1014 {
   1015    while (w && ((uintptr_t)pd & 15))
   1016    {
   1017 uint32_t s = combine1 (ps, pm);
   1018 uint32_t d = *pd;
   1019 
   1020 *pd++ = pack_1x128_32 (
   1021     pix_multiply_1x128 (
   1022 	unpack_32_1x128 (s), negate_1x128 (
   1023 	    expand_alpha_1x128 (unpack_32_1x128 (d)))));
   1024 w--;
   1025 ps++;
   1026 if (pm)
   1027     pm++;
   1028    }
   1029 
   1030    while (w >= 4)
   1031    {
   1032 __m128i xmm_src_lo, xmm_src_hi;
   1033 __m128i xmm_dst_lo, xmm_dst_hi;
   1034 
   1035 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
   1036 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1037 
   1038 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1039 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1040 
   1041 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1042 negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1043 
   1044 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1045 		    &xmm_dst_lo, &xmm_dst_hi,
   1046 		    &xmm_dst_lo, &xmm_dst_hi);
   1047 
   1048 save_128_aligned (
   1049     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1050 
   1051 ps += 4;
   1052 pd += 4;
   1053 w -= 4;
   1054 if (pm)
   1055     pm += 4;
   1056    }
   1057 
   1058    while (w)
   1059    {
   1060 uint32_t s = combine1 (ps, pm);
   1061 uint32_t d = *pd;
   1062 
   1063 *pd++ = pack_1x128_32 (
   1064     pix_multiply_1x128 (
   1065 	unpack_32_1x128 (s), negate_1x128 (
   1066 	    expand_alpha_1x128 (unpack_32_1x128 (d)))));
   1067 w--;
   1068 ps++;
   1069 if (pm)
   1070     pm++;
   1071    }
   1072 }
   1073 
   1074 static force_inline uint32_t
   1075 core_combine_atop_u_pixel_sse2 (uint32_t src,
   1076                                uint32_t dst)
   1077 {
   1078    __m128i s = unpack_32_1x128 (src);
   1079    __m128i d = unpack_32_1x128 (dst);
   1080 
   1081    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
   1082    __m128i da = expand_alpha_1x128 (d);
   1083 
   1084    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
   1085 }
   1086 
   1087 static void
   1088 sse2_combine_atop_u (pixman_implementation_t *imp,
   1089                     pixman_op_t              op,
   1090                     uint32_t *               pd,
   1091                     const uint32_t *         ps,
   1092                     const uint32_t *         pm,
   1093                     int                      w)
   1094 {
   1095    uint32_t s, d;
   1096 
   1097    __m128i xmm_src_lo, xmm_src_hi;
   1098    __m128i xmm_dst_lo, xmm_dst_hi;
   1099    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   1100    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   1101 
   1102    while (w && ((uintptr_t)pd & 15))
   1103    {
   1104 s = combine1 (ps, pm);
   1105 d = *pd;
   1106 
   1107 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
   1108 w--;
   1109 ps++;
   1110 if (pm)
   1111     pm++;
   1112    }
   1113 
   1114    while (w >= 4)
   1115    {
   1116 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
   1117 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1118 
   1119 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1120 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1121 
   1122 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1123 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1124 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1125 		    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1126 
   1127 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
   1128 	      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1129 
   1130 pix_add_multiply_2x128 (
   1131     &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   1132     &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   1133     &xmm_dst_lo, &xmm_dst_hi);
   1134 
   1135 save_128_aligned (
   1136     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1137 
   1138 ps += 4;
   1139 pd += 4;
   1140 w -= 4;
   1141 if (pm)
   1142     pm += 4;
   1143    }
   1144 
   1145    while (w)
   1146    {
   1147 s = combine1 (ps, pm);
   1148 d = *pd;
   1149 
   1150 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
   1151 w--;
   1152 ps++;
   1153 if (pm)
   1154     pm++;
   1155    }
   1156 }
   1157 
   1158 static force_inline uint32_t
   1159 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
   1160                                        uint32_t dst)
   1161 {
   1162    __m128i s = unpack_32_1x128 (src);
   1163    __m128i d = unpack_32_1x128 (dst);
   1164 
   1165    __m128i sa = expand_alpha_1x128 (s);
   1166    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
   1167 
   1168    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
   1169 }
   1170 
   1171 static void
   1172 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
   1173                             pixman_op_t              op,
   1174                             uint32_t *               pd,
   1175                             const uint32_t *         ps,
   1176                             const uint32_t *         pm,
   1177                             int                      w)
   1178 {
   1179    uint32_t s, d;
   1180 
   1181    __m128i xmm_src_lo, xmm_src_hi;
   1182    __m128i xmm_dst_lo, xmm_dst_hi;
   1183    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   1184    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   1185 
   1186    while (w && ((uintptr_t)pd & 15))
   1187    {
   1188 s = combine1 (ps, pm);
   1189 d = *pd;
   1190 
   1191 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
   1192 ps++;
   1193 w--;
   1194 if (pm)
   1195     pm++;
   1196    }
   1197 
   1198    while (w >= 4)
   1199    {
   1200 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
   1201 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1202 
   1203 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1204 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1205 
   1206 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1207 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1208 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1209 		    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1210 
   1211 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
   1212 	      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1213 
   1214 pix_add_multiply_2x128 (
   1215     &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   1216     &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   1217     &xmm_dst_lo, &xmm_dst_hi);
   1218 
   1219 save_128_aligned (
   1220     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1221 
   1222 ps += 4;
   1223 pd += 4;
   1224 w -= 4;
   1225 if (pm)
   1226     pm += 4;
   1227    }
   1228 
   1229    while (w)
   1230    {
   1231 s = combine1 (ps, pm);
   1232 d = *pd;
   1233 
   1234 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
   1235 ps++;
   1236 w--;
   1237 if (pm)
   1238     pm++;
   1239    }
   1240 }
   1241 
   1242 static force_inline uint32_t
   1243 core_combine_xor_u_pixel_sse2 (uint32_t src,
   1244                               uint32_t dst)
   1245 {
   1246    __m128i s = unpack_32_1x128 (src);
   1247    __m128i d = unpack_32_1x128 (dst);
   1248 
   1249    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
   1250    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
   1251 
   1252    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
   1253 }
   1254 
   1255 static void
   1256 sse2_combine_xor_u (pixman_implementation_t *imp,
   1257                    pixman_op_t              op,
   1258                    uint32_t *               dst,
   1259                    const uint32_t *         src,
   1260                    const uint32_t *         mask,
   1261                    int                      width)
   1262 {
   1263    int w = width;
   1264    uint32_t s, d;
   1265    uint32_t* pd = dst;
   1266    const uint32_t* ps = src;
   1267    const uint32_t* pm = mask;
   1268 
   1269    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   1270    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   1271    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   1272    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   1273 
   1274    while (w && ((uintptr_t)pd & 15))
   1275    {
   1276 s = combine1 (ps, pm);
   1277 d = *pd;
   1278 
   1279 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
   1280 w--;
   1281 ps++;
   1282 if (pm)
   1283     pm++;
   1284    }
   1285 
   1286    while (w >= 4)
   1287    {
   1288 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
   1289 xmm_dst = load_128_aligned ((__m128i*) pd);
   1290 
   1291 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   1292 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   1293 
   1294 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1295 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1296 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1297 		    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1298 
   1299 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
   1300 	      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   1301 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
   1302 	      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   1303 
   1304 pix_add_multiply_2x128 (
   1305     &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   1306     &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   1307     &xmm_dst_lo, &xmm_dst_hi);
   1308 
   1309 save_128_aligned (
   1310     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1311 
   1312 ps += 4;
   1313 pd += 4;
   1314 w -= 4;
   1315 if (pm)
   1316     pm += 4;
   1317    }
   1318 
   1319    while (w)
   1320    {
   1321 s = combine1 (ps, pm);
   1322 d = *pd;
   1323 
   1324 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
   1325 w--;
   1326 ps++;
   1327 if (pm)
   1328     pm++;
   1329    }
   1330 }
   1331 
   1332 static force_inline void
   1333 sse2_combine_add_u (pixman_implementation_t *imp,
   1334                    pixman_op_t              op,
   1335                    uint32_t *               dst,
   1336                    const uint32_t *         src,
   1337                    const uint32_t *         mask,
   1338                    int                      width)
   1339 {
   1340    int w = width;
   1341    uint32_t s, d;
   1342    uint32_t* pd = dst;
   1343    const uint32_t* ps = src;
   1344    const uint32_t* pm = mask;
   1345 
   1346    while (w && (uintptr_t)pd & 15)
   1347    {
   1348 s = combine1 (ps, pm);
   1349 d = *pd;
   1350 
   1351 ps++;
   1352 if (pm)
   1353     pm++;
   1354 *pd++ = _mm_cvtsi128_si32 (
   1355     _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
   1356 w--;
   1357    }
   1358 
   1359    while (w >= 4)
   1360    {
   1361 __m128i s;
   1362 
   1363 s = combine4 ((__m128i*)ps, (__m128i*)pm);
   1364 
   1365 save_128_aligned (
   1366     (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
   1367 
   1368 pd += 4;
   1369 ps += 4;
   1370 if (pm)
   1371     pm += 4;
   1372 w -= 4;
   1373    }
   1374 
   1375    while (w--)
   1376    {
   1377 s = combine1 (ps, pm);
   1378 d = *pd;
   1379 
   1380 ps++;
   1381 *pd++ = _mm_cvtsi128_si32 (
   1382     _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
   1383 if (pm)
   1384     pm++;
   1385    }
   1386 }
   1387 
   1388 static force_inline uint32_t
   1389 core_combine_saturate_u_pixel_sse2 (uint32_t src,
   1390                                    uint32_t dst)
   1391 {
   1392    __m128i ms = unpack_32_1x128 (src);
   1393    __m128i md = unpack_32_1x128 (dst);
   1394    uint32_t sa = src >> 24;
   1395    uint32_t da = ~dst >> 24;
   1396 
   1397    if (sa > da)
   1398    {
   1399 ms = pix_multiply_1x128 (
   1400     ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
   1401    }
   1402 
   1403    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
   1404 }
   1405 
   1406 static void
   1407 sse2_combine_saturate_u (pixman_implementation_t *imp,
   1408                         pixman_op_t              op,
   1409                         uint32_t *               pd,
   1410                         const uint32_t *         ps,
   1411                         const uint32_t *         pm,
   1412                         int                      w)
   1413 {
   1414    uint32_t s, d;
   1415 
   1416    uint32_t pack_cmp;
   1417    __m128i xmm_src, xmm_dst;
   1418 
   1419    while (w && (uintptr_t)pd & 15)
   1420    {
   1421 s = combine1 (ps, pm);
   1422 d = *pd;
   1423 
   1424 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1425 w--;
   1426 ps++;
   1427 if (pm)
   1428     pm++;
   1429    }
   1430 
   1431    while (w >= 4)
   1432    {
   1433 xmm_dst = load_128_aligned  ((__m128i*)pd);
   1434 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
   1435 
   1436 pack_cmp = _mm_movemask_epi8 (
   1437     _mm_cmpgt_epi32 (
   1438 	_mm_srli_epi32 (xmm_src, 24),
   1439 	_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
   1440 
   1441 /* if some alpha src is grater than respective ~alpha dst */
   1442 if (pack_cmp)
   1443 {
   1444     s = combine1 (ps++, pm);
   1445     d = *pd;
   1446     *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1447     if (pm)
   1448 	pm++;
   1449 
   1450     s = combine1 (ps++, pm);
   1451     d = *pd;
   1452     *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1453     if (pm)
   1454 	pm++;
   1455 
   1456     s = combine1 (ps++, pm);
   1457     d = *pd;
   1458     *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1459     if (pm)
   1460 	pm++;
   1461 
   1462     s = combine1 (ps++, pm);
   1463     d = *pd;
   1464     *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1465     if (pm)
   1466 	pm++;
   1467 }
   1468 else
   1469 {
   1470     save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
   1471 
   1472     pd += 4;
   1473     ps += 4;
   1474     if (pm)
   1475 	pm += 4;
   1476 }
   1477 
   1478 w -= 4;
   1479    }
   1480 
   1481    while (w--)
   1482    {
   1483 s = combine1 (ps, pm);
   1484 d = *pd;
   1485 
   1486 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
   1487 ps++;
   1488 if (pm)
   1489     pm++;
   1490    }
   1491 }
   1492 
   1493 static void
   1494 sse2_combine_src_ca (pixman_implementation_t *imp,
   1495                     pixman_op_t              op,
   1496                     uint32_t *               pd,
   1497                     const uint32_t *         ps,
   1498                     const uint32_t *         pm,
   1499                     int                      w)
   1500 {
   1501    uint32_t s, m;
   1502 
   1503    __m128i xmm_src_lo, xmm_src_hi;
   1504    __m128i xmm_mask_lo, xmm_mask_hi;
   1505    __m128i xmm_dst_lo, xmm_dst_hi;
   1506 
   1507    while (w && (uintptr_t)pd & 15)
   1508    {
   1509 s = *ps++;
   1510 m = *pm++;
   1511 *pd++ = pack_1x128_32 (
   1512     pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
   1513 w--;
   1514    }
   1515 
   1516    while (w >= 4)
   1517    {
   1518 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1519 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1520 
   1521 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1522 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1523 
   1524 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1525 		    &xmm_mask_lo, &xmm_mask_hi,
   1526 		    &xmm_dst_lo, &xmm_dst_hi);
   1527 
   1528 save_128_aligned (
   1529     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1530 
   1531 ps += 4;
   1532 pd += 4;
   1533 pm += 4;
   1534 w -= 4;
   1535    }
   1536 
   1537    while (w)
   1538    {
   1539 s = *ps++;
   1540 m = *pm++;
   1541 *pd++ = pack_1x128_32 (
   1542     pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
   1543 w--;
   1544    }
   1545 }
   1546 
   1547 static force_inline uint32_t
   1548 core_combine_over_ca_pixel_sse2 (uint32_t src,
   1549                                 uint32_t mask,
   1550                                 uint32_t dst)
   1551 {
   1552    __m128i s = unpack_32_1x128 (src);
   1553    __m128i expAlpha = expand_alpha_1x128 (s);
   1554    __m128i unpk_mask = unpack_32_1x128 (mask);
   1555    __m128i unpk_dst  = unpack_32_1x128 (dst);
   1556 
   1557    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
   1558 }
   1559 
   1560 static void
   1561 sse2_combine_over_ca (pixman_implementation_t *imp,
   1562                      pixman_op_t              op,
   1563                      uint32_t *               pd,
   1564                      const uint32_t *         ps,
   1565                      const uint32_t *         pm,
   1566                      int                      w)
   1567 {
   1568    uint32_t s, m, d;
   1569 
   1570    __m128i xmm_alpha_lo, xmm_alpha_hi;
   1571    __m128i xmm_src_lo, xmm_src_hi;
   1572    __m128i xmm_dst_lo, xmm_dst_hi;
   1573    __m128i xmm_mask_lo, xmm_mask_hi;
   1574 
   1575    while (w && (uintptr_t)pd & 15)
   1576    {
   1577 s = *ps++;
   1578 m = *pm++;
   1579 d = *pd;
   1580 
   1581 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
   1582 w--;
   1583    }
   1584 
   1585    while (w >= 4)
   1586    {
   1587 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1588 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1589 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1590 
   1591 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1592 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1593 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1594 
   1595 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1596 		    &xmm_alpha_lo, &xmm_alpha_hi);
   1597 
   1598 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   1599 	       &xmm_alpha_lo, &xmm_alpha_hi,
   1600 	       &xmm_mask_lo, &xmm_mask_hi,
   1601 	       &xmm_dst_lo, &xmm_dst_hi);
   1602 
   1603 save_128_aligned (
   1604     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1605 
   1606 ps += 4;
   1607 pd += 4;
   1608 pm += 4;
   1609 w -= 4;
   1610    }
   1611 
   1612    while (w)
   1613    {
   1614 s = *ps++;
   1615 m = *pm++;
   1616 d = *pd;
   1617 
   1618 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
   1619 w--;
   1620    }
   1621 }
   1622 
   1623 static force_inline uint32_t
   1624 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
   1625                                         uint32_t mask,
   1626                                         uint32_t dst)
   1627 {
   1628    __m128i d = unpack_32_1x128 (dst);
   1629 
   1630    return pack_1x128_32 (
   1631 over_1x128 (d, expand_alpha_1x128 (d),
   1632 	    pix_multiply_1x128 (unpack_32_1x128 (src),
   1633 				unpack_32_1x128 (mask))));
   1634 }
   1635 
   1636 static void
   1637 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
   1638                              pixman_op_t              op,
   1639                              uint32_t *               pd,
   1640                              const uint32_t *         ps,
   1641                              const uint32_t *         pm,
   1642                              int                      w)
   1643 {
   1644    uint32_t s, m, d;
   1645 
   1646    __m128i xmm_alpha_lo, xmm_alpha_hi;
   1647    __m128i xmm_src_lo, xmm_src_hi;
   1648    __m128i xmm_dst_lo, xmm_dst_hi;
   1649    __m128i xmm_mask_lo, xmm_mask_hi;
   1650 
   1651    while (w && (uintptr_t)pd & 15)
   1652    {
   1653 s = *ps++;
   1654 m = *pm++;
   1655 d = *pd;
   1656 
   1657 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
   1658 w--;
   1659    }
   1660 
   1661    while (w >= 4)
   1662    {
   1663 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1664 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1665 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1666 
   1667 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1668 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1669 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1670 
   1671 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1672 		    &xmm_alpha_lo, &xmm_alpha_hi);
   1673 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1674 		    &xmm_mask_lo, &xmm_mask_hi,
   1675 		    &xmm_mask_lo, &xmm_mask_hi);
   1676 
   1677 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1678 	    &xmm_alpha_lo, &xmm_alpha_hi,
   1679 	    &xmm_mask_lo, &xmm_mask_hi);
   1680 
   1681 save_128_aligned (
   1682     (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
   1683 
   1684 ps += 4;
   1685 pd += 4;
   1686 pm += 4;
   1687 w -= 4;
   1688    }
   1689 
   1690    while (w)
   1691    {
   1692 s = *ps++;
   1693 m = *pm++;
   1694 d = *pd;
   1695 
   1696 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
   1697 w--;
   1698    }
   1699 }
   1700 
   1701 static void
   1702 sse2_combine_in_ca (pixman_implementation_t *imp,
   1703                    pixman_op_t              op,
   1704                    uint32_t *               pd,
   1705                    const uint32_t *         ps,
   1706                    const uint32_t *         pm,
   1707                    int                      w)
   1708 {
   1709    uint32_t s, m, d;
   1710 
   1711    __m128i xmm_alpha_lo, xmm_alpha_hi;
   1712    __m128i xmm_src_lo, xmm_src_hi;
   1713    __m128i xmm_dst_lo, xmm_dst_hi;
   1714    __m128i xmm_mask_lo, xmm_mask_hi;
   1715 
   1716    while (w && (uintptr_t)pd & 15)
   1717    {
   1718 s = *ps++;
   1719 m = *pm++;
   1720 d = *pd;
   1721 
   1722 *pd++ = pack_1x128_32 (
   1723     pix_multiply_1x128 (
   1724 	pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
   1725 	expand_alpha_1x128 (unpack_32_1x128 (d))));
   1726 
   1727 w--;
   1728    }
   1729 
   1730    while (w >= 4)
   1731    {
   1732 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1733 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1734 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1735 
   1736 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1737 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1738 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1739 
   1740 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1741 		    &xmm_alpha_lo, &xmm_alpha_hi);
   1742 
   1743 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1744 		    &xmm_mask_lo, &xmm_mask_hi,
   1745 		    &xmm_dst_lo, &xmm_dst_hi);
   1746 
   1747 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1748 		    &xmm_alpha_lo, &xmm_alpha_hi,
   1749 		    &xmm_dst_lo, &xmm_dst_hi);
   1750 
   1751 save_128_aligned (
   1752     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1753 
   1754 ps += 4;
   1755 pd += 4;
   1756 pm += 4;
   1757 w -= 4;
   1758    }
   1759 
   1760    while (w)
   1761    {
   1762 s = *ps++;
   1763 m = *pm++;
   1764 d = *pd;
   1765 
   1766 *pd++ = pack_1x128_32 (
   1767     pix_multiply_1x128 (
   1768 	pix_multiply_1x128 (
   1769 	    unpack_32_1x128 (s), unpack_32_1x128 (m)),
   1770 	expand_alpha_1x128 (unpack_32_1x128 (d))));
   1771 
   1772 w--;
   1773    }
   1774 }
   1775 
   1776 static void
   1777 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
   1778                            pixman_op_t              op,
   1779                            uint32_t *               pd,
   1780                            const uint32_t *         ps,
   1781                            const uint32_t *         pm,
   1782                            int                      w)
   1783 {
   1784    uint32_t s, m, d;
   1785 
   1786    __m128i xmm_alpha_lo, xmm_alpha_hi;
   1787    __m128i xmm_src_lo, xmm_src_hi;
   1788    __m128i xmm_dst_lo, xmm_dst_hi;
   1789    __m128i xmm_mask_lo, xmm_mask_hi;
   1790 
   1791    while (w && (uintptr_t)pd & 15)
   1792    {
   1793 s = *ps++;
   1794 m = *pm++;
   1795 d = *pd;
   1796 
   1797 *pd++ = pack_1x128_32 (
   1798     pix_multiply_1x128 (
   1799 	unpack_32_1x128 (d),
   1800 	pix_multiply_1x128 (unpack_32_1x128 (m),
   1801 			   expand_alpha_1x128 (unpack_32_1x128 (s)))));
   1802 w--;
   1803    }
   1804 
   1805    while (w >= 4)
   1806    {
   1807 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1808 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1809 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1810 
   1811 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1812 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1813 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1814 
   1815 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1816 		    &xmm_alpha_lo, &xmm_alpha_hi);
   1817 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   1818 		    &xmm_alpha_lo, &xmm_alpha_hi,
   1819 		    &xmm_alpha_lo, &xmm_alpha_hi);
   1820 
   1821 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1822 		    &xmm_alpha_lo, &xmm_alpha_hi,
   1823 		    &xmm_dst_lo, &xmm_dst_hi);
   1824 
   1825 save_128_aligned (
   1826     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1827 
   1828 ps += 4;
   1829 pd += 4;
   1830 pm += 4;
   1831 w -= 4;
   1832    }
   1833 
   1834    while (w)
   1835    {
   1836 s = *ps++;
   1837 m = *pm++;
   1838 d = *pd;
   1839 
   1840 *pd++ = pack_1x128_32 (
   1841     pix_multiply_1x128 (
   1842 	unpack_32_1x128 (d),
   1843 	pix_multiply_1x128 (unpack_32_1x128 (m),
   1844 			   expand_alpha_1x128 (unpack_32_1x128 (s)))));
   1845 w--;
   1846    }
   1847 }
   1848 
   1849 static void
   1850 sse2_combine_out_ca (pixman_implementation_t *imp,
   1851                     pixman_op_t              op,
   1852                     uint32_t *               pd,
   1853                     const uint32_t *         ps,
   1854                     const uint32_t *         pm,
   1855                     int                      w)
   1856 {
   1857    uint32_t s, m, d;
   1858 
   1859    __m128i xmm_alpha_lo, xmm_alpha_hi;
   1860    __m128i xmm_src_lo, xmm_src_hi;
   1861    __m128i xmm_dst_lo, xmm_dst_hi;
   1862    __m128i xmm_mask_lo, xmm_mask_hi;
   1863 
   1864    while (w && (uintptr_t)pd & 15)
   1865    {
   1866 s = *ps++;
   1867 m = *pm++;
   1868 d = *pd;
   1869 
   1870 *pd++ = pack_1x128_32 (
   1871     pix_multiply_1x128 (
   1872 	pix_multiply_1x128 (
   1873 	    unpack_32_1x128 (s), unpack_32_1x128 (m)),
   1874 	negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
   1875 w--;
   1876    }
   1877 
   1878    while (w >= 4)
   1879    {
   1880 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1881 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1882 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1883 
   1884 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1885 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1886 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1887 
   1888 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1889 		    &xmm_alpha_lo, &xmm_alpha_hi);
   1890 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
   1891 	      &xmm_alpha_lo, &xmm_alpha_hi);
   1892 
   1893 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1894 		    &xmm_mask_lo, &xmm_mask_hi,
   1895 		    &xmm_dst_lo, &xmm_dst_hi);
   1896 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1897 		    &xmm_alpha_lo, &xmm_alpha_hi,
   1898 		    &xmm_dst_lo, &xmm_dst_hi);
   1899 
   1900 save_128_aligned (
   1901     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1902 
   1903 ps += 4;
   1904 pd += 4;
   1905 pm += 4;
   1906 w -= 4;
   1907    }
   1908 
   1909    while (w)
   1910    {
   1911 s = *ps++;
   1912 m = *pm++;
   1913 d = *pd;
   1914 
   1915 *pd++ = pack_1x128_32 (
   1916     pix_multiply_1x128 (
   1917 	pix_multiply_1x128 (
   1918 	    unpack_32_1x128 (s), unpack_32_1x128 (m)),
   1919 	negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
   1920 
   1921 w--;
   1922    }
   1923 }
   1924 
   1925 static void
   1926 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
   1927                             pixman_op_t              op,
   1928                             uint32_t *               pd,
   1929                             const uint32_t *         ps,
   1930                             const uint32_t *         pm,
   1931                             int                      w)
   1932 {
   1933    uint32_t s, m, d;
   1934 
   1935    __m128i xmm_alpha_lo, xmm_alpha_hi;
   1936    __m128i xmm_src_lo, xmm_src_hi;
   1937    __m128i xmm_dst_lo, xmm_dst_hi;
   1938    __m128i xmm_mask_lo, xmm_mask_hi;
   1939 
   1940    while (w && (uintptr_t)pd & 15)
   1941    {
   1942 s = *ps++;
   1943 m = *pm++;
   1944 d = *pd;
   1945 
   1946 *pd++ = pack_1x128_32 (
   1947     pix_multiply_1x128 (
   1948 	unpack_32_1x128 (d),
   1949 	negate_1x128 (pix_multiply_1x128 (
   1950 			 unpack_32_1x128 (m),
   1951 			 expand_alpha_1x128 (unpack_32_1x128 (s))))));
   1952 w--;
   1953    }
   1954 
   1955    while (w >= 4)
   1956    {
   1957 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   1958 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   1959 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   1960 
   1961 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1962 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1963 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   1964 
   1965 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   1966 		    &xmm_alpha_lo, &xmm_alpha_hi);
   1967 
   1968 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   1969 		    &xmm_alpha_lo, &xmm_alpha_hi,
   1970 		    &xmm_mask_lo, &xmm_mask_hi);
   1971 
   1972 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
   1973 	      &xmm_mask_lo, &xmm_mask_hi);
   1974 
   1975 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1976 		    &xmm_mask_lo, &xmm_mask_hi,
   1977 		    &xmm_dst_lo, &xmm_dst_hi);
   1978 
   1979 save_128_aligned (
   1980     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1981 
   1982 ps += 4;
   1983 pd += 4;
   1984 pm += 4;
   1985 w -= 4;
   1986    }
   1987 
   1988    while (w)
   1989    {
   1990 s = *ps++;
   1991 m = *pm++;
   1992 d = *pd;
   1993 
   1994 *pd++ = pack_1x128_32 (
   1995     pix_multiply_1x128 (
   1996 	unpack_32_1x128 (d),
   1997 	negate_1x128 (pix_multiply_1x128 (
   1998 			 unpack_32_1x128 (m),
   1999 			 expand_alpha_1x128 (unpack_32_1x128 (s))))));
   2000 w--;
   2001    }
   2002 }
   2003 
   2004 static force_inline uint32_t
   2005 core_combine_atop_ca_pixel_sse2 (uint32_t src,
   2006                                 uint32_t mask,
   2007                                 uint32_t dst)
   2008 {
   2009    __m128i m = unpack_32_1x128 (mask);
   2010    __m128i s = unpack_32_1x128 (src);
   2011    __m128i d = unpack_32_1x128 (dst);
   2012    __m128i sa = expand_alpha_1x128 (s);
   2013    __m128i da = expand_alpha_1x128 (d);
   2014 
   2015    s = pix_multiply_1x128 (s, m);
   2016    m = negate_1x128 (pix_multiply_1x128 (m, sa));
   2017 
   2018    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
   2019 }
   2020 
   2021 static void
   2022 sse2_combine_atop_ca (pixman_implementation_t *imp,
   2023                      pixman_op_t              op,
   2024                      uint32_t *               pd,
   2025                      const uint32_t *         ps,
   2026                      const uint32_t *         pm,
   2027                      int                      w)
   2028 {
   2029    uint32_t s, m, d;
   2030 
   2031    __m128i xmm_src_lo, xmm_src_hi;
   2032    __m128i xmm_dst_lo, xmm_dst_hi;
   2033    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   2034    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   2035    __m128i xmm_mask_lo, xmm_mask_hi;
   2036 
   2037    while (w && (uintptr_t)pd & 15)
   2038    {
   2039 s = *ps++;
   2040 m = *pm++;
   2041 d = *pd;
   2042 
   2043 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
   2044 w--;
   2045    }
   2046 
   2047    while (w >= 4)
   2048    {
   2049 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   2050 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   2051 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   2052 
   2053 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   2054 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   2055 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2056 
   2057 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   2058 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   2059 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   2060 		    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2061 
   2062 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   2063 		    &xmm_mask_lo, &xmm_mask_hi,
   2064 		    &xmm_src_lo, &xmm_src_hi);
   2065 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   2066 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   2067 		    &xmm_mask_lo, &xmm_mask_hi);
   2068 
   2069 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2070 
   2071 pix_add_multiply_2x128 (
   2072     &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
   2073     &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   2074     &xmm_dst_lo, &xmm_dst_hi);
   2075 
   2076 save_128_aligned (
   2077     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2078 
   2079 ps += 4;
   2080 pd += 4;
   2081 pm += 4;
   2082 w -= 4;
   2083    }
   2084 
   2085    while (w)
   2086    {
   2087 s = *ps++;
   2088 m = *pm++;
   2089 d = *pd;
   2090 
   2091 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
   2092 w--;
   2093    }
   2094 }
   2095 
   2096 static force_inline uint32_t
   2097 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
   2098                                         uint32_t mask,
   2099                                         uint32_t dst)
   2100 {
   2101    __m128i m = unpack_32_1x128 (mask);
   2102    __m128i s = unpack_32_1x128 (src);
   2103    __m128i d = unpack_32_1x128 (dst);
   2104 
   2105    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
   2106    __m128i sa = expand_alpha_1x128 (s);
   2107 
   2108    s = pix_multiply_1x128 (s, m);
   2109    m = pix_multiply_1x128 (m, sa);
   2110 
   2111    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
   2112 }
   2113 
   2114 static void
   2115 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
   2116                              pixman_op_t              op,
   2117                              uint32_t *               pd,
   2118                              const uint32_t *         ps,
   2119                              const uint32_t *         pm,
   2120                              int                      w)
   2121 {
   2122    uint32_t s, m, d;
   2123 
   2124    __m128i xmm_src_lo, xmm_src_hi;
   2125    __m128i xmm_dst_lo, xmm_dst_hi;
   2126    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   2127    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   2128    __m128i xmm_mask_lo, xmm_mask_hi;
   2129 
   2130    while (w && (uintptr_t)pd & 15)
   2131    {
   2132 s = *ps++;
   2133 m = *pm++;
   2134 d = *pd;
   2135 
   2136 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
   2137 w--;
   2138    }
   2139 
   2140    while (w >= 4)
   2141    {
   2142 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   2143 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   2144 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   2145 
   2146 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   2147 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   2148 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2149 
   2150 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   2151 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   2152 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   2153 		    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2154 
   2155 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   2156 		    &xmm_mask_lo, &xmm_mask_hi,
   2157 		    &xmm_src_lo, &xmm_src_hi);
   2158 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   2159 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   2160 		    &xmm_mask_lo, &xmm_mask_hi);
   2161 
   2162 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
   2163 	      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2164 
   2165 pix_add_multiply_2x128 (
   2166     &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
   2167     &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   2168     &xmm_dst_lo, &xmm_dst_hi);
   2169 
   2170 save_128_aligned (
   2171     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2172 
   2173 ps += 4;
   2174 pd += 4;
   2175 pm += 4;
   2176 w -= 4;
   2177    }
   2178 
   2179    while (w)
   2180    {
   2181 s = *ps++;
   2182 m = *pm++;
   2183 d = *pd;
   2184 
   2185 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
   2186 w--;
   2187    }
   2188 }
   2189 
   2190 static force_inline uint32_t
   2191 core_combine_xor_ca_pixel_sse2 (uint32_t src,
   2192                                uint32_t mask,
   2193                                uint32_t dst)
   2194 {
   2195    __m128i a = unpack_32_1x128 (mask);
   2196    __m128i s = unpack_32_1x128 (src);
   2197    __m128i d = unpack_32_1x128 (dst);
   2198 
   2199    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
   2200 			       a, expand_alpha_1x128 (s)));
   2201    __m128i dest      = pix_multiply_1x128 (s, a);
   2202    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
   2203 
   2204    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
   2205                                                &alpha_dst,
   2206                                                &dest,
   2207                                                &alpha_src));
   2208 }
   2209 
   2210 static void
   2211 sse2_combine_xor_ca (pixman_implementation_t *imp,
   2212                     pixman_op_t              op,
   2213                     uint32_t *               pd,
   2214                     const uint32_t *         ps,
   2215                     const uint32_t *         pm,
   2216                     int                      w)
   2217 {
   2218    uint32_t s, m, d;
   2219 
   2220    __m128i xmm_src_lo, xmm_src_hi;
   2221    __m128i xmm_dst_lo, xmm_dst_hi;
   2222    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
   2223    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
   2224    __m128i xmm_mask_lo, xmm_mask_hi;
   2225 
   2226    while (w && (uintptr_t)pd & 15)
   2227    {
   2228 s = *ps++;
   2229 m = *pm++;
   2230 d = *pd;
   2231 
   2232 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
   2233 w--;
   2234    }
   2235 
   2236    while (w >= 4)
   2237    {
   2238 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   2239 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   2240 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   2241 
   2242 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   2243 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   2244 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2245 
   2246 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   2247 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
   2248 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   2249 		    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2250 
   2251 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   2252 		    &xmm_mask_lo, &xmm_mask_hi,
   2253 		    &xmm_src_lo, &xmm_src_hi);
   2254 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   2255 		    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
   2256 		    &xmm_mask_lo, &xmm_mask_hi);
   2257 
   2258 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
   2259 	      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
   2260 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
   2261 	      &xmm_mask_lo, &xmm_mask_hi);
   2262 
   2263 pix_add_multiply_2x128 (
   2264     &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
   2265     &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
   2266     &xmm_dst_lo, &xmm_dst_hi);
   2267 
   2268 save_128_aligned (
   2269     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2270 
   2271 ps += 4;
   2272 pd += 4;
   2273 pm += 4;
   2274 w -= 4;
   2275    }
   2276 
   2277    while (w)
   2278    {
   2279 s = *ps++;
   2280 m = *pm++;
   2281 d = *pd;
   2282 
   2283 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
   2284 w--;
   2285    }
   2286 }
   2287 
   2288 static void
   2289 sse2_combine_add_ca (pixman_implementation_t *imp,
   2290                     pixman_op_t              op,
   2291                     uint32_t *               pd,
   2292                     const uint32_t *         ps,
   2293                     const uint32_t *         pm,
   2294                     int                      w)
   2295 {
   2296    uint32_t s, m, d;
   2297 
   2298    __m128i xmm_src_lo, xmm_src_hi;
   2299    __m128i xmm_dst_lo, xmm_dst_hi;
   2300    __m128i xmm_mask_lo, xmm_mask_hi;
   2301 
   2302    while (w && (uintptr_t)pd & 15)
   2303    {
   2304 s = *ps++;
   2305 m = *pm++;
   2306 d = *pd;
   2307 
   2308 *pd++ = pack_1x128_32 (
   2309     _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
   2310 				       unpack_32_1x128 (m)),
   2311 		   unpack_32_1x128 (d)));
   2312 w--;
   2313    }
   2314 
   2315    while (w >= 4)
   2316    {
   2317 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
   2318 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
   2319 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
   2320 
   2321 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   2322 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   2323 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   2324 
   2325 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   2326 		    &xmm_mask_lo, &xmm_mask_hi,
   2327 		    &xmm_src_lo, &xmm_src_hi);
   2328 
   2329 save_128_aligned (
   2330     (__m128i*)pd, pack_2x128_128 (
   2331 	_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
   2332 	_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
   2333 
   2334 ps += 4;
   2335 pd += 4;
   2336 pm += 4;
   2337 w -= 4;
   2338    }
   2339 
   2340    while (w)
   2341    {
   2342 s = *ps++;
   2343 m = *pm++;
   2344 d = *pd;
   2345 
   2346 *pd++ = pack_1x128_32 (
   2347     _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
   2348 				       unpack_32_1x128 (m)),
   2349 		   unpack_32_1x128 (d)));
   2350 w--;
   2351    }
   2352 }
   2353 
   2354 static force_inline __m128i
   2355 create_mask_16_128 (uint16_t mask)
   2356 {
   2357    return _mm_set1_epi16 (mask);
   2358 }
   2359 
   2360 /* Work around a code generation bug in Sun Studio 12. */
   2361 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
   2362 # define create_mask_2x32_128(mask0, mask1)				\
   2363    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
   2364 #else
   2365 static force_inline __m128i
   2366 create_mask_2x32_128 (uint32_t mask0,
   2367                      uint32_t mask1)
   2368 {
   2369    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
   2370 }
   2371 #endif
   2372 
   2373 static void
   2374 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
   2375                            pixman_composite_info_t *info)
   2376 {
   2377    PIXMAN_COMPOSITE_ARGS (info);
   2378    uint32_t src;
   2379    uint32_t    *dst_line, *dst, d;
   2380    int32_t w;
   2381    int dst_stride;
   2382    __m128i xmm_src, xmm_alpha;
   2383    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   2384 
   2385    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2386 
   2387    if (src == 0)
   2388 return;
   2389 
   2390    PIXMAN_IMAGE_GET_LINE (
   2391 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2392 
   2393    xmm_src = expand_pixel_32_1x128 (src);
   2394    xmm_alpha = expand_alpha_1x128 (xmm_src);
   2395 
   2396    while (height--)
   2397    {
   2398 dst = dst_line;
   2399 
   2400 dst_line += dst_stride;
   2401 w = width;
   2402 
   2403 while (w && (uintptr_t)dst & 15)
   2404 {
   2405     d = *dst;
   2406     *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
   2407 					xmm_alpha,
   2408 					unpack_32_1x128 (d)));
   2409     w--;
   2410 }
   2411 
   2412 while (w >= 4)
   2413 {
   2414     xmm_dst = load_128_aligned ((__m128i*)dst);
   2415 
   2416     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   2417 
   2418     over_2x128 (&xmm_src, &xmm_src,
   2419 		&xmm_alpha, &xmm_alpha,
   2420 		&xmm_dst_lo, &xmm_dst_hi);
   2421 
   2422     /* rebuid the 4 pixel data and save*/
   2423     save_128_aligned (
   2424 	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2425 
   2426     w -= 4;
   2427     dst += 4;
   2428 }
   2429 
   2430 while (w)
   2431 {
   2432     d = *dst;
   2433     *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
   2434 					xmm_alpha,
   2435 					unpack_32_1x128 (d)));
   2436     w--;
   2437 }
   2438 
   2439    }
   2440 }
   2441 
   2442 static void
   2443 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
   2444                            pixman_composite_info_t *info)
   2445 {
   2446    PIXMAN_COMPOSITE_ARGS (info);
   2447    uint32_t src;
   2448    uint16_t    *dst_line, *dst, d;
   2449    int32_t w;
   2450    int dst_stride;
   2451    __m128i xmm_src, xmm_alpha;
   2452    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   2453 
   2454    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2455 
   2456    if (src == 0)
   2457 return;
   2458 
   2459    PIXMAN_IMAGE_GET_LINE (
   2460 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2461 
   2462    xmm_src = expand_pixel_32_1x128 (src);
   2463    xmm_alpha = expand_alpha_1x128 (xmm_src);
   2464 
   2465    while (height--)
   2466    {
   2467 dst = dst_line;
   2468 
   2469 dst_line += dst_stride;
   2470 w = width;
   2471 
   2472 while (w && (uintptr_t)dst & 15)
   2473 {
   2474     d = *dst;
   2475 
   2476     *dst++ = pack_565_32_16 (
   2477 	pack_1x128_32 (over_1x128 (xmm_src,
   2478 				   xmm_alpha,
   2479 				   expand565_16_1x128 (d))));
   2480     w--;
   2481 }
   2482 
   2483 while (w >= 8)
   2484 {
   2485     xmm_dst = load_128_aligned ((__m128i*)dst);
   2486 
   2487     unpack_565_128_4x128 (xmm_dst,
   2488 			  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   2489 
   2490     over_2x128 (&xmm_src, &xmm_src,
   2491 		&xmm_alpha, &xmm_alpha,
   2492 		&xmm_dst0, &xmm_dst1);
   2493     over_2x128 (&xmm_src, &xmm_src,
   2494 		&xmm_alpha, &xmm_alpha,
   2495 		&xmm_dst2, &xmm_dst3);
   2496 
   2497     xmm_dst = pack_565_4x128_128 (
   2498 	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   2499 
   2500     save_128_aligned ((__m128i*)dst, xmm_dst);
   2501 
   2502     dst += 8;
   2503     w -= 8;
   2504 }
   2505 
   2506 while (w--)
   2507 {
   2508     d = *dst;
   2509     *dst++ = pack_565_32_16 (
   2510 	pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
   2511 				   expand565_16_1x128 (d))));
   2512 }
   2513    }
   2514 
   2515 }
   2516 
   2517 static void
   2518 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
   2519 			   pixman_composite_info_t *info)
   2520 {
   2521    PIXMAN_COMPOSITE_ARGS (info);
   2522    uint32_t src;
   2523    uint32_t    *dst_line, d;
   2524    uint32_t    *mask_line, m;
   2525    uint32_t pack_cmp;
   2526    int dst_stride, mask_stride;
   2527 
   2528    __m128i xmm_src;
   2529    __m128i xmm_dst;
   2530    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   2531 
   2532    __m128i mmx_src, mmx_mask, mmx_dest;
   2533 
   2534    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2535 
   2536    if (src == 0)
   2537 return;
   2538 
   2539    PIXMAN_IMAGE_GET_LINE (
   2540 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2541    PIXMAN_IMAGE_GET_LINE (
   2542 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   2543 
   2544    xmm_src = _mm_unpacklo_epi8 (
   2545 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
   2546    mmx_src   = xmm_src;
   2547 
   2548    while (height--)
   2549    {
   2550 int w = width;
   2551 const uint32_t *pm = (uint32_t *)mask_line;
   2552 uint32_t *pd = (uint32_t *)dst_line;
   2553 
   2554 dst_line += dst_stride;
   2555 mask_line += mask_stride;
   2556 
   2557 while (w && (uintptr_t)pd & 15)
   2558 {
   2559     m = *pm++;
   2560 
   2561     if (m)
   2562     {
   2563 	d = *pd;
   2564 
   2565 	mmx_mask = unpack_32_1x128 (m);
   2566 	mmx_dest = unpack_32_1x128 (d);
   2567 
   2568 	*pd = pack_1x128_32 (
   2569 	    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
   2570 			   mmx_dest));
   2571     }
   2572 
   2573     pd++;
   2574     w--;
   2575 }
   2576 
   2577 while (w >= 4)
   2578 {
   2579     xmm_mask = load_128_unaligned ((__m128i*)pm);
   2580 
   2581     pack_cmp =
   2582 	_mm_movemask_epi8 (
   2583 	    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
   2584 
   2585     /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
   2586     if (pack_cmp != 0xffff)
   2587     {
   2588 	xmm_dst = load_128_aligned ((__m128i*)pd);
   2589 
   2590 	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   2591 
   2592 	pix_multiply_2x128 (&xmm_src, &xmm_src,
   2593 			    &xmm_mask_lo, &xmm_mask_hi,
   2594 			    &xmm_mask_lo, &xmm_mask_hi);
   2595 	xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
   2596 
   2597 	save_128_aligned (
   2598 	    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
   2599     }
   2600 
   2601     pd += 4;
   2602     pm += 4;
   2603     w -= 4;
   2604 }
   2605 
   2606 while (w)
   2607 {
   2608     m = *pm++;
   2609 
   2610     if (m)
   2611     {
   2612 	d = *pd;
   2613 
   2614 	mmx_mask = unpack_32_1x128 (m);
   2615 	mmx_dest = unpack_32_1x128 (d);
   2616 
   2617 	*pd = pack_1x128_32 (
   2618 	    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
   2619 			   mmx_dest));
   2620     }
   2621 
   2622     pd++;
   2623     w--;
   2624 }
   2625    }
   2626 
   2627 }
   2628 
   2629 static void
   2630 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
   2631                                    pixman_composite_info_t *info)
   2632 {
   2633    PIXMAN_COMPOSITE_ARGS (info);
   2634    uint32_t src;
   2635    uint32_t    *dst_line, d;
   2636    uint32_t    *mask_line, m;
   2637    uint32_t pack_cmp;
   2638    int dst_stride, mask_stride;
   2639 
   2640    __m128i xmm_src, xmm_alpha;
   2641    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   2642    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   2643 
   2644    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
   2645 
   2646    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2647 
   2648    if (src == 0)
   2649 return;
   2650 
   2651    PIXMAN_IMAGE_GET_LINE (
   2652 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2653    PIXMAN_IMAGE_GET_LINE (
   2654 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   2655 
   2656    xmm_src = _mm_unpacklo_epi8 (
   2657 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
   2658    xmm_alpha = expand_alpha_1x128 (xmm_src);
   2659    mmx_src   = xmm_src;
   2660    mmx_alpha = xmm_alpha;
   2661 
   2662    while (height--)
   2663    {
   2664 int w = width;
   2665 const uint32_t *pm = (uint32_t *)mask_line;
   2666 uint32_t *pd = (uint32_t *)dst_line;
   2667 
   2668 dst_line += dst_stride;
   2669 mask_line += mask_stride;
   2670 
   2671 while (w && (uintptr_t)pd & 15)
   2672 {
   2673     m = *pm++;
   2674 
   2675     if (m)
   2676     {
   2677 	d = *pd;
   2678 	mmx_mask = unpack_32_1x128 (m);
   2679 	mmx_dest = unpack_32_1x128 (d);
   2680 
   2681 	*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
   2682 	                                  &mmx_alpha,
   2683 	                                  &mmx_mask,
   2684 	                                  &mmx_dest));
   2685     }
   2686 
   2687     pd++;
   2688     w--;
   2689 }
   2690 
   2691 while (w >= 4)
   2692 {
   2693     xmm_mask = load_128_unaligned ((__m128i*)pm);
   2694 
   2695     pack_cmp =
   2696 	_mm_movemask_epi8 (
   2697 	    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
   2698 
   2699     /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
   2700     if (pack_cmp != 0xffff)
   2701     {
   2702 	xmm_dst = load_128_aligned ((__m128i*)pd);
   2703 
   2704 	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   2705 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   2706 
   2707 	in_over_2x128 (&xmm_src, &xmm_src,
   2708 		       &xmm_alpha, &xmm_alpha,
   2709 		       &xmm_mask_lo, &xmm_mask_hi,
   2710 		       &xmm_dst_lo, &xmm_dst_hi);
   2711 
   2712 	save_128_aligned (
   2713 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2714     }
   2715 
   2716     pd += 4;
   2717     pm += 4;
   2718     w -= 4;
   2719 }
   2720 
   2721 while (w)
   2722 {
   2723     m = *pm++;
   2724 
   2725     if (m)
   2726     {
   2727 	d = *pd;
   2728 	mmx_mask = unpack_32_1x128 (m);
   2729 	mmx_dest = unpack_32_1x128 (d);
   2730 
   2731 	*pd = pack_1x128_32 (
   2732 	    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
   2733     }
   2734 
   2735     pd++;
   2736     w--;
   2737 }
   2738    }
   2739 
   2740 }
   2741 
   2742 static void
   2743 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
   2744                                 pixman_composite_info_t *info)
   2745 {
   2746    PIXMAN_COMPOSITE_ARGS (info);
   2747    uint32_t    *dst_line, *dst;
   2748    uint32_t    *src_line, *src;
   2749    uint32_t mask;
   2750    int32_t w;
   2751    int dst_stride, src_stride;
   2752 
   2753    __m128i xmm_mask;
   2754    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   2755    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   2756    __m128i xmm_alpha_lo, xmm_alpha_hi;
   2757 
   2758    PIXMAN_IMAGE_GET_LINE (
   2759 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2760    PIXMAN_IMAGE_GET_LINE (
   2761 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2762 
   2763    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
   2764 
   2765    xmm_mask = create_mask_16_128 (mask >> 24);
   2766 
   2767    while (height--)
   2768    {
   2769 dst = dst_line;
   2770 dst_line += dst_stride;
   2771 src = src_line;
   2772 src_line += src_stride;
   2773 w = width;
   2774 
   2775 while (w && (uintptr_t)dst & 15)
   2776 {
   2777     uint32_t s = *src++;
   2778 
   2779     if (s)
   2780     {
   2781 	uint32_t d = *dst;
   2782 	
   2783 	__m128i ms = unpack_32_1x128 (s);
   2784 	__m128i alpha    = expand_alpha_1x128 (ms);
   2785 	__m128i dest     = xmm_mask;
   2786 	__m128i alpha_dst = unpack_32_1x128 (d);
   2787 	
   2788 	*dst = pack_1x128_32 (
   2789 	    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
   2790     }
   2791     dst++;
   2792     w--;
   2793 }
   2794 
   2795 while (w >= 4)
   2796 {
   2797     xmm_src = load_128_unaligned ((__m128i*)src);
   2798 
   2799     if (!is_zero (xmm_src))
   2800     {
   2801 	xmm_dst = load_128_aligned ((__m128i*)dst);
   2802 	
   2803 	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   2804 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   2805 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   2806 			    &xmm_alpha_lo, &xmm_alpha_hi);
   2807 	
   2808 	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   2809 		       &xmm_alpha_lo, &xmm_alpha_hi,
   2810 		       &xmm_mask, &xmm_mask,
   2811 		       &xmm_dst_lo, &xmm_dst_hi);
   2812 	
   2813 	save_128_aligned (
   2814 	    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   2815     }
   2816 	
   2817     dst += 4;
   2818     src += 4;
   2819     w -= 4;
   2820 }
   2821 
   2822 while (w)
   2823 {
   2824     uint32_t s = *src++;
   2825 
   2826     if (s)
   2827     {
   2828 	uint32_t d = *dst;
   2829 	
   2830 	__m128i ms = unpack_32_1x128 (s);
   2831 	__m128i alpha = expand_alpha_1x128 (ms);
   2832 	__m128i mask  = xmm_mask;
   2833 	__m128i dest  = unpack_32_1x128 (d);
   2834 	
   2835 	*dst = pack_1x128_32 (
   2836 	    in_over_1x128 (&ms, &alpha, &mask, &dest));
   2837     }
   2838 
   2839     dst++;
   2840     w--;
   2841 }
   2842    }
   2843 
   2844 }
   2845 
   2846 static void
   2847 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
   2848                              pixman_composite_info_t *info)
   2849 {
   2850    PIXMAN_COMPOSITE_ARGS (info);
   2851    uint16_t    *dst_line, *dst;
   2852    uint32_t    *src_line, *src, s;
   2853    int dst_stride, src_stride;
   2854    int32_t w;
   2855 
   2856    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2857    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   2858 
   2859    while (height--)
   2860    {
   2861 dst = dst_line;
   2862 dst_line += dst_stride;
   2863 src = src_line;
   2864 src_line += src_stride;
   2865 w = width;
   2866 
   2867 while (w && (uintptr_t)dst & 15)
   2868 {
   2869     s = *src++;
   2870     *dst = convert_8888_to_0565 (s);
   2871     dst++;
   2872     w--;
   2873 }
   2874 
   2875 while (w >= 8)
   2876 {
   2877     __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
   2878     __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
   2879 
   2880     save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
   2881 
   2882     w -= 8;
   2883     src += 8;
   2884     dst += 8;
   2885 }
   2886 
   2887 while (w)
   2888 {
   2889     s = *src++;
   2890     *dst = convert_8888_to_0565 (s);
   2891     dst++;
   2892     w--;
   2893 }
   2894    }
   2895 }
   2896 
   2897 static void
   2898 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
   2899 		      pixman_composite_info_t *info)
   2900 {
   2901    PIXMAN_COMPOSITE_ARGS (info);
   2902    uint32_t    *dst_line, *dst;
   2903    uint32_t    *src_line, *src;
   2904    int32_t w;
   2905    int dst_stride, src_stride;
   2906 
   2907 
   2908    PIXMAN_IMAGE_GET_LINE (
   2909 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2910    PIXMAN_IMAGE_GET_LINE (
   2911 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2912 
   2913    while (height--)
   2914    {
   2915 dst = dst_line;
   2916 dst_line += dst_stride;
   2917 src = src_line;
   2918 src_line += src_stride;
   2919 w = width;
   2920 
   2921 while (w && (uintptr_t)dst & 15)
   2922 {
   2923     *dst++ = *src++ | 0xff000000;
   2924     w--;
   2925 }
   2926 
   2927 while (w >= 16)
   2928 {
   2929     __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
   2930     
   2931     xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
   2932     xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
   2933     xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
   2934     xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
   2935     
   2936     save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
   2937     save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
   2938     save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
   2939     save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
   2940     
   2941     dst += 16;
   2942     src += 16;
   2943     w -= 16;
   2944 }
   2945 
   2946 while (w)
   2947 {
   2948     *dst++ = *src++ | 0xff000000;
   2949     w--;
   2950 }
   2951    }
   2952 
   2953 }
   2954 
   2955 static void
   2956 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
   2957                                 pixman_composite_info_t *info)
   2958 {
   2959    PIXMAN_COMPOSITE_ARGS (info);
   2960    uint32_t    *dst_line, *dst;
   2961    uint32_t    *src_line, *src;
   2962    uint32_t mask;
   2963    int dst_stride, src_stride;
   2964    int32_t w;
   2965 
   2966    __m128i xmm_mask, xmm_alpha;
   2967    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   2968    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   2969 
   2970    PIXMAN_IMAGE_GET_LINE (
   2971 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   2972    PIXMAN_IMAGE_GET_LINE (
   2973 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   2974 
   2975    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
   2976 
   2977    xmm_mask = create_mask_16_128 (mask >> 24);
   2978    xmm_alpha = mask_00ff;
   2979 
   2980    while (height--)
   2981    {
   2982 dst = dst_line;
   2983 dst_line += dst_stride;
   2984 src = src_line;
   2985 src_line += src_stride;
   2986 w = width;
   2987 
   2988 while (w && (uintptr_t)dst & 15)
   2989 {
   2990     uint32_t s = (*src++) | 0xff000000;
   2991     uint32_t d = *dst;
   2992 
   2993     __m128i src   = unpack_32_1x128 (s);
   2994     __m128i alpha = xmm_alpha;
   2995     __m128i mask  = xmm_mask;
   2996     __m128i dest  = unpack_32_1x128 (d);
   2997 
   2998     *dst++ = pack_1x128_32 (
   2999 	in_over_1x128 (&src, &alpha, &mask, &dest));
   3000 
   3001     w--;
   3002 }
   3003 
   3004 while (w >= 4)
   3005 {
   3006     xmm_src = _mm_or_si128 (
   3007 	load_128_unaligned ((__m128i*)src), mask_ff000000);
   3008     xmm_dst = load_128_aligned ((__m128i*)dst);
   3009 
   3010     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3011     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   3012 
   3013     in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   3014 		   &xmm_alpha, &xmm_alpha,
   3015 		   &xmm_mask, &xmm_mask,
   3016 		   &xmm_dst_lo, &xmm_dst_hi);
   3017 
   3018     save_128_aligned (
   3019 	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   3020 
   3021     dst += 4;
   3022     src += 4;
   3023     w -= 4;
   3024 
   3025 }
   3026 
   3027 while (w)
   3028 {
   3029     uint32_t s = (*src++) | 0xff000000;
   3030     uint32_t d = *dst;
   3031 
   3032     __m128i src  = unpack_32_1x128 (s);
   3033     __m128i alpha = xmm_alpha;
   3034     __m128i mask  = xmm_mask;
   3035     __m128i dest  = unpack_32_1x128 (d);
   3036 
   3037     *dst++ = pack_1x128_32 (
   3038 	in_over_1x128 (&src, &alpha, &mask, &dest));
   3039 
   3040     w--;
   3041 }
   3042    }
   3043 
   3044 }
   3045 
   3046 static void
   3047 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
   3048                               pixman_composite_info_t *info)
   3049 {
   3050    PIXMAN_COMPOSITE_ARGS (info);
   3051    int dst_stride, src_stride;
   3052    uint32_t    *dst_line, *dst;
   3053    uint32_t    *src_line, *src;
   3054 
   3055    PIXMAN_IMAGE_GET_LINE (
   3056 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3057    PIXMAN_IMAGE_GET_LINE (
   3058 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3059 
   3060    dst = dst_line;
   3061    src = src_line;
   3062 
   3063    while (height--)
   3064    {
   3065 sse2_combine_over_u (imp, op, dst, src, NULL, width);
   3066 
   3067 dst += dst_stride;
   3068 src += src_stride;
   3069    }
   3070 }
   3071 
   3072 static force_inline uint16_t
   3073 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
   3074 {
   3075    __m128i ms;
   3076 
   3077    ms = unpack_32_1x128 (src);
   3078    return pack_565_32_16 (
   3079 pack_1x128_32 (
   3080     over_1x128 (
   3081 	ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
   3082 }
   3083 
   3084 static void
   3085 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
   3086                               pixman_composite_info_t *info)
   3087 {
   3088    PIXMAN_COMPOSITE_ARGS (info);
   3089    uint16_t    *dst_line, *dst, d;
   3090    uint32_t    *src_line, *src, s;
   3091    int dst_stride, src_stride;
   3092    int32_t w;
   3093 
   3094    __m128i xmm_alpha_lo, xmm_alpha_hi;
   3095    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   3096    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   3097 
   3098    PIXMAN_IMAGE_GET_LINE (
   3099 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3100    PIXMAN_IMAGE_GET_LINE (
   3101 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3102 
   3103    while (height--)
   3104    {
   3105 dst = dst_line;
   3106 src = src_line;
   3107 
   3108 dst_line += dst_stride;
   3109 src_line += src_stride;
   3110 w = width;
   3111 
   3112 /* Align dst on a 16-byte boundary */
   3113 while (w &&
   3114        ((uintptr_t)dst & 15))
   3115 {
   3116     s = *src++;
   3117     d = *dst;
   3118 
   3119     *dst++ = composite_over_8888_0565pixel (s, d);
   3120     w--;
   3121 }
   3122 
   3123 /* It's a 8 pixel loop */
   3124 while (w >= 8)
   3125 {
   3126     /* I'm loading unaligned because I'm not sure
   3127      * about the address alignment.
   3128      */
   3129     xmm_src = load_128_unaligned ((__m128i*) src);
   3130     xmm_dst = load_128_aligned ((__m128i*) dst);
   3131 
   3132     /* Unpacking */
   3133     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3134     unpack_565_128_4x128 (xmm_dst,
   3135 			  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   3136     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   3137 			&xmm_alpha_lo, &xmm_alpha_hi);
   3138 
   3139     /* I'm loading next 4 pixels from memory
   3140      * before to optimze the memory read.
   3141      */
   3142     xmm_src = load_128_unaligned ((__m128i*) (src + 4));
   3143 
   3144     over_2x128 (&xmm_src_lo, &xmm_src_hi,
   3145 		&xmm_alpha_lo, &xmm_alpha_hi,
   3146 		&xmm_dst0, &xmm_dst1);
   3147 
   3148     /* Unpacking */
   3149     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3150     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   3151 			&xmm_alpha_lo, &xmm_alpha_hi);
   3152 
   3153     over_2x128 (&xmm_src_lo, &xmm_src_hi,
   3154 		&xmm_alpha_lo, &xmm_alpha_hi,
   3155 		&xmm_dst2, &xmm_dst3);
   3156 
   3157     save_128_aligned (
   3158 	(__m128i*)dst, pack_565_4x128_128 (
   3159 	    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
   3160 
   3161     w -= 8;
   3162     dst += 8;
   3163     src += 8;
   3164 }
   3165 
   3166 while (w--)
   3167 {
   3168     s = *src++;
   3169     d = *dst;
   3170 
   3171     *dst++ = composite_over_8888_0565pixel (s, d);
   3172 }
   3173    }
   3174 
   3175 }
   3176 
   3177 static void
   3178 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
   3179                              pixman_composite_info_t *info)
   3180 {
   3181    PIXMAN_COMPOSITE_ARGS (info);
   3182    uint32_t src, srca;
   3183    uint32_t *dst_line, *dst;
   3184    uint8_t *mask_line, *mask;
   3185    int dst_stride, mask_stride;
   3186    int32_t w;
   3187    uint32_t d;
   3188 
   3189    __m128i xmm_src, xmm_alpha, xmm_def;
   3190    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   3191    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   3192 
   3193    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
   3194 
   3195    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3196 
   3197    srca = src >> 24;
   3198    if (src == 0)
   3199 return;
   3200 
   3201    PIXMAN_IMAGE_GET_LINE (
   3202 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3203    PIXMAN_IMAGE_GET_LINE (
   3204 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   3205 
   3206    xmm_def = create_mask_2x32_128 (src, src);
   3207    xmm_src = expand_pixel_32_1x128 (src);
   3208    xmm_alpha = expand_alpha_1x128 (xmm_src);
   3209    mmx_src   = xmm_src;
   3210    mmx_alpha = xmm_alpha;
   3211 
   3212    while (height--)
   3213    {
   3214 dst = dst_line;
   3215 dst_line += dst_stride;
   3216 mask = mask_line;
   3217 mask_line += mask_stride;
   3218 w = width;
   3219 
   3220 while (w && (uintptr_t)dst & 15)
   3221 {
   3222     uint8_t m = *mask++;
   3223 
   3224     if (m)
   3225     {
   3226 	d = *dst;
   3227 	mmx_mask = expand_pixel_8_1x128 (m);
   3228 	mmx_dest = unpack_32_1x128 (d);
   3229 
   3230 	*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
   3231 	                                   &mmx_alpha,
   3232 	                                   &mmx_mask,
   3233 	                                   &mmx_dest));
   3234     }
   3235 
   3236     w--;
   3237     dst++;
   3238 }
   3239 
   3240 while (w >= 4)
   3241 {
   3242            uint32_t m;
   3243            memcpy(&m, mask, sizeof(uint32_t));
   3244 
   3245     if (srca == 0xff && m == 0xffffffff)
   3246     {
   3247 	save_128_aligned ((__m128i*)dst, xmm_def);
   3248     }
   3249     else if (m)
   3250     {
   3251 	xmm_dst = load_128_aligned ((__m128i*) dst);
   3252 	xmm_mask = unpack_32_1x128 (m);
   3253 	xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
   3254 
   3255 	/* Unpacking */
   3256 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   3257 	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3258 
   3259 	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   3260 				&xmm_mask_lo, &xmm_mask_hi);
   3261 
   3262 	in_over_2x128 (&xmm_src, &xmm_src,
   3263 		       &xmm_alpha, &xmm_alpha,
   3264 		       &xmm_mask_lo, &xmm_mask_hi,
   3265 		       &xmm_dst_lo, &xmm_dst_hi);
   3266 
   3267 	save_128_aligned (
   3268 	    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   3269     }
   3270 
   3271     w -= 4;
   3272     dst += 4;
   3273     mask += 4;
   3274 }
   3275 
   3276 while (w)
   3277 {
   3278     uint8_t m = *mask++;
   3279 
   3280     if (m)
   3281     {
   3282 	d = *dst;
   3283 	mmx_mask = expand_pixel_8_1x128 (m);
   3284 	mmx_dest = unpack_32_1x128 (d);
   3285 
   3286 	*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
   3287 	                                   &mmx_alpha,
   3288 	                                   &mmx_mask,
   3289 	                                   &mmx_dest));
   3290     }
   3291 
   3292     w--;
   3293     dst++;
   3294 }
   3295    }
   3296 
   3297 }
   3298 
   3299 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
   3300 __attribute__((__force_align_arg_pointer__))
   3301 #endif
   3302 static pixman_bool_t
   3303 sse2_fill (pixman_implementation_t *imp,
   3304           uint32_t *               bits,
   3305           int                      stride,
   3306           int                      bpp,
   3307           int                      x,
   3308           int                      y,
   3309           int                      width,
   3310           int                      height,
   3311           uint32_t		    filler)
   3312 {
   3313    uint32_t byte_width;
   3314    uint8_t *byte_line;
   3315 
   3316    __m128i xmm_def;
   3317 
   3318    if (bpp == 8)
   3319    {
   3320 uint32_t b;
   3321 uint32_t w;
   3322 
   3323 stride = stride * (int) sizeof (uint32_t) / 1;
   3324 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
   3325 byte_width = width;
   3326 stride *= 1;
   3327 
   3328 b = filler & 0xff;
   3329 w = (b << 8) | b;
   3330 filler = (w << 16) | w;
   3331    }
   3332    else if (bpp == 16)
   3333    {
   3334 stride = stride * (int) sizeof (uint32_t) / 2;
   3335 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
   3336 byte_width = 2 * width;
   3337 stride *= 2;
   3338 
   3339        filler = (filler & 0xffff) * 0x00010001;
   3340    }
   3341    else if (bpp == 32)
   3342    {
   3343 stride = stride * (int) sizeof (uint32_t) / 4;
   3344 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
   3345 byte_width = 4 * width;
   3346 stride *= 4;
   3347    }
   3348    else
   3349    {
   3350 return FALSE;
   3351    }
   3352 
   3353    xmm_def = create_mask_2x32_128 (filler, filler);
   3354 
   3355    while (height--)
   3356    {
   3357 int w;
   3358 uint8_t *d = byte_line;
   3359 byte_line += stride;
   3360 w = byte_width;
   3361 
   3362 if (w >= 1 && ((uintptr_t)d & 1))
   3363 {
   3364     *(uint8_t *)d = filler;
   3365     w -= 1;
   3366     d += 1;
   3367 }
   3368 
   3369 while (w >= 2 && ((uintptr_t)d & 3))
   3370 {
   3371     *(uint16_t *)d = filler;
   3372     w -= 2;
   3373     d += 2;
   3374 }
   3375 
   3376 while (w >= 4 && ((uintptr_t)d & 15))
   3377 {
   3378     *(uint32_t *)d = filler;
   3379 
   3380     w -= 4;
   3381     d += 4;
   3382 }
   3383 
   3384 while (w >= 128)
   3385 {
   3386     save_128_aligned ((__m128i*)(d),     xmm_def);
   3387     save_128_aligned ((__m128i*)(d + 16),  xmm_def);
   3388     save_128_aligned ((__m128i*)(d + 32),  xmm_def);
   3389     save_128_aligned ((__m128i*)(d + 48),  xmm_def);
   3390     save_128_aligned ((__m128i*)(d + 64),  xmm_def);
   3391     save_128_aligned ((__m128i*)(d + 80),  xmm_def);
   3392     save_128_aligned ((__m128i*)(d + 96),  xmm_def);
   3393     save_128_aligned ((__m128i*)(d + 112), xmm_def);
   3394 
   3395     d += 128;
   3396     w -= 128;
   3397 }
   3398 
   3399 if (w >= 64)
   3400 {
   3401     save_128_aligned ((__m128i*)(d),     xmm_def);
   3402     save_128_aligned ((__m128i*)(d + 16),  xmm_def);
   3403     save_128_aligned ((__m128i*)(d + 32),  xmm_def);
   3404     save_128_aligned ((__m128i*)(d + 48),  xmm_def);
   3405 
   3406     d += 64;
   3407     w -= 64;
   3408 }
   3409 
   3410 if (w >= 32)
   3411 {
   3412     save_128_aligned ((__m128i*)(d),     xmm_def);
   3413     save_128_aligned ((__m128i*)(d + 16),  xmm_def);
   3414 
   3415     d += 32;
   3416     w -= 32;
   3417 }
   3418 
   3419 if (w >= 16)
   3420 {
   3421     save_128_aligned ((__m128i*)(d),     xmm_def);
   3422 
   3423     d += 16;
   3424     w -= 16;
   3425 }
   3426 
   3427 while (w >= 4)
   3428 {
   3429     *(uint32_t *)d = filler;
   3430 
   3431     w -= 4;
   3432     d += 4;
   3433 }
   3434 
   3435 if (w >= 2)
   3436 {
   3437     *(uint16_t *)d = filler;
   3438     w -= 2;
   3439     d += 2;
   3440 }
   3441 
   3442 if (w >= 1)
   3443 {
   3444     *(uint8_t *)d = filler;
   3445     w -= 1;
   3446     d += 1;
   3447 }
   3448    }
   3449 
   3450    return TRUE;
   3451 }
   3452 
   3453 static void
   3454 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
   3455                             pixman_composite_info_t *info)
   3456 {
   3457    PIXMAN_COMPOSITE_ARGS (info);
   3458    uint32_t src, srca;
   3459    uint32_t    *dst_line, *dst;
   3460    uint8_t     *mask_line, *mask;
   3461    int dst_stride, mask_stride;
   3462    int32_t w;
   3463 
   3464    __m128i xmm_src, xmm_def;
   3465    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   3466 
   3467    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3468 
   3469    srca = src >> 24;
   3470    if (src == 0)
   3471    {
   3472 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
   3473 	   PIXMAN_FORMAT_BPP (dest_image->bits.format),
   3474 	   dest_x, dest_y, width, height, 0);
   3475 return;
   3476    }
   3477 
   3478    PIXMAN_IMAGE_GET_LINE (
   3479 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3480    PIXMAN_IMAGE_GET_LINE (
   3481 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   3482 
   3483    xmm_def = create_mask_2x32_128 (src, src);
   3484    xmm_src = expand_pixel_32_1x128 (src);
   3485 
   3486    while (height--)
   3487    {
   3488 dst = dst_line;
   3489 dst_line += dst_stride;
   3490 mask = mask_line;
   3491 mask_line += mask_stride;
   3492 w = width;
   3493 
   3494 while (w && (uintptr_t)dst & 15)
   3495 {
   3496     uint8_t m = *mask++;
   3497 
   3498     if (m)
   3499     {
   3500 	*dst = pack_1x128_32 (
   3501 	    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
   3502     }
   3503     else
   3504     {
   3505 	*dst = 0;
   3506     }
   3507 
   3508     w--;
   3509     dst++;
   3510 }
   3511 
   3512 while (w >= 4)
   3513 {
   3514            uint32_t m;
   3515            memcpy(&m, mask, sizeof(uint32_t));
   3516 
   3517     if (srca == 0xff && m == 0xffffffff)
   3518     {
   3519 	save_128_aligned ((__m128i*)dst, xmm_def);
   3520     }
   3521     else if (m)
   3522     {
   3523 	xmm_mask = unpack_32_1x128 (m);
   3524 	xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
   3525 
   3526 	/* Unpacking */
   3527 	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3528 
   3529 	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   3530 				&xmm_mask_lo, &xmm_mask_hi);
   3531 
   3532 	pix_multiply_2x128 (&xmm_src, &xmm_src,
   3533 			    &xmm_mask_lo, &xmm_mask_hi,
   3534 			    &xmm_mask_lo, &xmm_mask_hi);
   3535 
   3536 	save_128_aligned (
   3537 	    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
   3538     }
   3539     else
   3540     {
   3541 	save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
   3542     }
   3543 
   3544     w -= 4;
   3545     dst += 4;
   3546     mask += 4;
   3547 }
   3548 
   3549 while (w)
   3550 {
   3551     uint8_t m = *mask++;
   3552 
   3553     if (m)
   3554     {
   3555 	*dst = pack_1x128_32 (
   3556 	    pix_multiply_1x128 (
   3557 		xmm_src, expand_pixel_8_1x128 (m)));
   3558     }
   3559     else
   3560     {
   3561 	*dst = 0;
   3562     }
   3563 
   3564     w--;
   3565     dst++;
   3566 }
   3567    }
   3568 
   3569 }
   3570 
   3571 static void
   3572 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
   3573                              pixman_composite_info_t *info)
   3574 {
   3575    PIXMAN_COMPOSITE_ARGS (info);
   3576    uint32_t src;
   3577    uint16_t    *dst_line, *dst, d;
   3578    uint8_t     *mask_line, *mask;
   3579    int dst_stride, mask_stride;
   3580    int32_t w;
   3581    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
   3582 
   3583    __m128i xmm_src, xmm_alpha;
   3584    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   3585    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   3586 
   3587    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3588 
   3589    if (src == 0)
   3590 return;
   3591 
   3592    PIXMAN_IMAGE_GET_LINE (
   3593 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3594    PIXMAN_IMAGE_GET_LINE (
   3595 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   3596 
   3597    xmm_src = expand_pixel_32_1x128 (src);
   3598    xmm_alpha = expand_alpha_1x128 (xmm_src);
   3599    mmx_src = xmm_src;
   3600    mmx_alpha = xmm_alpha;
   3601 
   3602    while (height--)
   3603    {
   3604 dst = dst_line;
   3605 dst_line += dst_stride;
   3606 mask = mask_line;
   3607 mask_line += mask_stride;
   3608 w = width;
   3609 
   3610 while (w && (uintptr_t)dst & 15)
   3611 {
   3612     uint8_t m = *mask++;
   3613 
   3614     if (m)
   3615     {
   3616 	d = *dst;
   3617 	mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
   3618 	mmx_dest = expand565_16_1x128 (d);
   3619 
   3620 	*dst = pack_565_32_16 (
   3621 	    pack_1x128_32 (
   3622 		in_over_1x128 (
   3623 		    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
   3624     }
   3625 
   3626     w--;
   3627     dst++;
   3628 }
   3629 
   3630 while (w >= 8)
   3631 {
   3632            uint32_t m;
   3633 
   3634     xmm_dst = load_128_aligned ((__m128i*) dst);
   3635     unpack_565_128_4x128 (xmm_dst,
   3636 			  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   3637 
   3638            memcpy(&m, mask, sizeof(uint32_t));
   3639     mask += 4;
   3640 
   3641     if (m)
   3642     {
   3643 	xmm_mask = unpack_32_1x128 (m);
   3644 	xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
   3645 
   3646 	/* Unpacking */
   3647 	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3648 
   3649 	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   3650 				&xmm_mask_lo, &xmm_mask_hi);
   3651 
   3652 	in_over_2x128 (&xmm_src, &xmm_src,
   3653 		       &xmm_alpha, &xmm_alpha,
   3654 		       &xmm_mask_lo, &xmm_mask_hi,
   3655 		       &xmm_dst0, &xmm_dst1);
   3656     }
   3657 
   3658            memcpy(&m, mask, sizeof(uint32_t));
   3659     mask += 4;
   3660 
   3661     if (m)
   3662     {
   3663 	xmm_mask = unpack_32_1x128 (m);
   3664 	xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
   3665 
   3666 	/* Unpacking */
   3667 	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3668 
   3669 	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   3670 				&xmm_mask_lo, &xmm_mask_hi);
   3671 	in_over_2x128 (&xmm_src, &xmm_src,
   3672 		       &xmm_alpha, &xmm_alpha,
   3673 		       &xmm_mask_lo, &xmm_mask_hi,
   3674 		       &xmm_dst2, &xmm_dst3);
   3675     }
   3676 
   3677     save_128_aligned (
   3678 	(__m128i*)dst, pack_565_4x128_128 (
   3679 	    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
   3680 
   3681     w -= 8;
   3682     dst += 8;
   3683 }
   3684 
   3685 while (w)
   3686 {
   3687     uint8_t m = *mask++;
   3688 
   3689     if (m)
   3690     {
   3691 	d = *dst;
   3692 	mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
   3693 	mmx_dest = expand565_16_1x128 (d);
   3694 
   3695 	*dst = pack_565_32_16 (
   3696 	    pack_1x128_32 (
   3697 		in_over_1x128 (
   3698 		    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
   3699     }
   3700 
   3701     w--;
   3702     dst++;
   3703 }
   3704    }
   3705 
   3706 }
   3707 
   3708 static void
   3709 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
   3710                                 pixman_composite_info_t *info)
   3711 {
   3712    PIXMAN_COMPOSITE_ARGS (info);
   3713    uint16_t    *dst_line, *dst, d;
   3714    uint32_t    *src_line, *src, s;
   3715    int dst_stride, src_stride;
   3716    int32_t w;
   3717    uint32_t opaque, zero;
   3718 
   3719    __m128i ms;
   3720    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   3721    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   3722 
   3723    PIXMAN_IMAGE_GET_LINE (
   3724 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3725    PIXMAN_IMAGE_GET_LINE (
   3726 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3727 
   3728    while (height--)
   3729    {
   3730 dst = dst_line;
   3731 dst_line += dst_stride;
   3732 src = src_line;
   3733 src_line += src_stride;
   3734 w = width;
   3735 
   3736 while (w && (uintptr_t)dst & 15)
   3737 {
   3738     s = *src++;
   3739     d = *dst;
   3740 
   3741     ms = unpack_32_1x128 (s);
   3742 
   3743     *dst++ = pack_565_32_16 (
   3744 	pack_1x128_32 (
   3745 	    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
   3746     w--;
   3747 }
   3748 
   3749 while (w >= 8)
   3750 {
   3751     /* First round */
   3752     xmm_src = load_128_unaligned ((__m128i*)src);
   3753     xmm_dst = load_128_aligned  ((__m128i*)dst);
   3754 
   3755     opaque = is_opaque (xmm_src);
   3756     zero = is_zero (xmm_src);
   3757 
   3758     unpack_565_128_4x128 (xmm_dst,
   3759 			  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   3760     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3761 
   3762     /* preload next round*/
   3763     xmm_src = load_128_unaligned ((__m128i*)(src + 4));
   3764 
   3765     if (opaque)
   3766     {
   3767 	invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
   3768 			     &xmm_dst0, &xmm_dst1);
   3769     }
   3770     else if (!zero)
   3771     {
   3772 	over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
   3773 				&xmm_dst0, &xmm_dst1);
   3774     }
   3775 
   3776     /* Second round */
   3777     opaque = is_opaque (xmm_src);
   3778     zero = is_zero (xmm_src);
   3779 
   3780     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   3781 
   3782     if (opaque)
   3783     {
   3784 	invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
   3785 			     &xmm_dst2, &xmm_dst3);
   3786     }
   3787     else if (!zero)
   3788     {
   3789 	over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
   3790 				&xmm_dst2, &xmm_dst3);
   3791     }
   3792 
   3793     save_128_aligned (
   3794 	(__m128i*)dst, pack_565_4x128_128 (
   3795 	    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
   3796 
   3797     w -= 8;
   3798     src += 8;
   3799     dst += 8;
   3800 }
   3801 
   3802 while (w)
   3803 {
   3804     s = *src++;
   3805     d = *dst;
   3806 
   3807     ms = unpack_32_1x128 (s);
   3808 
   3809     *dst++ = pack_565_32_16 (
   3810 	pack_1x128_32 (
   3811 	    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
   3812     w--;
   3813 }
   3814    }
   3815 
   3816 }
   3817 
   3818 static void
   3819 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
   3820                                 pixman_composite_info_t *info)
   3821 {
   3822    PIXMAN_COMPOSITE_ARGS (info);
   3823    uint32_t    *dst_line, *dst, d;
   3824    uint32_t    *src_line, *src, s;
   3825    int dst_stride, src_stride;
   3826    int32_t w;
   3827    uint32_t opaque, zero;
   3828 
   3829    __m128i xmm_src_lo, xmm_src_hi;
   3830    __m128i xmm_dst_lo, xmm_dst_hi;
   3831 
   3832    PIXMAN_IMAGE_GET_LINE (
   3833 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   3834    PIXMAN_IMAGE_GET_LINE (
   3835 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   3836 
   3837    while (height--)
   3838    {
   3839 dst = dst_line;
   3840 dst_line += dst_stride;
   3841 src = src_line;
   3842 src_line += src_stride;
   3843 w = width;
   3844 
   3845 while (w && (uintptr_t)dst & 15)
   3846 {
   3847     s = *src++;
   3848     d = *dst;
   3849 
   3850     *dst++ = pack_1x128_32 (
   3851 	over_rev_non_pre_1x128 (
   3852 	    unpack_32_1x128 (s), unpack_32_1x128 (d)));
   3853 
   3854     w--;
   3855 }
   3856 
   3857 while (w >= 4)
   3858 {
   3859     xmm_src_hi = load_128_unaligned ((__m128i*)src);
   3860 
   3861     opaque = is_opaque (xmm_src_hi);
   3862     zero = is_zero (xmm_src_hi);
   3863 
   3864     unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   3865 
   3866     if (opaque)
   3867     {
   3868 	invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
   3869 			     &xmm_dst_lo, &xmm_dst_hi);
   3870 
   3871 	save_128_aligned (
   3872 	    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   3873     }
   3874     else if (!zero)
   3875     {
   3876 	xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
   3877 
   3878 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   3879 
   3880 	over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
   3881 				&xmm_dst_lo, &xmm_dst_hi);
   3882 
   3883 	save_128_aligned (
   3884 	    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   3885     }
   3886 
   3887     w -= 4;
   3888     dst += 4;
   3889     src += 4;
   3890 }
   3891 
   3892 while (w)
   3893 {
   3894     s = *src++;
   3895     d = *dst;
   3896 
   3897     *dst++ = pack_1x128_32 (
   3898 	over_rev_non_pre_1x128 (
   3899 	    unpack_32_1x128 (s), unpack_32_1x128 (d)));
   3900 
   3901     w--;
   3902 }
   3903    }
   3904 
   3905 }
   3906 
   3907 static void
   3908 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
   3909                                    pixman_composite_info_t *info)
   3910 {
   3911    PIXMAN_COMPOSITE_ARGS (info);
   3912    uint32_t src;
   3913    uint16_t    *dst_line, *dst, d;
   3914    uint32_t    *mask_line, *mask, m;
   3915    int dst_stride, mask_stride;
   3916    int w;
   3917    uint32_t pack_cmp;
   3918 
   3919    __m128i xmm_src, xmm_alpha;
   3920    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   3921    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
   3922 
   3923    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
   3924 
   3925    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   3926 
   3927    if (src == 0)
   3928 return;
   3929 
   3930    PIXMAN_IMAGE_GET_LINE (
   3931 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
   3932    PIXMAN_IMAGE_GET_LINE (
   3933 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   3934 
   3935    xmm_src = expand_pixel_32_1x128 (src);
   3936    xmm_alpha = expand_alpha_1x128 (xmm_src);
   3937    mmx_src = xmm_src;
   3938    mmx_alpha = xmm_alpha;
   3939 
   3940    while (height--)
   3941    {
   3942 w = width;
   3943 mask = mask_line;
   3944 dst = dst_line;
   3945 mask_line += mask_stride;
   3946 dst_line += dst_stride;
   3947 
   3948 while (w && ((uintptr_t)dst & 15))
   3949 {
   3950     m = *(uint32_t *) mask;
   3951 
   3952     if (m)
   3953     {
   3954 	d = *dst;
   3955 	mmx_mask = unpack_32_1x128 (m);
   3956 	mmx_dest = expand565_16_1x128 (d);
   3957 
   3958 	*dst = pack_565_32_16 (
   3959 	    pack_1x128_32 (
   3960 		in_over_1x128 (
   3961 		    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
   3962     }
   3963 
   3964     w--;
   3965     dst++;
   3966     mask++;
   3967 }
   3968 
   3969 while (w >= 8)
   3970 {
   3971     /* First round */
   3972     xmm_mask = load_128_unaligned ((__m128i*)mask);
   3973     xmm_dst = load_128_aligned ((__m128i*)dst);
   3974 
   3975     pack_cmp = _mm_movemask_epi8 (
   3976 	_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
   3977 
   3978     unpack_565_128_4x128 (xmm_dst,
   3979 			  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
   3980     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3981 
   3982     /* preload next round */
   3983     xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
   3984 
   3985     /* preload next round */
   3986     if (pack_cmp != 0xffff)
   3987     {
   3988 	in_over_2x128 (&xmm_src, &xmm_src,
   3989 		       &xmm_alpha, &xmm_alpha,
   3990 		       &xmm_mask_lo, &xmm_mask_hi,
   3991 		       &xmm_dst0, &xmm_dst1);
   3992     }
   3993 
   3994     /* Second round */
   3995     pack_cmp = _mm_movemask_epi8 (
   3996 	_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
   3997 
   3998     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   3999 
   4000     if (pack_cmp != 0xffff)
   4001     {
   4002 	in_over_2x128 (&xmm_src, &xmm_src,
   4003 		       &xmm_alpha, &xmm_alpha,
   4004 		       &xmm_mask_lo, &xmm_mask_hi,
   4005 		       &xmm_dst2, &xmm_dst3);
   4006     }
   4007 
   4008     save_128_aligned (
   4009 	(__m128i*)dst, pack_565_4x128_128 (
   4010 	    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
   4011 
   4012     w -= 8;
   4013     dst += 8;
   4014     mask += 8;
   4015 }
   4016 
   4017 while (w)
   4018 {
   4019     m = *(uint32_t *) mask;
   4020 
   4021     if (m)
   4022     {
   4023 	d = *dst;
   4024 	mmx_mask = unpack_32_1x128 (m);
   4025 	mmx_dest = expand565_16_1x128 (d);
   4026 
   4027 	*dst = pack_565_32_16 (
   4028 	    pack_1x128_32 (
   4029 		in_over_1x128 (
   4030 		    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
   4031     }
   4032 
   4033     w--;
   4034     dst++;
   4035     mask++;
   4036 }
   4037    }
   4038 
   4039 }
   4040 
   4041 static void
   4042 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
   4043                         pixman_composite_info_t *info)
   4044 {
   4045    PIXMAN_COMPOSITE_ARGS (info);
   4046    uint8_t     *dst_line, *dst;
   4047    uint8_t     *mask_line, *mask;
   4048    int dst_stride, mask_stride;
   4049    uint32_t d;
   4050    uint32_t src;
   4051    int32_t w;
   4052 
   4053    __m128i xmm_alpha;
   4054    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   4055    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4056 
   4057    PIXMAN_IMAGE_GET_LINE (
   4058 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4059    PIXMAN_IMAGE_GET_LINE (
   4060 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4061 
   4062    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4063 
   4064    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
   4065 
   4066    while (height--)
   4067    {
   4068 dst = dst_line;
   4069 dst_line += dst_stride;
   4070 mask = mask_line;
   4071 mask_line += mask_stride;
   4072 w = width;
   4073 
   4074 while (w && ((uintptr_t)dst & 15))
   4075 {
   4076     uint8_t m = *mask++;
   4077     d = (uint32_t) *dst;
   4078 
   4079     *dst++ = (uint8_t) pack_1x128_32 (
   4080 	pix_multiply_1x128 (
   4081 	    pix_multiply_1x128 (xmm_alpha,
   4082 			       unpack_32_1x128 (m)),
   4083 	    unpack_32_1x128 (d)));
   4084     w--;
   4085 }
   4086 
   4087 while (w >= 16)
   4088 {
   4089     xmm_mask = load_128_unaligned ((__m128i*)mask);
   4090     xmm_dst = load_128_aligned ((__m128i*)dst);
   4091 
   4092     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4093     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4094 
   4095     pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
   4096 			&xmm_mask_lo, &xmm_mask_hi,
   4097 			&xmm_mask_lo, &xmm_mask_hi);
   4098 
   4099     pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
   4100 			&xmm_dst_lo, &xmm_dst_hi,
   4101 			&xmm_dst_lo, &xmm_dst_hi);
   4102 
   4103     save_128_aligned (
   4104 	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4105 
   4106     mask += 16;
   4107     dst += 16;
   4108     w -= 16;
   4109 }
   4110 
   4111 while (w)
   4112 {
   4113     uint8_t m = *mask++;
   4114     d = (uint32_t) *dst;
   4115 
   4116     *dst++ = (uint8_t) pack_1x128_32 (
   4117 	pix_multiply_1x128 (
   4118 	    pix_multiply_1x128 (
   4119 		xmm_alpha, unpack_32_1x128 (m)),
   4120 	    unpack_32_1x128 (d)));
   4121     w--;
   4122 }
   4123    }
   4124 
   4125 }
   4126 
   4127 static void
   4128 sse2_composite_in_n_8 (pixman_implementation_t *imp,
   4129 	       pixman_composite_info_t *info)
   4130 {
   4131    PIXMAN_COMPOSITE_ARGS (info);
   4132    uint8_t     *dst_line, *dst;
   4133    int dst_stride;
   4134    uint32_t d;
   4135    uint32_t src;
   4136    int32_t w;
   4137 
   4138    __m128i xmm_alpha;
   4139    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4140 
   4141    PIXMAN_IMAGE_GET_LINE (
   4142 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4143 
   4144    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4145 
   4146    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
   4147 
   4148    src = src >> 24;
   4149 
   4150    if (src == 0xff)
   4151 return;
   4152 
   4153    if (src == 0x00)
   4154    {
   4155 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
   4156 	     8, dest_x, dest_y, width, height, src);
   4157 
   4158 return;
   4159    }
   4160 
   4161    while (height--)
   4162    {
   4163 dst = dst_line;
   4164 dst_line += dst_stride;
   4165 w = width;
   4166 
   4167 while (w && ((uintptr_t)dst & 15))
   4168 {
   4169     d = (uint32_t) *dst;
   4170 
   4171     *dst++ = (uint8_t) pack_1x128_32 (
   4172 	pix_multiply_1x128 (
   4173 	    xmm_alpha,
   4174 	    unpack_32_1x128 (d)));
   4175     w--;
   4176 }
   4177 
   4178 while (w >= 16)
   4179 {
   4180     xmm_dst = load_128_aligned ((__m128i*)dst);
   4181 
   4182     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4183     
   4184     pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
   4185 			&xmm_dst_lo, &xmm_dst_hi,
   4186 			&xmm_dst_lo, &xmm_dst_hi);
   4187 
   4188     save_128_aligned (
   4189 	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4190 
   4191     dst += 16;
   4192     w -= 16;
   4193 }
   4194 
   4195 while (w)
   4196 {
   4197     d = (uint32_t) *dst;
   4198 
   4199     *dst++ = (uint8_t) pack_1x128_32 (
   4200 	pix_multiply_1x128 (
   4201 	    xmm_alpha,
   4202 	    unpack_32_1x128 (d)));
   4203     w--;
   4204 }
   4205    }
   4206 
   4207 }
   4208 
   4209 static void
   4210 sse2_composite_in_8_8 (pixman_implementation_t *imp,
   4211                       pixman_composite_info_t *info)
   4212 {
   4213    PIXMAN_COMPOSITE_ARGS (info);
   4214    uint8_t     *dst_line, *dst;
   4215    uint8_t     *src_line, *src;
   4216    int src_stride, dst_stride;
   4217    int32_t w;
   4218    uint32_t s, d;
   4219 
   4220    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   4221    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4222 
   4223    PIXMAN_IMAGE_GET_LINE (
   4224 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4225    PIXMAN_IMAGE_GET_LINE (
   4226 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
   4227 
   4228    while (height--)
   4229    {
   4230 dst = dst_line;
   4231 dst_line += dst_stride;
   4232 src = src_line;
   4233 src_line += src_stride;
   4234 w = width;
   4235 
   4236 while (w && ((uintptr_t)dst & 15))
   4237 {
   4238     s = (uint32_t) *src++;
   4239     d = (uint32_t) *dst;
   4240 
   4241     *dst++ = (uint8_t) pack_1x128_32 (
   4242 	pix_multiply_1x128 (
   4243 	    unpack_32_1x128 (s), unpack_32_1x128 (d)));
   4244     w--;
   4245 }
   4246 
   4247 while (w >= 16)
   4248 {
   4249     xmm_src = load_128_unaligned ((__m128i*)src);
   4250     xmm_dst = load_128_aligned ((__m128i*)dst);
   4251 
   4252     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   4253     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4254 
   4255     pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   4256 			&xmm_dst_lo, &xmm_dst_hi,
   4257 			&xmm_dst_lo, &xmm_dst_hi);
   4258 
   4259     save_128_aligned (
   4260 	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4261 
   4262     src += 16;
   4263     dst += 16;
   4264     w -= 16;
   4265 }
   4266 
   4267 while (w)
   4268 {
   4269     s = (uint32_t) *src++;
   4270     d = (uint32_t) *dst;
   4271 
   4272     *dst++ = (uint8_t) pack_1x128_32 (
   4273 	pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
   4274     w--;
   4275 }
   4276    }
   4277 
   4278 }
   4279 
   4280 static void
   4281 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
   4282 		  pixman_composite_info_t *info)
   4283 {
   4284    PIXMAN_COMPOSITE_ARGS (info);
   4285    uint8_t     *dst_line, *dst;
   4286    uint8_t     *mask_line, *mask;
   4287    int dst_stride, mask_stride;
   4288    int32_t w;
   4289    uint32_t src;
   4290    uint32_t d;
   4291 
   4292    __m128i xmm_alpha;
   4293    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   4294    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4295 
   4296    PIXMAN_IMAGE_GET_LINE (
   4297 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4298    PIXMAN_IMAGE_GET_LINE (
   4299 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4300 
   4301    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4302 
   4303    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
   4304 
   4305    while (height--)
   4306    {
   4307 dst = dst_line;
   4308 dst_line += dst_stride;
   4309 mask = mask_line;
   4310 mask_line += mask_stride;
   4311 w = width;
   4312 
   4313 while (w && ((uintptr_t)dst & 15))
   4314 {
   4315     uint8_t m = *mask++;
   4316     d = (uint32_t) *dst;
   4317 
   4318     *dst++ = (uint8_t) pack_1x128_32 (
   4319 	_mm_adds_epu16 (
   4320 	    pix_multiply_1x128 (
   4321 		xmm_alpha, unpack_32_1x128 (m)),
   4322 	    unpack_32_1x128 (d)));
   4323     w--;
   4324 }
   4325 
   4326 while (w >= 16)
   4327 {
   4328     xmm_mask = load_128_unaligned ((__m128i*)mask);
   4329     xmm_dst = load_128_aligned ((__m128i*)dst);
   4330 
   4331     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4332     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4333 
   4334     pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
   4335 			&xmm_mask_lo, &xmm_mask_hi,
   4336 			&xmm_mask_lo, &xmm_mask_hi);
   4337 
   4338     xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
   4339     xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
   4340 
   4341     save_128_aligned (
   4342 	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4343 
   4344     mask += 16;
   4345     dst += 16;
   4346     w -= 16;
   4347 }
   4348 
   4349 while (w)
   4350 {
   4351     uint8_t m = (uint32_t) *mask++;
   4352     d = (uint32_t) *dst;
   4353 
   4354     *dst++ = (uint8_t) pack_1x128_32 (
   4355 	_mm_adds_epu16 (
   4356 	    pix_multiply_1x128 (
   4357 		xmm_alpha, unpack_32_1x128 (m)),
   4358 	    unpack_32_1x128 (d)));
   4359 
   4360     w--;
   4361 }
   4362    }
   4363 
   4364 }
   4365 
   4366 static void
   4367 sse2_composite_add_n_8 (pixman_implementation_t *imp,
   4368 		pixman_composite_info_t *info)
   4369 {
   4370    PIXMAN_COMPOSITE_ARGS (info);
   4371    uint8_t     *dst_line, *dst;
   4372    int dst_stride;
   4373    int32_t w;
   4374    uint32_t src;
   4375 
   4376    __m128i xmm_src;
   4377 
   4378    PIXMAN_IMAGE_GET_LINE (
   4379 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4380 
   4381    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4382 
   4383    src >>= 24;
   4384 
   4385    if (src == 0x00)
   4386 return;
   4387 
   4388    if (src == 0xff)
   4389    {
   4390 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
   4391 	     8, dest_x, dest_y, width, height, 0xff);
   4392 
   4393 return;
   4394    }
   4395 
   4396    src = (src << 24) | (src << 16) | (src << 8) | src;
   4397    xmm_src = _mm_set_epi32 (src, src, src, src);
   4398 
   4399    while (height--)
   4400    {
   4401 dst = dst_line;
   4402 dst_line += dst_stride;
   4403 w = width;
   4404 
   4405 while (w && ((uintptr_t)dst & 15))
   4406 {
   4407     *dst = (uint8_t)_mm_cvtsi128_si32 (
   4408 	_mm_adds_epu8 (
   4409 	    xmm_src,
   4410 	    _mm_cvtsi32_si128 (*dst)));
   4411 
   4412     w--;
   4413     dst++;
   4414 }
   4415 
   4416 while (w >= 16)
   4417 {
   4418     save_128_aligned (
   4419 	(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
   4420 
   4421     dst += 16;
   4422     w -= 16;
   4423 }
   4424 
   4425 while (w)
   4426 {
   4427     *dst = (uint8_t)_mm_cvtsi128_si32 (
   4428 	_mm_adds_epu8 (
   4429 	    xmm_src,
   4430 	    _mm_cvtsi32_si128 (*dst)));
   4431 
   4432     w--;
   4433     dst++;
   4434 }
   4435    }
   4436 
   4437 }
   4438 
   4439 static void
   4440 sse2_composite_add_8_8 (pixman_implementation_t *imp,
   4441 		pixman_composite_info_t *info)
   4442 {
   4443    PIXMAN_COMPOSITE_ARGS (info);
   4444    uint8_t     *dst_line, *dst;
   4445    uint8_t     *src_line, *src;
   4446    int dst_stride, src_stride;
   4447    int32_t w;
   4448    uint16_t t;
   4449 
   4450    PIXMAN_IMAGE_GET_LINE (
   4451 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
   4452    PIXMAN_IMAGE_GET_LINE (
   4453 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
   4454 
   4455    while (height--)
   4456    {
   4457 dst = dst_line;
   4458 src = src_line;
   4459 
   4460 dst_line += dst_stride;
   4461 src_line += src_stride;
   4462 w = width;
   4463 
   4464 /* Small head */
   4465 while (w && (uintptr_t)dst & 3)
   4466 {
   4467     t = (*dst) + (*src++);
   4468     *dst++ = t | (0 - (t >> 8));
   4469     w--;
   4470 }
   4471 
   4472 sse2_combine_add_u (imp, op,
   4473 		    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
   4474 
   4475 /* Small tail */
   4476 dst += w & 0xfffc;
   4477 src += w & 0xfffc;
   4478 
   4479 w &= 3;
   4480 
   4481 while (w)
   4482 {
   4483     t = (*dst) + (*src++);
   4484     *dst++ = t | (0 - (t >> 8));
   4485     w--;
   4486 }
   4487    }
   4488 
   4489 }
   4490 
   4491 static void
   4492 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
   4493                              pixman_composite_info_t *info)
   4494 {
   4495    PIXMAN_COMPOSITE_ARGS (info);
   4496    uint32_t    *dst_line, *dst;
   4497    uint32_t    *src_line, *src;
   4498    int dst_stride, src_stride;
   4499 
   4500    PIXMAN_IMAGE_GET_LINE (
   4501 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   4502    PIXMAN_IMAGE_GET_LINE (
   4503 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4504 
   4505    while (height--)
   4506    {
   4507 dst = dst_line;
   4508 dst_line += dst_stride;
   4509 src = src_line;
   4510 src_line += src_stride;
   4511 
   4512 sse2_combine_add_u (imp, op, dst, src, NULL, width);
   4513    }
   4514 }
   4515 
   4516 static void
   4517 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
   4518 		   pixman_composite_info_t *info)
   4519 {
   4520    PIXMAN_COMPOSITE_ARGS (info);
   4521    uint32_t *dst_line, *dst, src;
   4522    int dst_stride;
   4523 
   4524    __m128i xmm_src;
   4525 
   4526    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4527 
   4528    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4529    if (src == 0)
   4530 return;
   4531 
   4532    if (src == ~0)
   4533    {
   4534 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
   4535 	     dest_x, dest_y, width, height, ~0);
   4536 
   4537 return;
   4538    }
   4539 
   4540    xmm_src = _mm_set_epi32 (src, src, src, src);
   4541    while (height--)
   4542    {
   4543 int w = width;
   4544 uint32_t d;
   4545 
   4546 dst = dst_line;
   4547 dst_line += dst_stride;
   4548 
   4549 while (w && (uintptr_t)dst & 15)
   4550 {
   4551     d = *dst;
   4552     *dst++ =
   4553 	_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
   4554     w--;
   4555 }
   4556 
   4557 while (w >= 4)
   4558 {
   4559     save_128_aligned
   4560 	((__m128i*)dst,
   4561 	 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
   4562 
   4563     dst += 4;
   4564     w -= 4;
   4565 }
   4566 
   4567 while (w--)
   4568 {
   4569     d = *dst;
   4570     *dst++ =
   4571 	_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
   4572 					  _mm_cvtsi32_si128 (d)));
   4573 }
   4574    }
   4575 }
   4576 
   4577 static void
   4578 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
   4579 		     pixman_composite_info_t *info)
   4580 {
   4581    PIXMAN_COMPOSITE_ARGS (info);
   4582    uint32_t     *dst_line, *dst;
   4583    uint8_t     *mask_line, *mask;
   4584    int dst_stride, mask_stride;
   4585    int32_t w;
   4586    uint32_t src;
   4587 
   4588    __m128i xmm_src;
   4589 
   4590    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   4591    if (src == 0)
   4592 return;
   4593    xmm_src = expand_pixel_32_1x128 (src);
   4594 
   4595    PIXMAN_IMAGE_GET_LINE (
   4596 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4597    PIXMAN_IMAGE_GET_LINE (
   4598 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4599 
   4600    while (height--)
   4601    {
   4602 dst = dst_line;
   4603 dst_line += dst_stride;
   4604 mask = mask_line;
   4605 mask_line += mask_stride;
   4606 w = width;
   4607 
   4608 while (w && ((uintptr_t)dst & 15))
   4609 {
   4610     uint8_t m = *mask++;
   4611     if (m)
   4612     {
   4613 	*dst = pack_1x128_32
   4614 	    (_mm_adds_epu16
   4615 	     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
   4616 	      unpack_32_1x128 (*dst)));
   4617     }
   4618     dst++;
   4619     w--;
   4620 }
   4621 
   4622 while (w >= 4)
   4623 {
   4624     uint32_t m;
   4625            memcpy(&m, mask, sizeof(uint32_t));
   4626 
   4627     if (m)
   4628     {
   4629 	__m128i xmm_mask_lo, xmm_mask_hi;
   4630 	__m128i xmm_dst_lo, xmm_dst_hi;
   4631 
   4632 	__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
   4633 	__m128i xmm_mask =
   4634 	    _mm_unpacklo_epi8 (unpack_32_1x128(m),
   4635 			       _mm_setzero_si128 ());
   4636 
   4637 	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4638 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4639 
   4640 	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
   4641 				&xmm_mask_lo, &xmm_mask_hi);
   4642 
   4643 	pix_multiply_2x128 (&xmm_src, &xmm_src,
   4644 			    &xmm_mask_lo, &xmm_mask_hi,
   4645 			    &xmm_mask_lo, &xmm_mask_hi);
   4646 
   4647 	xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
   4648 	xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
   4649 
   4650 	save_128_aligned (
   4651 	    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4652     }
   4653 
   4654     w -= 4;
   4655     dst += 4;
   4656     mask += 4;
   4657 }
   4658 
   4659 while (w)
   4660 {
   4661     uint8_t m = *mask++;
   4662     if (m)
   4663     {
   4664 	*dst = pack_1x128_32
   4665 	    (_mm_adds_epu16
   4666 	     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
   4667 	      unpack_32_1x128 (*dst)));
   4668     }
   4669     dst++;
   4670     w--;
   4671 }
   4672    }
   4673 }
   4674 
   4675 static pixman_bool_t
   4676 sse2_blt (pixman_implementation_t *imp,
   4677          uint32_t *               src_bits,
   4678          uint32_t *               dst_bits,
   4679          int                      src_stride,
   4680          int                      dst_stride,
   4681          int                      src_bpp,
   4682          int                      dst_bpp,
   4683          int                      src_x,
   4684          int                      src_y,
   4685          int                      dest_x,
   4686          int                      dest_y,
   4687          int                      width,
   4688          int                      height)
   4689 {
   4690    uint8_t *   src_bytes;
   4691    uint8_t *   dst_bytes;
   4692    int byte_width;
   4693 
   4694    if (src_bpp != dst_bpp)
   4695 return FALSE;
   4696 
   4697    if (src_bpp == 16)
   4698    {
   4699 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
   4700 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
   4701 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
   4702 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
   4703 byte_width = 2 * width;
   4704 src_stride *= 2;
   4705 dst_stride *= 2;
   4706    }
   4707    else if (src_bpp == 32)
   4708    {
   4709 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
   4710 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
   4711 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
   4712 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
   4713 byte_width = 4 * width;
   4714 src_stride *= 4;
   4715 dst_stride *= 4;
   4716    }
   4717    else
   4718    {
   4719 return FALSE;
   4720    }
   4721 
   4722    while (height--)
   4723    {
   4724 int w;
   4725 uint8_t *s = src_bytes;
   4726 uint8_t *d = dst_bytes;
   4727 src_bytes += src_stride;
   4728 dst_bytes += dst_stride;
   4729 w = byte_width;
   4730 
   4731 while (w >= 2 && ((uintptr_t)d & 3))
   4732 {
   4733            memmove(d, s, 2);
   4734     w -= 2;
   4735     s += 2;
   4736     d += 2;
   4737 }
   4738 
   4739 while (w >= 4 && ((uintptr_t)d & 15))
   4740 {
   4741            memmove(d, s, 4);
   4742 
   4743     w -= 4;
   4744     s += 4;
   4745     d += 4;
   4746 }
   4747 
   4748 while (w >= 64)
   4749 {
   4750     __m128i xmm0, xmm1, xmm2, xmm3;
   4751 
   4752     xmm0 = load_128_unaligned ((__m128i*)(s));
   4753     xmm1 = load_128_unaligned ((__m128i*)(s + 16));
   4754     xmm2 = load_128_unaligned ((__m128i*)(s + 32));
   4755     xmm3 = load_128_unaligned ((__m128i*)(s + 48));
   4756 
   4757     save_128_aligned ((__m128i*)(d),    xmm0);
   4758     save_128_aligned ((__m128i*)(d + 16), xmm1);
   4759     save_128_aligned ((__m128i*)(d + 32), xmm2);
   4760     save_128_aligned ((__m128i*)(d + 48), xmm3);
   4761 
   4762     s += 64;
   4763     d += 64;
   4764     w -= 64;
   4765 }
   4766 
   4767 while (w >= 16)
   4768 {
   4769     save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
   4770 
   4771     w -= 16;
   4772     d += 16;
   4773     s += 16;
   4774 }
   4775 
   4776 while (w >= 4)
   4777 {
   4778            memmove(d, s, 4);
   4779 
   4780     w -= 4;
   4781     s += 4;
   4782     d += 4;
   4783 }
   4784 
   4785 if (w >= 2)
   4786 {
   4787            memmove(d, s, 2);
   4788     w -= 2;
   4789     s += 2;
   4790     d += 2;
   4791 }
   4792    }
   4793 
   4794    return TRUE;
   4795 }
   4796 
   4797 static void
   4798 sse2_composite_copy_area (pixman_implementation_t *imp,
   4799                          pixman_composite_info_t *info)
   4800 {
   4801    PIXMAN_COMPOSITE_ARGS (info);
   4802    sse2_blt (imp, src_image->bits.bits,
   4803       dest_image->bits.bits,
   4804       src_image->bits.rowstride,
   4805       dest_image->bits.rowstride,
   4806       PIXMAN_FORMAT_BPP (src_image->bits.format),
   4807       PIXMAN_FORMAT_BPP (dest_image->bits.format),
   4808       src_x, src_y, dest_x, dest_y, width, height);
   4809 }
   4810 
   4811 static void
   4812 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
   4813                                 pixman_composite_info_t *info)
   4814 {
   4815    PIXMAN_COMPOSITE_ARGS (info);
   4816    uint32_t    *src, *src_line, s;
   4817    uint32_t    *dst, *dst_line, d;
   4818    uint8_t         *mask, *mask_line;
   4819    int src_stride, mask_stride, dst_stride;
   4820    int32_t w;
   4821    __m128i ms;
   4822 
   4823    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   4824    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4825    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   4826 
   4827    PIXMAN_IMAGE_GET_LINE (
   4828 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4829    PIXMAN_IMAGE_GET_LINE (
   4830 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4831    PIXMAN_IMAGE_GET_LINE (
   4832 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   4833 
   4834    while (height--)
   4835    {
   4836        src = src_line;
   4837        src_line += src_stride;
   4838        dst = dst_line;
   4839        dst_line += dst_stride;
   4840        mask = mask_line;
   4841        mask_line += mask_stride;
   4842 
   4843        w = width;
   4844 
   4845        while (w && (uintptr_t)dst & 15)
   4846        {
   4847            uint8_t m = *mask++;
   4848            s = 0xff000000 | *src++;
   4849            d = *dst;
   4850            ms = unpack_32_1x128 (s);
   4851 
   4852            if (m != 0xff)
   4853            {
   4854 	__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
   4855 	__m128i md = unpack_32_1x128 (d);
   4856 
   4857                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
   4858            }
   4859 
   4860            *dst++ = pack_1x128_32 (ms);
   4861            w--;
   4862        }
   4863 
   4864        while (w >= 4)
   4865        {
   4866            uint32_t m;
   4867            memcpy(&m, mask, sizeof(uint32_t));
   4868            xmm_src = _mm_or_si128 (
   4869 	load_128_unaligned ((__m128i*)src), mask_ff000000);
   4870 
   4871            if (m == 0xffffffff)
   4872            {
   4873                save_128_aligned ((__m128i*)dst, xmm_src);
   4874            }
   4875            else
   4876            {
   4877                xmm_dst = load_128_aligned ((__m128i*)dst);
   4878 
   4879                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
   4880 
   4881                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   4882                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   4883                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   4884 
   4885                expand_alpha_rev_2x128 (
   4886 	    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   4887 
   4888                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   4889 		       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
   4890 		       &xmm_dst_lo, &xmm_dst_hi);
   4891 
   4892                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   4893            }
   4894 
   4895            src += 4;
   4896            dst += 4;
   4897            mask += 4;
   4898            w -= 4;
   4899        }
   4900 
   4901        while (w)
   4902        {
   4903            uint8_t m = *mask++;
   4904 
   4905            if (m)
   4906            {
   4907                s = 0xff000000 | *src;
   4908 
   4909                if (m == 0xff)
   4910                {
   4911                    *dst = s;
   4912                }
   4913                else
   4914                {
   4915 	    __m128i ma, md, ms;
   4916 
   4917                    d = *dst;
   4918 
   4919 	    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
   4920 	    md = unpack_32_1x128 (d);
   4921 	    ms = unpack_32_1x128 (s);
   4922 
   4923                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
   4924                }
   4925 
   4926            }
   4927 
   4928            src++;
   4929            dst++;
   4930            w--;
   4931        }
   4932    }
   4933 
   4934 }
   4935 
   4936 static void
   4937 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
   4938                                 pixman_composite_info_t *info)
   4939 {
   4940    PIXMAN_COMPOSITE_ARGS (info);
   4941    uint32_t    *src, *src_line, s;
   4942    uint32_t    *dst, *dst_line, d;
   4943    uint8_t         *mask, *mask_line;
   4944    int src_stride, mask_stride, dst_stride;
   4945    int32_t w;
   4946 
   4947    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
   4948    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   4949    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   4950 
   4951    PIXMAN_IMAGE_GET_LINE (
   4952 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   4953    PIXMAN_IMAGE_GET_LINE (
   4954 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
   4955    PIXMAN_IMAGE_GET_LINE (
   4956 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   4957 
   4958    while (height--)
   4959    {
   4960        src = src_line;
   4961        src_line += src_stride;
   4962        dst = dst_line;
   4963        dst_line += dst_stride;
   4964        mask = mask_line;
   4965        mask_line += mask_stride;
   4966 
   4967        w = width;
   4968 
   4969        while (w && (uintptr_t)dst & 15)
   4970        {
   4971     uint32_t sa;
   4972            uint8_t m = *mask++;
   4973 
   4974            s = *src++;
   4975            d = *dst;
   4976 
   4977     sa = s >> 24;
   4978 
   4979     if (m)
   4980     {
   4981 	if (sa == 0xff && m == 0xff)
   4982 	{
   4983 	    *dst = s;
   4984 	}
   4985 	else
   4986 	{
   4987 	    __m128i ms, md, ma, msa;
   4988 
   4989 	    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   4990 	    ms = unpack_32_1x128 (s);
   4991 	    md = unpack_32_1x128 (d);
   4992 
   4993 	    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   4994 
   4995 	    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   4996 	}
   4997     }
   4998 
   4999     dst++;
   5000            w--;
   5001        }
   5002 
   5003        while (w >= 4)
   5004        {
   5005            uint32_t m;
   5006            memcpy(&m, mask, sizeof(uint32_t));
   5007 
   5008     if (m)
   5009     {
   5010 	xmm_src = load_128_unaligned ((__m128i*)src);
   5011 
   5012 	if (m == 0xffffffff && is_opaque (xmm_src))
   5013 	{
   5014 	    save_128_aligned ((__m128i *)dst, xmm_src);
   5015 	}
   5016 	else
   5017 	{
   5018 	    xmm_dst = load_128_aligned ((__m128i *)dst);
   5019 
   5020 	    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
   5021 
   5022 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5023 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   5024 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5025 
   5026 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
   5027 	    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   5028 
   5029 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
   5030 			   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
   5031 
   5032 	    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5033 	}
   5034     }
   5035 
   5036            src += 4;
   5037            dst += 4;
   5038            mask += 4;
   5039            w -= 4;
   5040        }
   5041 
   5042        while (w)
   5043        {
   5044     uint32_t sa;
   5045            uint8_t m = *mask++;
   5046 
   5047            s = *src++;
   5048            d = *dst;
   5049 
   5050     sa = s >> 24;
   5051 
   5052     if (m)
   5053     {
   5054 	if (sa == 0xff && m == 0xff)
   5055 	{
   5056 	    *dst = s;
   5057 	}
   5058 	else
   5059 	{
   5060 	    __m128i ms, md, ma, msa;
   5061 
   5062 	    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5063 	    ms = unpack_32_1x128 (s);
   5064 	    md = unpack_32_1x128 (d);
   5065 
   5066 	    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5067 
   5068 	    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5069 	}
   5070     }
   5071 
   5072     dst++;
   5073            w--;
   5074        }
   5075    }
   5076 
   5077 }
   5078 
   5079 static void
   5080 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
   5081 			    pixman_composite_info_t *info)
   5082 {
   5083    PIXMAN_COMPOSITE_ARGS (info);
   5084    uint32_t src;
   5085    uint32_t    *dst_line, *dst;
   5086    __m128i xmm_src;
   5087    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   5088    __m128i xmm_dsta_hi, xmm_dsta_lo;
   5089    int dst_stride;
   5090    int32_t w;
   5091 
   5092    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   5093 
   5094    if (src == 0)
   5095 return;
   5096 
   5097    PIXMAN_IMAGE_GET_LINE (
   5098 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   5099 
   5100    xmm_src = expand_pixel_32_1x128 (src);
   5101 
   5102    while (height--)
   5103    {
   5104 dst = dst_line;
   5105 
   5106 dst_line += dst_stride;
   5107 w = width;
   5108 
   5109 while (w && (uintptr_t)dst & 15)
   5110 {
   5111     __m128i vd;
   5112 
   5113     vd = unpack_32_1x128 (*dst);
   5114 
   5115     *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
   5116 				      xmm_src));
   5117     w--;
   5118     dst++;
   5119 }
   5120 
   5121 while (w >= 4)
   5122 {
   5123     __m128i tmp_lo, tmp_hi;
   5124 
   5125     xmm_dst = load_128_aligned ((__m128i*)dst);
   5126 
   5127     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5128     expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
   5129 
   5130     tmp_lo = xmm_src;
   5131     tmp_hi = xmm_src;
   5132 
   5133     over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   5134 		&xmm_dsta_lo, &xmm_dsta_hi,
   5135 		&tmp_lo, &tmp_hi);
   5136 
   5137     save_128_aligned (
   5138 	(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
   5139 
   5140     w -= 4;
   5141     dst += 4;
   5142 }
   5143 
   5144 while (w)
   5145 {
   5146     __m128i vd;
   5147 
   5148     vd = unpack_32_1x128 (*dst);
   5149 
   5150     *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
   5151 				      xmm_src));
   5152     w--;
   5153     dst++;
   5154 }
   5155 
   5156    }
   5157 
   5158 }
   5159 
   5160 static void
   5161 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
   5162 			    pixman_composite_info_t *info)
   5163 {
   5164    PIXMAN_COMPOSITE_ARGS (info);
   5165    uint32_t    *src, *src_line, s;
   5166    uint32_t    *dst, *dst_line, d;
   5167    uint32_t    *mask, *mask_line;
   5168    uint32_t    m;
   5169    int src_stride, mask_stride, dst_stride;
   5170    int32_t w;
   5171 
   5172    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
   5173    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   5174    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   5175 
   5176    PIXMAN_IMAGE_GET_LINE (
   5177 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
   5178    PIXMAN_IMAGE_GET_LINE (
   5179 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
   5180    PIXMAN_IMAGE_GET_LINE (
   5181 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
   5182 
   5183    while (height--)
   5184    {
   5185        src = src_line;
   5186        src_line += src_stride;
   5187        dst = dst_line;
   5188        dst_line += dst_stride;
   5189        mask = mask_line;
   5190        mask_line += mask_stride;
   5191 
   5192        w = width;
   5193 
   5194        while (w && (uintptr_t)dst & 15)
   5195        {
   5196     uint32_t sa;
   5197 
   5198            s = *src++;
   5199            m = (*mask++) >> 24;
   5200            d = *dst;
   5201 
   5202     sa = s >> 24;
   5203 
   5204     if (m)
   5205     {
   5206 	if (sa == 0xff && m == 0xff)
   5207 	{
   5208 	    *dst = s;
   5209 	}
   5210 	else
   5211 	{
   5212 	    __m128i ms, md, ma, msa;
   5213 
   5214 	    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5215 	    ms = unpack_32_1x128 (s);
   5216 	    md = unpack_32_1x128 (d);
   5217 
   5218 	    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5219 
   5220 	    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5221 	}
   5222     }
   5223 
   5224     dst++;
   5225            w--;
   5226        }
   5227 
   5228        while (w >= 4)
   5229        {
   5230     xmm_mask = load_128_unaligned ((__m128i*)mask);
   5231 
   5232     if (!is_transparent (xmm_mask))
   5233     {
   5234 	xmm_src = load_128_unaligned ((__m128i*)src);
   5235 
   5236 	if (is_opaque (xmm_mask) && is_opaque (xmm_src))
   5237 	{
   5238 	    save_128_aligned ((__m128i *)dst, xmm_src);
   5239 	}
   5240 	else
   5241 	{
   5242 	    xmm_dst = load_128_aligned ((__m128i *)dst);
   5243 
   5244 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5245 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   5246 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5247 
   5248 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
   5249 	    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   5250 
   5251 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
   5252 			   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
   5253 
   5254 	    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5255 	}
   5256     }
   5257 
   5258            src += 4;
   5259            dst += 4;
   5260            mask += 4;
   5261            w -= 4;
   5262        }
   5263 
   5264        while (w)
   5265        {
   5266     uint32_t sa;
   5267 
   5268            s = *src++;
   5269            m = (*mask++) >> 24;
   5270            d = *dst;
   5271 
   5272     sa = s >> 24;
   5273 
   5274     if (m)
   5275     {
   5276 	if (sa == 0xff && m == 0xff)
   5277 	{
   5278 	    *dst = s;
   5279 	}
   5280 	else
   5281 	{
   5282 	    __m128i ms, md, ma, msa;
   5283 
   5284 	    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5285 	    ms = unpack_32_1x128 (s);
   5286 	    md = unpack_32_1x128 (d);
   5287 
   5288 	    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5289 
   5290 	    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5291 	}
   5292     }
   5293 
   5294     dst++;
   5295            w--;
   5296        }
   5297    }
   5298 
   5299 }
   5300 
   5301 /* A variant of 'sse2_combine_over_u' with minor tweaks */
   5302 static force_inline void
   5303 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
   5304                                             const uint32_t* ps,
   5305                                             int32_t         w,
   5306                                             pixman_fixed_t  vx,
   5307                                             pixman_fixed_t  unit_x,
   5308                                             pixman_fixed_t  src_width_fixed,
   5309                                             pixman_bool_t   fully_transparent_src)
   5310 {
   5311    uint32_t s, d;
   5312    const uint32_t* pm = NULL;
   5313 
   5314    __m128i xmm_dst_lo, xmm_dst_hi;
   5315    __m128i xmm_src_lo, xmm_src_hi;
   5316    __m128i xmm_alpha_lo, xmm_alpha_hi;
   5317 
   5318    if (fully_transparent_src)
   5319 return;
   5320 
   5321    /* Align dst on a 16-byte boundary */
   5322    while (w && ((uintptr_t)pd & 15))
   5323    {
   5324 d = *pd;
   5325 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
   5326 vx += unit_x;
   5327 while (vx >= 0)
   5328     vx -= src_width_fixed;
   5329 
   5330 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
   5331 if (pm)
   5332     pm++;
   5333 w--;
   5334    }
   5335 
   5336    while (w >= 4)
   5337    {
   5338 __m128i tmp;
   5339 uint32_t tmp1, tmp2, tmp3, tmp4;
   5340 
   5341 tmp1 = *(ps + pixman_fixed_to_int (vx));
   5342 vx += unit_x;
   5343 while (vx >= 0)
   5344     vx -= src_width_fixed;
   5345 tmp2 = *(ps + pixman_fixed_to_int (vx));
   5346 vx += unit_x;
   5347 while (vx >= 0)
   5348     vx -= src_width_fixed;
   5349 tmp3 = *(ps + pixman_fixed_to_int (vx));
   5350 vx += unit_x;
   5351 while (vx >= 0)
   5352     vx -= src_width_fixed;
   5353 tmp4 = *(ps + pixman_fixed_to_int (vx));
   5354 vx += unit_x;
   5355 while (vx >= 0)
   5356     vx -= src_width_fixed;
   5357 
   5358 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
   5359 
   5360 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
   5361 
   5362 if (is_opaque (xmm_src_hi))
   5363 {
   5364     save_128_aligned ((__m128i*)pd, xmm_src_hi);
   5365 }
   5366 else if (!is_zero (xmm_src_hi))
   5367 {
   5368     xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   5369 
   5370     unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   5371     unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   5372 
   5373     expand_alpha_2x128 (
   5374 	xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
   5375 
   5376     over_2x128 (&xmm_src_lo, &xmm_src_hi,
   5377 		&xmm_alpha_lo, &xmm_alpha_hi,
   5378 		&xmm_dst_lo, &xmm_dst_hi);
   5379 
   5380     /* rebuid the 4 pixel data and save*/
   5381     save_128_aligned ((__m128i*)pd,
   5382 		      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5383 }
   5384 
   5385 w -= 4;
   5386 pd += 4;
   5387 if (pm)
   5388     pm += 4;
   5389    }
   5390 
   5391    while (w)
   5392    {
   5393 d = *pd;
   5394 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
   5395 vx += unit_x;
   5396 while (vx >= 0)
   5397     vx -= src_width_fixed;
   5398 
   5399 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
   5400 if (pm)
   5401     pm++;
   5402 
   5403 w--;
   5404    }
   5405 }
   5406 
   5407 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
   5408 	       scaled_nearest_scanline_sse2_8888_8888_OVER,
   5409 	       uint32_t, uint32_t, COVER)
   5410 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
   5411 	       scaled_nearest_scanline_sse2_8888_8888_OVER,
   5412 	       uint32_t, uint32_t, NONE)
   5413 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
   5414 	       scaled_nearest_scanline_sse2_8888_8888_OVER,
   5415 	       uint32_t, uint32_t, PAD)
   5416 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
   5417 	       scaled_nearest_scanline_sse2_8888_8888_OVER,
   5418 	       uint32_t, uint32_t, NORMAL)
   5419 
   5420 static force_inline void
   5421 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
   5422 				       uint32_t *       dst,
   5423 				       const uint32_t * src,
   5424 				       int32_t          w,
   5425 				       pixman_fixed_t   vx,
   5426 				       pixman_fixed_t   unit_x,
   5427 				       pixman_fixed_t   src_width_fixed,
   5428 				       pixman_bool_t    zero_src)
   5429 {
   5430    __m128i xmm_mask;
   5431    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
   5432    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   5433    __m128i xmm_alpha_lo, xmm_alpha_hi;
   5434 
   5435    if (zero_src || (*mask >> 24) == 0)
   5436 return;
   5437 
   5438    xmm_mask = create_mask_16_128 (*mask >> 24);
   5439 
   5440    while (w && (uintptr_t)dst & 15)
   5441    {
   5442 uint32_t s = *(src + pixman_fixed_to_int (vx));
   5443 vx += unit_x;
   5444 while (vx >= 0)
   5445     vx -= src_width_fixed;
   5446 
   5447 if (s)
   5448 {
   5449     uint32_t d = *dst;
   5450 
   5451     __m128i ms = unpack_32_1x128 (s);
   5452     __m128i alpha     = expand_alpha_1x128 (ms);
   5453     __m128i dest      = xmm_mask;
   5454     __m128i alpha_dst = unpack_32_1x128 (d);
   5455 
   5456     *dst = pack_1x128_32 (
   5457 	in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
   5458 }
   5459 dst++;
   5460 w--;
   5461    }
   5462 
   5463    while (w >= 4)
   5464    {
   5465 uint32_t tmp1, tmp2, tmp3, tmp4;
   5466 
   5467 tmp1 = *(src + pixman_fixed_to_int (vx));
   5468 vx += unit_x;
   5469 while (vx >= 0)
   5470     vx -= src_width_fixed;
   5471 tmp2 = *(src + pixman_fixed_to_int (vx));
   5472 vx += unit_x;
   5473 while (vx >= 0)
   5474     vx -= src_width_fixed;
   5475 tmp3 = *(src + pixman_fixed_to_int (vx));
   5476 vx += unit_x;
   5477 while (vx >= 0)
   5478     vx -= src_width_fixed;
   5479 tmp4 = *(src + pixman_fixed_to_int (vx));
   5480 vx += unit_x;
   5481 while (vx >= 0)
   5482     vx -= src_width_fixed;
   5483 
   5484 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
   5485 
   5486 if (!is_zero (xmm_src))
   5487 {
   5488     xmm_dst = load_128_aligned ((__m128i*)dst);
   5489 
   5490     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5491     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5492     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   5493 		        &xmm_alpha_lo, &xmm_alpha_hi);
   5494 
   5495     in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   5496 		   &xmm_alpha_lo, &xmm_alpha_hi,
   5497 		   &xmm_mask, &xmm_mask,
   5498 		   &xmm_dst_lo, &xmm_dst_hi);
   5499 
   5500     save_128_aligned (
   5501 	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5502 }
   5503 
   5504 dst += 4;
   5505 w -= 4;
   5506    }
   5507 
   5508    while (w)
   5509    {
   5510 uint32_t s = *(src + pixman_fixed_to_int (vx));
   5511 vx += unit_x;
   5512 while (vx >= 0)
   5513     vx -= src_width_fixed;
   5514 
   5515 if (s)
   5516 {
   5517     uint32_t d = *dst;
   5518 
   5519     __m128i ms = unpack_32_1x128 (s);
   5520     __m128i alpha = expand_alpha_1x128 (ms);
   5521     __m128i mask  = xmm_mask;
   5522     __m128i dest  = unpack_32_1x128 (d);
   5523 
   5524     *dst = pack_1x128_32 (
   5525 	in_over_1x128 (&ms, &alpha, &mask, &dest));
   5526 }
   5527 
   5528 dst++;
   5529 w--;
   5530    }
   5531 
   5532 }
   5533 
   5534 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
   5535 		      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
   5536 		      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
   5537 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
   5538 		      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
   5539 		      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
   5540 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
   5541 		      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
   5542 		      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
   5543 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
   5544 		      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
   5545 		      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
   5546 
   5547 #if PSHUFD_IS_FAST
   5548 
   5549 /***********************************************************************************/
   5550 
   5551 # define BILINEAR_DECLARE_VARIABLES						\
   5552    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
   5553    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
   5554    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
   5555    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
   5556 				   unit_x, -unit_x, unit_x, -unit_x);	\
   5557    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
   5558 				   unit_x * 4, -unit_x * 4,		\
   5559 				   unit_x * 4, -unit_x * 4,		\
   5560 				   unit_x * 4, -unit_x * 4);		\
   5561    const __m128i xmm_zero = _mm_setzero_si128 ();				\
   5562    __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,	\
   5563 			   vx + unit_x * 2, -(vx + 1) - unit_x * 2,	\
   5564 			   vx + unit_x * 1, -(vx + 1) - unit_x * 1,	\
   5565 			   vx + unit_x * 0, -(vx + 1) - unit_x * 0);	\
   5566    __m128i xmm_wh_state;
   5567 
   5568 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)			\
   5569 do {										\
   5570    int phase = phase_;								\
   5571    __m128i xmm_wh, xmm_a, xmm_b;						\
   5572    /* fetch 2x2 pixel block into sse2 registers */				\
   5573    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
   5574    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
   5575    vx += unit_x;								\
   5576    /* vertical interpolation */						\
   5577    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
   5578    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
   5579    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);						\
   5580    /* calculate horizontal weights */						\
   5581    if (phase <= 0)								\
   5582    {										\
   5583 xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
   5584 				16 - BILINEAR_INTERPOLATION_BITS));	\
   5585 xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);		\
   5586 phase = 0;								\
   5587    }										\
   5588    xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,	\
   5589 						   phase, phase));	\
   5590    /* horizontal interpolation */						\
   5591    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
   5592 	xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);		\
   5593    /* shift the result */							\
   5594    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
   5595 } while (0)
   5596 
   5597 #else /************************************************************************/
   5598 
   5599 # define BILINEAR_DECLARE_VARIABLES						\
   5600    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
   5601    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
   5602    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
   5603    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
   5604 				  unit_x, -unit_x, unit_x, -unit_x);	\
   5605    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
   5606 				   unit_x * 4, -unit_x * 4,		\
   5607 				   unit_x * 4, -unit_x * 4,		\
   5608 				   unit_x * 4, -unit_x * 4);		\
   5609    const __m128i xmm_zero = _mm_setzero_si128 ();				\
   5610    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
   5611 			   vx, -(vx + 1), vx, -(vx + 1))
   5612 
   5613 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)			\
   5614 do {										\
   5615    __m128i xmm_wh, xmm_a, xmm_b;						\
   5616    /* fetch 2x2 pixel block into sse2 registers */				\
   5617    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
   5618    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
   5619    (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */		\
   5620    vx += unit_x;								\
   5621    /* vertical interpolation */						\
   5622    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
   5623    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
   5624    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);					\
   5625    /* calculate horizontal weights */						\
   5626    xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,			\
   5627 				16 - BILINEAR_INTERPOLATION_BITS));	\
   5628    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
   5629    /* horizontal interpolation */						\
   5630    xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);	\
   5631    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);		\
   5632    /* shift the result */							\
   5633    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
   5634 } while (0)
   5635 
   5636 /***********************************************************************************/
   5637 
   5638 #endif
   5639 
   5640 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);					\
   5641 do {										\
   5642 __m128i xmm_pix;							\
   5643 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);			\
   5644 xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);				\
   5645 xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);				\
   5646 pix = _mm_cvtsi128_si32 (xmm_pix);					\
   5647 } while(0)
   5648 
   5649 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);					\
   5650 do {										\
   5651 __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;				\
   5652 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);			\
   5653 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);			\
   5654 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);			\
   5655 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);			\
   5656 xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);			\
   5657 xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);			\
   5658 pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);				\
   5659 } while(0)
   5660 
   5661 #define BILINEAR_SKIP_ONE_PIXEL()						\
   5662 do {										\
   5663    vx += unit_x;								\
   5664    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
   5665 } while(0)
   5666 
   5667 #define BILINEAR_SKIP_FOUR_PIXELS()						\
   5668 do {										\
   5669    vx += unit_x * 4;								\
   5670    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);					\
   5671 } while(0)
   5672 
   5673 /***********************************************************************************/
   5674 
   5675 static force_inline void
   5676 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
   5677 				     const uint32_t * mask,
   5678 				     const uint32_t * src_top,
   5679 				     const uint32_t * src_bottom,
   5680 				     int32_t          w,
   5681 				     int              wt,
   5682 				     int              wb,
   5683 				     pixman_fixed_t   vx_,
   5684 				     pixman_fixed_t   unit_x_,
   5685 				     pixman_fixed_t   max_vx,
   5686 				     pixman_bool_t    zero_src)
   5687 {
   5688    intptr_t vx = vx_;
   5689    intptr_t unit_x = unit_x_;
   5690    BILINEAR_DECLARE_VARIABLES;
   5691    uint32_t pix1, pix2;
   5692 
   5693    while (w && ((uintptr_t)dst & 15))
   5694    {
   5695 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5696 *dst++ = pix1;
   5697 w--;
   5698    }
   5699 
   5700    while ((w -= 4) >= 0) {
   5701 __m128i xmm_src;
   5702 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
   5703 _mm_store_si128 ((__m128i *)dst, xmm_src);
   5704 dst += 4;
   5705    }
   5706 
   5707    if (w & 2)
   5708    {
   5709 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5710 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
   5711 *dst++ = pix1;
   5712 *dst++ = pix2;
   5713    }
   5714 
   5715    if (w & 1)
   5716    {
   5717 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5718 *dst = pix1;
   5719    }
   5720 
   5721 }
   5722 
   5723 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
   5724 		       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   5725 		       uint32_t, uint32_t, uint32_t,
   5726 		       COVER, FLAG_NONE)
   5727 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
   5728 		       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   5729 		       uint32_t, uint32_t, uint32_t,
   5730 		       PAD, FLAG_NONE)
   5731 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
   5732 		       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   5733 		       uint32_t, uint32_t, uint32_t,
   5734 		       NONE, FLAG_NONE)
   5735 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
   5736 		       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   5737 		       uint32_t, uint32_t, uint32_t,
   5738 		       NORMAL, FLAG_NONE)
   5739 
   5740 static force_inline void
   5741 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
   5742 				     const uint32_t * mask,
   5743 				     const uint32_t * src_top,
   5744 				     const uint32_t * src_bottom,
   5745 				     int32_t          w,
   5746 				     int              wt,
   5747 				     int              wb,
   5748 				     pixman_fixed_t   vx_,
   5749 				     pixman_fixed_t   unit_x_,
   5750 				     pixman_fixed_t   max_vx,
   5751 				     pixman_bool_t    zero_src)
   5752 {
   5753    intptr_t vx = vx_;
   5754    intptr_t unit_x = unit_x_;
   5755    BILINEAR_DECLARE_VARIABLES;
   5756    uint32_t pix1, pix2;
   5757 
   5758    while (w && ((uintptr_t)dst & 15))
   5759    {
   5760 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5761 *dst++ = pix1 | 0xFF000000;
   5762 w--;
   5763    }
   5764 
   5765    while ((w -= 4) >= 0) {
   5766 __m128i xmm_src;
   5767 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
   5768 _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
   5769 dst += 4;
   5770    }
   5771 
   5772    if (w & 2)
   5773    {
   5774 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5775 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
   5776 *dst++ = pix1 | 0xFF000000;
   5777 *dst++ = pix2 | 0xFF000000;
   5778    }
   5779 
   5780    if (w & 1)
   5781    {
   5782 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5783 *dst = pix1 | 0xFF000000;
   5784    }
   5785 }
   5786 
   5787 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
   5788 		       scaled_bilinear_scanline_sse2_x888_8888_SRC,
   5789 		       uint32_t, uint32_t, uint32_t,
   5790 		       COVER, FLAG_NONE)
   5791 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
   5792 		       scaled_bilinear_scanline_sse2_x888_8888_SRC,
   5793 		       uint32_t, uint32_t, uint32_t,
   5794 		       PAD, FLAG_NONE)
   5795 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
   5796 		       scaled_bilinear_scanline_sse2_x888_8888_SRC,
   5797 		       uint32_t, uint32_t, uint32_t,
   5798 		       NORMAL, FLAG_NONE)
   5799 
   5800 static force_inline void
   5801 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
   5802 				      const uint32_t * mask,
   5803 				      const uint32_t * src_top,
   5804 				      const uint32_t * src_bottom,
   5805 				      int32_t          w,
   5806 				      int              wt,
   5807 				      int              wb,
   5808 				      pixman_fixed_t   vx_,
   5809 				      pixman_fixed_t   unit_x_,
   5810 				      pixman_fixed_t   max_vx,
   5811 				      pixman_bool_t    zero_src)
   5812 {
   5813    intptr_t vx = vx_;
   5814    intptr_t unit_x = unit_x_;
   5815    BILINEAR_DECLARE_VARIABLES;
   5816    uint32_t pix1, pix2;
   5817 
   5818    while (w && ((uintptr_t)dst & 15))
   5819    {
   5820 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5821 
   5822 if (pix1)
   5823 {
   5824     pix2 = *dst;
   5825     *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
   5826 }
   5827 
   5828 w--;
   5829 dst++;
   5830    }
   5831 
   5832    while (w  >= 4)
   5833    {
   5834 __m128i xmm_src;
   5835 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
   5836 __m128i xmm_alpha_hi, xmm_alpha_lo;
   5837 
   5838 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
   5839 
   5840 if (!is_zero (xmm_src))
   5841 {
   5842     if (is_opaque (xmm_src))
   5843     {
   5844 	save_128_aligned ((__m128i *)dst, xmm_src);
   5845     }
   5846     else
   5847     {
   5848 	__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
   5849 
   5850 	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5851 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5852 
   5853 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
   5854 	over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
   5855 		    &xmm_dst_lo, &xmm_dst_hi);
   5856 
   5857 	save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5858     }
   5859 }
   5860 
   5861 w -= 4;
   5862 dst += 4;
   5863    }
   5864 
   5865    while (w)
   5866    {
   5867 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5868 
   5869 if (pix1)
   5870 {
   5871     pix2 = *dst;
   5872     *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
   5873 }
   5874 
   5875 w--;
   5876 dst++;
   5877    }
   5878 }
   5879 
   5880 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
   5881 		       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   5882 		       uint32_t, uint32_t, uint32_t,
   5883 		       COVER, FLAG_NONE)
   5884 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
   5885 		       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   5886 		       uint32_t, uint32_t, uint32_t,
   5887 		       PAD, FLAG_NONE)
   5888 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
   5889 		       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   5890 		       uint32_t, uint32_t, uint32_t,
   5891 		       NONE, FLAG_NONE)
   5892 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
   5893 		       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   5894 		       uint32_t, uint32_t, uint32_t,
   5895 		       NORMAL, FLAG_NONE)
   5896 
   5897 static force_inline void
   5898 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
   5899 					const uint8_t  * mask,
   5900 					const uint32_t * src_top,
   5901 					const uint32_t * src_bottom,
   5902 					int32_t          w,
   5903 					int              wt,
   5904 					int              wb,
   5905 					pixman_fixed_t   vx_,
   5906 					pixman_fixed_t   unit_x_,
   5907 					pixman_fixed_t   max_vx,
   5908 					pixman_bool_t    zero_src)
   5909 {
   5910    intptr_t vx = vx_;
   5911    intptr_t unit_x = unit_x_;
   5912    BILINEAR_DECLARE_VARIABLES;
   5913    uint32_t pix1, pix2;
   5914 
   5915    while (w && ((uintptr_t)dst & 15))
   5916    {
   5917 uint32_t sa;
   5918 uint8_t m = *mask++;
   5919 
   5920 if (m)
   5921 {
   5922     BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   5923     sa = pix1 >> 24;
   5924 
   5925     if (sa == 0xff && m == 0xff)
   5926     {
   5927 	*dst = pix1;
   5928     }
   5929     else
   5930     {
   5931 	__m128i ms, md, ma, msa;
   5932 
   5933 	pix2 = *dst;
   5934 	ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   5935 	ms = unpack_32_1x128 (pix1);
   5936 	md = unpack_32_1x128 (pix2);
   5937 
   5938 	msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   5939 
   5940 	*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   5941     }
   5942 }
   5943 else
   5944 {
   5945     BILINEAR_SKIP_ONE_PIXEL ();
   5946 }
   5947 
   5948 w--;
   5949 dst++;
   5950    }
   5951 
   5952    while (w >= 4)
   5953    {
   5954        uint32_t m;
   5955 
   5956 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
   5957 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   5958 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
   5959 
   5960        memcpy(&m, mask, sizeof(uint32_t));
   5961 
   5962 if (m)
   5963 {
   5964     BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
   5965 
   5966     if (m == 0xffffffff && is_opaque (xmm_src))
   5967     {
   5968 	save_128_aligned ((__m128i *)dst, xmm_src);
   5969     }
   5970     else
   5971     {
   5972 	xmm_dst = load_128_aligned ((__m128i *)dst);
   5973 
   5974 	xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
   5975 
   5976 	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   5977 	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
   5978 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   5979 
   5980 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
   5981 	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
   5982 
   5983 	in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
   5984 		       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
   5985 
   5986 	save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   5987     }
   5988 }
   5989 else
   5990 {
   5991     BILINEAR_SKIP_FOUR_PIXELS ();
   5992 }
   5993 
   5994 w -= 4;
   5995 dst += 4;
   5996 mask += 4;
   5997    }
   5998 
   5999    while (w)
   6000    {
   6001 uint32_t sa;
   6002 uint8_t m = *mask++;
   6003 
   6004 if (m)
   6005 {
   6006     BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   6007     sa = pix1 >> 24;
   6008 
   6009     if (sa == 0xff && m == 0xff)
   6010     {
   6011 	*dst = pix1;
   6012     }
   6013     else
   6014     {
   6015 	__m128i ms, md, ma, msa;
   6016 
   6017 	pix2 = *dst;
   6018 	ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
   6019 	ms = unpack_32_1x128 (pix1);
   6020 	md = unpack_32_1x128 (pix2);
   6021 
   6022 	msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
   6023 
   6024 	*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
   6025     }
   6026 }
   6027 else
   6028 {
   6029     BILINEAR_SKIP_ONE_PIXEL ();
   6030 }
   6031 
   6032 w--;
   6033 dst++;
   6034    }
   6035 }
   6036 
   6037 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
   6038 		       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   6039 		       uint32_t, uint8_t, uint32_t,
   6040 		       COVER, FLAG_HAVE_NON_SOLID_MASK)
   6041 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
   6042 		       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   6043 		       uint32_t, uint8_t, uint32_t,
   6044 		       PAD, FLAG_HAVE_NON_SOLID_MASK)
   6045 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
   6046 		       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   6047 		       uint32_t, uint8_t, uint32_t,
   6048 		       NONE, FLAG_HAVE_NON_SOLID_MASK)
   6049 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
   6050 		       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   6051 		       uint32_t, uint8_t, uint32_t,
   6052 		       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
   6053 
   6054 static force_inline void
   6055 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
   6056 					const uint32_t * mask,
   6057 					const uint32_t * src_top,
   6058 					const uint32_t * src_bottom,
   6059 					int32_t          w,
   6060 					int              wt,
   6061 					int              wb,
   6062 					pixman_fixed_t   vx_,
   6063 					pixman_fixed_t   unit_x_,
   6064 					pixman_fixed_t   max_vx,
   6065 					pixman_bool_t    zero_src)
   6066 {
   6067    intptr_t vx = vx_;
   6068    intptr_t unit_x = unit_x_;
   6069    BILINEAR_DECLARE_VARIABLES;
   6070    uint32_t pix1;
   6071    __m128i xmm_mask;
   6072 
   6073    if (zero_src || (*mask >> 24) == 0)
   6074 return;
   6075 
   6076    xmm_mask = create_mask_16_128 (*mask >> 24);
   6077 
   6078    while (w && ((uintptr_t)dst & 15))
   6079    {
   6080 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   6081 if (pix1)
   6082 {
   6083 	uint32_t d = *dst;
   6084 
   6085 	__m128i ms = unpack_32_1x128 (pix1);
   6086 	__m128i alpha     = expand_alpha_1x128 (ms);
   6087 	__m128i dest      = xmm_mask;
   6088 	__m128i alpha_dst = unpack_32_1x128 (d);
   6089 
   6090 	*dst = pack_1x128_32
   6091 		(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
   6092 }
   6093 
   6094 dst++;
   6095 w--;
   6096    }
   6097 
   6098    while (w >= 4)
   6099    {
   6100 __m128i xmm_src;
   6101 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
   6102 
   6103 if (!is_zero (xmm_src))
   6104 {
   6105     __m128i xmm_src_lo, xmm_src_hi;
   6106     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
   6107     __m128i xmm_alpha_lo, xmm_alpha_hi;
   6108 
   6109     xmm_dst = load_128_aligned ((__m128i*)dst);
   6110 
   6111     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
   6112     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
   6113     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
   6114 			&xmm_alpha_lo, &xmm_alpha_hi);
   6115 
   6116     in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
   6117 		   &xmm_alpha_lo, &xmm_alpha_hi,
   6118 		   &xmm_mask, &xmm_mask,
   6119 		   &xmm_dst_lo, &xmm_dst_hi);
   6120 
   6121     save_128_aligned
   6122 	((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   6123 }
   6124 
   6125 dst += 4;
   6126 w -= 4;
   6127    }
   6128 
   6129    while (w)
   6130    {
   6131 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   6132 if (pix1)
   6133 {
   6134 	uint32_t d = *dst;
   6135 
   6136 	__m128i ms = unpack_32_1x128 (pix1);
   6137 	__m128i alpha     = expand_alpha_1x128 (ms);
   6138 	__m128i dest      = xmm_mask;
   6139 	__m128i alpha_dst = unpack_32_1x128 (d);
   6140 
   6141 	*dst = pack_1x128_32
   6142 		(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
   6143 }
   6144 
   6145 dst++;
   6146 w--;
   6147    }
   6148 }
   6149 
   6150 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
   6151 		       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
   6152 		       uint32_t, uint32_t, uint32_t,
   6153 		       COVER, FLAG_HAVE_SOLID_MASK)
   6154 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
   6155 		       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
   6156 		       uint32_t, uint32_t, uint32_t,
   6157 		       PAD, FLAG_HAVE_SOLID_MASK)
   6158 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
   6159 		       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
   6160 		       uint32_t, uint32_t, uint32_t,
   6161 		       NONE, FLAG_HAVE_SOLID_MASK)
   6162 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
   6163 		       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
   6164 		       uint32_t, uint32_t, uint32_t,
   6165 		       NORMAL, FLAG_HAVE_SOLID_MASK)
   6166 
   6167 static const pixman_fast_path_t sse2_fast_paths[] =
   6168 {
   6169    /* PIXMAN_OP_OVER */
   6170    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
   6171    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
   6172    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
   6173    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
   6174    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
   6175    PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
   6176    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
   6177    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
   6178    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
   6179    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
   6180    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
   6181    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
   6182    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
   6183    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
   6184    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
   6185    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
   6186    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
   6187    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
   6188    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
   6189    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
   6190    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
   6191    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
   6192    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
   6193    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
   6194    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
   6195    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
   6196    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
   6197    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
   6198    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
   6199    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
   6200    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
   6201    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
   6202    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
   6203    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
   6204    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
   6205    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
   6206    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
   6207    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
   6208    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
   6209    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
   6210    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
   6211    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
   6212    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
   6213    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
   6214    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
   6215    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
   6216    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
   6217    
   6218    /* PIXMAN_OP_OVER_REVERSE */
   6219    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
   6220    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
   6221 
   6222    /* PIXMAN_OP_ADD */
   6223    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
   6224    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
   6225    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
   6226    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
   6227    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
   6228    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
   6229    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
   6230    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
   6231    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
   6232    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
   6233    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
   6234    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
   6235    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
   6236    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
   6237 
   6238    /* PIXMAN_OP_SRC */
   6239    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
   6240    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
   6241    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
   6242    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
   6243    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
   6244    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
   6245    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
   6246    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
   6247    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
   6248    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
   6249    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
   6250    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
   6251    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
   6252    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
   6253    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
   6254    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
   6255    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
   6256    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
   6257 
   6258    /* PIXMAN_OP_IN */
   6259    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
   6260    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
   6261    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
   6262 
   6263    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
   6264    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
   6265    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
   6266    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
   6267 
   6268    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
   6269    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
   6270    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
   6271    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
   6272 
   6273    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
   6274    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
   6275    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
   6276    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
   6277    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
   6278    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
   6279 
   6280    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
   6281    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
   6282    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
   6283    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
   6284    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
   6285    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
   6286 
   6287    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
   6288    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
   6289    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
   6290    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
   6291 
   6292    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
   6293    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
   6294    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
   6295    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
   6296 
   6297    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
   6298    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
   6299    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
   6300    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
   6301 
   6302    { PIXMAN_OP_NONE },
   6303 };
   6304 
   6305 static uint32_t *
   6306 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
   6307 {
   6308    int w = iter->width;
   6309    __m128i ff000000 = mask_ff000000;
   6310    uint32_t *dst = iter->buffer;
   6311    uint32_t *src = (uint32_t *)iter->bits;
   6312 
   6313    iter->bits += iter->stride;
   6314 
   6315    while (w && ((uintptr_t)dst) & 0x0f)
   6316    {
   6317 *dst++ = (*src++) | 0xff000000;
   6318 w--;
   6319    }
   6320 
   6321    while (w >= 4)
   6322    {
   6323 save_128_aligned (
   6324     (__m128i *)dst, _mm_or_si128 (
   6325 	load_128_unaligned ((__m128i *)src), ff000000));
   6326 
   6327 dst += 4;
   6328 src += 4;
   6329 w -= 4;
   6330    }
   6331 
   6332    while (w)
   6333    {
   6334 *dst++ = (*src++) | 0xff000000;
   6335 w--;
   6336    }
   6337 
   6338    return iter->buffer;
   6339 }
   6340 
   6341 static uint32_t *
   6342 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
   6343 {
   6344    int w = iter->width;
   6345    uint32_t *dst = iter->buffer;
   6346    uint16_t *src = (uint16_t *)iter->bits;
   6347    __m128i ff000000 = mask_ff000000;
   6348 
   6349    iter->bits += iter->stride;
   6350 
   6351    while (w && ((uintptr_t)dst) & 0x0f)
   6352    {
   6353 uint16_t s = *src++;
   6354 
   6355 *dst++ = convert_0565_to_8888 (s);
   6356 w--;
   6357    }
   6358 
   6359    while (w >= 8)
   6360    {
   6361 __m128i lo, hi, s;
   6362 
   6363 s = _mm_loadu_si128 ((__m128i *)src);
   6364 
   6365 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
   6366 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
   6367 
   6368 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
   6369 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
   6370 
   6371 dst += 8;
   6372 src += 8;
   6373 w -= 8;
   6374    }
   6375 
   6376    while (w)
   6377    {
   6378 uint16_t s = *src++;
   6379 
   6380 *dst++ = convert_0565_to_8888 (s);
   6381 w--;
   6382    }
   6383 
   6384    return iter->buffer;
   6385 }
   6386 
   6387 static uint32_t *
   6388 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
   6389 {
   6390    int w = iter->width;
   6391    uint32_t *dst = iter->buffer;
   6392    uint8_t *src = iter->bits;
   6393    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
   6394 
   6395    iter->bits += iter->stride;
   6396 
   6397    while (w && (((uintptr_t)dst) & 15))
   6398    {
   6399        *dst++ = (uint32_t)(*(src++)) << 24;
   6400        w--;
   6401    }
   6402 
   6403    while (w >= 16)
   6404    {
   6405 xmm0 = _mm_loadu_si128((__m128i *)src);
   6406 
   6407 xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
   6408 xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
   6409 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
   6410 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
   6411 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
   6412 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
   6413 
   6414 _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
   6415 _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
   6416 _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
   6417 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
   6418 
   6419 dst += 16;
   6420 src += 16;
   6421 w -= 16;
   6422    }
   6423 
   6424    while (w)
   6425    {
   6426 *dst++ = (uint32_t)(*(src++)) << 24;
   6427 w--;
   6428    }
   6429 
   6430    return iter->buffer;
   6431 }
   6432 
   6433 #define IMAGE_FLAGS							\
   6434    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
   6435     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
   6436 
   6437 static const pixman_iter_info_t sse2_iters[] = 
   6438 {
   6439    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
   6440      _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
   6441    },
   6442    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
   6443      _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
   6444    },
   6445    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
   6446      _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
   6447    },
   6448    { PIXMAN_null },
   6449 };
   6450 
   6451 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
   6452 __attribute__((__force_align_arg_pointer__))
   6453 #endif
   6454 pixman_implementation_t *
   6455 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
   6456 {
   6457    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
   6458 
   6459    /* SSE2 constants */
   6460    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
   6461    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
   6462    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
   6463    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
   6464    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
   6465    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
   6466    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
   6467    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
   6468    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
   6469    mask_0080 = create_mask_16_128 (0x0080);
   6470    mask_00ff = create_mask_16_128 (0x00ff);
   6471    mask_0101 = create_mask_16_128 (0x0101);
   6472    mask_ffff = create_mask_16_128 (0xffff);
   6473    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
   6474    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
   6475    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
   6476    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
   6477 
   6478    /* Set up function pointers */
   6479    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
   6480    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
   6481    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
   6482    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
   6483    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
   6484    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
   6485    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
   6486    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
   6487    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
   6488    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
   6489 
   6490    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
   6491 
   6492    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
   6493    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
   6494    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
   6495    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
   6496    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
   6497    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
   6498    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
   6499    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
   6500    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
   6501    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
   6502    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
   6503 
   6504    imp->blt = sse2_blt;
   6505    imp->fill = sse2_fill;
   6506 
   6507    imp->iter_info = sse2_iters;
   6508 
   6509    return imp;
   6510 }