tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-vmx.c (64570B)


      1 /*
      2 * Copyright © 2007 Luca Barbato
      3 *
      4 * Permission to use, copy, modify, distribute, and sell this software and its
      5 * documentation for any purpose is hereby granted without fee, provided that
      6 * the above copyright notice appear in all copies and that both that
      7 * copyright notice and this permission notice appear in supporting
      8 * documentation, and that the name of Luca Barbato not be used in advertising or
      9 * publicity pertaining to distribution of the software without specific,
     10 * written prior permission.  Luca Barbato makes no representations about the
     11 * suitability of this software for any purpose.  It is provided "as is"
     12 * without express or implied warranty.
     13 *
     14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     21 * SOFTWARE.
     22 *
     23 * Author:  Luca Barbato (lu_zero@gentoo.org)
     24 *
     25 * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include <pixman-config.h>
     30 #endif
     31 #include "pixman-combine32.h"
     32 #include "pixman-inlines.h"
     33 #include "pixman-private.h"
     34 #include <altivec.h>
     35 
     36 static const vector unsigned char vzero = (const vector unsigned char){0};
     37 static vector unsigned char       mask_ff000000;
     38 
     39 static force_inline vector unsigned char
     40 splat_alpha (vector unsigned char pix)
     41 {
     42    const vector unsigned char sel = (vector unsigned char){
     43 #ifdef WORDS_BIGENDIAN
     44 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
     45 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C,
     46 #else
     47 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
     48 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F,
     49 #endif
     50    };
     51 
     52    return vec_perm (pix, pix, sel);
     53 }
     54 
     55 static force_inline vector unsigned char
     56 splat_pixel (vector unsigned char pix)
     57 {
     58    const vector unsigned char sel = (vector unsigned char){
     59 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
     60 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
     61    };
     62 
     63    return vec_perm (pix, pix, sel);
     64 }
     65 
     66 static force_inline vector unsigned short
     67 create_mask_16_128 (uint32_t mask)
     68 {
     69    return (vector unsigned short){mask, mask, mask, mask,
     70 			   mask, mask, mask, mask};
     71 }
     72 
     73 static force_inline vector unsigned int
     74 create_mask_32_128 (uint32_t mask)
     75 {
     76    return (vector unsigned int){mask, mask, mask, mask};
     77 }
     78 
     79 static force_inline vector unsigned char
     80 unpacklo_128_16x8 (vector unsigned char data1, vector unsigned char data2)
     81 {
     82 #ifdef WORDS_BIGENDIAN
     83    return vec_mergel (data2, data1);
     84 #else
     85    return vec_mergel (data1, data2);
     86 #endif
     87 }
     88 
     89 static force_inline vector unsigned char
     90 unpackhi_128_16x8 (vector unsigned char data1, vector unsigned char data2)
     91 {
     92 #ifdef WORDS_BIGENDIAN
     93    return vec_mergeh (data2, data1);
     94 #else
     95    return vec_mergeh (data1, data2);
     96 #endif
     97 }
     98 
     99 static force_inline void
    100 unpack_128_2x128 (vector unsigned char  data1,
    101 	  vector unsigned char  data2,
    102 	  vector unsigned char *data_lo,
    103 	  vector unsigned char *data_hi)
    104 {
    105    *data_lo = unpacklo_128_16x8 (data1, data2);
    106    *data_hi = unpackhi_128_16x8 (data1, data2);
    107 }
    108 
    109 static force_inline vector unsigned char
    110 pix_multiply (vector unsigned char a, vector unsigned char b)
    111 {
    112    const vector unsigned char sel = (vector unsigned char){
    113 #ifdef WORDS_BIGENDIAN
    114 0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16,
    115 0x08, 0x18, 0x0a, 0x1a, 0x0c, 0x1c, 0x0e, 0x1e,
    116 #else
    117 0x01, 0x11, 0x03, 0x13, 0x05, 0x15, 0x07, 0x17,
    118 0x09, 0x19, 0x0b, 0x1b, 0x0d, 0x1d, 0x0f, 0x1f,
    119 #endif
    120    };
    121    vector unsigned short e = vec_mule (a, b);
    122    vector unsigned short o = vec_mulo (a, b);
    123 
    124    e = vec_adds (e, create_mask_16_128 (128));
    125    o = vec_adds (o, create_mask_16_128 (128));
    126 
    127    e = vec_adds (e, vec_sr (e, vec_splat_u16 (8)));
    128    o = vec_adds (o, vec_sr (o, vec_splat_u16 (8)));
    129 
    130    return (vector unsigned char)vec_perm (e, o, sel);
    131 }
    132 
    133 static force_inline vector unsigned char
    134 pix_add (vector unsigned char a, vector unsigned char b)
    135 {
    136    return vec_adds (a, b);
    137 }
    138 
    139 static force_inline vector unsigned char
    140 pix_add_mul (vector unsigned char x,
    141      vector unsigned char a,
    142      vector unsigned char y,
    143      vector unsigned char b)
    144 {
    145    vector unsigned char t1, t2;
    146 
    147    t1 = pix_multiply (x, a);
    148    t2 = pix_multiply (y, b);
    149 
    150    return pix_add (t1, t2);
    151 }
    152 
    153 static force_inline vector unsigned char
    154 negate (vector unsigned char src)
    155 {
    156    return vec_nor (src, src);
    157 }
    158 
    159 /* dest*~srca + src */
    160 static force_inline vector unsigned char
    161 over (vector unsigned char src,
    162      vector unsigned char srca,
    163      vector unsigned char dest)
    164 {
    165    return vec_adds (src, pix_multiply (dest, negate (srca)));
    166 }
    167 
    168 /* in == pix_multiply */
    169 static force_inline vector unsigned char
    170 in_over (vector unsigned char src,
    171  vector unsigned char srca,
    172  vector unsigned char mask,
    173  vector unsigned char dest)
    174 {
    175    return over (pix_multiply (src, mask), pix_multiply (srca, mask), dest);
    176 }
    177 
    178 #ifdef WORDS_BIGENDIAN
    179 
    180 #define COMPUTE_SHIFT_MASK(source) source##_mask = vec_lvsl (0, source);
    181 
    182 #define COMPUTE_SHIFT_MASKS(dest, source) source##_mask = vec_lvsl (0, source);
    183 
    184 #define COMPUTE_SHIFT_MASKC(dest, source, mask)                                \
    185    mask##_mask   = vec_lvsl (0, mask);                                        \
    186    source##_mask = vec_lvsl (0, source);
    187 
    188 #define LOAD_VECTOR(source)                                                    \
    189    do                                                                         \
    190    {                                                                          \
    191 vector unsigned char tmp1, tmp2;                                       \
    192 tmp1      = (typeof (tmp1))vec_ld (0, source);                         \
    193 tmp2      = (typeof (tmp2))vec_ld (15, source);                        \
    194 v##source = (typeof (v##source))vec_perm (tmp1, tmp2, source##_mask);  \
    195    } while (0)
    196 
    197 #define LOAD_VECTORS(dest, source)                                             \
    198    do                                                                         \
    199    {                                                                          \
    200 LOAD_VECTOR (source);                                                  \
    201 v##dest = (typeof (v##dest))vec_ld (0, dest);                          \
    202    } while (0)
    203 
    204 #define LOAD_VECTORSC(dest, source, mask)                                      \
    205    do                                                                         \
    206    {                                                                          \
    207 LOAD_VECTORS (dest, source);                                           \
    208 LOAD_VECTOR (mask);                                                    \
    209    } while (0)
    210 
    211 #define DECLARE_SRC_MASK_VAR  vector unsigned char src_mask
    212 #define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
    213 
    214 #else
    215 
    216 /* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
    217 * They are defined that way because little endian altivec can do unaligned
    218 * reads natively and have no need for constructing the permutation pattern
    219 * variables.
    220 */
    221 #define COMPUTE_SHIFT_MASK(source)
    222 
    223 #define COMPUTE_SHIFT_MASKS(dest, source)
    224 
    225 #define COMPUTE_SHIFT_MASKC(dest, source, mask)
    226 
    227 #define LOAD_VECTOR(source) v##source = (typeof (v##source))vec_xl (0, source);
    228 
    229 #define LOAD_VECTORS(dest, source)                                             \
    230    LOAD_VECTOR (source);                                                      \
    231    LOAD_VECTOR (dest);
    232 
    233 #define LOAD_VECTORSC(dest, source, mask)                                      \
    234    LOAD_VECTORS (dest, source);                                               \
    235    LOAD_VECTOR (mask);
    236 
    237 #define DECLARE_SRC_MASK_VAR
    238 #define DECLARE_MASK_MASK_VAR
    239 
    240 #endif /* WORDS_BIGENDIAN */
    241 
    242 #define LOAD_VECTORSM(dest, source, mask)                                      \
    243    LOAD_VECTORSC (dest, source, mask);                                        \
    244    v##source = pix_multiply (v##source, splat_alpha (v##mask));
    245 
    246 #define STORE_VECTOR(dest) vec_st ((vector unsigned int)v##dest, 0, dest);
    247 
    248 /* load 4 pixels from a 16-byte boundary aligned address */
    249 static force_inline vector unsigned char
    250 load_128_aligned (const uint32_t *src)
    251 {
    252    return *((vector unsigned char *)src);
    253 }
    254 
    255 /* load 4 pixels from a unaligned address */
    256 static force_inline vector unsigned char
    257 load_128_unaligned (const uint32_t *src)
    258 {
    259    vector unsigned char vsrc;
    260    DECLARE_SRC_MASK_VAR;
    261 
    262    COMPUTE_SHIFT_MASK (src);
    263    LOAD_VECTOR (src);
    264 
    265    return vsrc;
    266 }
    267 
    268 /* save 4 pixels on a 16-byte boundary aligned address */
    269 static force_inline void
    270 save_128_aligned (uint32_t *data, vector unsigned char vdata)
    271 {
    272    STORE_VECTOR (data)
    273 }
    274 
    275 static force_inline int
    276 is_opaque (vector unsigned char x)
    277 {
    278    return vec_all_eq (vec_and (x, mask_ff000000), mask_ff000000);
    279 }
    280 
    281 static force_inline int
    282 is_zero (vector unsigned char x)
    283 {
    284    return vec_all_eq (x, vzero);
    285 }
    286 
    287 static force_inline int
    288 is_transparent (vector unsigned char x)
    289 {
    290    return vec_all_eq (vec_and (x, mask_ff000000), vzero);
    291 }
    292 
    293 static force_inline uint32_t
    294 core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
    295 {
    296    uint32_t a;
    297 
    298    a = ALPHA_8 (src);
    299 
    300    if (a == 0xff)
    301    {
    302 return src;
    303    }
    304    else if (src)
    305    {
    306 UN8x4_MUL_UN8_ADD_UN8x4 (dst, (~a & MASK), src);
    307    }
    308 
    309    return dst;
    310 }
    311 
    312 static force_inline uint32_t
    313 combine1 (const uint32_t *ps, const uint32_t *pm)
    314 {
    315    uint32_t s = *ps;
    316 
    317    if (pm)
    318 UN8x4_MUL_UN8 (s, ALPHA_8 (*pm));
    319 
    320    return s;
    321 }
    322 
    323 static force_inline vector unsigned char
    324 combine4 (const uint32_t *ps, const uint32_t *pm)
    325 {
    326    vector unsigned char src, msk;
    327 
    328    if (pm)
    329    {
    330 msk = load_128_unaligned (pm);
    331 
    332 if (is_transparent (msk))
    333     return vzero;
    334    }
    335 
    336    src = load_128_unaligned (ps);
    337 
    338    if (pm)
    339 src = pix_multiply (src, msk);
    340 
    341    return src;
    342 }
    343 
    344 static void
    345 vmx_combine_over_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
    346 {
    347    vector unsigned char vdest, vsrc;
    348    DECLARE_SRC_MASK_VAR;
    349 
    350    while (width && ((uintptr_t)dest & 15))
    351    {
    352 uint32_t s  = *src++;
    353 uint32_t d  = *dest;
    354 uint32_t ia = ALPHA_8 (~s);
    355 
    356 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
    357 
    358 *dest++ = d;
    359 width--;
    360    }
    361 
    362    COMPUTE_SHIFT_MASKS (dest, src);
    363 
    364    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    365    for (int i = width / 4; i > 0; i--)
    366    {
    367 
    368 LOAD_VECTORS (dest, src);
    369 
    370 vdest = over (vsrc, splat_alpha (vsrc), vdest);
    371 
    372 STORE_VECTOR (dest);
    373 
    374 src += 4;
    375 dest += 4;
    376    }
    377 
    378    for (int i = width % 4; --i >= 0;)
    379    {
    380 uint32_t s  = src[i];
    381 uint32_t d  = dest[i];
    382 uint32_t ia = ALPHA_8 (~s);
    383 
    384 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
    385 
    386 dest[i] = d;
    387    }
    388 }
    389 
    390 static void
    391 vmx_combine_over_u_mask (uint32_t       *dest,
    392 		 const uint32_t *src,
    393 		 const uint32_t *mask,
    394 		 int             width)
    395 {
    396    vector unsigned char vdest, vsrc, vmask;
    397    DECLARE_SRC_MASK_VAR;
    398    DECLARE_MASK_MASK_VAR;
    399 
    400    while (width && ((uintptr_t)dest & 15))
    401    {
    402 uint32_t m = ALPHA_8 (*mask++);
    403 uint32_t s = *src++;
    404 uint32_t d = *dest;
    405 uint32_t ia;
    406 
    407 UN8x4_MUL_UN8 (s, m);
    408 
    409 ia = ALPHA_8 (~s);
    410 
    411 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
    412 *dest++ = d;
    413 width--;
    414    }
    415 
    416    COMPUTE_SHIFT_MASKC (dest, src, mask);
    417 
    418    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    419    for (int i = width / 4; i > 0; i--)
    420    {
    421 LOAD_VECTORSM (dest, src, mask);
    422 
    423 vdest = over (vsrc, splat_alpha (vsrc), vdest);
    424 
    425 STORE_VECTOR (dest);
    426 
    427 src += 4;
    428 dest += 4;
    429 mask += 4;
    430    }
    431 
    432    for (int i = width % 4; --i >= 0;)
    433    {
    434 uint32_t m = ALPHA_8 (mask[i]);
    435 uint32_t s = src[i];
    436 uint32_t d = dest[i];
    437 uint32_t ia;
    438 
    439 UN8x4_MUL_UN8 (s, m);
    440 
    441 ia = ALPHA_8 (~s);
    442 
    443 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
    444 dest[i] = d;
    445    }
    446 }
    447 
    448 static void
    449 vmx_combine_over_u (pixman_implementation_t *imp,
    450 	    pixman_op_t              op,
    451 	    uint32_t                *dest,
    452 	    const uint32_t          *src,
    453 	    const uint32_t          *mask,
    454 	    int                      width)
    455 {
    456    if (mask)
    457 vmx_combine_over_u_mask (dest, src, mask, width);
    458    else
    459 vmx_combine_over_u_no_mask (dest, src, width);
    460 }
    461 
    462 static void
    463 vmx_combine_over_reverse_u_no_mask (uint32_t       *dest,
    464 			    const uint32_t *src,
    465 			    int             width)
    466 {
    467    vector unsigned char vdest, vsrc;
    468    DECLARE_SRC_MASK_VAR;
    469 
    470    while (width && ((uintptr_t)dest & 15))
    471    {
    472 uint32_t s  = *src++;
    473 uint32_t d  = *dest;
    474 uint32_t ia = ALPHA_8 (~d);
    475 
    476 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
    477 *dest++ = s;
    478 width--;
    479    }
    480 
    481    COMPUTE_SHIFT_MASKS (dest, src);
    482 
    483    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    484    for (int i = width / 4; i > 0; i--)
    485    {
    486 
    487 LOAD_VECTORS (dest, src);
    488 
    489 vdest = over (vdest, splat_alpha (vdest), vsrc);
    490 
    491 STORE_VECTOR (dest);
    492 
    493 src += 4;
    494 dest += 4;
    495    }
    496 
    497    for (int i = width % 4; --i >= 0;)
    498    {
    499 uint32_t s  = src[i];
    500 uint32_t d  = dest[i];
    501 uint32_t ia = ALPHA_8 (~dest[i]);
    502 
    503 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
    504 dest[i] = s;
    505    }
    506 }
    507 
    508 static void
    509 vmx_combine_over_reverse_u_mask (uint32_t       *dest,
    510 			 const uint32_t *src,
    511 			 const uint32_t *mask,
    512 			 int             width)
    513 {
    514    vector unsigned char vdest, vsrc, vmask;
    515    DECLARE_SRC_MASK_VAR;
    516    DECLARE_MASK_MASK_VAR;
    517 
    518    while (width && ((uintptr_t)dest & 15))
    519    {
    520 uint32_t m  = ALPHA_8 (*mask++);
    521 uint32_t s  = *src++;
    522 uint32_t d  = *dest;
    523 uint32_t ia = ALPHA_8 (~d);
    524 
    525 UN8x4_MUL_UN8 (s, m);
    526 
    527 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
    528 *dest++ = s;
    529 width--;
    530    }
    531 
    532    COMPUTE_SHIFT_MASKC (dest, src, mask);
    533 
    534    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    535    for (int i = width / 4; i > 0; i--)
    536    {
    537 
    538 LOAD_VECTORSM (dest, src, mask);
    539 
    540 vdest = over (vdest, splat_alpha (vdest), vsrc);
    541 
    542 STORE_VECTOR (dest);
    543 
    544 src += 4;
    545 dest += 4;
    546 mask += 4;
    547    }
    548 
    549    for (int i = width % 4; --i >= 0;)
    550    {
    551 uint32_t m  = ALPHA_8 (mask[i]);
    552 uint32_t s  = src[i];
    553 uint32_t d  = dest[i];
    554 uint32_t ia = ALPHA_8 (~dest[i]);
    555 
    556 UN8x4_MUL_UN8 (s, m);
    557 
    558 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
    559 dest[i] = s;
    560    }
    561 }
    562 
    563 static void
    564 vmx_combine_over_reverse_u (pixman_implementation_t *imp,
    565 		    pixman_op_t              op,
    566 		    uint32_t                *dest,
    567 		    const uint32_t          *src,
    568 		    const uint32_t          *mask,
    569 		    int                      width)
    570 {
    571    if (mask)
    572 vmx_combine_over_reverse_u_mask (dest, src, mask, width);
    573    else
    574 vmx_combine_over_reverse_u_no_mask (dest, src, width);
    575 }
    576 
    577 static void
    578 vmx_combine_in_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
    579 {
    580    vector unsigned char vdest, vsrc;
    581    DECLARE_SRC_MASK_VAR;
    582 
    583    while (width && ((uintptr_t)dest & 15))
    584    {
    585 uint32_t s = *src++;
    586 uint32_t a = ALPHA_8 (*dest);
    587 
    588 UN8x4_MUL_UN8 (s, a);
    589 *dest++ = s;
    590 width--;
    591    }
    592 
    593    COMPUTE_SHIFT_MASKS (dest, src);
    594 
    595    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    596    for (int i = width / 4; i > 0; i--)
    597    {
    598 LOAD_VECTORS (dest, src);
    599 
    600 vdest = pix_multiply (vsrc, splat_alpha (vdest));
    601 
    602 STORE_VECTOR (dest);
    603 
    604 src += 4;
    605 dest += 4;
    606    }
    607 
    608    for (int i = width % 4; --i >= 0;)
    609    {
    610 uint32_t s = src[i];
    611 uint32_t a = ALPHA_8 (dest[i]);
    612 
    613 UN8x4_MUL_UN8 (s, a);
    614 dest[i] = s;
    615    }
    616 }
    617 
    618 static void
    619 vmx_combine_in_u_mask (uint32_t       *dest,
    620 	       const uint32_t *src,
    621 	       const uint32_t *mask,
    622 	       int             width)
    623 {
    624    vector unsigned char vdest, vsrc, vmask;
    625    DECLARE_SRC_MASK_VAR;
    626    DECLARE_MASK_MASK_VAR;
    627 
    628    while (width && ((uintptr_t)dest & 15))
    629    {
    630 uint32_t m = ALPHA_8 (*mask++);
    631 uint32_t s = *src++;
    632 uint32_t a = ALPHA_8 (*dest);
    633 
    634 UN8x4_MUL_UN8 (s, m);
    635 UN8x4_MUL_UN8 (s, a);
    636 
    637 *dest++ = s;
    638 width--;
    639    }
    640 
    641    COMPUTE_SHIFT_MASKC (dest, src, mask);
    642 
    643    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    644    for (int i = width / 4; i > 0; i--)
    645    {
    646 LOAD_VECTORSM (dest, src, mask);
    647 
    648 vdest = pix_multiply (vsrc, splat_alpha (vdest));
    649 
    650 STORE_VECTOR (dest);
    651 
    652 src += 4;
    653 dest += 4;
    654 mask += 4;
    655    }
    656 
    657    for (int i = width % 4; --i >= 0;)
    658    {
    659 uint32_t m = ALPHA_8 (mask[i]);
    660 uint32_t s = src[i];
    661 uint32_t a = ALPHA_8 (dest[i]);
    662 
    663 UN8x4_MUL_UN8 (s, m);
    664 UN8x4_MUL_UN8 (s, a);
    665 
    666 dest[i] = s;
    667    }
    668 }
    669 
    670 static void
    671 vmx_combine_in_u (pixman_implementation_t *imp,
    672 	  pixman_op_t              op,
    673 	  uint32_t                *dest,
    674 	  const uint32_t          *src,
    675 	  const uint32_t          *mask,
    676 	  int                      width)
    677 {
    678    if (mask)
    679 vmx_combine_in_u_mask (dest, src, mask, width);
    680    else
    681 vmx_combine_in_u_no_mask (dest, src, width);
    682 }
    683 
    684 static void
    685 vmx_combine_in_reverse_u_no_mask (uint32_t       *dest,
    686 			  const uint32_t *src,
    687 			  int             width)
    688 {
    689    vector unsigned char vdest, vsrc;
    690    DECLARE_SRC_MASK_VAR;
    691 
    692    while (width && ((uintptr_t)dest & 15))
    693    {
    694 uint32_t d = *dest;
    695 uint32_t a = ALPHA_8 (*src++);
    696 
    697 UN8x4_MUL_UN8 (d, a);
    698 
    699 *dest++ = d;
    700 width--;
    701    }
    702 
    703    COMPUTE_SHIFT_MASKS (dest, src);
    704 
    705    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    706    for (int i = width / 4; i > 0; i--)
    707    {
    708 LOAD_VECTORS (dest, src);
    709 
    710 vdest = pix_multiply (vdest, splat_alpha (vsrc));
    711 
    712 STORE_VECTOR (dest);
    713 
    714 src += 4;
    715 dest += 4;
    716    }
    717 
    718    for (int i = width % 4; --i >= 0;)
    719    {
    720 uint32_t d = dest[i];
    721 uint32_t a = ALPHA_8 (src[i]);
    722 
    723 UN8x4_MUL_UN8 (d, a);
    724 
    725 dest[i] = d;
    726    }
    727 }
    728 
    729 static void
    730 vmx_combine_in_reverse_u_mask (uint32_t       *dest,
    731 		       const uint32_t *src,
    732 		       const uint32_t *mask,
    733 		       int             width)
    734 {
    735    vector unsigned char vdest, vsrc, vmask;
    736    DECLARE_SRC_MASK_VAR;
    737    DECLARE_MASK_MASK_VAR;
    738 
    739    while (width && ((uintptr_t)dest & 15))
    740    {
    741 uint32_t m = ALPHA_8 (*mask++);
    742 uint32_t d = *dest;
    743 uint32_t a = *src++;
    744 
    745 UN8x4_MUL_UN8 (a, m);
    746 a = ALPHA_8 (a);
    747 UN8x4_MUL_UN8 (d, a);
    748 
    749 *dest++ = d;
    750 width--;
    751    }
    752 
    753    COMPUTE_SHIFT_MASKC (dest, src, mask);
    754 
    755    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    756    for (int i = width / 4; i > 0; i--)
    757    {
    758 LOAD_VECTORSM (dest, src, mask);
    759 
    760 vdest = pix_multiply (vdest, splat_alpha (vsrc));
    761 
    762 STORE_VECTOR (dest);
    763 
    764 src += 4;
    765 dest += 4;
    766 mask += 4;
    767    }
    768 
    769    for (int i = width % 4; --i >= 0;)
    770    {
    771 uint32_t m = ALPHA_8 (mask[i]);
    772 uint32_t d = dest[i];
    773 uint32_t a = src[i];
    774 
    775 UN8x4_MUL_UN8 (a, m);
    776 a = ALPHA_8 (a);
    777 UN8x4_MUL_UN8 (d, a);
    778 
    779 dest[i] = d;
    780    }
    781 }
    782 
    783 static void
    784 vmx_combine_in_reverse_u (pixman_implementation_t *imp,
    785 		  pixman_op_t              op,
    786 		  uint32_t                *dest,
    787 		  const uint32_t          *src,
    788 		  const uint32_t          *mask,
    789 		  int                      width)
    790 {
    791    if (mask)
    792 vmx_combine_in_reverse_u_mask (dest, src, mask, width);
    793    else
    794 vmx_combine_in_reverse_u_no_mask (dest, src, width);
    795 }
    796 
    797 static void
    798 vmx_combine_out_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
    799 {
    800    vector unsigned char vdest, vsrc;
    801    DECLARE_SRC_MASK_VAR;
    802 
    803    while (width && ((uintptr_t)dest & 15))
    804    {
    805 uint32_t s = *src++;
    806 uint32_t a = ALPHA_8 (~(*dest));
    807 
    808 UN8x4_MUL_UN8 (s, a);
    809 
    810 *dest++ = s;
    811 width--;
    812    }
    813 
    814    COMPUTE_SHIFT_MASKS (dest, src);
    815 
    816    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    817    for (int i = width / 4; i > 0; i--)
    818    {
    819 LOAD_VECTORS (dest, src);
    820 
    821 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
    822 
    823 STORE_VECTOR (dest);
    824 
    825 src += 4;
    826 dest += 4;
    827    }
    828 
    829    for (int i = width % 4; --i >= 0;)
    830    {
    831 uint32_t s = src[i];
    832 uint32_t a = ALPHA_8 (~dest[i]);
    833 
    834 UN8x4_MUL_UN8 (s, a);
    835 
    836 dest[i] = s;
    837    }
    838 }
    839 
    840 static void
    841 vmx_combine_out_u_mask (uint32_t       *dest,
    842 		const uint32_t *src,
    843 		const uint32_t *mask,
    844 		int             width)
    845 {
    846    vector unsigned char vdest, vsrc, vmask;
    847    DECLARE_SRC_MASK_VAR;
    848    DECLARE_MASK_MASK_VAR;
    849 
    850    while (width && ((uintptr_t)dest & 15))
    851    {
    852 uint32_t m = ALPHA_8 (*mask++);
    853 uint32_t s = *src++;
    854 uint32_t a = ALPHA_8 (~(*dest));
    855 
    856 UN8x4_MUL_UN8 (s, m);
    857 UN8x4_MUL_UN8 (s, a);
    858 
    859 *dest++ = s;
    860 width--;
    861    }
    862 
    863    COMPUTE_SHIFT_MASKC (dest, src, mask);
    864 
    865    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    866    for (int i = width / 4; i > 0; i--)
    867    {
    868 LOAD_VECTORSM (dest, src, mask);
    869 
    870 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
    871 
    872 STORE_VECTOR (dest);
    873 
    874 src += 4;
    875 dest += 4;
    876 mask += 4;
    877    }
    878 
    879    for (int i = width % 4; --i >= 0;)
    880    {
    881 uint32_t m = ALPHA_8 (mask[i]);
    882 uint32_t s = src[i];
    883 uint32_t a = ALPHA_8 (~dest[i]);
    884 
    885 UN8x4_MUL_UN8 (s, m);
    886 UN8x4_MUL_UN8 (s, a);
    887 
    888 dest[i] = s;
    889    }
    890 }
    891 
    892 static void
    893 vmx_combine_out_u (pixman_implementation_t *imp,
    894 	   pixman_op_t              op,
    895 	   uint32_t                *dest,
    896 	   const uint32_t          *src,
    897 	   const uint32_t          *mask,
    898 	   int                      width)
    899 {
    900    if (mask)
    901 vmx_combine_out_u_mask (dest, src, mask, width);
    902    else
    903 vmx_combine_out_u_no_mask (dest, src, width);
    904 }
    905 
    906 static void
    907 vmx_combine_out_reverse_u_no_mask (uint32_t       *dest,
    908 			   const uint32_t *src,
    909 			   int             width)
    910 {
    911    vector unsigned char vdest, vsrc;
    912    DECLARE_SRC_MASK_VAR;
    913 
    914    while (width && ((uintptr_t)dest & 15))
    915    {
    916 uint32_t d = *dest;
    917 uint32_t a = ALPHA_8 (~(*src++));
    918 
    919 UN8x4_MUL_UN8 (d, a);
    920 
    921 *dest++ = d;
    922 width--;
    923    }
    924 
    925    COMPUTE_SHIFT_MASKS (dest, src);
    926 
    927    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    928    for (int i = width / 4; i > 0; i--)
    929    {
    930 
    931 LOAD_VECTORS (dest, src);
    932 
    933 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
    934 
    935 STORE_VECTOR (dest);
    936 
    937 src += 4;
    938 dest += 4;
    939    }
    940 
    941    for (int i = width % 4; --i >= 0;)
    942    {
    943 uint32_t d = dest[i];
    944 uint32_t a = ALPHA_8 (~src[i]);
    945 
    946 UN8x4_MUL_UN8 (d, a);
    947 
    948 dest[i] = d;
    949    }
    950 }
    951 
    952 static void
    953 vmx_combine_out_reverse_u_mask (uint32_t       *dest,
    954 			const uint32_t *src,
    955 			const uint32_t *mask,
    956 			int             width)
    957 {
    958    vector unsigned char vdest, vsrc, vmask;
    959    DECLARE_SRC_MASK_VAR;
    960    DECLARE_MASK_MASK_VAR;
    961 
    962    while (width && ((uintptr_t)dest & 15))
    963    {
    964 uint32_t m = ALPHA_8 (*mask++);
    965 uint32_t d = *dest;
    966 uint32_t a = *src++;
    967 
    968 UN8x4_MUL_UN8 (a, m);
    969 a = ALPHA_8 (~a);
    970 UN8x4_MUL_UN8 (d, a);
    971 
    972 *dest++ = d;
    973 width--;
    974    }
    975 
    976    COMPUTE_SHIFT_MASKC (dest, src, mask);
    977 
    978    /* printf ("%s\n",__PRETTY_FUNCTION__); */
    979    for (int i = width / 4; i > 0; i--)
    980    {
    981 LOAD_VECTORSM (dest, src, mask);
    982 
    983 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
    984 
    985 STORE_VECTOR (dest);
    986 
    987 src += 4;
    988 dest += 4;
    989 mask += 4;
    990    }
    991 
    992    for (int i = width % 4; --i >= 0;)
    993    {
    994 uint32_t m = ALPHA_8 (mask[i]);
    995 uint32_t d = dest[i];
    996 uint32_t a = src[i];
    997 
    998 UN8x4_MUL_UN8 (a, m);
    999 a = ALPHA_8 (~a);
   1000 UN8x4_MUL_UN8 (d, a);
   1001 
   1002 dest[i] = d;
   1003    }
   1004 }
   1005 
   1006 static void
   1007 vmx_combine_out_reverse_u (pixman_implementation_t *imp,
   1008 		   pixman_op_t              op,
   1009 		   uint32_t                *dest,
   1010 		   const uint32_t          *src,
   1011 		   const uint32_t          *mask,
   1012 		   int                      width)
   1013 {
   1014    if (mask)
   1015 vmx_combine_out_reverse_u_mask (dest, src, mask, width);
   1016    else
   1017 vmx_combine_out_reverse_u_no_mask (dest, src, width);
   1018 }
   1019 
   1020 static void
   1021 vmx_combine_atop_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
   1022 {
   1023    vector unsigned char vdest, vsrc;
   1024    DECLARE_SRC_MASK_VAR;
   1025 
   1026    while (width && ((uintptr_t)dest & 15))
   1027    {
   1028 uint32_t s      = *src++;
   1029 uint32_t d      = *dest;
   1030 uint32_t dest_a = ALPHA_8 (d);
   1031 uint32_t src_ia = ALPHA_8 (~s);
   1032 
   1033 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
   1034 
   1035 *dest++ = s;
   1036 width--;
   1037    }
   1038 
   1039    COMPUTE_SHIFT_MASKS (dest, src);
   1040 
   1041    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1042    for (int i = width / 4; i > 0; i--)
   1043    {
   1044 LOAD_VECTORS (dest, src);
   1045 
   1046 vdest = pix_add_mul (vsrc, splat_alpha (vdest), vdest,
   1047 		     splat_alpha (negate (vsrc)));
   1048 
   1049 STORE_VECTOR (dest);
   1050 
   1051 src += 4;
   1052 dest += 4;
   1053    }
   1054 
   1055    for (int i = width % 4; --i >= 0;)
   1056    {
   1057 uint32_t s      = src[i];
   1058 uint32_t d      = dest[i];
   1059 uint32_t dest_a = ALPHA_8 (d);
   1060 uint32_t src_ia = ALPHA_8 (~s);
   1061 
   1062 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
   1063 
   1064 dest[i] = s;
   1065    }
   1066 }
   1067 
   1068 static void
   1069 vmx_combine_atop_u_mask (uint32_t       *dest,
   1070 		 const uint32_t *src,
   1071 		 const uint32_t *mask,
   1072 		 int             width)
   1073 {
   1074    vector unsigned char vdest, vsrc, vmask;
   1075    DECLARE_SRC_MASK_VAR;
   1076    DECLARE_MASK_MASK_VAR;
   1077 
   1078    while (width && ((uintptr_t)dest & 15))
   1079    {
   1080 uint32_t m      = ALPHA_8 (*mask++);
   1081 uint32_t s      = *src++;
   1082 uint32_t d      = *dest;
   1083 uint32_t dest_a = ALPHA_8 (d);
   1084 uint32_t src_ia;
   1085 
   1086 UN8x4_MUL_UN8 (s, m);
   1087 
   1088 src_ia = ALPHA_8 (~s);
   1089 
   1090 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
   1091 
   1092 *dest++ = s;
   1093 width--;
   1094    }
   1095 
   1096    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1097 
   1098    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1099    for (int i = width / 4; i > 0; i--)
   1100    {
   1101 LOAD_VECTORSM (dest, src, mask);
   1102 
   1103 vdest = pix_add_mul (vsrc, splat_alpha (vdest), vdest,
   1104 		     splat_alpha (negate (vsrc)));
   1105 
   1106 STORE_VECTOR (dest);
   1107 
   1108 src += 4;
   1109 dest += 4;
   1110 mask += 4;
   1111    }
   1112 
   1113    for (int i = width % 4; --i >= 0;)
   1114    {
   1115 uint32_t m      = ALPHA_8 (mask[i]);
   1116 uint32_t s      = src[i];
   1117 uint32_t d      = dest[i];
   1118 uint32_t dest_a = ALPHA_8 (d);
   1119 uint32_t src_ia;
   1120 
   1121 UN8x4_MUL_UN8 (s, m);
   1122 
   1123 src_ia = ALPHA_8 (~s);
   1124 
   1125 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
   1126 
   1127 dest[i] = s;
   1128    }
   1129 }
   1130 
   1131 static void
   1132 vmx_combine_atop_u (pixman_implementation_t *imp,
   1133 	    pixman_op_t              op,
   1134 	    uint32_t                *dest,
   1135 	    const uint32_t          *src,
   1136 	    const uint32_t          *mask,
   1137 	    int                      width)
   1138 {
   1139    if (mask)
   1140 vmx_combine_atop_u_mask (dest, src, mask, width);
   1141    else
   1142 vmx_combine_atop_u_no_mask (dest, src, width);
   1143 }
   1144 
   1145 static void
   1146 vmx_combine_atop_reverse_u_no_mask (uint32_t       *dest,
   1147 			    const uint32_t *src,
   1148 			    int             width)
   1149 {
   1150    vector unsigned char vdest, vsrc;
   1151    DECLARE_SRC_MASK_VAR;
   1152 
   1153    while (width && ((uintptr_t)dest & 15))
   1154    {
   1155 uint32_t s       = *src++;
   1156 uint32_t d       = *dest;
   1157 uint32_t src_a   = ALPHA_8 (s);
   1158 uint32_t dest_ia = ALPHA_8 (~d);
   1159 
   1160 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
   1161 
   1162 *dest++ = s;
   1163 width--;
   1164    }
   1165 
   1166    COMPUTE_SHIFT_MASKS (dest, src);
   1167 
   1168    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1169    for (int i = width / 4; i > 0; i--)
   1170    {
   1171 LOAD_VECTORS (dest, src);
   1172 
   1173 vdest = pix_add_mul (vdest, splat_alpha (vsrc), vsrc,
   1174 		     splat_alpha (negate (vdest)));
   1175 
   1176 STORE_VECTOR (dest);
   1177 
   1178 src += 4;
   1179 dest += 4;
   1180    }
   1181 
   1182    for (int i = width % 4; --i >= 0;)
   1183    {
   1184 uint32_t s       = src[i];
   1185 uint32_t d       = dest[i];
   1186 uint32_t src_a   = ALPHA_8 (s);
   1187 uint32_t dest_ia = ALPHA_8 (~d);
   1188 
   1189 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
   1190 
   1191 dest[i] = s;
   1192    }
   1193 }
   1194 
   1195 static void
   1196 vmx_combine_atop_reverse_u_mask (uint32_t       *dest,
   1197 			 const uint32_t *src,
   1198 			 const uint32_t *mask,
   1199 			 int             width)
   1200 {
   1201    vector unsigned char vdest, vsrc, vmask;
   1202    DECLARE_SRC_MASK_VAR;
   1203    DECLARE_MASK_MASK_VAR;
   1204 
   1205    while (width && ((uintptr_t)dest & 15))
   1206    {
   1207 uint32_t m = ALPHA_8 (*mask++);
   1208 uint32_t s = *src++;
   1209 uint32_t d = *dest;
   1210 uint32_t src_a;
   1211 uint32_t dest_ia = ALPHA_8 (~d);
   1212 
   1213 UN8x4_MUL_UN8 (s, m);
   1214 
   1215 src_a = ALPHA_8 (s);
   1216 
   1217 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
   1218 
   1219 *dest++ = s;
   1220 width--;
   1221    }
   1222 
   1223    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1224 
   1225    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1226    for (int i = width / 4; i > 0; i--)
   1227    {
   1228 LOAD_VECTORSM (dest, src, mask);
   1229 
   1230 vdest = pix_add_mul (vdest, splat_alpha (vsrc), vsrc,
   1231 		     splat_alpha (negate (vdest)));
   1232 
   1233 STORE_VECTOR (dest);
   1234 
   1235 src += 4;
   1236 dest += 4;
   1237 mask += 4;
   1238    }
   1239 
   1240    for (int i = width % 4; --i >= 0;)
   1241    {
   1242 uint32_t m = ALPHA_8 (mask[i]);
   1243 uint32_t s = src[i];
   1244 uint32_t d = dest[i];
   1245 uint32_t src_a;
   1246 uint32_t dest_ia = ALPHA_8 (~d);
   1247 
   1248 UN8x4_MUL_UN8 (s, m);
   1249 
   1250 src_a = ALPHA_8 (s);
   1251 
   1252 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
   1253 
   1254 dest[i] = s;
   1255    }
   1256 }
   1257 
   1258 static void
   1259 vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
   1260 		    pixman_op_t              op,
   1261 		    uint32_t                *dest,
   1262 		    const uint32_t          *src,
   1263 		    const uint32_t          *mask,
   1264 		    int                      width)
   1265 {
   1266    if (mask)
   1267 vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
   1268    else
   1269 vmx_combine_atop_reverse_u_no_mask (dest, src, width);
   1270 }
   1271 
   1272 static void
   1273 vmx_combine_xor_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
   1274 {
   1275    vector unsigned char vdest, vsrc;
   1276    DECLARE_SRC_MASK_VAR;
   1277 
   1278    while (width && ((uintptr_t)dest & 15))
   1279    {
   1280 uint32_t s       = *src++;
   1281 uint32_t d       = *dest;
   1282 uint32_t src_ia  = ALPHA_8 (~s);
   1283 uint32_t dest_ia = ALPHA_8 (~d);
   1284 
   1285 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
   1286 
   1287 *dest++ = s;
   1288 width--;
   1289    }
   1290 
   1291    COMPUTE_SHIFT_MASKS (dest, src);
   1292 
   1293    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1294    for (int i = width / 4; i > 0; i--)
   1295    {
   1296 LOAD_VECTORS (dest, src);
   1297 
   1298 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), vdest,
   1299 		     splat_alpha (negate (vsrc)));
   1300 
   1301 STORE_VECTOR (dest);
   1302 
   1303 src += 4;
   1304 dest += 4;
   1305    }
   1306 
   1307    for (int i = width % 4; --i >= 0;)
   1308    {
   1309 uint32_t s       = src[i];
   1310 uint32_t d       = dest[i];
   1311 uint32_t src_ia  = ALPHA_8 (~s);
   1312 uint32_t dest_ia = ALPHA_8 (~d);
   1313 
   1314 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
   1315 
   1316 dest[i] = s;
   1317    }
   1318 }
   1319 
   1320 static void
   1321 vmx_combine_xor_u_mask (uint32_t       *dest,
   1322 		const uint32_t *src,
   1323 		const uint32_t *mask,
   1324 		int             width)
   1325 {
   1326    vector unsigned char vdest, vsrc, vmask;
   1327    DECLARE_SRC_MASK_VAR;
   1328    DECLARE_MASK_MASK_VAR;
   1329 
   1330    while (width && ((uintptr_t)dest & 15))
   1331    {
   1332 uint32_t m = ALPHA_8 (*mask++);
   1333 uint32_t s = *src++;
   1334 uint32_t d = *dest;
   1335 uint32_t src_ia;
   1336 uint32_t dest_ia = ALPHA_8 (~d);
   1337 
   1338 UN8x4_MUL_UN8 (s, m);
   1339 
   1340 src_ia = ALPHA_8 (~s);
   1341 
   1342 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
   1343 
   1344 *dest++ = s;
   1345 width--;
   1346    }
   1347 
   1348    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1349 
   1350    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1351    for (int i = width / 4; i > 0; i--)
   1352    {
   1353 LOAD_VECTORSM (dest, src, mask);
   1354 
   1355 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)), vdest,
   1356 		     splat_alpha (negate (vsrc)));
   1357 
   1358 STORE_VECTOR (dest);
   1359 
   1360 src += 4;
   1361 dest += 4;
   1362 mask += 4;
   1363    }
   1364 
   1365    for (int i = width % 4; --i >= 0;)
   1366    {
   1367 uint32_t m = ALPHA_8 (mask[i]);
   1368 uint32_t s = src[i];
   1369 uint32_t d = dest[i];
   1370 uint32_t src_ia;
   1371 uint32_t dest_ia = ALPHA_8 (~d);
   1372 
   1373 UN8x4_MUL_UN8 (s, m);
   1374 
   1375 src_ia = ALPHA_8 (~s);
   1376 
   1377 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
   1378 
   1379 dest[i] = s;
   1380    }
   1381 }
   1382 
   1383 static void
   1384 vmx_combine_xor_u (pixman_implementation_t *imp,
   1385 	   pixman_op_t              op,
   1386 	   uint32_t                *dest,
   1387 	   const uint32_t          *src,
   1388 	   const uint32_t          *mask,
   1389 	   int                      width)
   1390 {
   1391    if (mask)
   1392 vmx_combine_xor_u_mask (dest, src, mask, width);
   1393    else
   1394 vmx_combine_xor_u_no_mask (dest, src, width);
   1395 }
   1396 
   1397 static void
   1398 vmx_combine_add_u_no_mask (uint32_t *dest, const uint32_t *src, int width)
   1399 {
   1400    vector unsigned char vdest, vsrc;
   1401    DECLARE_SRC_MASK_VAR;
   1402 
   1403    while (width && ((uintptr_t)dest & 15))
   1404    {
   1405 uint32_t s = *src++;
   1406 uint32_t d = *dest;
   1407 
   1408 UN8x4_ADD_UN8x4 (d, s);
   1409 
   1410 *dest++ = d;
   1411 width--;
   1412    }
   1413 
   1414    COMPUTE_SHIFT_MASKS (dest, src);
   1415    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1416    for (int i = width / 4; i > 0; i--)
   1417    {
   1418 LOAD_VECTORS (dest, src);
   1419 
   1420 vdest = pix_add (vsrc, vdest);
   1421 
   1422 STORE_VECTOR (dest);
   1423 
   1424 src += 4;
   1425 dest += 4;
   1426    }
   1427 
   1428    for (int i = width % 4; --i >= 0;)
   1429    {
   1430 uint32_t s = src[i];
   1431 uint32_t d = dest[i];
   1432 
   1433 UN8x4_ADD_UN8x4 (d, s);
   1434 
   1435 dest[i] = d;
   1436    }
   1437 }
   1438 
   1439 static void
   1440 vmx_combine_add_u_mask (uint32_t       *dest,
   1441 		const uint32_t *src,
   1442 		const uint32_t *mask,
   1443 		int             width)
   1444 {
   1445    vector unsigned char vdest, vsrc, vmask;
   1446    DECLARE_SRC_MASK_VAR;
   1447    DECLARE_MASK_MASK_VAR;
   1448 
   1449    while (width && ((uintptr_t)dest & 15))
   1450    {
   1451 uint32_t m = ALPHA_8 (*mask++);
   1452 uint32_t s = *src++;
   1453 uint32_t d = *dest;
   1454 
   1455 UN8x4_MUL_UN8 (s, m);
   1456 UN8x4_ADD_UN8x4 (d, s);
   1457 
   1458 *dest++ = d;
   1459 width--;
   1460    }
   1461 
   1462    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1463 
   1464    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1465    for (int i = width / 4; i > 0; i--)
   1466    {
   1467 LOAD_VECTORSM (dest, src, mask);
   1468 
   1469 vdest = pix_add (vsrc, vdest);
   1470 
   1471 STORE_VECTOR (dest);
   1472 
   1473 src += 4;
   1474 dest += 4;
   1475 mask += 4;
   1476    }
   1477 
   1478    for (int i = width % 4; --i >= 0;)
   1479    {
   1480 uint32_t m = ALPHA_8 (mask[i]);
   1481 uint32_t s = src[i];
   1482 uint32_t d = dest[i];
   1483 
   1484 UN8x4_MUL_UN8 (s, m);
   1485 UN8x4_ADD_UN8x4 (d, s);
   1486 
   1487 dest[i] = d;
   1488    }
   1489 }
   1490 
   1491 static void
   1492 vmx_combine_add_u (pixman_implementation_t *imp,
   1493 	   pixman_op_t              op,
   1494 	   uint32_t                *dest,
   1495 	   const uint32_t          *src,
   1496 	   const uint32_t          *mask,
   1497 	   int                      width)
   1498 {
   1499    if (mask)
   1500 vmx_combine_add_u_mask (dest, src, mask, width);
   1501    else
   1502 vmx_combine_add_u_no_mask (dest, src, width);
   1503 }
   1504 
   1505 static void
   1506 vmx_combine_src_ca (pixman_implementation_t *imp,
   1507 	    pixman_op_t              op,
   1508 	    uint32_t                *dest,
   1509 	    const uint32_t          *src,
   1510 	    const uint32_t          *mask,
   1511 	    int                      width)
   1512 {
   1513    vector unsigned char vdest, vsrc, vmask;
   1514    DECLARE_SRC_MASK_VAR;
   1515    DECLARE_MASK_MASK_VAR;
   1516 
   1517    while (width && ((uintptr_t)dest & 15))
   1518    {
   1519 uint32_t a = *mask++;
   1520 uint32_t s = *src++;
   1521 
   1522 UN8x4_MUL_UN8x4 (s, a);
   1523 
   1524 *dest++ = s;
   1525 width--;
   1526    }
   1527 
   1528    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1529 
   1530    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1531    for (int i = width / 4; i > 0; i--)
   1532    {
   1533 LOAD_VECTORSC (dest, src, mask);
   1534 
   1535 vdest = pix_multiply (vsrc, vmask);
   1536 
   1537 STORE_VECTOR (dest);
   1538 
   1539 mask += 4;
   1540 src += 4;
   1541 dest += 4;
   1542    }
   1543 
   1544    for (int i = width % 4; --i >= 0;)
   1545    {
   1546 uint32_t a = mask[i];
   1547 uint32_t s = src[i];
   1548 
   1549 UN8x4_MUL_UN8x4 (s, a);
   1550 
   1551 dest[i] = s;
   1552    }
   1553 }
   1554 
   1555 static void
   1556 vmx_combine_over_ca (pixman_implementation_t *imp,
   1557 	     pixman_op_t              op,
   1558 	     uint32_t                *dest,
   1559 	     const uint32_t          *src,
   1560 	     const uint32_t          *mask,
   1561 	     int                      width)
   1562 {
   1563    vector unsigned char vdest, vsrc, vmask;
   1564    DECLARE_SRC_MASK_VAR;
   1565    DECLARE_MASK_MASK_VAR;
   1566 
   1567    while (width && ((uintptr_t)dest & 15))
   1568    {
   1569 uint32_t a  = *mask++;
   1570 uint32_t s  = *src++;
   1571 uint32_t d  = *dest;
   1572 uint32_t sa = ALPHA_8 (s);
   1573 
   1574 UN8x4_MUL_UN8x4 (s, a);
   1575 UN8x4_MUL_UN8 (a, sa);
   1576 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
   1577 
   1578 *dest++ = d;
   1579 width--;
   1580    }
   1581 
   1582    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1583 
   1584    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1585    for (int i = width / 4; i > 0; i--)
   1586    {
   1587 LOAD_VECTORSC (dest, src, mask);
   1588 
   1589 vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
   1590 
   1591 STORE_VECTOR (dest);
   1592 
   1593 mask += 4;
   1594 src += 4;
   1595 dest += 4;
   1596    }
   1597 
   1598    for (int i = width % 4; --i >= 0;)
   1599    {
   1600 uint32_t a  = mask[i];
   1601 uint32_t s  = src[i];
   1602 uint32_t d  = dest[i];
   1603 uint32_t sa = ALPHA_8 (s);
   1604 
   1605 UN8x4_MUL_UN8x4 (s, a);
   1606 UN8x4_MUL_UN8 (a, sa);
   1607 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
   1608 
   1609 dest[i] = d;
   1610    }
   1611 }
   1612 
   1613 static void
   1614 vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
   1615 		     pixman_op_t              op,
   1616 		     uint32_t                *dest,
   1617 		     const uint32_t          *src,
   1618 		     const uint32_t          *mask,
   1619 		     int                      width)
   1620 {
   1621    vector unsigned char vdest, vsrc, vmask;
   1622    DECLARE_SRC_MASK_VAR;
   1623    DECLARE_MASK_MASK_VAR;
   1624 
   1625    while (width && ((uintptr_t)dest & 15))
   1626    {
   1627 uint32_t a   = *mask++;
   1628 uint32_t s   = *src++;
   1629 uint32_t d   = *dest;
   1630 uint32_t ida = ALPHA_8 (~d);
   1631 
   1632 UN8x4_MUL_UN8x4 (s, a);
   1633 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
   1634 
   1635 *dest++ = s;
   1636 width--;
   1637    }
   1638 
   1639    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1640 
   1641    /* printf("%s\n",__PRETTY_FUNCTION__); */
   1642    for (int i = width / 4; i > 0; i--)
   1643    {
   1644 LOAD_VECTORSC (dest, src, mask);
   1645 
   1646 vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
   1647 
   1648 STORE_VECTOR (dest);
   1649 
   1650 mask += 4;
   1651 src += 4;
   1652 dest += 4;
   1653    }
   1654 
   1655    for (int i = width % 4; --i >= 0;)
   1656    {
   1657 uint32_t a   = mask[i];
   1658 uint32_t s   = src[i];
   1659 uint32_t d   = dest[i];
   1660 uint32_t ida = ALPHA_8 (~d);
   1661 
   1662 UN8x4_MUL_UN8x4 (s, a);
   1663 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
   1664 
   1665 dest[i] = s;
   1666    }
   1667 }
   1668 
   1669 static void
   1670 vmx_combine_in_ca (pixman_implementation_t *imp,
   1671 	   pixman_op_t              op,
   1672 	   uint32_t                *dest,
   1673 	   const uint32_t          *src,
   1674 	   const uint32_t          *mask,
   1675 	   int                      width)
   1676 {
   1677    vector unsigned char vdest, vsrc, vmask;
   1678    DECLARE_SRC_MASK_VAR;
   1679    DECLARE_MASK_MASK_VAR;
   1680 
   1681    while (width && ((uintptr_t)dest & 15))
   1682    {
   1683 uint32_t a  = *mask++;
   1684 uint32_t s  = *src++;
   1685 uint32_t da = ALPHA_8 (*dest);
   1686 
   1687 UN8x4_MUL_UN8x4 (s, a);
   1688 UN8x4_MUL_UN8 (s, da);
   1689 
   1690 *dest++ = s;
   1691 width--;
   1692    }
   1693 
   1694    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1695 
   1696    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1697    for (int i = width / 4; i > 0; i--)
   1698    {
   1699 LOAD_VECTORSC (dest, src, mask);
   1700 
   1701 vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
   1702 
   1703 STORE_VECTOR (dest);
   1704 
   1705 src += 4;
   1706 dest += 4;
   1707 mask += 4;
   1708    }
   1709 
   1710    for (int i = width % 4; --i >= 0;)
   1711    {
   1712 uint32_t a  = mask[i];
   1713 uint32_t s  = src[i];
   1714 uint32_t da = ALPHA_8 (dest[i]);
   1715 
   1716 UN8x4_MUL_UN8x4 (s, a);
   1717 UN8x4_MUL_UN8 (s, da);
   1718 
   1719 dest[i] = s;
   1720    }
   1721 }
   1722 
   1723 static void
   1724 vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
   1725 		   pixman_op_t              op,
   1726 		   uint32_t                *dest,
   1727 		   const uint32_t          *src,
   1728 		   const uint32_t          *mask,
   1729 		   int                      width)
   1730 {
   1731    vector unsigned char vdest, vsrc, vmask;
   1732    DECLARE_SRC_MASK_VAR;
   1733    DECLARE_MASK_MASK_VAR;
   1734 
   1735    while (width && ((uintptr_t)dest & 15))
   1736    {
   1737 uint32_t a  = *mask++;
   1738 uint32_t d  = *dest;
   1739 uint32_t sa = ALPHA_8 (*src++);
   1740 
   1741 UN8x4_MUL_UN8 (a, sa);
   1742 UN8x4_MUL_UN8x4 (d, a);
   1743 
   1744 *dest++ = d;
   1745 width--;
   1746    }
   1747 
   1748    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1749 
   1750    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1751    for (int i = width / 4; i > 0; i--)
   1752    {
   1753 
   1754 LOAD_VECTORSC (dest, src, mask);
   1755 
   1756 vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
   1757 
   1758 STORE_VECTOR (dest);
   1759 
   1760 src += 4;
   1761 dest += 4;
   1762 mask += 4;
   1763    }
   1764 
   1765    for (int i = width % 4; --i >= 0;)
   1766    {
   1767 uint32_t a  = mask[i];
   1768 uint32_t d  = dest[i];
   1769 uint32_t sa = ALPHA_8 (src[i]);
   1770 
   1771 UN8x4_MUL_UN8 (a, sa);
   1772 UN8x4_MUL_UN8x4 (d, a);
   1773 
   1774 dest[i] = d;
   1775    }
   1776 }
   1777 
   1778 static void
   1779 vmx_combine_out_ca (pixman_implementation_t *imp,
   1780 	    pixman_op_t              op,
   1781 	    uint32_t                *dest,
   1782 	    const uint32_t          *src,
   1783 	    const uint32_t          *mask,
   1784 	    int                      width)
   1785 {
   1786    vector unsigned char vdest, vsrc, vmask;
   1787    DECLARE_SRC_MASK_VAR;
   1788    DECLARE_MASK_MASK_VAR;
   1789 
   1790    while (width && ((uintptr_t)dest & 15))
   1791    {
   1792 uint32_t a  = *mask++;
   1793 uint32_t s  = *src++;
   1794 uint32_t d  = *dest;
   1795 uint32_t da = ALPHA_8 (~d);
   1796 
   1797 UN8x4_MUL_UN8x4 (s, a);
   1798 UN8x4_MUL_UN8 (s, da);
   1799 
   1800 *dest++ = s;
   1801 width--;
   1802    }
   1803 
   1804    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1805 
   1806    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1807    for (int i = width / 4; i > 0; i--)
   1808    {
   1809 LOAD_VECTORSC (dest, src, mask);
   1810 
   1811 vdest = pix_multiply (pix_multiply (vsrc, vmask),
   1812 		      splat_alpha (negate (vdest)));
   1813 
   1814 STORE_VECTOR (dest);
   1815 
   1816 src += 4;
   1817 dest += 4;
   1818 mask += 4;
   1819    }
   1820 
   1821    for (int i = width % 4; --i >= 0;)
   1822    {
   1823 uint32_t a  = mask[i];
   1824 uint32_t s  = src[i];
   1825 uint32_t d  = dest[i];
   1826 uint32_t da = ALPHA_8 (~d);
   1827 
   1828 UN8x4_MUL_UN8x4 (s, a);
   1829 UN8x4_MUL_UN8 (s, da);
   1830 
   1831 dest[i] = s;
   1832    }
   1833 }
   1834 
   1835 static void
   1836 vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
   1837 		    pixman_op_t              op,
   1838 		    uint32_t                *dest,
   1839 		    const uint32_t          *src,
   1840 		    const uint32_t          *mask,
   1841 		    int                      width)
   1842 {
   1843    vector unsigned char vdest, vsrc, vmask;
   1844    DECLARE_SRC_MASK_VAR;
   1845    DECLARE_MASK_MASK_VAR;
   1846 
   1847    while (width && ((uintptr_t)dest & 15))
   1848    {
   1849 uint32_t a  = *mask++;
   1850 uint32_t s  = *src++;
   1851 uint32_t d  = *dest;
   1852 uint32_t sa = ALPHA_8 (s);
   1853 
   1854 UN8x4_MUL_UN8 (a, sa);
   1855 UN8x4_MUL_UN8x4 (d, ~a);
   1856 
   1857 *dest++ = d;
   1858 width--;
   1859    }
   1860 
   1861    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1862 
   1863    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1864    for (int i = width / 4; i > 0; i--)
   1865    {
   1866 LOAD_VECTORSC (dest, src, mask);
   1867 
   1868 vdest = pix_multiply (
   1869     vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
   1870 
   1871 STORE_VECTOR (dest);
   1872 
   1873 src += 4;
   1874 dest += 4;
   1875 mask += 4;
   1876    }
   1877 
   1878    for (int i = width % 4; --i >= 0;)
   1879    {
   1880 uint32_t a  = mask[i];
   1881 uint32_t s  = src[i];
   1882 uint32_t d  = dest[i];
   1883 uint32_t sa = ALPHA_8 (s);
   1884 
   1885 UN8x4_MUL_UN8 (a, sa);
   1886 UN8x4_MUL_UN8x4 (d, ~a);
   1887 
   1888 dest[i] = d;
   1889    }
   1890 }
   1891 
   1892 static void
   1893 vmx_combine_atop_ca (pixman_implementation_t *imp,
   1894 	     pixman_op_t              op,
   1895 	     uint32_t                *dest,
   1896 	     const uint32_t          *src,
   1897 	     const uint32_t          *mask,
   1898 	     int                      width)
   1899 {
   1900    vector unsigned char vdest, vsrc, vmask, vsrca;
   1901    DECLARE_SRC_MASK_VAR;
   1902    DECLARE_MASK_MASK_VAR;
   1903 
   1904    while (width && ((uintptr_t)dest & 15))
   1905    {
   1906 uint32_t a  = *mask++;
   1907 uint32_t s  = *src++;
   1908 uint32_t d  = *dest;
   1909 uint32_t sa = ALPHA_8 (s);
   1910 uint32_t da = ALPHA_8 (d);
   1911 
   1912 UN8x4_MUL_UN8x4 (s, a);
   1913 UN8x4_MUL_UN8 (a, sa);
   1914 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
   1915 
   1916 *dest++ = d;
   1917 width--;
   1918    }
   1919 
   1920    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1921 
   1922    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1923    for (int i = width / 4; i > 0; i--)
   1924    {
   1925 LOAD_VECTORSC (dest, src, mask);
   1926 
   1927 vsrca = splat_alpha (vsrc);
   1928 
   1929 vsrc  = pix_multiply (vsrc, vmask);
   1930 vmask = pix_multiply (vmask, vsrca);
   1931 
   1932 vdest = pix_add_mul (vsrc, splat_alpha (vdest), negate (vmask), vdest);
   1933 
   1934 STORE_VECTOR (dest);
   1935 
   1936 src += 4;
   1937 dest += 4;
   1938 mask += 4;
   1939    }
   1940 
   1941    for (int i = width % 4; --i >= 0;)
   1942    {
   1943 uint32_t a  = mask[i];
   1944 uint32_t s  = src[i];
   1945 uint32_t d  = dest[i];
   1946 uint32_t sa = ALPHA_8 (s);
   1947 uint32_t da = ALPHA_8 (d);
   1948 
   1949 UN8x4_MUL_UN8x4 (s, a);
   1950 UN8x4_MUL_UN8 (a, sa);
   1951 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
   1952 
   1953 dest[i] = d;
   1954    }
   1955 }
   1956 
   1957 static void
   1958 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
   1959 		     pixman_op_t              op,
   1960 		     uint32_t                *dest,
   1961 		     const uint32_t          *src,
   1962 		     const uint32_t          *mask,
   1963 		     int                      width)
   1964 {
   1965    vector unsigned char vdest, vsrc, vmask;
   1966    DECLARE_SRC_MASK_VAR;
   1967    DECLARE_MASK_MASK_VAR;
   1968 
   1969    while (width && ((uintptr_t)dest & 15))
   1970    {
   1971 uint32_t a  = *mask++;
   1972 uint32_t s  = *src++;
   1973 uint32_t d  = *dest;
   1974 uint32_t sa = ALPHA_8 (s);
   1975 uint32_t da = ALPHA_8 (~d);
   1976 
   1977 UN8x4_MUL_UN8x4 (s, a);
   1978 UN8x4_MUL_UN8 (a, sa);
   1979 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
   1980 
   1981 *dest++ = d;
   1982 width--;
   1983    }
   1984 
   1985    COMPUTE_SHIFT_MASKC (dest, src, mask);
   1986 
   1987    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   1988    for (int i = width / 4; i > 0; i--)
   1989    {
   1990 LOAD_VECTORSC (dest, src, mask);
   1991 
   1992 vdest = pix_add_mul (vdest, pix_multiply (vmask, splat_alpha (vsrc)),
   1993 		     pix_multiply (vsrc, vmask),
   1994 		     negate (splat_alpha (vdest)));
   1995 
   1996 STORE_VECTOR (dest);
   1997 
   1998 src += 4;
   1999 dest += 4;
   2000 mask += 4;
   2001    }
   2002 
   2003    for (int i = width % 4; --i >= 0;)
   2004    {
   2005 uint32_t a  = mask[i];
   2006 uint32_t s  = src[i];
   2007 uint32_t d  = dest[i];
   2008 uint32_t sa = ALPHA_8 (s);
   2009 uint32_t da = ALPHA_8 (~d);
   2010 
   2011 UN8x4_MUL_UN8x4 (s, a);
   2012 UN8x4_MUL_UN8 (a, sa);
   2013 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
   2014 
   2015 dest[i] = d;
   2016    }
   2017 }
   2018 
   2019 static void
   2020 vmx_combine_xor_ca (pixman_implementation_t *imp,
   2021 	    pixman_op_t              op,
   2022 	    uint32_t                *dest,
   2023 	    const uint32_t          *src,
   2024 	    const uint32_t          *mask,
   2025 	    int                      width)
   2026 {
   2027    vector unsigned char vdest, vsrc, vmask;
   2028    DECLARE_SRC_MASK_VAR;
   2029    DECLARE_MASK_MASK_VAR;
   2030 
   2031    while (width && ((uintptr_t)dest & 15))
   2032    {
   2033 uint32_t a  = *mask++;
   2034 uint32_t s  = *src++;
   2035 uint32_t d  = *dest;
   2036 uint32_t sa = ALPHA_8 (s);
   2037 uint32_t da = ALPHA_8 (~d);
   2038 
   2039 UN8x4_MUL_UN8x4 (s, a);
   2040 UN8x4_MUL_UN8 (a, sa);
   2041 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
   2042 
   2043 *dest++ = d;
   2044 width--;
   2045    }
   2046 
   2047    COMPUTE_SHIFT_MASKC (dest, src, mask);
   2048 
   2049    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   2050    for (int i = width / 4; i > 0; i--)
   2051    {
   2052 LOAD_VECTORSC (dest, src, mask);
   2053 
   2054 vdest = pix_add_mul (
   2055     vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))),
   2056     pix_multiply (vsrc, vmask), negate (splat_alpha (vdest)));
   2057 
   2058 STORE_VECTOR (dest);
   2059 
   2060 src += 4;
   2061 dest += 4;
   2062 mask += 4;
   2063    }
   2064 
   2065    for (int i = width % 4; --i >= 0;)
   2066    {
   2067 uint32_t a  = mask[i];
   2068 uint32_t s  = src[i];
   2069 uint32_t d  = dest[i];
   2070 uint32_t sa = ALPHA_8 (s);
   2071 uint32_t da = ALPHA_8 (~d);
   2072 
   2073 UN8x4_MUL_UN8x4 (s, a);
   2074 UN8x4_MUL_UN8 (a, sa);
   2075 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
   2076 
   2077 dest[i] = d;
   2078    }
   2079 }
   2080 
   2081 static void
   2082 vmx_combine_add_ca (pixman_implementation_t *imp,
   2083 	    pixman_op_t              op,
   2084 	    uint32_t                *dest,
   2085 	    const uint32_t          *src,
   2086 	    const uint32_t          *mask,
   2087 	    int                      width)
   2088 {
   2089    vector unsigned char vdest, vsrc, vmask;
   2090    DECLARE_SRC_MASK_VAR;
   2091    DECLARE_MASK_MASK_VAR;
   2092 
   2093    while (width && ((uintptr_t)dest & 15))
   2094    {
   2095 uint32_t a = *mask++;
   2096 uint32_t s = *src++;
   2097 uint32_t d = *dest;
   2098 
   2099 UN8x4_MUL_UN8x4 (s, a);
   2100 UN8x4_ADD_UN8x4 (s, d);
   2101 
   2102 *dest++ = s;
   2103 width--;
   2104    }
   2105 
   2106    COMPUTE_SHIFT_MASKC (dest, src, mask);
   2107 
   2108    /* printf ("%s\n",__PRETTY_FUNCTION__); */
   2109    for (int i = width / 4; i > 0; i--)
   2110    {
   2111 LOAD_VECTORSC (dest, src, mask);
   2112 
   2113 vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
   2114 
   2115 STORE_VECTOR (dest);
   2116 
   2117 src += 4;
   2118 dest += 4;
   2119 mask += 4;
   2120    }
   2121 
   2122    for (int i = width % 4; --i >= 0;)
   2123    {
   2124 uint32_t a = mask[i];
   2125 uint32_t s = src[i];
   2126 uint32_t d = dest[i];
   2127 
   2128 UN8x4_MUL_UN8x4 (s, a);
   2129 UN8x4_ADD_UN8x4 (s, d);
   2130 
   2131 dest[i] = s;
   2132    }
   2133 }
   2134 
   2135 static void
   2136 vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
   2137 		     pixman_composite_info_t *info)
   2138 {
   2139    PIXMAN_COMPOSITE_ARGS (info);
   2140    uint32_t  src, srca;
   2141    uint32_t *dst_line, *dst;
   2142    uint8_t  *mask_line;
   2143    int       dst_stride, mask_stride;
   2144    int32_t   w;
   2145    uint32_t  m, d, s, ia;
   2146 
   2147    vector unsigned char vsrc, valpha, vmask, vdst;
   2148 
   2149    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2150 
   2151    srca = ALPHA_8 (src);
   2152    if (src == 0)
   2153 return;
   2154 
   2155    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2156 		   dst_line, 1);
   2157    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
   2158 		   mask_line, 1);
   2159 
   2160    vsrc   = (vector unsigned char)create_mask_32_128 (src);
   2161    valpha = splat_alpha (vsrc);
   2162 
   2163    while (height--)
   2164    {
   2165 const uint8_t *pm = mask_line;
   2166 dst               = dst_line;
   2167 dst_line += dst_stride;
   2168 mask_line += mask_stride;
   2169 w = width;
   2170 
   2171 while (w && (uintptr_t)dst & 15)
   2172 {
   2173     s = src;
   2174     m = *pm++;
   2175 
   2176     if (m)
   2177     {
   2178 	d = *dst;
   2179 	UN8x4_MUL_UN8 (s, m);
   2180 	ia = ALPHA_8 (~s);
   2181 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
   2182 	*dst = d;
   2183     }
   2184 
   2185     w--;
   2186     dst++;
   2187 }
   2188 
   2189 while (w >= 4)
   2190 {
   2191     m = *((uint32_t *)pm);
   2192 
   2193     if (srca == 0xff && m == 0xffffffff)
   2194     {
   2195 	save_128_aligned (dst, vsrc);
   2196     }
   2197     else if (m)
   2198     {
   2199 	vmask = splat_pixel (
   2200 	    (vector unsigned char)create_mask_32_128 (m));
   2201 
   2202 	/* dst is 16-byte aligned */
   2203 	vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst));
   2204 
   2205 	save_128_aligned (dst, vdst);
   2206     }
   2207 
   2208     w -= 4;
   2209     dst += 4;
   2210     pm += 4;
   2211 }
   2212 
   2213 while (w)
   2214 {
   2215     s = src;
   2216     m = *pm++;
   2217 
   2218     if (m)
   2219     {
   2220 	d = *dst;
   2221 	UN8x4_MUL_UN8 (s, m);
   2222 	ia = ALPHA_8 (~s);
   2223 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
   2224 	*dst = d;
   2225     }
   2226 
   2227     w--;
   2228     dst++;
   2229 }
   2230    }
   2231 }
   2232 
   2233 static pixman_bool_t
   2234 vmx_fill (pixman_implementation_t *imp,
   2235   uint32_t                *bits,
   2236   int                      stride,
   2237   int                      bpp,
   2238   int                      x,
   2239   int                      y,
   2240   int                      width,
   2241   int                      height,
   2242   uint32_t                 filler)
   2243 {
   2244    uint32_t byte_width;
   2245    uint8_t *byte_line;
   2246 
   2247    vector unsigned int vfiller;
   2248 
   2249    if (bpp == 8)
   2250    {
   2251 uint8_t  b;
   2252 uint16_t w;
   2253 
   2254 stride     = stride * (int)sizeof (uint32_t) / 1;
   2255 byte_line  = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
   2256 byte_width = width;
   2257 stride *= 1;
   2258 
   2259 b      = filler & 0xff;
   2260 w      = (b << 8) | b;
   2261 filler = (w << 16) | w;
   2262    }
   2263    else if (bpp == 16)
   2264    {
   2265 stride     = stride * (int)sizeof (uint32_t) / 2;
   2266 byte_line  = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
   2267 byte_width = 2 * width;
   2268 stride *= 2;
   2269 
   2270 filler = (filler & 0xffff) * 0x00010001;
   2271    }
   2272    else if (bpp == 32)
   2273    {
   2274 stride     = stride * (int)sizeof (uint32_t) / 4;
   2275 byte_line  = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
   2276 byte_width = 4 * width;
   2277 stride *= 4;
   2278    }
   2279    else
   2280    {
   2281 return FALSE;
   2282    }
   2283 
   2284    vfiller = create_mask_32_128 (filler);
   2285 
   2286    while (height--)
   2287    {
   2288 int      w;
   2289 uint8_t *d = byte_line;
   2290 byte_line += stride;
   2291 w = byte_width;
   2292 
   2293 if (w >= 1 && ((uintptr_t)d & 1))
   2294 {
   2295     *(uint8_t *)d = filler;
   2296     w -= 1;
   2297     d += 1;
   2298 }
   2299 
   2300 while (w >= 2 && ((uintptr_t)d & 3))
   2301 {
   2302     *(uint16_t *)d = filler;
   2303     w -= 2;
   2304     d += 2;
   2305 }
   2306 
   2307 while (w >= 4 && ((uintptr_t)d & 15))
   2308 {
   2309     *(uint32_t *)d = filler;
   2310 
   2311     w -= 4;
   2312     d += 4;
   2313 }
   2314 
   2315 while (w >= 128)
   2316 {
   2317     vec_st (vfiller, 0, (uint32_t *)d);
   2318     vec_st (vfiller, 0, (uint32_t *)d + 4);
   2319     vec_st (vfiller, 0, (uint32_t *)d + 8);
   2320     vec_st (vfiller, 0, (uint32_t *)d + 12);
   2321     vec_st (vfiller, 0, (uint32_t *)d + 16);
   2322     vec_st (vfiller, 0, (uint32_t *)d + 20);
   2323     vec_st (vfiller, 0, (uint32_t *)d + 24);
   2324     vec_st (vfiller, 0, (uint32_t *)d + 28);
   2325 
   2326     d += 128;
   2327     w -= 128;
   2328 }
   2329 
   2330 if (w >= 64)
   2331 {
   2332     vec_st (vfiller, 0, (uint32_t *)d);
   2333     vec_st (vfiller, 0, (uint32_t *)d + 4);
   2334     vec_st (vfiller, 0, (uint32_t *)d + 8);
   2335     vec_st (vfiller, 0, (uint32_t *)d + 12);
   2336 
   2337     d += 64;
   2338     w -= 64;
   2339 }
   2340 
   2341 if (w >= 32)
   2342 {
   2343     vec_st (vfiller, 0, (uint32_t *)d);
   2344     vec_st (vfiller, 0, (uint32_t *)d + 4);
   2345 
   2346     d += 32;
   2347     w -= 32;
   2348 }
   2349 
   2350 if (w >= 16)
   2351 {
   2352     vec_st (vfiller, 0, (uint32_t *)d);
   2353 
   2354     d += 16;
   2355     w -= 16;
   2356 }
   2357 
   2358 while (w >= 4)
   2359 {
   2360     *(uint32_t *)d = filler;
   2361 
   2362     w -= 4;
   2363     d += 4;
   2364 }
   2365 
   2366 if (w >= 2)
   2367 {
   2368     *(uint16_t *)d = filler;
   2369     w -= 2;
   2370     d += 2;
   2371 }
   2372 
   2373 if (w >= 1)
   2374 {
   2375     *(uint8_t *)d = filler;
   2376     w -= 1;
   2377     d += 1;
   2378 }
   2379    }
   2380 
   2381    return TRUE;
   2382 }
   2383 
   2384 static void
   2385 vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
   2386 		     pixman_composite_info_t *info)
   2387 {
   2388    PIXMAN_COMPOSITE_ARGS (info);
   2389    uint32_t *dst_line, *dst;
   2390    uint32_t *src_line, *src;
   2391    int32_t   w;
   2392    int       dst_stride, src_stride;
   2393 
   2394    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2395 		   dst_line, 1);
   2396    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2397 		   src_line, 1);
   2398 
   2399    while (height--)
   2400    {
   2401 dst = dst_line;
   2402 dst_line += dst_stride;
   2403 src = src_line;
   2404 src_line += src_stride;
   2405 w = width;
   2406 
   2407 while (w && (uintptr_t)dst & 15)
   2408 {
   2409     *dst++ = *src++ | 0xff000000;
   2410     w--;
   2411 }
   2412 
   2413 while (w >= 16)
   2414 {
   2415     vector unsigned char vmx_src1, vmx_src2, vmx_src3, vmx_src4;
   2416 
   2417     vmx_src1 = load_128_unaligned (src);
   2418     vmx_src2 = load_128_unaligned (src + 4);
   2419     vmx_src3 = load_128_unaligned (src + 8);
   2420     vmx_src4 = load_128_unaligned (src + 12);
   2421 
   2422     save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
   2423     save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
   2424     save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
   2425     save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
   2426 
   2427     dst += 16;
   2428     src += 16;
   2429     w -= 16;
   2430 }
   2431 
   2432 while (w)
   2433 {
   2434     *dst++ = *src++ | 0xff000000;
   2435     w--;
   2436 }
   2437    }
   2438 }
   2439 
   2440 static void
   2441 vmx_composite_over_n_8888 (pixman_implementation_t *imp,
   2442 		   pixman_composite_info_t *info)
   2443 {
   2444    PIXMAN_COMPOSITE_ARGS (info);
   2445    uint32_t *dst_line, *dst;
   2446    uint32_t  src, ia;
   2447    int       w, dst_stride;
   2448 
   2449    vector unsigned char vdst, vsrc, via;
   2450 
   2451    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2452 
   2453    if (src == 0)
   2454 return;
   2455 
   2456    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2457 		   dst_line, 1);
   2458 
   2459    vsrc = (vector unsigned char)create_mask_32_128 (src);
   2460    via  = negate (splat_alpha (vsrc));
   2461    ia   = ALPHA_8 (~src);
   2462 
   2463    while (height--)
   2464    {
   2465 dst = dst_line;
   2466 dst_line += dst_stride;
   2467 w = width;
   2468 
   2469 while (w && ((uintptr_t)dst & 15))
   2470 {
   2471     uint32_t d = *dst;
   2472     UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
   2473     *dst++ = d;
   2474     w--;
   2475 }
   2476 
   2477 for (int i = w / 4; i > 0; i--)
   2478 {
   2479     vdst = pix_multiply (load_128_aligned (dst), via);
   2480     save_128_aligned (dst, pix_add (vsrc, vdst));
   2481     dst += 4;
   2482 }
   2483 
   2484 for (int i = w % 4; --i >= 0;)
   2485 {
   2486     uint32_t d = dst[i];
   2487     UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
   2488     dst[i] = d;
   2489 }
   2490    }
   2491 }
   2492 
   2493 static void
   2494 vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
   2495 		      pixman_composite_info_t *info)
   2496 {
   2497    PIXMAN_COMPOSITE_ARGS (info);
   2498    int       dst_stride, src_stride;
   2499    uint32_t *dst_line, *dst;
   2500    uint32_t *src_line, *src;
   2501 
   2502    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2503 		   dst_line, 1);
   2504    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2505 		   src_line, 1);
   2506 
   2507    dst = dst_line;
   2508    src = src_line;
   2509 
   2510    while (height--)
   2511    {
   2512 vmx_combine_over_u (imp, op, dst, src, NULL, width);
   2513 
   2514 dst += dst_stride;
   2515 src += src_stride;
   2516    }
   2517 }
   2518 
   2519 static void
   2520 vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
   2521 			   pixman_composite_info_t *info)
   2522 {
   2523    PIXMAN_COMPOSITE_ARGS (info);
   2524    uint32_t  src, ia;
   2525    uint32_t *dst_line, d;
   2526    uint32_t *mask_line, m;
   2527    uint32_t  pack_cmp;
   2528    int       dst_stride, mask_stride;
   2529 
   2530    vector unsigned char vsrc, valpha, vmask, vdest;
   2531 
   2532    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2533 
   2534    if (src == 0)
   2535 return;
   2536 
   2537    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2538 		   dst_line, 1);
   2539    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
   2540 		   mask_line, 1);
   2541 
   2542    vsrc   = (vector unsigned char)create_mask_32_128 (src);
   2543    valpha = splat_alpha (vsrc);
   2544    ia     = ALPHA_8 (src);
   2545 
   2546    while (height--)
   2547    {
   2548 int             w  = width;
   2549 const uint32_t *pm = (uint32_t *)mask_line;
   2550 uint32_t       *pd = (uint32_t *)dst_line;
   2551 uint32_t        s;
   2552 
   2553 dst_line += dst_stride;
   2554 mask_line += mask_stride;
   2555 
   2556 while (w && (uintptr_t)pd & 15)
   2557 {
   2558     s = src;
   2559     m = *pm++;
   2560 
   2561     if (m)
   2562     {
   2563 	d = *pd;
   2564 	UN8x4_MUL_UN8x4 (s, m);
   2565 	UN8x4_MUL_UN8 (m, ia);
   2566 	m = ~m;
   2567 	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
   2568 	*pd = d;
   2569     }
   2570 
   2571     pd++;
   2572     w--;
   2573 }
   2574 
   2575 while (w >= 4)
   2576 {
   2577     /* pm is NOT necessarily 16-byte aligned */
   2578     vmask = load_128_unaligned (pm);
   2579 
   2580     pack_cmp = vec_all_eq (vmask, vzero);
   2581 
   2582     /* if all bits in mask are zero, pack_cmp is not 0 */
   2583     if (pack_cmp == 0)
   2584     {
   2585 	/* pd is 16-byte aligned */
   2586 	vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd));
   2587 
   2588 	save_128_aligned (pd, vdest);
   2589     }
   2590 
   2591     pd += 4;
   2592     pm += 4;
   2593     w -= 4;
   2594 }
   2595 
   2596 while (w)
   2597 {
   2598     s = src;
   2599     m = *pm++;
   2600 
   2601     if (m)
   2602     {
   2603 	d = *pd;
   2604 	UN8x4_MUL_UN8x4 (s, m);
   2605 	UN8x4_MUL_UN8 (m, ia);
   2606 	m = ~m;
   2607 	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
   2608 	*pd = d;
   2609     }
   2610 
   2611     pd++;
   2612     w--;
   2613 }
   2614    }
   2615 }
   2616 
   2617 static void
   2618 vmx_composite_add_8_8 (pixman_implementation_t *imp,
   2619 	       pixman_composite_info_t *info)
   2620 {
   2621    PIXMAN_COMPOSITE_ARGS (info);
   2622    uint8_t *dst_line, *dst;
   2623    uint8_t *src_line, *src;
   2624    int      dst_stride, src_stride;
   2625    int32_t  w;
   2626    uint16_t t;
   2627 
   2628    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride,
   2629 		   src_line, 1);
   2630    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
   2631 		   dst_line, 1);
   2632 
   2633    while (height--)
   2634    {
   2635 dst = dst_line;
   2636 src = src_line;
   2637 
   2638 dst_line += dst_stride;
   2639 src_line += src_stride;
   2640 w = width;
   2641 
   2642 /* Small head */
   2643 while (w && (uintptr_t)dst & 3)
   2644 {
   2645     t      = (*dst) + (*src++);
   2646     *dst++ = t | (0 - (t >> 8));
   2647     w--;
   2648 }
   2649 
   2650 vmx_combine_add_u (imp, op, (uint32_t *)dst, (uint32_t *)src, NULL,
   2651 		   w >> 2);
   2652 
   2653 /* Small tail */
   2654 dst += w & 0xfffc;
   2655 src += w & 0xfffc;
   2656 
   2657 w &= 3;
   2658 
   2659 while (w)
   2660 {
   2661     t      = (*dst) + (*src++);
   2662     *dst++ = t | (0 - (t >> 8));
   2663     w--;
   2664 }
   2665    }
   2666 }
   2667 
   2668 static void
   2669 vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
   2670 		     pixman_composite_info_t *info)
   2671 {
   2672    PIXMAN_COMPOSITE_ARGS (info);
   2673    uint32_t *dst_line, *dst;
   2674    uint32_t *src_line, *src;
   2675    int       dst_stride, src_stride;
   2676 
   2677    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2678 		   src_line, 1);
   2679    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2680 		   dst_line, 1);
   2681 
   2682    while (height--)
   2683    {
   2684 dst = dst_line;
   2685 dst_line += dst_stride;
   2686 src = src_line;
   2687 src_line += src_stride;
   2688 
   2689 vmx_combine_add_u (imp, op, dst, src, NULL, width);
   2690    }
   2691 }
   2692 
   2693 static force_inline void
   2694 scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t       *pd,
   2695 				    const uint32_t *ps,
   2696 				    int32_t         w,
   2697 				    pixman_fixed_t  vx,
   2698 				    pixman_fixed_t  unit_x,
   2699 				    pixman_fixed_t  src_width_fixed,
   2700 				    pixman_bool_t fully_transparent_src)
   2701 {
   2702    uint32_t        s, d;
   2703    const uint32_t *pm = NULL;
   2704 
   2705    vector unsigned char vsrc, vdst;
   2706 
   2707    if (fully_transparent_src)
   2708 return;
   2709 
   2710    /* Align dst on a 16-byte boundary */
   2711    while (w && ((uintptr_t)pd & 15))
   2712    {
   2713 d = *pd;
   2714 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
   2715 vx += unit_x;
   2716 while (vx >= 0)
   2717     vx -= src_width_fixed;
   2718 
   2719 *pd++ = core_combine_over_u_pixel_vmx (s, d);
   2720 if (pm)
   2721     pm++;
   2722 w--;
   2723    }
   2724 
   2725    while (w >= 4)
   2726    {
   2727 uint32_t tmp[4];
   2728 
   2729 tmp[0] = *(ps + pixman_fixed_to_int (vx));
   2730 vx += unit_x;
   2731 while (vx >= 0)
   2732     vx -= src_width_fixed;
   2733 tmp[1] = *(ps + pixman_fixed_to_int (vx));
   2734 vx += unit_x;
   2735 while (vx >= 0)
   2736     vx -= src_width_fixed;
   2737 tmp[2] = *(ps + pixman_fixed_to_int (vx));
   2738 vx += unit_x;
   2739 while (vx >= 0)
   2740     vx -= src_width_fixed;
   2741 tmp[3] = *(ps + pixman_fixed_to_int (vx));
   2742 vx += unit_x;
   2743 while (vx >= 0)
   2744     vx -= src_width_fixed;
   2745 
   2746 vsrc = combine4 (tmp, pm);
   2747 
   2748 if (is_opaque (vsrc))
   2749 {
   2750     save_128_aligned (pd, vsrc);
   2751 }
   2752 else if (!is_zero (vsrc))
   2753 {
   2754     vdst = over (vsrc, splat_alpha (vsrc), load_128_aligned (pd));
   2755 
   2756     save_128_aligned (pd, vdst);
   2757 }
   2758 
   2759 w -= 4;
   2760 pd += 4;
   2761 if (pm)
   2762     pm += 4;
   2763    }
   2764 
   2765    while (w)
   2766    {
   2767 d = *pd;
   2768 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
   2769 vx += unit_x;
   2770 while (vx >= 0)
   2771     vx -= src_width_fixed;
   2772 
   2773 *pd++ = core_combine_over_u_pixel_vmx (s, d);
   2774 if (pm)
   2775     pm++;
   2776 
   2777 w--;
   2778    }
   2779 }
   2780 
   2781 /* clang-format off */
   2782 FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
   2783 	       scaled_nearest_scanline_vmx_8888_8888_OVER,
   2784 	       uint32_t, uint32_t, COVER)
   2785 FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
   2786 	       scaled_nearest_scanline_vmx_8888_8888_OVER,
   2787 	       uint32_t, uint32_t, NONE)
   2788 FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
   2789 	       scaled_nearest_scanline_vmx_8888_8888_OVER,
   2790 	       uint32_t, uint32_t, PAD)
   2791 FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
   2792 	       scaled_nearest_scanline_vmx_8888_8888_OVER,
   2793 	       uint32_t, uint32_t, NORMAL)
   2794 
   2795 static const pixman_fast_path_t vmx_fast_paths[] =
   2796 {
   2797    PIXMAN_STD_FAST_PATH (OVER, solid,    null, a8r8g8b8, vmx_composite_over_n_8888),
   2798    PIXMAN_STD_FAST_PATH (OVER, solid,    null, x8r8g8b8, vmx_composite_over_n_8888),
   2799    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
   2800    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
   2801    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
   2802    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
   2803    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888),
   2804    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888),
   2805    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888),
   2806    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888),
   2807    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
   2808    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
   2809    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
   2810    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
   2811 
   2812    /* PIXMAN_OP_ADD */
   2813    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
   2814    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
   2815    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
   2816 
   2817    /* PIXMAN_OP_SRC */
   2818    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
   2819    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
   2820 
   2821    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
   2822    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
   2823    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
   2824    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
   2825 
   2826    {   PIXMAN_OP_NONE	},
   2827 };
   2828 /* clang-format on */
   2829 
   2830 static uint32_t *
   2831 vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
   2832 {
   2833    int       w   = iter->width;
   2834    uint32_t *dst = iter->buffer;
   2835    uint32_t *src = (uint32_t *)iter->bits;
   2836 
   2837    iter->bits += iter->stride;
   2838 
   2839    while (w && ((uintptr_t)dst) & 0x0f)
   2840    {
   2841 *dst++ = (*src++) | 0xff000000;
   2842 w--;
   2843    }
   2844 
   2845    while (w >= 4)
   2846    {
   2847 save_128_aligned (dst,
   2848 		  vec_or (load_128_unaligned (src), mask_ff000000));
   2849 
   2850 dst += 4;
   2851 src += 4;
   2852 w -= 4;
   2853    }
   2854 
   2855    while (w)
   2856    {
   2857 *dst++ = (*src++) | 0xff000000;
   2858 w--;
   2859    }
   2860 
   2861    return iter->buffer;
   2862 }
   2863 
   2864 static uint32_t *
   2865 vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
   2866 {
   2867    int       w   = iter->width;
   2868    uint32_t *dst = iter->buffer;
   2869    uint8_t  *src = iter->bits;
   2870 
   2871    vector unsigned char vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
   2872 
   2873    iter->bits += iter->stride;
   2874 
   2875    while (w && (((uintptr_t)dst) & 15))
   2876    {
   2877 *dst++ = *(src++) << 24;
   2878 w--;
   2879    }
   2880 
   2881    while (w >= 16)
   2882    {
   2883 vmx0 = load_128_unaligned ((uint32_t *)src);
   2884 
   2885 unpack_128_2x128 (vzero, vmx0, &vmx1, &vmx2);
   2886 unpack_128_2x128 (vzero, vmx1, &vmx3, &vmx4);
   2887 unpack_128_2x128 (vzero, vmx2, &vmx5, &vmx6);
   2888 
   2889 save_128_aligned (dst, vmx6);
   2890 save_128_aligned ((dst + 4), vmx5);
   2891 save_128_aligned ((dst + 8), vmx4);
   2892 save_128_aligned ((dst + 12), vmx3);
   2893 
   2894 dst += 16;
   2895 src += 16;
   2896 w -= 16;
   2897    }
   2898 
   2899    while (w)
   2900    {
   2901 *dst++ = *(src++) << 24;
   2902 w--;
   2903    }
   2904 
   2905    return iter->buffer;
   2906 }
   2907 
   2908 #define IMAGE_FLAGS                                                            \
   2909    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |                       \
   2910     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
   2911 
   2912 /* clang-format off */
   2913 static const pixman_iter_info_t vmx_iters[] =
   2914 {
   2915    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
   2916      _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
   2917    },
   2918    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
   2919      _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
   2920    },
   2921    { PIXMAN_null },
   2922 };
   2923 /* clang-format on */
   2924 
   2925 pixman_implementation_t *
   2926 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
   2927 {
   2928    pixman_implementation_t *imp = _pixman_implementation_create (
   2929 fallback, vmx_fast_paths);
   2930 
   2931    /* VMX constants */
   2932    mask_ff000000 = (vector unsigned char)create_mask_32_128 (0xff000000);
   2933 
   2934    /* Set up function pointers */
   2935 
   2936    imp->combine_32[PIXMAN_OP_OVER]         = vmx_combine_over_u;
   2937    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
   2938    imp->combine_32[PIXMAN_OP_IN]           = vmx_combine_in_u;
   2939    imp->combine_32[PIXMAN_OP_IN_REVERSE]   = vmx_combine_in_reverse_u;
   2940    imp->combine_32[PIXMAN_OP_OUT]          = vmx_combine_out_u;
   2941    imp->combine_32[PIXMAN_OP_OUT_REVERSE]  = vmx_combine_out_reverse_u;
   2942    imp->combine_32[PIXMAN_OP_ATOP]         = vmx_combine_atop_u;
   2943    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
   2944    imp->combine_32[PIXMAN_OP_XOR]          = vmx_combine_xor_u;
   2945 
   2946    imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
   2947 
   2948    imp->combine_32_ca[PIXMAN_OP_SRC]          = vmx_combine_src_ca;
   2949    imp->combine_32_ca[PIXMAN_OP_OVER]         = vmx_combine_over_ca;
   2950    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
   2951    imp->combine_32_ca[PIXMAN_OP_IN]           = vmx_combine_in_ca;
   2952    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE]   = vmx_combine_in_reverse_ca;
   2953    imp->combine_32_ca[PIXMAN_OP_OUT]          = vmx_combine_out_ca;
   2954    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE]  = vmx_combine_out_reverse_ca;
   2955    imp->combine_32_ca[PIXMAN_OP_ATOP]         = vmx_combine_atop_ca;
   2956    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
   2957    imp->combine_32_ca[PIXMAN_OP_XOR]          = vmx_combine_xor_ca;
   2958    imp->combine_32_ca[PIXMAN_OP_ADD]          = vmx_combine_add_ca;
   2959 
   2960    imp->fill = vmx_fill;
   2961 
   2962    imp->iter_info = vmx_iters;
   2963 
   2964    return imp;
   2965 }