tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-rvv.c (113605B)


      1 /*
      2 * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
      3 *             2005 Lars Knoll & Zack Rusin, Trolltech
      4 *             2024 Filip Wasil, Samsung Electronics
      5 *             2024 Bernard Gingold, Samsung Electronics
      6 *             2025 Marek Pikuła, Samsung Electronics
      7 * Permission to use, copy, modify, distribute, and sell this software and its
      8 * documentation for any purpose is hereby granted without fee, provided that
      9 * the above copyright notice appear in all copies and that both that
     10 * copyright notice and this permission notice appear in supporting
     11 * documentation, and that the name of Keith Packard not be used in
     12 * advertising or publicity pertaining to distribution of the software without
     13 * specific, written prior permission.  Keith Packard makes no
     14 * representations about the suitability of this software for any purpose.  It
     15 * is provided "as is" without express or implied warranty.
     16 *
     17 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     18 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     19 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     20 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     21 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     22 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     23 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     24 * SOFTWARE.
     25 */
     26 
     27 #ifdef HAVE_CONFIG_H
     28 #include <pixman-config.h>
     29 #endif
     30 
     31 #include "pixman-combine-float.h"
     32 #include "pixman-combine32.h"
     33 #include "pixman-inlines.h"
     34 #include "pixman-private.h"
     35 
     36 #include <riscv_vector.h>
     37 
     38 #include <float.h>
     39 #include <math.h>
     40 #include <stdbool.h>
     41 #include <stddef.h>
     42 #include <stdint.h>
     43 #include <stdio.h>
     44 #include <stdlib.h>
     45 #include <string.h>
     46 
     47 // Convenience macros {
     48 
     49 #define __FE_PTR(p, vl) ((p) += (vl))
     50 
     51 #define _RVV_FE_PRE(total_len, vn, vl, vspec)                                  \
     52    size_t vn = total_len, vl = __riscv_vsetvl_##vspec (vn);                   \
     53    vn > 0
     54 
     55 #define _RVV_FE_POST(vn, vl, vspec) vn -= (vl), vl = __riscv_vsetvl_##vspec (vn)
     56 
     57 #define RVV_FOREACH_1(total_len, vl, vspec, p1)                                \
     58    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
     59  __FE_PTR (p1, vl), _RVV_FE_POST (vn, vl, vspec))
     60 
     61 #define RVV_FOREACH_2(total_len, vl, vspec, p1, p2)                            \
     62    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
     63  __FE_PTR (p1, vl), __FE_PTR (p2, vl), _RVV_FE_POST (vn, vl, vspec))
     64 
     65 #define RVV_FOREACH_3(total_len, vl, vspec, p1, p2, p3)                        \
     66    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
     67  __FE_PTR (p1, vl), __FE_PTR (p2, vl), __FE_PTR (p3, vl),              \
     68  _RVV_FE_POST (vn, vl, vspec))
     69 
     70 // vuintXXmYY_t for use in macros (less token concatenation).
     71 #define VUINT(ELEN, LMUL) vuint##ELEN##LMUL##_t
     72 #define VUINT32(LMUL)     VUINT (32, LMUL)
     73 #define VUINT16(LMUL)     VUINT (16, LMUL)
     74 #define VUINT8(LMUL)      VUINT (8, LMUL)
     75 
     76 // Short for vreinterpret commonly used for ARGB batch operations.
     77 #define RVV_U8x4_U32(LMUL, value)                                              \
     78    __riscv_vreinterpret_v_u8##LMUL##_u32##LMUL (value)
     79 #define RVV_U8x4_U32_m2(value) RVV_U8x4_U32 (m2, value)
     80 #define RVV_U8x4_U32_m4(value) RVV_U8x4_U32 (m4, value)
     81 
     82 #define RVV_U32_U8x4(LMUL, value)                                              \
     83    __riscv_vreinterpret_v_u32##LMUL##_u8##LMUL (value)
     84 #define RVV_U32_U8x4_m2(value) RVV_U32_U8x4 (m2, value)
     85 #define RVV_U32_U8x4_m4(value) RVV_U32_U8x4 (m4, value)
     86 
     87 // }
     88 
     89 // Float implementation
     90 
     91 /*
     92 * Screen
     93 *
     94 *      ad * as * B(d/ad, s/as)
     95 *    = ad * as * (d/ad + s/as - s/as * d/ad)
     96 *    = ad * s + as * d - s * d
     97 */
     98 
     99 static force_inline vfloat32m1_t
    100 rvv_blend_screen_float (const vfloat32m1_t sa,
    101 		const vfloat32m1_t s,
    102 		const vfloat32m1_t da,
    103 		const vfloat32m1_t d,
    104 		size_t             vl)
    105 {
    106    vfloat32m1_t t0, t1, t2;
    107    t0 = __riscv_vfmul_vv_f32m1 (s, da, vl);
    108    t1 = __riscv_vfmul_vv_f32m1 (d, sa, vl);
    109    t2 = __riscv_vfmul_vv_f32m1 (s, d, vl);
    110    return __riscv_vfsub_vv_f32m1 (__riscv_vfadd_vv_f32m1 (t0, t1, vl), t2, vl);
    111 }
    112 
    113 /*
    114 * Multiply
    115 *
    116 *      ad * as * B(d / ad, s / as)
    117 *    = ad * as * d/ad * s/as
    118 *    = d * s
    119 *
    120 */
    121 
    122 static force_inline vfloat32m1_t
    123 rvv_blend_multiply_float (const vfloat32m1_t sa,
    124 		  const vfloat32m1_t s,
    125 		  const vfloat32m1_t da,
    126 		  const vfloat32m1_t d,
    127 		  size_t             vl)
    128 {
    129    return __riscv_vfmul_vv_f32m1 (s, d, vl);
    130 }
    131 
    132 /*
    133 * Overlay
    134 *
    135 *     ad * as * B(d/ad, s/as)
    136 *   = ad * as * Hardlight (s, d)
    137 *   = if (d / ad < 0.5)
    138 *         as * ad * Multiply (s/as, 2 * d/ad)
    139 *     else
    140 *         as * ad * Screen (s/as, 2 * d / ad - 1)
    141 *   = if (d < 0.5 * ad)
    142 *         as * ad * s/as * 2 * d /ad
    143 *     else
    144 *         as * ad * (s/as + 2 * d / ad - 1 - s / as * (2 * d / ad - 1))
    145 *   = if (2 * d < ad)
    146 *         2 * s * d
    147 *     else
    148 *         ad * s + 2 * as * d - as * ad - ad * s * (2 * d / ad - 1)
    149 *   = if (2 * d < ad)
    150 *         2 * s * d
    151 *     else
    152 *         as * ad - 2 * (ad - d) * (as - s)
    153 */
    154 
    155 static force_inline vfloat32m1_t
    156 rvv_blend_overlay_float (const vfloat32m1_t sa,
    157 		 const vfloat32m1_t s,
    158 		 const vfloat32m1_t da,
    159 		 const vfloat32m1_t d,
    160 		 size_t             vl)
    161 {
    162    vfloat32m1_t t0, t1, t2, t3, t4, f0, f1, f2;
    163    vbool32_t    vb;
    164    t0 = __riscv_vfadd_vv_f32m1 (d, d, vl);
    165    t1 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (s, s, vl), d, vl);
    166    vb = __riscv_vmflt_vv_f32m1_b32 (t0, da, vl);
    167    t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl);
    168    f2 = __riscv_vfsub_vv_f32m1 (da, d, vl);
    169    t3 = __riscv_vfmul_vf_f32m1 (f2, 2.0f, vl);
    170    t4 = __riscv_vfsub_vv_f32m1 (sa, s, vl);
    171    f0 = __riscv_vfmul_vv_f32m1 (t3, t4, vl);
    172    f1 = __riscv_vfsub_vv_f32m1 (t2, f0, vl);
    173    return __riscv_vmerge_vvm_f32m1 (f1, t1, vb, vl);
    174 }
    175 
    176 /*
    177 * Darken
    178 *
    179 *     ad * as * B(d/ad, s/as)
    180 *   = ad * as * MIN(d/ad, s/as)
    181 *   = MIN (as * d, ad * s)
    182 */
    183 
    184 static force_inline vfloat32m1_t
    185 rvv_blend_darken_float (const vfloat32m1_t sa,
    186 		const vfloat32m1_t s,
    187 		const vfloat32m1_t da,
    188 		const vfloat32m1_t d,
    189 		size_t             vl)
    190 {
    191    vfloat32m1_t ss, dd;
    192    vbool32_t    vb;
    193    ss = __riscv_vfmul_vv_f32m1 (da, s, vl);
    194    dd = __riscv_vfmul_vv_f32m1 (sa, d, vl);
    195    vb = __riscv_vmfgt_vv_f32m1_b32 (ss, dd, vl);
    196    return __riscv_vmerge_vvm_f32m1 (ss, dd, vb, vl);
    197 }
    198 
    199 /*
    200 * Lighten
    201 *
    202 *     ad * as * B(d/ad, s/as)
    203 *   = ad * as * MAX(d/ad, s/as)
    204 *   = MAX (as * d, ad * s)
    205 */
    206 
    207 static force_inline vfloat32m1_t
    208 rvv_blend_lighten_float (const vfloat32m1_t sa,
    209 		 const vfloat32m1_t s,
    210 		 const vfloat32m1_t da,
    211 		 const vfloat32m1_t d,
    212 		 size_t             vl)
    213 {
    214    vfloat32m1_t ss, dd;
    215    vbool32_t    vb;
    216    ss = __riscv_vfmul_vv_f32m1 (s, da, vl);
    217    dd = __riscv_vfmul_vv_f32m1 (d, sa, vl);
    218    vb = __riscv_vmfgt_vv_f32m1_b32 (ss, dd, vl);
    219    return __riscv_vmerge_vvm_f32m1 (dd, ss, vb, vl);
    220 }
    221 
    222 /*
    223 * Color dodge
    224 *
    225 *     ad * as * B(d/ad, s/as)
    226 *   = if d/ad = 0
    227 *         ad * as * 0
    228 *     else if (d/ad >= (1 - s/as)
    229 *         ad * as * 1
    230 *     else
    231 *         ad * as * ((d/ad) / (1 - s/as))
    232 *   = if d = 0
    233 *         0
    234 *     elif as * d >= ad * (as - s)
    235 *         ad * as
    236 *     else
    237 *         as * (as * d / (as - s))
    238 *
    239 */
    240 
    241 static force_inline vfloat32m1_t
    242 rvv_blend_color_dodge_float (const vfloat32m1_t sa,
    243 		     const vfloat32m1_t s,
    244 		     const vfloat32m1_t da,
    245 		     const vfloat32m1_t d,
    246 		     size_t             vl)
    247 {
    248    vfloat32m1_t t0, t1, t2, t3, t4;
    249    vbool32_t    is_d_zero, vb, is_t0_non_zero;
    250 
    251    is_d_zero = __riscv_vmfeq_vf_f32m1_b32 (d, 0.0f, vl);
    252 
    253    t0 = __riscv_vfsub_vv_f32m1 (sa, s, vl);  // sa - s
    254    t1 = __riscv_vfmul_vv_f32m1 (sa, d, vl);  // d * sa
    255    t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl); // sa * da
    256    t3 = __riscv_vfsub_vv_f32m1 (t2, __riscv_vfmul_vv_f32m1 (s, da, vl),
    257 			 vl); // sa * da - s * da
    258 
    259    is_t0_non_zero = __riscv_vmfne_vf_f32m1_b32 (t0, 0.0f, vl);
    260    vb             = __riscv_vmflt_vv_f32m1_b32 (t3, t1, vl);
    261    t4 = __riscv_vfdiv_vv_f32m1 (__riscv_vfmul_vv_f32m1 (sa, t1, vl), t0,
    262 			 vl); // sa * sa * d / (sa - s);
    263 
    264    return __riscv_vfmerge_vfm_f32m1 (
    265 __riscv_vmerge_vvm_f32m1 (
    266     __riscv_vmerge_vvm_f32m1 (t2, t4, is_t0_non_zero, vl), t2, vb, vl),
    267 0.0f, is_d_zero, vl);
    268 }
    269 
    270 /*
    271 * Color burn
    272 *
    273 * We modify the first clause "if d = 1" to "if d >= 1" since with
    274 * premultiplied colors d > 1 can actually happen.
    275 *
    276 *     ad * as * B(d/ad, s/as)
    277 *   = if d/ad >= 1
    278 *         ad * as * 1
    279 *     elif (1 - d/ad) >= s/as
    280 *         ad * as * 0
    281 *     else
    282 *         ad * as * (1 - ((1 - d/ad) / (s/as)))
    283 *   = if d >= ad
    284 *         ad * as
    285 *     elif as * ad - as * d >= ad * s
    286 *         0
    287 *     else
    288 *         ad * as  - as * as * (ad - d) / s
    289 */
    290 
    291 static force_inline vfloat32m1_t
    292 rvv_blend_color_burn_float (const vfloat32m1_t sa,
    293 		    const vfloat32m1_t s,
    294 		    const vfloat32m1_t da,
    295 		    const vfloat32m1_t d,
    296 		    size_t             vl)
    297 {
    298    vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7;
    299    vbool32_t    is_d_ge_da, is_s_zero, vb;
    300 
    301    is_d_ge_da = __riscv_vmfge_vv_f32m1_b32 (d, da, vl);
    302    is_s_zero  = __riscv_vmfeq_vf_f32m1_b32 (s, 0.0f, vl);
    303 
    304    t0 = __riscv_vfmul_vv_f32m1 (sa, __riscv_vfsub_vv_f32m1 (da, d, vl),
    305 			 vl); // sa * (da - d)
    306    t1 = __riscv_vfsub_vv_f32m1 (da, __riscv_vfdiv_vv_f32m1 (t0, s, vl),
    307 			 vl);         // da - sa * (da - d) / s)
    308    t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl); // sa * da
    309    t3 = __riscv_vfmul_vv_f32m1 (sa, t1, vl); // sa * (da - sa * (da - d) / s)
    310    t4 = __riscv_vfmul_vv_f32m1 (s, da, vl);  // s * da
    311    vb = __riscv_vmfge_vf_f32m1_b32 (__riscv_vfsub_vv_f32m1 (t0, t4, vl), 0.0f,
    312 			     vl); // if (sa * (da - d) - s * da >= 0.0f)
    313 
    314    t6 = __riscv_vfmerge_vfm_f32m1 (t3, 0.0f, is_s_zero, vl);
    315    t5 = __riscv_vfmerge_vfm_f32m1 (t6, 0.0f, vb, vl);
    316    t7 = __riscv_vmerge_vvm_f32m1 (t5, t2, is_d_ge_da, vl);
    317 
    318    return t7;
    319 }
    320 
    321 /*
    322 * Hard light
    323 *
    324 *     ad * as * B(d/ad, s/as)
    325 *   = if (s/as <= 0.5)
    326 *         ad * as * Multiply (d/ad, 2 * s/as)
    327 *     else
    328 *         ad * as * Screen (d/ad, 2 * s/as - 1)
    329 *   = if 2 * s <= as
    330 *         ad * as * d/ad * 2 * s / as
    331 *     else
    332 *         ad * as * (d/ad + (2 * s/as - 1) + d/ad * (2 * s/as - 1))
    333 *   = if 2 * s <= as
    334 *         2 * s * d
    335 *     else
    336 *         as * ad - 2 * (ad - d) * (as - s)
    337 */
    338 
    339 static force_inline vfloat32m1_t
    340 rvv_blend_hard_light_float (const vfloat32m1_t sa,
    341 		    const vfloat32m1_t s,
    342 		    const vfloat32m1_t da,
    343 		    const vfloat32m1_t d,
    344 		    size_t             vl)
    345 {
    346    vfloat32m1_t t0, t1, t2, t3, t4;
    347    vbool32_t    vb;
    348    t0 = __riscv_vfadd_vv_f32m1 (s, s, vl);
    349    t1 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (s, s, vl), d, vl);
    350    vb = __riscv_vmfgt_vv_f32m1_b32 (t0, sa, vl);
    351    t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl);
    352    t3 = __riscv_vfmul_vf_f32m1 (__riscv_vfsub_vv_f32m1 (da, d, vl), 2.0f, vl);
    353    t4 = __riscv_vfsub_vv_f32m1 (sa, s, vl);
    354    return __riscv_vmerge_vvm_f32m1 (
    355 t1,
    356 __riscv_vfsub_vv_f32m1 (t2, __riscv_vfmul_vv_f32m1 (t3, t4, vl), vl),
    357 vb, vl);
    358 }
    359 
    360 /*
    361 * Soft light
    362 *
    363 *     ad * as * B(d/ad, s/as)
    364 *   = if (s/as <= 0.5)
    365 *         ad * as * (d/ad - (1 - 2 * s/as) * d/ad * (1 - d/ad))
    366 *     else if (d/ad <= 0.25)
    367 *         ad * as * (d/ad + (2 * s/as - 1) * ((((16 * d/ad - 12) * d/ad + 4) * d/ad) - d/ad))
    368 *     else
    369 *         ad * as * (d/ad + (2 * s/as - 1) * sqrt (d/ad))
    370 *   = if (2 * s <= as)
    371 *         d * as - d * (ad - d) * (as - 2 * s) / ad;
    372 *     else if (4 * d <= ad)
    373 *         (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3);
    374 *     else
    375 *         d * as + (sqrt (d * ad) - d) * (2 * s - as);
    376 */
    377 
    378 static force_inline vfloat32m1_t
    379 rvv_blend_soft_light_float (const vfloat32m1_t sa,
    380 		    const vfloat32m1_t s,
    381 		    const vfloat32m1_t da,
    382 		    const vfloat32m1_t d,
    383 		    size_t             vl)
    384 {
    385    vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13;
    386    vbool32_t    is_sa_lt_2s, is_da_ls_4d, is_da_non_zero;
    387    is_da_non_zero = __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl);
    388    t0             = __riscv_vfadd_vv_f32m1 (s, s, vl); // 2 * s
    389    is_sa_lt_2s    = __riscv_vmflt_vv_f32m1_b32 (sa, t0, vl);
    390    t1             = __riscv_vfmul_vv_f32m1 (sa, d, vl);  // d * sa
    391    t2             = __riscv_vfsub_vv_f32m1 (sa, t0, vl); // (sa - 2*s)
    392    t3             = __riscv_vfmul_vv_f32m1 (d, t2, vl);  // (sa - 2*s) * d
    393    t7 = __riscv_vfdiv_vv_f32m1 (__riscv_vfmul_vf_f32m1 (d, 16.0f, vl), da,
    394 			 vl); // 16 * d / da
    395    t8 = __riscv_vfmul_vv_f32m1 (d, __riscv_vfsub_vf_f32m1 (t7, 12.0f, vl),
    396 			 vl); // (16 * d / da - 12) * d
    397    t9 = __riscv_vfadd_vf_f32m1 (__riscv_vfdiv_vv_f32m1 (t8, da, vl), 3.0f,
    398 			 vl); // (16 * d / da - 12) * d / da + 3)
    399    t4 = __riscv_vfmul_vv_f32m1 (
    400 t3, t9, vl); // (sa - 2*s) * d * ((16 * d / da - 12) * d / da + 3)
    401    t5 = __riscv_vfsub_vv_f32m1 (
    402 t1, t4,
    403 vl); // d * sa - (sa - 2*s) * d * ((16 * d / da - 12) * d / da + 3)
    404    t6          = __riscv_vfadd_vv_f32m1 (__riscv_vfadd_vv_f32m1 (d, d, vl),
    405 				  __riscv_vfadd_vv_f32m1 (d, d, vl), vl);
    406    is_da_ls_4d = __riscv_vmflt_vv_f32m1_b32 (da, t6, vl);
    407    t10         = __riscv_vfsub_vv_f32m1 (
    408        __riscv_vfsqrt_v_f32m1 (__riscv_vfmul_vv_f32m1 (d, da, vl), vl), d,
    409        vl); // sqrtf (d * da) - d
    410    t11 = __riscv_vfmul_vv_f32m1 (t2, t10,
    411 			  vl); // (sqrtf (d * da) - d) * (sa - 2 * s)
    412    t12 = __riscv_vfsub_vv_f32m1 (
    413 t1, t11, vl); // d * sa - (sqrtf (d * da) - d) * (sa - 2 * s)
    414    // d * sa - d * (da - d) * (sa - 2 * s) / da
    415    t13 = __riscv_vfsub_vv_f32m1 (
    416 t1,
    417 __riscv_vfdiv_vv_f32m1 (
    418     __riscv_vfmul_vv_f32m1 (__riscv_vfmul_vv_f32m1 (d, t2, vl),
    419 			    __riscv_vfsub_vv_f32m1 (da, d, vl), vl),
    420     da, vl),
    421 vl);
    422    return __riscv_vmerge_vvm_f32m1 (
    423 t1, // if (!FLOAT_IS_ZERO (da))
    424 __riscv_vmerge_vvm_f32m1 (
    425     t13, // if (4 * d > da)
    426     __riscv_vmerge_vvm_f32m1 (t5, t12, is_da_ls_4d, vl), is_sa_lt_2s,
    427     vl),
    428 is_da_non_zero, vl);
    429 }
    430 
    431 /*
    432 * Difference
    433 *
    434 *     ad * as * B(s/as, d/ad)
    435 *   = ad * as * abs (s/as - d/ad)
    436 *   = if (s/as <= d/ad)
    437 *         ad * as * (d/ad - s/as)
    438 *     else
    439 *         ad * as * (s/as - d/ad)
    440 *   = if (ad * s <= as * d)
    441 *        as * d - ad * s
    442 *     else
    443 *        ad * s - as * d
    444 */
    445 
    446 static force_inline vfloat32m1_t
    447 rvv_blend_difference_float (const vfloat32m1_t sa,
    448 		    const vfloat32m1_t s,
    449 		    const vfloat32m1_t da,
    450 		    const vfloat32m1_t d,
    451 		    size_t             vl)
    452 {
    453    vfloat32m1_t dsa, sda;
    454    vbool32_t    vb;
    455    dsa = __riscv_vfmul_vv_f32m1 (d, sa, vl);
    456    sda = __riscv_vfmul_vv_f32m1 (s, da, vl);
    457    vb  = __riscv_vmflt_vv_f32m1_b32 (sda, dsa, vl);
    458    return __riscv_vmerge_vvm_f32m1 (__riscv_vfsub_vv_f32m1 (sda, dsa, vl),
    459 			     __riscv_vfsub_vv_f32m1 (dsa, sda, vl), vb,
    460 			     vl);
    461 }
    462 
    463 /*
    464 * Exclusion
    465 *
    466 *     ad * as * B(s/as, d/ad)
    467 *   = ad * as * (d/ad + s/as - 2 * d/ad * s/as)
    468 *   = as * d + ad * s - 2 * s * d
    469 */
    470 
    471 static force_inline vfloat32m1_t
    472 rvv_blend_exclusion_float (const vfloat32m1_t sa,
    473 		   const vfloat32m1_t s,
    474 		   const vfloat32m1_t da,
    475 		   const vfloat32m1_t d,
    476 		   size_t             vl)
    477 {
    478    vfloat32m1_t t0, t1;
    479    t0 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (d, d, vl), s, vl);
    480    t1 = __riscv_vfadd_vv_f32m1 (__riscv_vfmul_vv_f32m1 (s, da, vl),
    481 			 __riscv_vfmul_vv_f32m1 (d, sa, vl), vl);
    482    return __riscv_vfsub_vv_f32m1 (t1, t0, vl);
    483 }
    484 
    485 typedef vfloat32m1_t (*rvv_combine_channel_float_t) (const vfloat32m1_t sa,
    486 					     const vfloat32m1_t s,
    487 					     const vfloat32m1_t da,
    488 					     const vfloat32m1_t d,
    489 					     size_t             vl);
    490 
    491 static force_inline void
    492 rvv_combine_inner_float (pixman_bool_t               component,
    493 		 float                      *dest,
    494 		 const float                *src,
    495 		 const float                *mask,
    496 		 int                         n_pixels,
    497 		 rvv_combine_channel_float_t combine_a,
    498 		 rvv_combine_channel_float_t combine_c)
    499 {
    500    float *__restrict__ pd       = dest;
    501    const float *__restrict__ ps = src;
    502    const float *__restrict__ pm = mask;
    503 
    504    const int component_count = 4;
    505    int       vn              = component_count * n_pixels;
    506    int       vl              = 0;
    507    int       vl_step         = 0;
    508 
    509    const ptrdiff_t stride = component_count * sizeof (float);
    510 
    511    vfloat32m1x4_t sa_sr_sg_sb, da_dr_dg_db, ma_mr_mg_mb;
    512    vfloat32m1_t   da2, dr2, dg2, db2, ma2, mr2, mg2, mb2, sr2, sg2, sb2, sa2;
    513 
    514    if (n_pixels == 0)
    515    {
    516 return;
    517    }
    518 
    519    if (!mask)
    520    {
    521 for (; vn > 0; vn -= vl_step, pd += vl_step, ps += vl_step)
    522 {
    523     vl          = __riscv_vsetvl_e32m1 (vn / component_count);
    524     sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl);
    525     da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl);
    526 
    527     da2 = combine_a (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
    528 		     __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
    529 		     __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    530 		     __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl);
    531 
    532     dr2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
    533 		     __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1),
    534 		     __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    535 		     __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl);
    536 
    537     dg2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
    538 		     __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2),
    539 		     __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    540 		     __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl);
    541 
    542     db2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
    543 		     __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3),
    544 		     __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    545 		     __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl);
    546 
    547     __riscv_vsseg4e32_v_f32m1x4 (
    548 	pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl);
    549 
    550     vl_step = vl * component_count;
    551 }
    552    }
    553    else
    554    {
    555 if (component)
    556 {
    557     for (; vn > 0;
    558 	 vn -= vl_step, pd += vl_step, ps += vl_step, pm += vl_step)
    559     {
    560 	vl = __riscv_vsetvl_e32m1 (vn / component_count);
    561 
    562 	sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl);
    563 	da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl);
    564 	ma_mr_mg_mb = __riscv_vlseg4e32_v_f32m1x4 (pm, vl);
    565 
    566 	sr2 = __riscv_vfmul_vv_f32m1 (
    567 	    __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1),
    568 	    __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 1), vl);
    569 
    570 	sg2 = __riscv_vfmul_vv_f32m1 (
    571 	    __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2),
    572 	    __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 2), vl);
    573 
    574 	sb2 = __riscv_vfmul_vv_f32m1 (
    575 	    __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3),
    576 	    __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 3), vl);
    577 
    578 	ma2 = __riscv_vfmul_vv_f32m1 (
    579 	    __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 0),
    580 	    __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
    581 
    582 	mr2 = __riscv_vfmul_vv_f32m1 (
    583 	    __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 1),
    584 	    __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
    585 
    586 	mg2 = __riscv_vfmul_vv_f32m1 (
    587 	    __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 2),
    588 	    __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
    589 
    590 	mb2 = __riscv_vfmul_vv_f32m1 (
    591 	    __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 3),
    592 	    __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
    593 
    594 	da2 = combine_a (
    595 	    ma2, ma2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    596 	    __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl);
    597 
    598 	dr2 = combine_c (
    599 	    mr2, sr2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    600 	    __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl);
    601 
    602 	dg2 = combine_c (
    603 	    mg2, sg2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    604 	    __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl);
    605 
    606 	db2 = combine_c (
    607 	    mb2, sb2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    608 	    __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl);
    609 
    610 	__riscv_vsseg4e32_v_f32m1x4 (
    611 	    pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl);
    612 
    613 	vl_step = vl * component_count;
    614     }
    615 }
    616 else
    617 {
    618     for (; vn > 0;
    619 	 vn -= vl_step, pd += vl_step, ps += vl_step, pm += vl_step)
    620     {
    621 	vl = __riscv_vsetvl_e32m1 (vn / component_count);
    622 
    623 	sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl);
    624 	da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl);
    625 	ma2         = __riscv_vlse32_v_f32m1 (pm, stride, vl);
    626 
    627 	sa2 = __riscv_vfmul_vv_f32m1 (
    628 	    ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
    629 	sr2 = __riscv_vfmul_vv_f32m1 (
    630 	    ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1), vl);
    631 	sg2 = __riscv_vfmul_vv_f32m1 (
    632 	    ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2), vl);
    633 	sb2 = __riscv_vfmul_vv_f32m1 (
    634 	    ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3), vl);
    635 
    636 	ma2 = sa2;
    637 
    638 	dr2 = combine_c (
    639 	    ma2, sr2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    640 	    __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl);
    641 
    642 	dg2 = combine_c (
    643 	    ma2, sg2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    644 	    __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl);
    645 
    646 	db2 = combine_c (
    647 	    ma2, sb2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    648 	    __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl);
    649 
    650 	da2 = combine_a (
    651 	    ma2, sa2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
    652 	    __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl);
    653 
    654 	__riscv_vsseg4e32_v_f32m1x4 (
    655 	    pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl);
    656 
    657 	vl_step = vl * component_count;
    658     }
    659 }
    660    }
    661 }
    662 
    663 #define RVV_MAKE_COMBINER(name, component, combine_a, combine_c)               \
    664    static void rvv_combine_##name##_float (                                   \
    665 pixman_implementation_t *imp, pixman_op_t op, float *dest,             \
    666 const float *src, const float *mask, int n_pixels)                     \
    667    {                                                                          \
    668 rvv_combine_inner_float (component, dest, src, mask, n_pixels,         \
    669 			 combine_a, combine_c);                        \
    670    }
    671 
    672 #define RVV_MAKE_COMBINERS(name, combine_a, combine_c)                         \
    673    RVV_MAKE_COMBINER (name##_ca, TRUE, combine_a, combine_c)                  \
    674    RVV_MAKE_COMBINER (name##_u, FALSE, combine_a, combine_c)
    675 
    676 static force_inline vfloat32m1_t
    677 rvv_get_factor_float (combine_factor_t factor,
    678 	      vfloat32m1_t     sa,
    679 	      vfloat32m1_t     da,
    680 	      size_t           vl)
    681 {
    682    vfloat32m1_t vone  = __riscv_vfmv_v_f_f32m1 (1.0f, vl);
    683    vfloat32m1_t vzero = __riscv_vfmv_v_f_f32m1 (0.0f, vl);
    684 
    685    switch (factor)
    686    {
    687 case ZERO:
    688     return vzero;
    689 
    690 case ONE:
    691     return vone;
    692 
    693 case SRC_ALPHA:
    694     return sa;
    695 
    696 case DEST_ALPHA:
    697     return da;
    698 
    699 case INV_SA:
    700     return __riscv_vfsub_vv_f32m1 (vone, sa, vl);
    701 
    702 case INV_DA:
    703     return __riscv_vfsub_vv_f32m1 (vone, da, vl);
    704 
    705 case SA_OVER_DA:
    706     return __riscv_vmerge_vvm_f32m1 (
    707 	vone,
    708 	__riscv_vfmin_vv_f32m1 (
    709 	    vone,
    710 	    __riscv_vfmax_vv_f32m1 (
    711 		vzero, __riscv_vfdiv_vv_f32m1 (sa, da, vl), vl),
    712 	    vl),
    713 	__riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl);
    714 
    715 case DA_OVER_SA:
    716     return __riscv_vmerge_vvm_f32m1 (
    717 	__riscv_vfmin_vv_f32m1 (
    718 	    vone,
    719 	    __riscv_vfmax_vv_f32m1 (
    720 		vzero, __riscv_vfdiv_vv_f32m1 (da, sa, vl), vl),
    721 	    vl),
    722 	vone, __riscv_vmfeq_vf_f32m1_b32 (sa, 0.0f, vl), vl);
    723 
    724 case INV_SA_OVER_DA:
    725     {
    726 	vfloat32m1_t t0 = __riscv_vfdiv_vv_f32m1 (
    727 	    __riscv_vfsub_vv_f32m1 (vone, sa, vl), da, vl);
    728 	return __riscv_vmerge_vvm_f32m1 (
    729 	    vone,
    730 	    __riscv_vfmin_vv_f32m1 (
    731 		vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
    732 	    __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl);
    733     }
    734 
    735 case INV_DA_OVER_SA:
    736     {
    737 	vfloat32m1_t t0 = __riscv_vfdiv_vv_f32m1 (
    738 	    __riscv_vfsub_vv_f32m1 (vone, da, vl), sa, vl);
    739 	return __riscv_vmerge_vvm_f32m1 (
    740 	    vone,
    741 	    __riscv_vfmin_vv_f32m1 (
    742 		vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
    743 	    __riscv_vmfne_vf_f32m1_b32 (sa, 0.0f, vl), vl);
    744     }
    745 
    746 case ONE_MINUS_SA_OVER_DA:
    747     {
    748 	vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 (
    749 	    vone, __riscv_vfdiv_vv_f32m1 (sa, da, vl), vl);
    750 	return __riscv_vmerge_vvm_f32m1 (
    751 	    vzero,
    752 	    __riscv_vfmin_vv_f32m1 (
    753 		vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
    754 	    __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl);
    755     }
    756 
    757 case ONE_MINUS_DA_OVER_SA:
    758     {
    759 	vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 (
    760 	    vone, __riscv_vfdiv_vv_f32m1 (da, sa, vl), vl);
    761 	return __riscv_vmerge_vvm_f32m1 (
    762 	    vzero,
    763 	    __riscv_vfmin_vv_f32m1 (
    764 		vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
    765 	    __riscv_vmfne_vf_f32m1_b32 (sa, 0.0f, vl), vl);
    766     }
    767 
    768 case ONE_MINUS_INV_DA_OVER_SA:
    769     {
    770 	vbool32_t is_zero = __riscv_vmand_mm_b32 (
    771 	    __riscv_vmflt_vf_f32m1_b32 (sa, FLT_MIN, vl),
    772 	    __riscv_vmfgt_vf_f32m1_b32 (sa, -FLT_MAX, vl), vl);
    773 	vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 (
    774 	    vone,
    775 	    __riscv_vfdiv_vv_f32m1 (
    776 		__riscv_vfsub_vv_f32m1 (vone, da, vl), sa, vl),
    777 	    vl);
    778 	return __riscv_vmerge_vvm_f32m1 (
    779 	    __riscv_vfmin_vv_f32m1 (
    780 		vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
    781 	    vzero, is_zero, vl);
    782     }
    783 
    784 case ONE_MINUS_INV_SA_OVER_DA:
    785     {
    786 	vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 (
    787 	    vone,
    788 	    __riscv_vfdiv_vv_f32m1 (
    789 		__riscv_vfsub_vv_f32m1 (vone, sa, vl), da, vl),
    790 	    vl);
    791 	return __riscv_vmerge_vvm_f32m1 (
    792 	    __riscv_vfmin_vv_f32m1 (
    793 		vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
    794 	    vzero, __riscv_vmfeq_vf_f32m1_b32 (da, 0.0f, vl), vl);
    795     }
    796    }
    797 
    798    return __riscv_vfmv_v_f_f32m1 (-1.0f, vl);
    799 }
    800 
    801 #define RVV_MAKE_PD_COMBINERS(name, a, b)                                      \
    802    static vfloat32m1_t force_inline rvv_pd_combine_##name##_float (           \
    803 vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d,      \
    804 size_t vl)                                                             \
    805    {                                                                          \
    806 const vfloat32m1_t fa = rvv_get_factor_float (a, sa, da, vl);          \
    807 const vfloat32m1_t fb = rvv_get_factor_float (b, sa, da, vl);          \
    808 vfloat32m1_t       t0 = __riscv_vfadd_vv_f32m1 (                       \
    809            __riscv_vfmul_vv_f32m1 (s, fa, vl),                          \
    810            __riscv_vfmul_vv_f32m1 (d, fb, vl), vl);                     \
    811 return __riscv_vfmin_vv_f32m1 (__riscv_vfmv_v_f_f32m1 (1.0f, vl), t0,  \
    812 			       vl);                                    \
    813    }                                                                          \
    814                                                                               \
    815    RVV_MAKE_COMBINERS (name, rvv_pd_combine_##name##_float,                   \
    816 		rvv_pd_combine_##name##_float)
    817 
    818 RVV_MAKE_PD_COMBINERS (clear, ZERO, ZERO)
    819 RVV_MAKE_PD_COMBINERS (src, ONE, ZERO)
    820 RVV_MAKE_PD_COMBINERS (dst, ZERO, ONE)
    821 RVV_MAKE_PD_COMBINERS (over, ONE, INV_SA)
    822 RVV_MAKE_PD_COMBINERS (over_reverse, INV_DA, ONE)
    823 RVV_MAKE_PD_COMBINERS (in, DEST_ALPHA, ZERO)
    824 RVV_MAKE_PD_COMBINERS (in_reverse, ZERO, SRC_ALPHA)
    825 RVV_MAKE_PD_COMBINERS (out, INV_DA, ZERO)
    826 RVV_MAKE_PD_COMBINERS (out_reverse, ZERO, INV_SA)
    827 RVV_MAKE_PD_COMBINERS (atop, DEST_ALPHA, INV_SA)
    828 RVV_MAKE_PD_COMBINERS (atop_reverse, INV_DA, SRC_ALPHA)
    829 RVV_MAKE_PD_COMBINERS (xor, INV_DA, INV_SA)
    830 RVV_MAKE_PD_COMBINERS (add, ONE, ONE)
    831 
    832 RVV_MAKE_PD_COMBINERS (saturate, INV_DA_OVER_SA, ONE)
    833 
    834 RVV_MAKE_PD_COMBINERS (disjoint_clear, ZERO, ZERO)
    835 RVV_MAKE_PD_COMBINERS (disjoint_src, ONE, ZERO)
    836 RVV_MAKE_PD_COMBINERS (disjoint_dst, ZERO, ONE)
    837 RVV_MAKE_PD_COMBINERS (disjoint_over, ONE, INV_SA_OVER_DA)
    838 RVV_MAKE_PD_COMBINERS (disjoint_over_reverse, INV_DA_OVER_SA, ONE)
    839 RVV_MAKE_PD_COMBINERS (disjoint_in, ONE_MINUS_INV_DA_OVER_SA, ZERO)
    840 RVV_MAKE_PD_COMBINERS (disjoint_in_reverse, ZERO, ONE_MINUS_INV_SA_OVER_DA)
    841 RVV_MAKE_PD_COMBINERS (disjoint_out, INV_DA_OVER_SA, ZERO)
    842 RVV_MAKE_PD_COMBINERS (disjoint_out_reverse, ZERO, INV_SA_OVER_DA)
    843 RVV_MAKE_PD_COMBINERS (disjoint_atop, ONE_MINUS_INV_DA_OVER_SA, INV_SA_OVER_DA)
    844 RVV_MAKE_PD_COMBINERS (disjoint_atop_reverse,
    845 	       INV_DA_OVER_SA,
    846 	       ONE_MINUS_INV_SA_OVER_DA)
    847 RVV_MAKE_PD_COMBINERS (disjoint_xor, INV_DA_OVER_SA, INV_SA_OVER_DA)
    848 
    849 RVV_MAKE_PD_COMBINERS (conjoint_clear, ZERO, ZERO)
    850 RVV_MAKE_PD_COMBINERS (conjoint_src, ONE, ZERO)
    851 RVV_MAKE_PD_COMBINERS (conjoint_dst, ZERO, ONE)
    852 RVV_MAKE_PD_COMBINERS (conjoint_over, ONE, ONE_MINUS_SA_OVER_DA)
    853 RVV_MAKE_PD_COMBINERS (conjoint_over_reverse, ONE_MINUS_DA_OVER_SA, ONE)
    854 RVV_MAKE_PD_COMBINERS (conjoint_in, DA_OVER_SA, ZERO)
    855 RVV_MAKE_PD_COMBINERS (conjoint_in_reverse, ZERO, SA_OVER_DA)
    856 RVV_MAKE_PD_COMBINERS (conjoint_out, ONE_MINUS_DA_OVER_SA, ZERO)
    857 RVV_MAKE_PD_COMBINERS (conjoint_out_reverse, ZERO, ONE_MINUS_SA_OVER_DA)
    858 RVV_MAKE_PD_COMBINERS (conjoint_atop, DA_OVER_SA, ONE_MINUS_SA_OVER_DA)
    859 RVV_MAKE_PD_COMBINERS (conjoint_atop_reverse, ONE_MINUS_DA_OVER_SA, SA_OVER_DA)
    860 RVV_MAKE_PD_COMBINERS (conjoint_xor, ONE_MINUS_DA_OVER_SA, ONE_MINUS_SA_OVER_DA)
    861 
    862 #define RVV_MAKE_SEPARABLE_PDF_COMBINERS(name)                                 \
    863    static force_inline vfloat32m1_t rvv_combine_##name##_a (                  \
    864 vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d,      \
    865 size_t vl)                                                             \
    866    {                                                                          \
    867 return __riscv_vfsub_vv_f32m1 (__riscv_vfadd_vv_f32m1 (da, sa, vl),    \
    868 			       __riscv_vfmul_vv_f32m1 (da, sa, vl),    \
    869 			       vl);                                    \
    870    }                                                                          \
    871                                                                               \
    872    static force_inline vfloat32m1_t rvv_combine_##name##_c (                  \
    873 vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d,      \
    874 size_t vl)                                                             \
    875    {                                                                          \
    876 vfloat32m1_t f = __riscv_vfmul_vf_f32m1 (                              \
    877     __riscv_vfadd_vv_f32m1 (                                           \
    878 	__riscv_vfmul_vv_f32m1 (__riscv_vfsub_vf_f32m1 (sa, 1.0f, vl), \
    879 				d, vl),                                \
    880 	__riscv_vfmul_vv_f32m1 (__riscv_vfsub_vf_f32m1 (da, 1.0f, vl), \
    881 				s, vl),                                \
    882 	vl),                                                           \
    883     -1.0f, vl);                                                        \
    884                                                                               \
    885 return __riscv_vfadd_vv_f32m1 (                                        \
    886     f, rvv_blend_##name##_float (sa, s, da, d, vl), vl);               \
    887    }                                                                          \
    888                                                                               \
    889    RVV_MAKE_COMBINERS (name, rvv_combine_##name##_a, rvv_combine_##name##_c)
    890 
    891 RVV_MAKE_SEPARABLE_PDF_COMBINERS (multiply)
    892 RVV_MAKE_SEPARABLE_PDF_COMBINERS (screen)
    893 RVV_MAKE_SEPARABLE_PDF_COMBINERS (overlay)
    894 RVV_MAKE_SEPARABLE_PDF_COMBINERS (darken)
    895 RVV_MAKE_SEPARABLE_PDF_COMBINERS (lighten)
    896 RVV_MAKE_SEPARABLE_PDF_COMBINERS (color_dodge)
    897 RVV_MAKE_SEPARABLE_PDF_COMBINERS (color_burn)
    898 RVV_MAKE_SEPARABLE_PDF_COMBINERS (hard_light)
    899 RVV_MAKE_SEPARABLE_PDF_COMBINERS (soft_light)
    900 RVV_MAKE_SEPARABLE_PDF_COMBINERS (difference)
    901 RVV_MAKE_SEPARABLE_PDF_COMBINERS (exclusion)
    902 
    903 // int implementation
    904 
    905 // pixman-combine32.h RVV implementation plus some convenience functions {
    906 
    907 /*
    908 * x_c = min(x_c + y_c, 255)
    909 */
    910 
    911 #define rvv_UN8_ADD_UN8_vv(x, y, vl) __riscv_vsaddu (x, y, vl)
    912 
    913 #define rvv_UN8x4_ADD_UN8x4_vv_m4(x, y, vl)                                    \
    914    RVV_U8x4_U32_m4 (rvv_UN8_ADD_UN8_vv (RVV_U32_U8x4_m4 (x),                  \
    915 				 RVV_U32_U8x4_m4 (y), (vl) * 4))
    916 
    917 /*
    918 * x_c = (x_c * a_c) / 255
    919 */
    920 
    921 #define __rvv_UN8_MUL_UN8_vv(LMUL, LMUL16)                                     \
    922    static force_inline VUINT8 (LMUL) rvv_UN8_MUL_UN8_vv_##LMUL (              \
    923 const VUINT8 (LMUL) x, const VUINT8 (LMUL) a, size_t vl)               \
    924    {                                                                          \
    925 VUINT16 (LMUL16)                                                       \
    926 mul_higher = __riscv_vwmaccu (                                         \
    927     __riscv_vmv_v_x_u16##LMUL16 (ONE_HALF, vl), x, a, vl);             \
    928                                                                               \
    929 VUINT16 (LMUL16)                                                       \
    930 mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl);                    \
    931                                                                               \
    932 return __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl),        \
    933 		      G_SHIFT, vl);                                    \
    934    }
    935 __rvv_UN8_MUL_UN8_vv (m1, m2);
    936 __rvv_UN8_MUL_UN8_vv (m2, m4);
    937 __rvv_UN8_MUL_UN8_vv (m4, m8);
    938 
    939 static force_inline vuint8m4_t
    940 rvv_UN8_MUL_UN8_vx_m4 (const vuint8m4_t x, const uint8_t a, size_t vl)
    941 {
    942    vuint16m8_t mul_higher = __riscv_vwmaccu (
    943 __riscv_vmv_v_x_u16m8 (ONE_HALF, vl), a, x, vl);
    944    vuint16m8_t mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl);
    945 
    946    return __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT,
    947 		  vl);
    948 }
    949 
    950 #define __rvv_UN8x4_MUL_UN8x4_vv(LMUL, x, a, vl)                               \
    951    RVV_U8x4_U32 (LMUL, rvv_UN8_MUL_UN8_vv_##LMUL (RVV_U32_U8x4 (LMUL, x),     \
    952 					   RVV_U32_U8x4 (LMUL, a),     \
    953 					   (vl) * 4))
    954 #define rvv_UN8x4_MUL_UN8x4_vv_m2(x, a, vl)                                    \
    955    __rvv_UN8x4_MUL_UN8x4_vv (m2, x, a, vl)
    956 #define rvv_UN8x4_MUL_UN8x4_vv_m4(x, a, vl)                                    \
    957    __rvv_UN8x4_MUL_UN8x4_vv (m4, x, a, vl)
    958 
    959 /*
    960 * a_c = a (broadcast to all components)
    961 */
    962 
    963 #define __rvv_UN16_bcast_UN8x4_v(LMUL, LMUL16)                                 \
    964    static force_inline VUINT32 (LMUL)                                         \
    965 rvv_UN16_bcast_UN8x4_v_##LMUL (const VUINT16 (LMUL16) a, size_t vl)    \
    966    {                                                                          \
    967 VUINT32 (LMUL)                                                         \
    968 a32 = __riscv_vwcvtu_x (__riscv_vmadd (a, 1 << 8, a, vl), vl);         \
    969                                                                               \
    970 return __riscv_vmadd (a32, 1 << 16, a32, vl);                          \
    971    }
    972 __rvv_UN16_bcast_UN8x4_v (m2, m1);
    973 __rvv_UN16_bcast_UN8x4_v (m4, m2);
    974 
    975 #define rvv_UN8_bcast_UN8x4_v_m4(a, vl)                                        \
    976    rvv_UN16_bcast_UN8x4_v_m4 (__riscv_vwcvtu_x (a, vl), vl)
    977 
    978 /*
    979 * x_c = (x_c * a) / 255
    980 */
    981 
    982 #define rvv_UN8x4_MUL_UN8_vv_m4(x, a, vl)                                      \
    983    rvv_UN8x4_MUL_UN8x4_vv_m4 (x, rvv_UN8_bcast_UN8x4_v_m4 (a, vl), vl)
    984 
    985 #define __rvv_UN8x4_MUL_UN16_vv(LMUL, x, a, vl)                                \
    986    rvv_UN8x4_MUL_UN8x4_vv_##LMUL (x, rvv_UN16_bcast_UN8x4_v_##LMUL (a, vl), vl)
    987 #define rvv_UN8x4_MUL_UN16_vv_m2(x, a, vl)                                     \
    988    __rvv_UN8x4_MUL_UN16_vv (m2, x, a, vl)
    989 #define rvv_UN8x4_MUL_UN16_vv_m4(x, a, vl)                                     \
    990    __rvv_UN8x4_MUL_UN16_vv (m4, x, a, vl)
    991 
    992 #define rvv_UN8x4_MUL_UN8_vx_m4(x, a, vl)                                      \
    993    RVV_U8x4_U32_m4 (rvv_UN8_MUL_UN8_vx_m4 (RVV_U32_U8x4_m4 (x), a, (vl) * 4))
    994 
    995 static force_inline vuint32m2_t
    996 rvv_DIV_ONE_UN32m2_UN32m2_v (const vuint32m2_t x, size_t vl)
    997 {
    998    vuint32m2_t mul_higher = __riscv_vadd (x, ONE_HALF, vl);
    999    vuint32m2_t mul_lower  = __riscv_vsrl (mul_higher, G_SHIFT, vl);
   1000 
   1001    return __riscv_vsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, vl);
   1002 }
   1003 
   1004 static force_inline vuint8m2_t
   1005 rvv_DIV_ONE_UN32m8_UN8m2_v (const vuint32m8_t x, size_t vl)
   1006 {
   1007    vuint32m8_t mul_higher = __riscv_vadd (x, ONE_HALF, vl);
   1008    vuint32m8_t mul_lower  = __riscv_vsrl (mul_higher, G_SHIFT, vl);
   1009 
   1010    return __riscv_vncvt_x (
   1011 __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, vl),
   1012 vl);
   1013 }
   1014 
   1015 /*
   1016 * x_c = (x_c * a) / 255 + y_c
   1017 */
   1018 
   1019 #define rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4(x, a, y, vl)                       \
   1020    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN16_vv_m4 (x, a, vl), y, vl)
   1021 
   1022 /*
   1023 * x_c = (x_c * a + y_c * b) / 255
   1024 */
   1025 
   1026 #define rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4(x, a, y, b, vl)          \
   1027    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN16_vv_m4 (x, a, vl),            \
   1028 		       rvv_UN8x4_MUL_UN16_vv_m4 (y, b, vl), vl)
   1029 
   1030 /*
   1031 * x_c = (x_c * a_c) / 255 + y_c
   1032 */
   1033 
   1034 #define rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4(x, a, y, vl)                      \
   1035    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN8x4_vv_m4 (x, a, vl), y, vl)
   1036 
   1037 /*
   1038 * x_c = (x_c * a_c + y_c * b) / 255
   1039 */
   1040 
   1041 #define rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4(x, a, y, b, vl)         \
   1042    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN8x4_vv_m4 (x, a, vl),           \
   1043 		       rvv_UN8x4_MUL_UN16_vv_m4 (y, b, vl), vl)
   1044 
   1045 // } pixman-combine32.h
   1046 
   1047 // Additional functions.
   1048 
   1049 #define rvv_shift_alpha_u16(x, vl) __riscv_vnsrl (x, 24, vl)
   1050 
   1051 #define rvv_shift_not_alpha_u16(x, vl)                                         \
   1052    rvv_shift_alpha_u16 (__riscv_vnot (x, vl), vl)
   1053 
   1054 #define rvv_load_alpha_u8m1(src, vl)                                           \
   1055    __riscv_vlse8_v_u8m1 ((uint8_t *)src + 3, 4, vl)
   1056 
   1057 #define rvv_load_not_alpha_u8m1(src, vl)                                       \
   1058    __riscv_vnot (rvv_load_alpha_u8m1 (src, vl), vl)
   1059 
   1060 #define rvv_u8m2_to_i16m4(in, vl)                                              \
   1061    __riscv_vreinterpret_i16m4 (__riscv_vwcvtu_x (in, vl))
   1062 
   1063 #define rvv_over_m4(src, dest, vl)                                             \
   1064    rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4 (                                      \
   1065 dest, rvv_shift_not_alpha_u16 (src, vl), src, vl)
   1066 
   1067 #define rvv_in_m4(x, y, vl) rvv_UN8x4_MUL_UN8_vv_m4 (x, y, vl)
   1068 
   1069 #define rvv_in_load_s_m_m4(src, mask, vl)                                      \
   1070    rvv_in_m4 (__riscv_vle32_v_u32m4 (src, vl),                                \
   1071        rvv_load_alpha_u8m1 (mask, vl), vl)
   1072 
   1073 #define rvv_in_load_s_nm_m4(src, mask, vl)                                     \
   1074    rvv_in_m4 (__riscv_vle32_v_u32m4 (src, vl),                                \
   1075        rvv_load_not_alpha_u8m1 (mask, vl), vl)
   1076 
   1077 static force_inline vuint16m2_t
   1078 rvv_convert_8888_to_0565_m2 (const vuint32m4_t s, size_t vl)
   1079 {
   1080    vuint32m4_t rb = __riscv_vand (s, 0xF800F8, vl);
   1081 
   1082    return __riscv_vor (
   1083 __riscv_vor (__riscv_vnsrl (rb, 3, vl), __riscv_vnsrl (rb, 8, vl), vl),
   1084 __riscv_vand (__riscv_vnsrl (s, 5, vl), 0x7E0, vl), vl);
   1085 }
   1086 
   1087 static force_inline vuint32m4_t
   1088 rvv_convert_0565_to_0888_m4 (const vuint16m2_t s, size_t vl)
   1089 {
   1090    vuint8m1_t  g1, g2;
   1091    vuint16m2_t r, g_w, b;
   1092    vuint32m4_t r_w, rb_w;
   1093 
   1094    r    = __riscv_vand (s, 0xF800, vl);
   1095    b    = __riscv_vand (s, 0x001F, vl);
   1096    r_w  = __riscv_vwmulu (r, 1 << 8, vl);
   1097    rb_w = __riscv_vwmaccu (r_w, 1 << 3, b, vl);
   1098    rb_w = __riscv_vand (__riscv_vor (rb_w, __riscv_vsrl (rb_w, 5, vl), vl),
   1099 		 0xFF00FF, vl);
   1100 
   1101    g1  = __riscv_vsll (__riscv_vnsrl (s, 5, vl), 2, vl);
   1102    g2  = __riscv_vsrl (g1, 6, vl);
   1103    g_w = __riscv_vwaddu_vv (g1, g2, vl);
   1104 
   1105    return __riscv_vwmaccu (rb_w, 1 << 8, g_w, vl);
   1106 }
   1107 
   1108 #define rvv_convert_0565_to_8888_m4(s, vl)                                     \
   1109    __riscv_vor (rvv_convert_0565_to_0888_m4 (s, vl), 0xff000000, vl)
   1110 
   1111 #define __rvv_combine_mask_value_ca(LMUL, src, mask, vl)                       \
   1112    rvv_UN8x4_MUL_UN8x4_vv_##LMUL (src, mask, vl)
   1113 #define rvv_combine_mask_value_ca_m2(src, mask, vl)                            \
   1114    __rvv_combine_mask_value_ca (m2, src, mask, vl)
   1115 #define rvv_combine_mask_value_ca_m4(src, mask, vl)                            \
   1116    __rvv_combine_mask_value_ca (m4, src, mask, vl)
   1117 
   1118 #define __rvv_combine_mask_alpha_ca(LMUL, src, mask, vl)                       \
   1119    rvv_UN8x4_MUL_UN16_vv_##LMUL (mask, rvv_shift_alpha_u16 (src, vl), vl)
   1120 #define rvv_combine_mask_alpha_ca_m2(src, mask, vl)                            \
   1121    __rvv_combine_mask_alpha_ca (m2, src, mask, vl)
   1122 #define rvv_combine_mask_alpha_ca_m4(src, mask, vl)                            \
   1123    __rvv_combine_mask_alpha_ca (m4, src, mask, vl)
   1124 
   1125 #define __rvv_combine_mask(LMUL, src, mask, vl)                                \
   1126    rvv_UN8x4_MUL_UN16_vv_##LMUL (src, rvv_shift_alpha_u16 (mask, vl), vl)
   1127 #define rvv_combine_mask_m2(src, mask, vl)                                     \
   1128    __rvv_combine_mask (m2, src, mask, vl)
   1129 #define rvv_combine_mask_m4(src, mask, vl)                                     \
   1130    __rvv_combine_mask (m4, src, mask, vl)
   1131 
   1132 #define __rvv_combine_mask_ca(LMUL)                                            \
   1133    static force_inline void rvv_combine_mask_ca_##LMUL (                      \
   1134 VUINT32 (LMUL) *__restrict__ src, VUINT32 (LMUL) *__restrict__ mask,   \
   1135 size_t vl)                                                             \
   1136    {                                                                          \
   1137 VUINT32 (LMUL) src_cpy = *src;                                         \
   1138 *(src)  = rvv_combine_mask_value_ca_##LMUL (*(src), *(mask), vl);      \
   1139 *(mask) = rvv_combine_mask_alpha_ca_##LMUL (src_cpy, *(mask), vl);     \
   1140    }
   1141 __rvv_combine_mask_ca (m2);
   1142 __rvv_combine_mask_ca (m4);
   1143 
   1144 static void
   1145 rvv_combine_clear (pixman_implementation_t *__restrict__ imp,
   1146 	   pixman_op_t op,
   1147 	   uint32_t *__restrict__ dest,
   1148 	   const uint32_t *__restrict__ src,
   1149 	   const uint32_t *__restrict__ mask,
   1150 	   int width)
   1151 {
   1152    uint32_t *pd = dest;
   1153 
   1154    vuint32m8_t v = __riscv_vmv_v_x_u32m8 (0, __riscv_vsetvlmax_e32m8 ());
   1155    RVV_FOREACH_1 (width, vl, e32m8, pd) { __riscv_vse32 (pd, v, vl); }
   1156 }
   1157 
   1158 static void
   1159 rvv_combine_src_u (pixman_implementation_t *__restrict__ imp,
   1160 	   pixman_op_t op,
   1161 	   uint32_t *__restrict__ dest,
   1162 	   const uint32_t *__restrict__ src,
   1163 	   const uint32_t *__restrict__ mask,
   1164 	   int width)
   1165 {
   1166    uint32_t *__restrict__ pd       = dest;
   1167    const uint32_t *__restrict__ ps = src;
   1168    const uint32_t *__restrict__ pm = mask;
   1169 
   1170    if (mask)
   1171    {
   1172 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1173 {
   1174     __riscv_vse32 (pd, rvv_in_load_s_m_m4 (ps, pm, vl), vl);
   1175 }
   1176    }
   1177    else
   1178    {
   1179 RVV_FOREACH_2 (width, vl, e32m8, ps, pd)
   1180 {
   1181     __riscv_vse32 (pd, __riscv_vle32_v_u32m8 (ps, vl), vl);
   1182 }
   1183    }
   1184 }
   1185 
   1186 static void
   1187 rvv_combine_over_u (pixman_implementation_t *__restrict__ imp,
   1188 	    pixman_op_t op,
   1189 	    uint32_t *__restrict__ dest,
   1190 	    const uint32_t *__restrict__ src,
   1191 	    const uint32_t *__restrict__ mask,
   1192 	    int width)
   1193 {
   1194    uint32_t *__restrict__ pd       = dest;
   1195    const uint32_t *__restrict__ ps = src;
   1196    const uint32_t *__restrict__ pm = mask;
   1197 
   1198    if (mask)
   1199    {
   1200 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1201 {
   1202     __riscv_vse32 (pd,
   1203 		   rvv_over_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
   1204 				__riscv_vle32_v_u32m4 (pd, vl), vl),
   1205 		   vl);
   1206 }
   1207    }
   1208    else
   1209    {
   1210 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1211 {
   1212     __riscv_vse32 (pd,
   1213 		   rvv_over_m4 (__riscv_vle32_v_u32m4 (ps, vl),
   1214 				__riscv_vle32_v_u32m4 (pd, vl), vl),
   1215 		   vl);
   1216 }
   1217    }
   1218 }
   1219 
   1220 static void
   1221 rvv_combine_over_reverse_u (pixman_implementation_t *__restrict__ imp,
   1222 		    pixman_op_t op,
   1223 		    uint32_t *__restrict__ dest,
   1224 		    const uint32_t *__restrict__ src,
   1225 		    const uint32_t *__restrict__ mask,
   1226 		    int width)
   1227 {
   1228    uint32_t *__restrict__ pd       = dest;
   1229    const uint32_t *__restrict__ ps = src;
   1230    const uint32_t *__restrict__ pm = mask;
   1231 
   1232    if (mask)
   1233    {
   1234 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1235 {
   1236     __riscv_vse32 (pd,
   1237 		   rvv_over_m4 (__riscv_vle32_v_u32m4 (pd, vl),
   1238 				rvv_in_load_s_m_m4 (ps, pm, vl), vl),
   1239 		   vl);
   1240 }
   1241    }
   1242    else
   1243    {
   1244 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1245 {
   1246     __riscv_vse32 (pd,
   1247 		   rvv_over_m4 (__riscv_vle32_v_u32m4 (pd, vl),
   1248 				__riscv_vle32_v_u32m4 (ps, vl), vl),
   1249 		   vl);
   1250 }
   1251    }
   1252 }
   1253 
   1254 static void
   1255 rvv_combine_in_u (pixman_implementation_t *__restrict__ imp,
   1256 	  pixman_op_t op,
   1257 	  uint32_t *__restrict__ dest,
   1258 	  const uint32_t *__restrict__ src,
   1259 	  const uint32_t *__restrict__ mask,
   1260 	  int width)
   1261 {
   1262    uint32_t *__restrict__ pd       = dest;
   1263    const uint32_t *__restrict__ ps = src;
   1264    const uint32_t *__restrict__ pm = mask;
   1265 
   1266    if (mask)
   1267    {
   1268 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1269 {
   1270     __riscv_vse32 (pd,
   1271 		   rvv_in_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
   1272 			      rvv_load_alpha_u8m1 (pd, vl), vl),
   1273 		   vl);
   1274 }
   1275    }
   1276    else
   1277    {
   1278 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1279 {
   1280     __riscv_vse32 (pd, rvv_in_load_s_m_m4 (ps, pd, vl), vl);
   1281 }
   1282    }
   1283 }
   1284 
   1285 static void
   1286 rvv_combine_in_reverse_u (pixman_implementation_t *__restrict__ imp,
   1287 		  pixman_op_t op,
   1288 		  uint32_t *__restrict__ dest,
   1289 		  const uint32_t *__restrict__ src,
   1290 		  const uint32_t *__restrict__ mask,
   1291 		  int width)
   1292 {
   1293    uint32_t *__restrict__ pd       = dest;
   1294    const uint32_t *__restrict__ ps = src;
   1295    const uint32_t *__restrict__ pm = mask;
   1296 
   1297    if (mask)
   1298    {
   1299 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1300 {
   1301     __riscv_vse32 (pd,
   1302 		   rvv_in_m4 (__riscv_vle32_v_u32m4 (pd, vl),
   1303 			      rvv_UN8_MUL_UN8_vv_m1 (
   1304 				  rvv_load_alpha_u8m1 (ps, vl),
   1305 				  rvv_load_alpha_u8m1 (pm, vl), vl),
   1306 			      vl),
   1307 		   vl);
   1308 }
   1309    }
   1310    else
   1311    {
   1312 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1313 {
   1314     __riscv_vse32 (pd, rvv_in_load_s_m_m4 (pd, ps, vl), vl);
   1315 }
   1316    }
   1317 }
   1318 
   1319 static void
   1320 rvv_combine_out_u (pixman_implementation_t *__restrict__ imp,
   1321 	   pixman_op_t op,
   1322 	   uint32_t *__restrict__ dest,
   1323 	   const uint32_t *__restrict__ src,
   1324 	   const uint32_t *__restrict__ mask,
   1325 	   int width)
   1326 {
   1327    uint32_t *__restrict__ pd       = dest;
   1328    const uint32_t *__restrict__ ps = src;
   1329    const uint32_t *__restrict__ pm = mask;
   1330 
   1331    if (mask)
   1332    {
   1333 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1334 {
   1335     __riscv_vse32 (pd,
   1336 		   rvv_in_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
   1337 			      rvv_load_not_alpha_u8m1 (pd, vl), vl),
   1338 		   vl);
   1339 }
   1340    }
   1341    else
   1342    {
   1343 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1344 {
   1345     __riscv_vse32 (pd, rvv_in_load_s_nm_m4 (ps, pd, vl), vl);
   1346 }
   1347    }
   1348 }
   1349 
   1350 static void
   1351 rvv_combine_out_reverse_u (pixman_implementation_t *__restrict__ imp,
   1352 		   pixman_op_t op,
   1353 		   uint32_t *__restrict__ dest,
   1354 		   const uint32_t *__restrict__ src,
   1355 		   const uint32_t *__restrict__ mask,
   1356 		   int width)
   1357 {
   1358    uint32_t *__restrict__ pd       = dest;
   1359    const uint32_t *__restrict__ ps = src;
   1360    const uint32_t *__restrict__ pm = mask;
   1361 
   1362    if (mask)
   1363    {
   1364 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1365 {
   1366     __riscv_vse32 (
   1367 	pd,
   1368 	rvv_in_m4 (__riscv_vle32_v_u32m4 (pd, vl),
   1369 		   __riscv_vnot (rvv_UN8_MUL_UN8_vv_m1 (
   1370 				     rvv_load_alpha_u8m1 (ps, vl),
   1371 				     rvv_load_alpha_u8m1 (pm, vl), vl),
   1372 				 vl),
   1373 		   vl),
   1374 	vl);
   1375 }
   1376    }
   1377    else
   1378    {
   1379 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1380 {
   1381     __riscv_vse32 (pd, rvv_in_load_s_nm_m4 (pd, ps, vl), vl);
   1382 }
   1383    }
   1384 }
   1385 
   1386 static void
   1387 rvv_combine_atop_u (pixman_implementation_t *__restrict__ imp,
   1388 	    pixman_op_t op,
   1389 	    uint32_t *__restrict__ dest,
   1390 	    const uint32_t *__restrict__ src,
   1391 	    const uint32_t *__restrict__ mask,
   1392 	    int width)
   1393 {
   1394    uint32_t *__restrict__ pd       = dest;
   1395    const uint32_t *__restrict__ ps = src;
   1396    const uint32_t *__restrict__ pm = mask;
   1397    vuint32m4_t s, d;
   1398 
   1399    if (mask)
   1400    {
   1401 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1402 {
   1403     s = rvv_in_load_s_m_m4 (ps, pm, vl);
   1404     d = __riscv_vle32_v_u32m4 (pd, vl);
   1405     __riscv_vse32 (pd,
   1406 		   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1407 		       s, rvv_shift_alpha_u16 (d, vl), d,
   1408 		       rvv_shift_not_alpha_u16 (s, vl), vl),
   1409 		   vl);
   1410 }
   1411    }
   1412    else
   1413    {
   1414 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1415 {
   1416     s = __riscv_vle32_v_u32m4 (ps, vl);
   1417     d = __riscv_vle32_v_u32m4 (pd, vl);
   1418     __riscv_vse32 (pd,
   1419 		   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1420 		       s, rvv_shift_alpha_u16 (d, vl), d,
   1421 		       rvv_shift_not_alpha_u16 (s, vl), vl),
   1422 		   vl);
   1423 }
   1424    }
   1425 }
   1426 
   1427 static void
   1428 rvv_combine_atop_reverse_u (pixman_implementation_t *__restrict__ imp,
   1429 		    pixman_op_t op,
   1430 		    uint32_t *__restrict__ dest,
   1431 		    const uint32_t *__restrict__ src,
   1432 		    const uint32_t *__restrict__ mask,
   1433 		    int width)
   1434 {
   1435    uint32_t *__restrict__ pd       = dest;
   1436    const uint32_t *__restrict__ ps = src;
   1437    const uint32_t *__restrict__ pm = mask;
   1438    vuint32m4_t s, d;
   1439 
   1440    if (mask)
   1441    {
   1442 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1443 {
   1444     s = rvv_in_load_s_m_m4 (ps, pm, vl);
   1445     d = __riscv_vle32_v_u32m4 (pd, vl);
   1446     __riscv_vse32 (pd,
   1447 		   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1448 		       s, rvv_shift_not_alpha_u16 (d, vl), d,
   1449 		       rvv_shift_alpha_u16 (s, vl), vl),
   1450 		   vl);
   1451 }
   1452    }
   1453    else
   1454    {
   1455 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1456 {
   1457     s = __riscv_vle32_v_u32m4 (ps, vl);
   1458     d = __riscv_vle32_v_u32m4 (pd, vl);
   1459     __riscv_vse32 (pd,
   1460 		   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1461 		       s, rvv_shift_not_alpha_u16 (d, vl), d,
   1462 		       rvv_shift_alpha_u16 (s, vl), vl),
   1463 		   vl);
   1464 }
   1465    }
   1466 }
   1467 
   1468 static void
   1469 rvv_combine_xor_u (pixman_implementation_t *__restrict__ imp,
   1470 	   pixman_op_t op,
   1471 	   uint32_t *__restrict__ dest,
   1472 	   const uint32_t *__restrict__ src,
   1473 	   const uint32_t *__restrict__ mask,
   1474 	   int width)
   1475 {
   1476    uint32_t *__restrict__ pd       = dest;
   1477    const uint32_t *__restrict__ ps = src;
   1478    const uint32_t *__restrict__ pm = mask;
   1479    vuint32m4_t s, d;
   1480 
   1481    if (mask)
   1482    {
   1483 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1484 {
   1485     s = rvv_in_load_s_m_m4 (ps, pm, vl);
   1486     d = __riscv_vle32_v_u32m4 (pd, vl);
   1487     __riscv_vse32 (pd,
   1488 		   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1489 		       s, rvv_shift_not_alpha_u16 (d, vl), d,
   1490 		       rvv_shift_not_alpha_u16 (s, vl), vl),
   1491 		   vl);
   1492 }
   1493    }
   1494    else
   1495    {
   1496 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1497 {
   1498     s = __riscv_vle32_v_u32m4 (ps, vl);
   1499     d = __riscv_vle32_v_u32m4 (pd, vl);
   1500     __riscv_vse32 (pd,
   1501 		   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1502 		       s, rvv_shift_not_alpha_u16 (d, vl), d,
   1503 		       rvv_shift_not_alpha_u16 (s, vl), vl),
   1504 		   vl);
   1505 }
   1506    }
   1507 }
   1508 
   1509 static void
   1510 rvv_combine_add_u (pixman_implementation_t *__restrict__ imp,
   1511 	   pixman_op_t op,
   1512 	   uint32_t *__restrict__ dest,
   1513 	   const uint32_t *__restrict__ src,
   1514 	   const uint32_t *__restrict__ mask,
   1515 	   int width)
   1516 {
   1517    uint32_t *__restrict__ pd       = dest;
   1518    const uint32_t *__restrict__ ps = src;
   1519    const uint32_t *__restrict__ pm = mask;
   1520 
   1521    if (mask)
   1522    {
   1523 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1524 {
   1525     __riscv_vse32 (
   1526 	pd,
   1527 	rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
   1528 				   rvv_in_load_s_m_m4 (ps, pm, vl), vl),
   1529 	vl);
   1530 }
   1531    }
   1532    else
   1533    {
   1534 RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
   1535 {
   1536     __riscv_vse32 (
   1537 	pd,
   1538 	rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
   1539 				   __riscv_vle32_v_u32m4 (ps, vl), vl),
   1540 	vl);
   1541 }
   1542    }
   1543 }
   1544 
   1545 /*
   1546 * Multiply
   1547 *
   1548 *      ad * as * B(d / ad, s / as)
   1549 *    = ad * as * d/ad * s/as
   1550 *    = d * s
   1551 *
   1552 */
   1553 static void
   1554 rvv_combine_multiply_u (pixman_implementation_t *imp,
   1555 		pixman_op_t              op,
   1556 		uint32_t *__restrict__ dest,
   1557 		const uint32_t *__restrict__ src,
   1558 		const uint32_t *__restrict__ mask,
   1559 		int width)
   1560 {
   1561    uint32_t *__restrict__ pd       = dest;
   1562    const uint32_t *__restrict__ ps = src;
   1563    const uint32_t *__restrict__ pm = mask;
   1564 
   1565    vuint32m4_t s, d;
   1566    if (mask)
   1567    {
   1568 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1569 {
   1570     s = rvv_in_load_s_m_m4 (ps, pm, vl);
   1571     d = __riscv_vle32_v_u32m4 (pd, vl);
   1572 
   1573     __riscv_vse32 (pd,
   1574 		   rvv_UN8x4_ADD_UN8x4_vv_m4 (
   1575 		       rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl),
   1576 		       rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1577 			   s, rvv_shift_not_alpha_u16 (d, vl), d,
   1578 			   rvv_shift_not_alpha_u16 (s, vl), vl),
   1579 		       vl),
   1580 		   vl);
   1581 }
   1582    }
   1583    else
   1584    {
   1585 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1586 {
   1587     s = __riscv_vle32_v_u32m4 (ps, vl);
   1588     d = __riscv_vle32_v_u32m4 (pd, vl);
   1589 
   1590     __riscv_vse32 (pd,
   1591 		   rvv_UN8x4_ADD_UN8x4_vv_m4 (
   1592 		       rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl),
   1593 		       rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1594 			   s, rvv_shift_not_alpha_u16 (d, vl), d,
   1595 			   rvv_shift_not_alpha_u16 (s, vl), vl),
   1596 		       vl),
   1597 		   vl);
   1598 }
   1599    }
   1600 }
   1601 
   1602 static void
   1603 rvv_combine_multiply_ca (pixman_implementation_t *__restrict__ imp,
   1604 		 pixman_op_t op,
   1605 		 uint32_t *__restrict__ dest,
   1606 		 const uint32_t *__restrict__ src,
   1607 		 const uint32_t *__restrict__ mask,
   1608 		 int width)
   1609 {
   1610    uint32_t *__restrict__ pd       = dest;
   1611    const uint32_t *__restrict__ ps = src;
   1612    const uint32_t *__restrict__ pm = mask;
   1613 
   1614    vuint32m4_t s, m, d;
   1615 
   1616    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1617    {
   1618 s = __riscv_vle32_v_u32m4 (ps, vl);
   1619 m = __riscv_vle32_v_u32m4 (pm, vl);
   1620 rvv_combine_mask_ca_m4 (&s, &m, vl);
   1621 
   1622 d = __riscv_vle32_v_u32m4 (pd, vl);
   1623 
   1624 __riscv_vse32 (pd,
   1625 	       rvv_UN8x4_ADD_UN8x4_vv_m4 (
   1626 		   rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1627 		       d, __riscv_vnot (m, vl), s,
   1628 		       rvv_shift_not_alpha_u16 (d, vl), vl),
   1629 		   rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl), vl),
   1630 	       vl);
   1631    }
   1632 }
   1633 
   1634 #define PDF_SEPARABLE_BLEND_MODE(name)                                         \
   1635    static void rvv_combine_##name##_u (                                       \
   1636 pixman_implementation_t *imp, pixman_op_t op, uint32_t *dest,          \
   1637 const uint32_t *src, const uint32_t *mask, int width)                  \
   1638    {                                                                          \
   1639 uint32_t *__restrict__ pd       = dest;                                \
   1640 const uint32_t *__restrict__ ps = src;                                 \
   1641 const uint32_t *__restrict__ pm = mask;                                \
   1642                                                                               \
   1643 vuint32m2_t s, d, ra, rx;                                              \
   1644 vuint16m1_t da, sa;                                                    \
   1645 size_t      vl4;                                                       \
   1646 vuint8m2_t  s4, d4, sa4, isa4, da4, ida4;                              \
   1647 vuint32m8_t rx4;                                                       \
   1648                                                                               \
   1649 RVV_FOREACH_3 (width, vl, e32m2, ps, pm, pd)                           \
   1650 {                                                                      \
   1651     vl4 = vl * 4;                                                      \
   1652                                                                               \
   1653     s = __riscv_vle32_v_u32m2 (ps, vl);                                \
   1654     if (mask)                                                          \
   1655 	s = rvv_combine_mask_m2 (s, __riscv_vle32_v_u32m2 (pm, vl),    \
   1656 				 vl);                                  \
   1657     sa = rvv_shift_alpha_u16 (s, vl);                                  \
   1658                                                                               \
   1659     d  = __riscv_vle32_v_u32m2 (pd, vl);                               \
   1660     da = rvv_shift_alpha_u16 (d, vl);                                  \
   1661                                                                               \
   1662     ra = __riscv_vsub (__riscv_vwaddu_vv (__riscv_vmul (da, 0xFF, vl), \
   1663 					  __riscv_vmul (sa, 0xFF, vl), \
   1664 					  vl),                         \
   1665 		       __riscv_vwmulu (sa, da, vl), vl);               \
   1666                                                                               \
   1667     s4   = RVV_U32_U8x4_m2 (s);                                        \
   1668     sa4  = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (sa, vl));       \
   1669     isa4 = __riscv_vnot (sa4, vl4);                                    \
   1670     d4   = RVV_U32_U8x4_m2 (d);                                        \
   1671     da4  = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (da, vl));       \
   1672     ida4 = __riscv_vnot (da4, vl4);                                    \
   1673                                                                               \
   1674     rx4 = __riscv_vadd (                                               \
   1675 	__riscv_vwaddu_vv (__riscv_vwmulu (isa4, d4, vl4),             \
   1676 			   __riscv_vwmulu (ida4, s4, vl4), vl4),       \
   1677 	rvv_blend_##name##_int (d4, da4, s4, sa4, vl4), vl4);          \
   1678                                                                               \
   1679     ra  = __riscv_vminu (ra, 255 * 255, vl);                           \
   1680     rx4 = __riscv_vminu (rx4, 255 * 255, vl4);                         \
   1681                                                                               \
   1682     ra = rvv_DIV_ONE_UN32m2_UN32m2_v (ra, vl);                         \
   1683     rx = RVV_U8x4_U32_m2 (rvv_DIV_ONE_UN32m8_UN8m2_v (rx4, vl4));      \
   1684                                                                               \
   1685     __riscv_vse32 (pd,                                                 \
   1686 		   __riscv_vor (__riscv_vsll (ra, 24, vl),             \
   1687 				__riscv_vand (rx, 0x00FFFFFF, vl),     \
   1688 				vl),                                   \
   1689 		   vl);                                                \
   1690 }                                                                      \
   1691    }                                                                          \
   1692                                                                               \
   1693    static void rvv_combine_##name##_ca (                                      \
   1694 pixman_implementation_t *imp, pixman_op_t op, uint32_t *dest,          \
   1695 const uint32_t *src, const uint32_t *mask, int width)                  \
   1696    {                                                                          \
   1697 uint32_t *__restrict__ pd       = dest;                                \
   1698 const uint32_t *__restrict__ ps = src;                                 \
   1699 const uint32_t *__restrict__ pm = mask;                                \
   1700                                                                               \
   1701 vuint32m2_t s, m, d, ra, rx;                                           \
   1702 vuint16m1_t da, sa;                                                    \
   1703 size_t      vl4;                                                       \
   1704 vuint8m2_t  s4, m4, d4, ixa4, da4, ida4;                               \
   1705 vuint32m8_t rx4;                                                       \
   1706                                                                               \
   1707 RVV_FOREACH_3 (width, vl, e32m2, ps, pm, pd)                           \
   1708 {                                                                      \
   1709     m = __riscv_vle32_v_u32m2 (pm, vl);                                \
   1710     s = __riscv_vle32_v_u32m2 (ps, vl);                                \
   1711     rvv_combine_mask_ca_m2 (&s, &m, vl);                               \
   1712     sa = rvv_shift_alpha_u16 (s, vl);                                  \
   1713                                                                               \
   1714     d  = __riscv_vle32_v_u32m2 (pd, vl);                               \
   1715     da = rvv_shift_alpha_u16 (d, vl);                                  \
   1716                                                                               \
   1717     ra = __riscv_vsub (__riscv_vwaddu_vv (__riscv_vmul (da, 0xFF, vl), \
   1718 					  __riscv_vmul (sa, 0xFF, vl), \
   1719 					  vl),                         \
   1720 		       __riscv_vwmulu (sa, da, vl), vl);               \
   1721                                                                               \
   1722     ixa4 = RVV_U32_U8x4_m2 (__riscv_vnot (m, vl));                     \
   1723     d4   = RVV_U32_U8x4_m2 (d);                                        \
   1724     ida4 = RVV_U32_U8x4_m2 (                                           \
   1725 	__riscv_vnot (rvv_UN16_bcast_UN8x4_v_m2 (da, vl), vl));        \
   1726     s4  = RVV_U32_U8x4_m2 (s);                                         \
   1727     da4 = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (da, vl));        \
   1728     m4  = RVV_U32_U8x4_m2 (m);                                         \
   1729                                                                               \
   1730     vl4 = vl * 4;                                                      \
   1731     rx4 = __riscv_vadd (                                               \
   1732 	__riscv_vwaddu_vv (__riscv_vwmulu (ixa4, d4, vl4),             \
   1733 			   __riscv_vwmulu (ida4, s4, vl4), vl4),       \
   1734 	rvv_blend_##name##_int (d4, da4, s4, m4, vl4), vl4);           \
   1735                                                                               \
   1736     ra  = __riscv_vminu (ra, 255 * 255, vl);                           \
   1737     rx4 = __riscv_vminu (rx4, 255 * 255, vl4);                         \
   1738                                                                               \
   1739     ra = rvv_DIV_ONE_UN32m2_UN32m2_v (ra, vl);                         \
   1740     rx = RVV_U8x4_U32_m2 (rvv_DIV_ONE_UN32m8_UN8m2_v (rx4, vl4));      \
   1741                                                                               \
   1742     __riscv_vse32 (pd,                                                 \
   1743 		   __riscv_vor (__riscv_vsll (ra, 24, vl),             \
   1744 				__riscv_vand (rx, 0x00FFFFFF, vl),     \
   1745 				vl),                                   \
   1746 		   vl);                                                \
   1747 }                                                                      \
   1748    }
   1749 
   1750 static force_inline vuint32m8_t
   1751 rvv_blend_screen_int (const vuint8m2_t d,
   1752 	      const vuint8m2_t ad,
   1753 	      const vuint8m2_t s,
   1754 	      const vuint8m2_t as,
   1755 	      size_t           vl)
   1756 {
   1757    return __riscv_vsub (__riscv_vwaddu_vv (__riscv_vwmulu (s, ad, vl),
   1758 				    __riscv_vwmulu (d, as, vl), vl),
   1759 		 __riscv_vwcvtu_x (__riscv_vwmulu (s, d, vl), vl), vl);
   1760 }
   1761 
   1762 PDF_SEPARABLE_BLEND_MODE (screen)
   1763 
   1764 static force_inline vuint32m8_t
   1765 _rvv_blend_overlay_hard_light (const vuint8m2_t d,
   1766 		       const vuint8m2_t ad,
   1767 		       const vuint8m2_t s,
   1768 		       const vuint8m2_t as,
   1769 		       const vbool4_t   selector,
   1770 		       size_t           vl)
   1771 {
   1772    vuint32m8_t out_true = __riscv_vwmulu (__riscv_vwmulu (s, d, vl), 2, vl);
   1773 
   1774    vint16m4_t d_i  = rvv_u8m2_to_i16m4 (d, vl);
   1775    vint16m4_t ad_i = rvv_u8m2_to_i16m4 (ad, vl);
   1776    vint16m4_t s_i  = rvv_u8m2_to_i16m4 (s, vl);
   1777    vint16m4_t as_i = rvv_u8m2_to_i16m4 (as, vl);
   1778 
   1779    vuint32m8_t out_false = __riscv_vreinterpret_v_i32m8_u32m8 (__riscv_vsub (
   1780 __riscv_vwmul (as_i, ad_i, vl),
   1781 __riscv_vsll (__riscv_vwmul (__riscv_vsub (ad_i, d_i, vl),
   1782 			     __riscv_vsub (as_i, s_i, vl), vl),
   1783 	      1, vl),
   1784 vl));
   1785 
   1786    return __riscv_vmerge (out_false, out_true, selector, vl);
   1787 }
   1788 
   1789 static force_inline vuint32m8_t
   1790 rvv_blend_overlay_int (const vuint8m2_t d,
   1791 	       const vuint8m2_t ad,
   1792 	       const vuint8m2_t s,
   1793 	       const vuint8m2_t as,
   1794 	       size_t           vl)
   1795 {
   1796    return _rvv_blend_overlay_hard_light (
   1797 d, ad, s, as,
   1798 __riscv_vmsltu (__riscv_vwmulu (d, 2, vl), __riscv_vwcvtu_x (ad, vl),
   1799 		vl),
   1800 vl);
   1801 }
   1802 
   1803 PDF_SEPARABLE_BLEND_MODE (overlay)
   1804 
   1805 static force_inline vuint32m8_t
   1806 rvv_blend_darken_int (const vuint8m2_t d,
   1807 	      const vuint8m2_t ad,
   1808 	      const vuint8m2_t s,
   1809 	      const vuint8m2_t as,
   1810 	      size_t           vl)
   1811 {
   1812    return __riscv_vwcvtu_x (__riscv_vminu (__riscv_vwmulu (ad, s, vl),
   1813 				    __riscv_vwmulu (as, d, vl), vl),
   1814 		     vl);
   1815 }
   1816 
   1817 PDF_SEPARABLE_BLEND_MODE (darken)
   1818 
   1819 static force_inline vuint32m8_t
   1820 rvv_blend_lighten_int (const vuint8m2_t d,
   1821 	       const vuint8m2_t ad,
   1822 	       const vuint8m2_t s,
   1823 	       const vuint8m2_t as,
   1824 	       size_t           vl)
   1825 {
   1826    return __riscv_vwcvtu_x (__riscv_vmaxu (__riscv_vwmulu (as, d, vl),
   1827 				    __riscv_vwmulu (ad, s, vl), vl),
   1828 		     vl);
   1829 }
   1830 
   1831 PDF_SEPARABLE_BLEND_MODE (lighten)
   1832 
   1833 static force_inline vuint32m8_t
   1834 rvv_blend_hard_light_int (const vuint8m2_t d,
   1835 		  const vuint8m2_t ad,
   1836 		  const vuint8m2_t s,
   1837 		  const vuint8m2_t as,
   1838 		  size_t           vl)
   1839 {
   1840    return _rvv_blend_overlay_hard_light (
   1841 d, ad, s, as,
   1842 __riscv_vmsltu (__riscv_vwmulu (s, 2, vl), __riscv_vwcvtu_x (as, vl),
   1843 		vl),
   1844 vl);
   1845 }
   1846 
   1847 PDF_SEPARABLE_BLEND_MODE (hard_light)
   1848 
   1849 static force_inline vuint32m8_t
   1850 rvv_blend_difference_int (const vuint8m2_t d,
   1851 		  const vuint8m2_t ad,
   1852 		  const vuint8m2_t s,
   1853 		  const vuint8m2_t as,
   1854 		  size_t           vl)
   1855 {
   1856    vuint16m4_t das = __riscv_vwmulu (d, as, vl);
   1857    vuint16m4_t sad = __riscv_vwmulu (s, ad, vl);
   1858 
   1859    return __riscv_vmerge (__riscv_vwsubu_vv (sad, das, vl),
   1860 		   __riscv_vwsubu_vv (das, sad, vl),
   1861 		   __riscv_vmsltu (sad, das, vl), vl);
   1862 }
   1863 
   1864 PDF_SEPARABLE_BLEND_MODE (difference)
   1865 
   1866 static force_inline vuint32m8_t
   1867 rvv_blend_exclusion_int (const vuint8m2_t d,
   1868 		 const vuint8m2_t ad,
   1869 		 const vuint8m2_t s,
   1870 		 const vuint8m2_t as,
   1871 		 size_t           vl)
   1872 {
   1873    return __riscv_vsub (__riscv_vwaddu_vv (__riscv_vwmulu (s, ad, vl),
   1874 				    __riscv_vwmulu (d, as, vl), vl),
   1875 		 __riscv_vwmulu (__riscv_vwmulu (d, s, vl), 2, vl), vl);
   1876 }
   1877 
   1878 PDF_SEPARABLE_BLEND_MODE (exclusion)
   1879 
   1880 #undef PDF_SEPARABLE_BLEND_MODE
   1881 
   1882 static void
   1883 rvv_combine_over_ca (pixman_implementation_t *__restrict__ imp,
   1884 	     pixman_op_t op,
   1885 	     uint32_t *__restrict__ dest,
   1886 	     const uint32_t *__restrict__ src,
   1887 	     const uint32_t *__restrict__ mask,
   1888 	     int width)
   1889 {
   1890    uint32_t *__restrict__ pd       = dest;
   1891    const uint32_t *__restrict__ ps = src;
   1892    const uint32_t *__restrict__ pm = mask;
   1893 
   1894    vuint32m4_t s, m;
   1895 
   1896    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1897    {
   1898 s = __riscv_vle32_v_u32m4 (ps, vl);
   1899 m = __riscv_vle32_v_u32m4 (pm, vl);
   1900 rvv_combine_mask_ca_m4 (&s, &m, vl);
   1901 
   1902 __riscv_vse32 (
   1903     pd,
   1904     rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
   1905 	__riscv_vle32_v_u32m4 (pd, vl), __riscv_vnot (m, vl), s, vl),
   1906     vl);
   1907    }
   1908 }
   1909 
   1910 static void
   1911 rvv_combine_over_reverse_ca (pixman_implementation_t *__restrict__ imp,
   1912 		     pixman_op_t op,
   1913 		     uint32_t *__restrict__ dest,
   1914 		     const uint32_t *__restrict__ src,
   1915 		     const uint32_t *__restrict__ mask,
   1916 		     int width)
   1917 {
   1918    uint32_t *__restrict__ pd       = dest;
   1919    const uint32_t *__restrict__ ps = src;
   1920    const uint32_t *__restrict__ pm = mask;
   1921 
   1922    vuint32m4_t d;
   1923 
   1924    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1925    {
   1926 d = __riscv_vle32_v_u32m4 (pd, vl);
   1927 __riscv_vse32 (
   1928     pd,
   1929     rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4 (
   1930 	rvv_UN8x4_MUL_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (ps, vl),
   1931 				   __riscv_vle32_v_u32m4 (pm, vl), vl),
   1932 	rvv_shift_not_alpha_u16 (d, vl), d, vl),
   1933     vl);
   1934    }
   1935 }
   1936 
   1937 static void
   1938 rvv_combine_atop_ca (pixman_implementation_t *__restrict__ imp,
   1939 	     pixman_op_t op,
   1940 	     uint32_t *__restrict__ dest,
   1941 	     const uint32_t *__restrict__ src,
   1942 	     const uint32_t *__restrict__ mask,
   1943 	     int width)
   1944 {
   1945    uint32_t *__restrict__ pd       = dest;
   1946    const uint32_t *__restrict__ ps = src;
   1947    const uint32_t *__restrict__ pm = mask;
   1948 
   1949    vuint32m4_t d, s, m;
   1950 
   1951    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1952    {
   1953 s = __riscv_vle32_v_u32m4 (ps, vl);
   1954 m = __riscv_vle32_v_u32m4 (pm, vl);
   1955 rvv_combine_mask_ca_m4 (&s, &m, vl);
   1956 
   1957 d = __riscv_vle32_v_u32m4 (pd, vl);
   1958 __riscv_vse32 (
   1959     pd,
   1960     rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1961 	d, __riscv_vnot (m, vl), s, rvv_shift_alpha_u16 (d, vl), vl),
   1962     vl);
   1963    }
   1964 }
   1965 
   1966 static void
   1967 rvv_combine_xor_ca (pixman_implementation_t *__restrict__ imp,
   1968 	    pixman_op_t op,
   1969 	    uint32_t *__restrict__ dest,
   1970 	    const uint32_t *__restrict__ src,
   1971 	    const uint32_t *__restrict__ mask,
   1972 	    int width)
   1973 {
   1974    uint32_t *__restrict__ pd       = dest;
   1975    const uint32_t *__restrict__ ps = src;
   1976    const uint32_t *__restrict__ pm = mask;
   1977 
   1978    vuint32m4_t d, s, m;
   1979 
   1980    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   1981    {
   1982 s = __riscv_vle32_v_u32m4 (ps, vl);
   1983 m = __riscv_vle32_v_u32m4 (pm, vl);
   1984 rvv_combine_mask_ca_m4 (&s, &m, vl);
   1985 
   1986 d = __riscv_vle32_v_u32m4 (pd, vl);
   1987 __riscv_vse32 (pd,
   1988 	       rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   1989 		   d, __riscv_vnot (m, vl), s,
   1990 		   rvv_shift_not_alpha_u16 (d, vl), vl),
   1991 	       vl);
   1992    }
   1993 }
   1994 
   1995 static void
   1996 rvv_combine_atop_reverse_ca (pixman_implementation_t *__restrict__ imp,
   1997 		     pixman_op_t op,
   1998 		     uint32_t *__restrict__ dest,
   1999 		     const uint32_t *__restrict__ src,
   2000 		     const uint32_t *__restrict__ mask,
   2001 		     int width)
   2002 {
   2003    uint32_t *__restrict__ pd       = dest;
   2004    const uint32_t *__restrict__ ps = src;
   2005    const uint32_t *__restrict__ pm = mask;
   2006 
   2007    vuint32m4_t d, s, m;
   2008 
   2009    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   2010    {
   2011 s = __riscv_vle32_v_u32m4 (ps, vl);
   2012 m = __riscv_vle32_v_u32m4 (pm, vl);
   2013 rvv_combine_mask_ca_m4 (&s, &m, vl);
   2014 
   2015 d = __riscv_vle32_v_u32m4 (pd, vl);
   2016 __riscv_vse32 (pd,
   2017 	       rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
   2018 		   d, m, s, rvv_shift_not_alpha_u16 (d, vl), vl),
   2019 	       vl);
   2020    }
   2021 }
   2022 
   2023 static void
   2024 rvv_combine_src_ca (pixman_implementation_t *__restrict__ imp,
   2025 	    pixman_op_t op,
   2026 	    uint32_t *__restrict__ dest,
   2027 	    const uint32_t *__restrict__ src,
   2028 	    const uint32_t *__restrict__ mask,
   2029 	    int width)
   2030 {
   2031    uint32_t *__restrict__ pd       = dest;
   2032    const uint32_t *__restrict__ ps = src;
   2033    const uint32_t *__restrict__ pm = mask;
   2034 
   2035    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   2036    {
   2037 __riscv_vse32 (
   2038     pd,
   2039     rvv_combine_mask_value_ca_m4 (__riscv_vle32_v_u32m4 (ps, vl),
   2040 				  __riscv_vle32_v_u32m4 (pm, vl), vl),
   2041     vl);
   2042    }
   2043 }
   2044 
   2045 static void
   2046 rvv_combine_in_ca (pixman_implementation_t *__restrict__ imp,
   2047 	   pixman_op_t op,
   2048 	   uint32_t *__restrict__ dest,
   2049 	   const uint32_t *__restrict__ src,
   2050 	   const uint32_t *__restrict__ mask,
   2051 	   int width)
   2052 {
   2053    uint32_t *__restrict__ pd       = dest;
   2054    const uint32_t *__restrict__ ps = src;
   2055    const uint32_t *__restrict__ pm = mask;
   2056 
   2057    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   2058    {
   2059 __riscv_vse32 (pd,
   2060 	       rvv_in_m4 (rvv_combine_mask_value_ca_m4 (
   2061 			      __riscv_vle32_v_u32m4 (ps, vl),
   2062 			      __riscv_vle32_v_u32m4 (pm, vl), vl),
   2063 			  rvv_load_alpha_u8m1 (pd, vl), vl),
   2064 	       vl);
   2065    }
   2066 }
   2067 
   2068 static void
   2069 rvv_combine_in_reverse_ca (pixman_implementation_t *imp,
   2070 		   pixman_op_t              op,
   2071 		   uint32_t                *dest,
   2072 		   const uint32_t          *src,
   2073 		   const uint32_t          *mask,
   2074 		   int                      width)
   2075 {
   2076    uint32_t *__restrict__ pd       = dest;
   2077    const uint32_t *__restrict__ ps = src;
   2078    const uint32_t *__restrict__ pm = mask;
   2079 
   2080    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   2081    {
   2082 __riscv_vse32 (
   2083     pd,
   2084     rvv_UN8x4_MUL_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
   2085 			       rvv_combine_mask_alpha_ca_m4 (
   2086 				   __riscv_vle32_v_u32m4 (ps, vl),
   2087 				   __riscv_vle32_v_u32m4 (pm, vl), vl),
   2088 			       vl),
   2089     vl);
   2090    }
   2091 }
   2092 
   2093 static void
   2094 rvv_combine_out_ca (pixman_implementation_t *__restrict__ imp,
   2095 	    pixman_op_t op,
   2096 	    uint32_t *__restrict__ dest,
   2097 	    const uint32_t *__restrict__ src,
   2098 	    const uint32_t *__restrict__ mask,
   2099 	    int width)
   2100 {
   2101    uint32_t *__restrict__ pd       = dest;
   2102    const uint32_t *__restrict__ ps = src;
   2103    const uint32_t *__restrict__ pm = mask;
   2104 
   2105    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   2106    {
   2107 __riscv_vse32 (pd,
   2108 	       rvv_in_m4 (rvv_combine_mask_value_ca_m4 (
   2109 			      __riscv_vle32_v_u32m4 (ps, vl),
   2110 			      __riscv_vle32_v_u32m4 (pm, vl), vl),
   2111 			  rvv_load_not_alpha_u8m1 (pd, vl), vl),
   2112 	       vl);
   2113    }
   2114 }
   2115 
   2116 static void
   2117 rvv_combine_out_reverse_ca (pixman_implementation_t *imp,
   2118 		    pixman_op_t              op,
   2119 		    uint32_t                *dest,
   2120 		    const uint32_t          *src,
   2121 		    const uint32_t          *mask,
   2122 		    int                      width)
   2123 {
   2124    uint32_t *__restrict__ pd       = dest;
   2125    const uint32_t *__restrict__ ps = src;
   2126    const uint32_t *__restrict__ pm = mask;
   2127 
   2128    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   2129    {
   2130 __riscv_vse32 (
   2131     pd,
   2132     rvv_UN8x4_MUL_UN8x4_vv_m4 (
   2133 	__riscv_vle32_v_u32m4 (pd, vl),
   2134 	__riscv_vnot_v_u32m4 (rvv_combine_mask_alpha_ca_m4 (
   2135 				  __riscv_vle32_v_u32m4 (ps, vl),
   2136 				  __riscv_vle32_v_u32m4 (pm, vl), vl),
   2137 			      vl),
   2138 	vl),
   2139     vl);
   2140    }
   2141 }
   2142 
   2143 static void
   2144 rvv_combine_add_ca (pixman_implementation_t *__restrict__ imp,
   2145 	    pixman_op_t op,
   2146 	    uint32_t *__restrict__ dest,
   2147 	    const uint32_t *__restrict__ src,
   2148 	    const uint32_t *__restrict__ mask,
   2149 	    int width)
   2150 {
   2151    uint32_t *__restrict__ pd       = dest;
   2152    const uint32_t *__restrict__ ps = src;
   2153    const uint32_t *__restrict__ pm = mask;
   2154 
   2155    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
   2156    {
   2157 __riscv_vse32 (
   2158     pd,
   2159     rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
   2160 			       rvv_combine_mask_value_ca_m4 (
   2161 				   __riscv_vle32_v_u32m4 (ps, vl),
   2162 				   __riscv_vle32_v_u32m4 (pm, vl), vl),
   2163 			       vl),
   2164     vl);
   2165    }
   2166 }
   2167 
   2168 static void
   2169 rvv_composite_src_x888_8888 (pixman_implementation_t *__restrict__ imp,
   2170 		     pixman_composite_info_t *__restrict__ info)
   2171 {
   2172    PIXMAN_COMPOSITE_ARGS (info);
   2173    uint32_t *__restrict__ dst_line, *__restrict__ dst;
   2174    uint32_t *__restrict__ src_line, *__restrict__ src;
   2175    int32_t dst_stride, src_stride;
   2176    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2177 		   dst_line, 1);
   2178    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2179 		   src_line, 1);
   2180    while (height--)
   2181    {
   2182 dst = dst_line;
   2183 dst_line += dst_stride;
   2184 src = src_line;
   2185 src_line += src_stride;
   2186 
   2187 RVV_FOREACH_2 (width, vl, e32m8, src, dst)
   2188 {
   2189     __riscv_vse32 (
   2190 	dst,
   2191 	__riscv_vor (__riscv_vle32_v_u32m8 (src, vl), 0xff000000, vl),
   2192 	vl);
   2193 }
   2194    }
   2195 }
   2196 
   2197 static void
   2198 rvv_composite_src_8888_8888 (pixman_implementation_t *__restrict__ imp,
   2199 		     pixman_composite_info_t *__restrict__ info)
   2200 {
   2201    PIXMAN_COMPOSITE_ARGS (info);
   2202    uint32_t *__restrict__ dst_line, *__restrict__ dst;
   2203    uint32_t *__restrict__ src_line, *__restrict__ src;
   2204    int32_t dst_stride, src_stride;
   2205    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2206 		   dst_line, 1);
   2207    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2208 		   src_line, 1);
   2209    while (height--)
   2210    {
   2211 dst = dst_line;
   2212 dst_line += dst_stride;
   2213 src = src_line;
   2214 src_line += src_stride;
   2215 
   2216 RVV_FOREACH_2 (width, vl, e32m8, src, dst)
   2217 {
   2218     __riscv_vse32 (dst, __riscv_vle32_v_u32m8 (src, vl), vl);
   2219 }
   2220    }
   2221 }
   2222 
   2223 static void
   2224 rvv_composite_over_x888_8_8888 (pixman_implementation_t *__restrict__ imp,
   2225 			pixman_composite_info_t *__restrict__ info)
   2226 {
   2227    PIXMAN_COMPOSITE_ARGS (info);
   2228    uint32_t *__restrict__ src, *__restrict__ src_line;
   2229    uint32_t *__restrict__ dst, *__restrict__ dst_line;
   2230    uint8_t *__restrict__ mask, *__restrict__ mask_line;
   2231    int32_t src_stride, mask_stride, dst_stride;
   2232    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2233 		   dst_line, 1);
   2234    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
   2235 		   mask_line, 1);
   2236    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2237 		   src_line, 1);
   2238 
   2239    while (height--)
   2240    {
   2241 src = src_line;
   2242 src_line += src_stride;
   2243 dst = dst_line;
   2244 dst_line += dst_stride;
   2245 mask = mask_line;
   2246 mask_line += mask_stride;
   2247 
   2248 RVV_FOREACH_3 (width, vl, e32m4, src, mask, dst)
   2249 {
   2250     __riscv_vse32 (
   2251 	dst,
   2252 	rvv_over_m4 (
   2253 	    rvv_in_m4 (__riscv_vor (__riscv_vle32_v_u32m4 (src, vl),
   2254 				    0xff000000, vl),
   2255 		       __riscv_vle8_v_u8m1 (mask, vl), vl),
   2256 	    __riscv_vle32_v_u32m4 (dst, vl), vl),
   2257 	vl);
   2258 }
   2259    }
   2260 }
   2261 
   2262 static void
   2263 rvv_composite_over_8888_8888 (pixman_implementation_t *imp,
   2264 		      pixman_composite_info_t *info)
   2265 {
   2266    PIXMAN_COMPOSITE_ARGS (info);
   2267    uint32_t *dst_line, *dst;
   2268    uint32_t *src_line, *src;
   2269    int       dst_stride, src_stride;
   2270 
   2271    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2272 		   dst_line, 1);
   2273    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2274 		   src_line, 1);
   2275 
   2276    while (height--)
   2277    {
   2278 dst = dst_line;
   2279 dst_line += dst_stride;
   2280 src = src_line;
   2281 src_line += src_stride;
   2282 
   2283 RVV_FOREACH_2 (width, vl, e32m4, src, dst)
   2284 {
   2285     __riscv_vse32 (dst,
   2286 		   rvv_over_m4 (__riscv_vle32_v_u32m4 (src, vl),
   2287 				__riscv_vle32_v_u32m4 (dst, vl), vl),
   2288 		   vl);
   2289 }
   2290    }
   2291 }
   2292 
   2293 static void
   2294 rvv_composite_over_n_8_0565 (pixman_implementation_t *imp,
   2295 		     pixman_composite_info_t *info)
   2296 {
   2297    PIXMAN_COMPOSITE_ARGS (info);
   2298    uint16_t *__restrict__ dst_line, *__restrict__ dst;
   2299    uint8_t *__restrict__ mask_line, *__restrict__ mask;
   2300    int         dst_stride, mask_stride;
   2301    uint32_t    src;
   2302    vuint32m4_t vsrc;
   2303 
   2304    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2305    if (src == 0)
   2306 return;
   2307    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
   2308 
   2309    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
   2310 		   dst_line, 1);
   2311    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
   2312 		   mask_line, 1);
   2313 
   2314    while (height--)
   2315    {
   2316 dst = dst_line;
   2317 dst_line += dst_stride;
   2318 mask = mask_line;
   2319 mask_line += mask_stride;
   2320 
   2321 RVV_FOREACH_2 (width, vl, e16m2, mask, dst)
   2322 {
   2323     __riscv_vse16 (
   2324 	dst,
   2325 	rvv_convert_8888_to_0565_m2 (
   2326 	    rvv_over_m4 (
   2327 		rvv_in_m4 (vsrc, __riscv_vle8_v_u8m1 (mask, vl), vl),
   2328 		rvv_convert_0565_to_0888_m4 (
   2329 		    __riscv_vle16_v_u16m2 (dst, vl), vl),
   2330 		vl),
   2331 	    vl),
   2332 	vl);
   2333 }
   2334    }
   2335 }
   2336 
   2337 static void
   2338 rvv_composite_over_n_8_8888 (pixman_implementation_t *imp,
   2339 		     pixman_composite_info_t *info)
   2340 {
   2341    PIXMAN_COMPOSITE_ARGS (info);
   2342    uint32_t   *dst_line, *dst;
   2343    uint8_t    *mask_line, *mask;
   2344    int         dst_stride, mask_stride;
   2345    uint32_t    src;
   2346    vuint32m4_t vsrc;
   2347 
   2348    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2349    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
   2350 
   2351    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2352 		   dst_line, 1);
   2353    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
   2354 		   mask_line, 1);
   2355 
   2356    while (height--)
   2357    {
   2358 dst = dst_line;
   2359 dst_line += dst_stride;
   2360 mask = mask_line;
   2361 mask_line += mask_stride;
   2362 
   2363 RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
   2364 {
   2365     __riscv_vse32 (
   2366 	dst,
   2367 	rvv_over_m4 (
   2368 	    rvv_in_m4 (vsrc, __riscv_vle8_v_u8m1 (mask, vl), vl),
   2369 	    __riscv_vle32_v_u32m4 (dst, vl), vl),
   2370 	vl);
   2371 }
   2372    }
   2373 }
   2374 
   2375 static void
   2376 rvv_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
   2377 			  pixman_composite_info_t *info)
   2378 {
   2379    PIXMAN_COMPOSITE_ARGS (info);
   2380    uint32_t   *dst_line, *dst;
   2381    uint32_t   *mask_line, *mask;
   2382    int         dst_stride, mask_stride;
   2383    uint32_t    src;
   2384    vuint32m4_t vsrc;
   2385 
   2386    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2387    if (src == 0)
   2388 return;
   2389    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
   2390 
   2391    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2392 		   dst_line, 1);
   2393    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
   2394 		   mask_line, 1);
   2395 
   2396    while (height--)
   2397    {
   2398 dst = dst_line;
   2399 dst_line += dst_stride;
   2400 mask = mask_line;
   2401 mask_line += mask_stride;
   2402 
   2403 RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
   2404 {
   2405     __riscv_vse32 (dst,
   2406 		   rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
   2407 		       __riscv_vle32_v_u32m4 (mask, vl), vsrc,
   2408 		       __riscv_vle32_v_u32m4 (dst, vl), vl),
   2409 		   vl);
   2410 }
   2411    }
   2412 }
   2413 
   2414 static void
   2415 rvv_composite_over_n_8888_8888_ca (pixman_implementation_t *__restrict__ imp,
   2416 			   pixman_composite_info_t *__restrict__ info)
   2417 {
   2418    PIXMAN_COMPOSITE_ARGS (info);
   2419    uint32_t *__restrict__ dst_line, *__restrict__ dst;
   2420    uint32_t *__restrict__ mask_line, *__restrict__ mask;
   2421    int         dst_stride, mask_stride;
   2422    uint32_t    src, srca;
   2423    vuint32m4_t vsrc;
   2424 
   2425    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2426    if (src == 0)
   2427 return;
   2428    srca = src >> 24;
   2429    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
   2430 
   2431    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2432 		   dst_line, 1);
   2433    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
   2434 		   mask_line, 1);
   2435 
   2436    while (height--)
   2437    {
   2438 dst = dst_line;
   2439 dst_line += dst_stride;
   2440 mask = mask_line;
   2441 mask_line += mask_stride;
   2442 
   2443 RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
   2444 {
   2445     vuint32m4_t m = __riscv_vle32_v_u32m4 (mask, vl);
   2446     __riscv_vse32 (
   2447 	dst,
   2448 	rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
   2449 	    __riscv_vle32_v_u32m4 (dst, vl),
   2450 	    __riscv_vnot (rvv_UN8x4_MUL_UN8_vx_m4 (m, srca, vl), vl),
   2451 	    rvv_UN8x4_MUL_UN8x4_vv_m4 (m, vsrc, vl), vl),
   2452 	vl);
   2453 }
   2454    }
   2455 }
   2456 
   2457 static void
   2458 rvv_composite_over_n_8888_0565_ca (pixman_implementation_t *__restrict__ imp,
   2459 			   pixman_composite_info_t *__restrict__ info)
   2460 {
   2461    PIXMAN_COMPOSITE_ARGS (info);
   2462    uint16_t *__restrict__ dst_line, *__restrict__ dst;
   2463    uint32_t *__restrict__ mask_line, *__restrict__ mask;
   2464    int         dst_stride, mask_stride;
   2465    uint32_t    src, srca;
   2466    vuint32m4_t vsrc;
   2467 
   2468    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2469    srca = src >> 24;
   2470    if (src == 0)
   2471 return;
   2472    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
   2473 
   2474    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
   2475 		   dst_line, 1);
   2476    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
   2477 		   mask_line, 1);
   2478 
   2479    while (height--)
   2480    {
   2481 dst = dst_line;
   2482 dst_line += dst_stride;
   2483 mask = mask_line;
   2484 mask_line += mask_stride;
   2485 
   2486 RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
   2487 {
   2488     vuint32m4_t ma = __riscv_vle32_v_u32m4 (mask, vl);
   2489 
   2490     __riscv_vse16 (
   2491 	dst,
   2492 	rvv_convert_8888_to_0565_m2 (
   2493 	    rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
   2494 		rvv_convert_0565_to_0888_m4 (
   2495 		    __riscv_vle16_v_u16m2 (dst, vl), vl),
   2496 		__riscv_vnot (rvv_UN8x4_MUL_UN8_vx_m4 (ma, srca, vl),
   2497 			      vl),
   2498 		rvv_UN8x4_MUL_UN8x4_vv_m4 (ma, vsrc, vl), vl),
   2499 	    vl),
   2500 	vl);
   2501 }
   2502    }
   2503 }
   2504 
   2505 static void
   2506 rvv_composite_over_8888_0565 (pixman_implementation_t *__restrict__ imp,
   2507 		      pixman_composite_info_t *__restrict__ info)
   2508 {
   2509    PIXMAN_COMPOSITE_ARGS (info);
   2510    uint16_t *__restrict__ dst_line, *__restrict__ dst;
   2511    uint32_t *__restrict__ src_line, *__restrict__ src;
   2512    int dst_stride, src_stride;
   2513 
   2514    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2515 		   src_line, 1);
   2516    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
   2517 		   dst_line, 1);
   2518 
   2519    while (height--)
   2520    {
   2521 dst = dst_line;
   2522 dst_line += dst_stride;
   2523 src = src_line;
   2524 src_line += src_stride;
   2525 
   2526 RVV_FOREACH_2 (width, vl, e16m2, src, dst)
   2527 {
   2528     __riscv_vse16 (
   2529 	dst,
   2530 	rvv_convert_8888_to_0565_m2 (
   2531 	    rvv_over_m4 (__riscv_vle32_v_u32m4 (src, vl),
   2532 			 rvv_convert_0565_to_0888_m4 (
   2533 			     __riscv_vle16_v_u16m2 (dst, vl), vl),
   2534 			 vl),
   2535 	    vl),
   2536 	vl);
   2537 }
   2538    }
   2539 }
   2540 
   2541 static void
   2542 rvv_composite_add_8_8 (pixman_implementation_t *imp,
   2543 	       pixman_composite_info_t *info)
   2544 {
   2545    PIXMAN_COMPOSITE_ARGS (info);
   2546    uint8_t *dst_line, *dst;
   2547    uint8_t *src_line, *src;
   2548    int      dst_stride, src_stride;
   2549 
   2550    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride,
   2551 		   src_line, 1);
   2552    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
   2553 		   dst_line, 1);
   2554 
   2555    while (height--)
   2556    {
   2557 dst = dst_line;
   2558 dst_line += dst_stride;
   2559 src = src_line;
   2560 src_line += src_stride;
   2561 
   2562 RVV_FOREACH_2 (width, vl, e8m8, src, dst)
   2563 {
   2564     __riscv_vse8 (dst,
   2565 		  rvv_UN8_ADD_UN8_vv (__riscv_vle8_v_u8m8 (src, vl),
   2566 				      __riscv_vle8_v_u8m8 (dst, vl),
   2567 				      vl),
   2568 		  vl);
   2569 }
   2570    }
   2571 }
   2572 
   2573 static void
   2574 rvv_composite_add_0565_0565 (pixman_implementation_t *imp,
   2575 		     pixman_composite_info_t *info)
   2576 {
   2577    PIXMAN_COMPOSITE_ARGS (info);
   2578    uint16_t *dst_line, *dst;
   2579    uint16_t *src_line, *src;
   2580    int       dst_stride, src_stride;
   2581 
   2582    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride,
   2583 		   src_line, 1);
   2584    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
   2585 		   dst_line, 1);
   2586 
   2587    while (height--)
   2588    {
   2589 dst = dst_line;
   2590 dst_line += dst_stride;
   2591 src = src_line;
   2592 src_line += src_stride;
   2593 
   2594 RVV_FOREACH_2 (width, vl, e16m2, src, dst)
   2595 {
   2596     __riscv_vse16 (dst,
   2597 		   rvv_convert_8888_to_0565_m2 (
   2598 		       rvv_UN8x4_ADD_UN8x4_vv_m4 (
   2599 			   rvv_convert_0565_to_8888_m4 (
   2600 			       __riscv_vle16_v_u16m2 (src, vl), vl),
   2601 			   rvv_convert_0565_to_8888_m4 (
   2602 			       __riscv_vle16_v_u16m2 (dst, vl), vl),
   2603 			   vl),
   2604 		       vl),
   2605 		   vl);
   2606 }
   2607    }
   2608 }
   2609 
   2610 static void
   2611 rvv_composite_add_8888_8888 (pixman_implementation_t *__restrict__ imp,
   2612 		     pixman_composite_info_t *__restrict__ info)
   2613 {
   2614    PIXMAN_COMPOSITE_ARGS (info);
   2615    uint32_t *__restrict__ dst_line, *__restrict__ dst;
   2616    uint32_t *__restrict__ src_line, *__restrict__ src;
   2617    int dst_stride, src_stride;
   2618 
   2619    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
   2620 		   src_line, 1);
   2621    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
   2622 		   dst_line, 1);
   2623 
   2624    while (height--)
   2625    {
   2626 dst = dst_line;
   2627 dst_line += dst_stride;
   2628 src = src_line;
   2629 src_line += src_stride;
   2630 
   2631 RVV_FOREACH_2 (width, vl, e32m4, src, dst)
   2632 {
   2633     __riscv_vse32 (
   2634 	dst,
   2635 	rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (src, vl),
   2636 				   __riscv_vle32_v_u32m4 (dst, vl), vl),
   2637 	vl);
   2638 }
   2639    }
   2640 }
   2641 
   2642 static void
   2643 rvv_composite_add_n_8_8 (pixman_implementation_t *imp,
   2644 		 pixman_composite_info_t *info)
   2645 {
   2646    PIXMAN_COMPOSITE_ARGS (info);
   2647    uint8_t *dst_line, *dst;
   2648    uint8_t *mask_line, *mask;
   2649    int      dst_stride, mask_stride;
   2650    uint32_t src;
   2651    uint8_t  sa;
   2652 
   2653    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
   2654 		   dst_line, 1);
   2655    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
   2656 		   mask_line, 1);
   2657    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2658    sa  = (src >> 24);
   2659 
   2660    while (height--)
   2661    {
   2662 dst = dst_line;
   2663 dst_line += dst_stride;
   2664 mask = mask_line;
   2665 mask_line += mask_stride;
   2666 
   2667 RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
   2668 {
   2669     __riscv_vse8 (
   2670 	dst,
   2671 	rvv_UN8_ADD_UN8_vv (rvv_UN8_MUL_UN8_vx_m4 (
   2672 				__riscv_vle8_v_u8m4 (mask, vl), sa, vl),
   2673 			    __riscv_vle8_v_u8m4 (dst, vl), vl),
   2674 	vl);
   2675 }
   2676    }
   2677 }
   2678 
   2679 static void
   2680 rvv_composite_src_memcpy (pixman_implementation_t *imp,
   2681 		  pixman_composite_info_t *info)
   2682 {
   2683    PIXMAN_COMPOSITE_ARGS (info);
   2684    int      bpp     = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
   2685    uint32_t n_bytes = width * bpp;
   2686    int      dst_stride, src_stride;
   2687    uint8_t *dst;
   2688    uint8_t *src;
   2689 
   2690    src_stride = src_image->bits.rowstride * 4;
   2691    dst_stride = dest_image->bits.rowstride * 4;
   2692 
   2693    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
   2694    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
   2695 
   2696    while (height--)
   2697    {
   2698 memcpy (dst, src, n_bytes);
   2699 
   2700 dst += dst_stride;
   2701 src += src_stride;
   2702    }
   2703 }
   2704 
   2705 static void
   2706 rvv_composite_in_n_8_8 (pixman_implementation_t *imp,
   2707 		pixman_composite_info_t *info)
   2708 {
   2709    PIXMAN_COMPOSITE_ARGS (info);
   2710    uint32_t src, srca;
   2711    uint8_t *dst_line, *dst;
   2712    uint8_t *mask_line, *mask;
   2713    int      dst_stride, mask_stride;
   2714 
   2715    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2716    srca = src >> 24;
   2717 
   2718    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
   2719 		   dst_line, 1);
   2720    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
   2721 		   mask_line, 1);
   2722 
   2723    if (srca == 0xff)
   2724    {
   2725 while (height--)
   2726 {
   2727     dst = dst_line;
   2728     dst_line += dst_stride;
   2729     mask = mask_line;
   2730     mask_line += mask_stride;
   2731 
   2732     RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
   2733     {
   2734 	__riscv_vse8 (
   2735 	    dst,
   2736 	    rvv_UN8_MUL_UN8_vv_m4 (__riscv_vle8_v_u8m4 (mask, vl),
   2737 				   __riscv_vle8_v_u8m4 (dst, vl), vl),
   2738 	    vl);
   2739     }
   2740 }
   2741    }
   2742    else
   2743    {
   2744 while (height--)
   2745 {
   2746     dst = dst_line;
   2747     dst_line += dst_stride;
   2748     mask = mask_line;
   2749     mask_line += mask_stride;
   2750 
   2751     RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
   2752     {
   2753 	__riscv_vse8 (dst,
   2754 		      rvv_UN8_MUL_UN8_vv_m4 (
   2755 			  rvv_UN8_MUL_UN8_vx_m4 (
   2756 			      __riscv_vle8_v_u8m4 (mask, vl), srca, vl),
   2757 			  __riscv_vle8_v_u8m4 (dst, vl), vl),
   2758 		      vl);
   2759     }
   2760 }
   2761    }
   2762 }
   2763 
   2764 static void
   2765 rvv_composite_in_8_8 (pixman_implementation_t *imp,
   2766 	      pixman_composite_info_t *info)
   2767 {
   2768    PIXMAN_COMPOSITE_ARGS (info);
   2769    uint8_t *dst_line, *dst;
   2770    uint8_t *src_line, *src;
   2771    int      dst_stride, src_stride;
   2772 
   2773    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride,
   2774 		   src_line, 1);
   2775    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
   2776 		   dst_line, 1);
   2777 
   2778    while (height--)
   2779    {
   2780 dst = dst_line;
   2781 dst_line += dst_stride;
   2782 src = src_line;
   2783 src_line += src_stride;
   2784 
   2785 RVV_FOREACH_2 (width, vl, e8m4, src, dst)
   2786 {
   2787     __riscv_vse8 (dst,
   2788 		  rvv_UN8_MUL_UN8_vv_m4 (__riscv_vle8_v_u8m4 (src, vl),
   2789 					 __riscv_vle8_v_u8m4 (dst, vl),
   2790 					 vl),
   2791 		  vl);
   2792 }
   2793    }
   2794 }
   2795 
   2796 #define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
   2797 
   2798 /*
   2799 * There is some potential for hand vectorization, but for now let's leave it
   2800 * autovectorized.
   2801 */
   2802 static force_inline void
   2803 pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
   2804 {
   2805    if (offs)
   2806    {
   2807 int leading_pixels = 32 - offs;
   2808 if (leading_pixels >= width)
   2809 {
   2810     if (v)
   2811 	*dst |= A1_FILL_MASK (width, offs);
   2812     else
   2813 	*dst &= ~A1_FILL_MASK (width, offs);
   2814     return;
   2815 }
   2816 else
   2817 {
   2818     if (v)
   2819 	*dst++ |= A1_FILL_MASK (leading_pixels, offs);
   2820     else
   2821 	*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
   2822     width -= leading_pixels;
   2823 }
   2824    }
   2825    while (width >= 32)
   2826    {
   2827 if (v)
   2828     *dst++ = 0xFFFFFFFF;
   2829 else
   2830     *dst++ = 0;
   2831 width -= 32;
   2832    }
   2833    if (width > 0)
   2834    {
   2835 if (v)
   2836     *dst |= A1_FILL_MASK (width, 0);
   2837 else
   2838     *dst &= ~A1_FILL_MASK (width, 0);
   2839    }
   2840 }
   2841 
   2842 static void
   2843 rvv_fill1 (uint32_t *bits,
   2844    int       stride,
   2845    int       x,
   2846    int       y,
   2847    int       width,
   2848    int       height,
   2849    uint32_t  filler)
   2850 {
   2851    uint32_t *dst  = bits + y * stride + (x >> 5);
   2852    int       offs = x & 31;
   2853 
   2854    while (height--)
   2855    {
   2856 pixman_fill1_line (dst, offs, width, (filler & 1));
   2857 dst += stride;
   2858    }
   2859 }
   2860 
   2861 #define RVV_FILL(dtypew)                                                            \
   2862    static void rvv_fill_u##dtypew (uint32_t *__restrict__ bits, int stride,        \
   2863 			    int x, int y, int width, int height,            \
   2864 			    uint32_t filler)                                \
   2865    {                                                                               \
   2866 uint##dtypew##_t *__restrict__ bitsw = (uint##dtypew##_t *)bits;            \
   2867 int32_t             vstride          = stride * (32 / dtypew);              \
   2868 vuint##dtypew##m8_t vfiller          = __riscv_vmv_v_x_u##dtypew##m8 (      \
   2869            (uint##dtypew##_t)filler, __riscv_vsetvlmax_e##dtypew##m8 ()); \
   2870                                                                                    \
   2871 bitsw += y * vstride + x;                                                   \
   2872 while (height--)                                                            \
   2873 {                                                                           \
   2874     uint##dtypew##_t *__restrict__ d = bitsw;                               \
   2875                                                                                    \
   2876     RVV_FOREACH_1 (width, vl, e##dtypew##m8, d)                             \
   2877     {                                                                       \
   2878 	__riscv_vse##dtypew (d, vfiller, vl);                               \
   2879     }                                                                       \
   2880                                                                                    \
   2881     bitsw += vstride;                                                       \
   2882 }                                                                           \
   2883    }
   2884 
   2885 RVV_FILL (8);
   2886 RVV_FILL (16);
   2887 RVV_FILL (32);
   2888 
   2889 static pixman_bool_t
   2890 rvv_fill (pixman_implementation_t *__restrict__ imp,
   2891   uint32_t *__restrict__ bits,
   2892   int      stride,
   2893   int      bpp,
   2894   int      x,
   2895   int      y,
   2896   int      width,
   2897   int      height,
   2898   uint32_t filler)
   2899 {
   2900    switch (bpp)
   2901    {
   2902 case 1:
   2903     rvv_fill1 (bits, stride, x, y, width, height, filler);
   2904     break;
   2905 case 8:
   2906     rvv_fill_u8 (bits, stride, x, y, width, height, filler);
   2907     break;
   2908 case 16:
   2909     rvv_fill_u16 (bits, stride, x, y, width, height, filler);
   2910     break;
   2911 case 32:
   2912     rvv_fill_u32 (bits, stride, x, y, width, height, filler);
   2913     break;
   2914 default:
   2915     return FALSE;
   2916    }
   2917 
   2918    return TRUE;
   2919 }
   2920 
   2921 static void
   2922 rvv_composite_solid_fill (pixman_implementation_t *imp,
   2923 		  pixman_composite_info_t *info)
   2924 {
   2925    PIXMAN_COMPOSITE_ARGS (info);
   2926    uint32_t src;
   2927 
   2928    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
   2929 
   2930    if (dest_image->bits.format == PIXMAN_a1)
   2931    {
   2932 src = src >> 31;
   2933    }
   2934    else if (dest_image->bits.format == PIXMAN_a8)
   2935    {
   2936 src = src >> 24;
   2937    }
   2938    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
   2939      dest_image->bits.format == PIXMAN_b5g6r5)
   2940    {
   2941 src = convert_8888_to_0565 (src);
   2942    }
   2943 
   2944    rvv_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
   2945       PIXMAN_FORMAT_BPP (dest_image->bits.format), dest_x, dest_y,
   2946       width, height, src);
   2947 }
   2948 
   2949 #define RVV_BLT(dtypew)                                                        \
   2950    static void rvv_blt_u##dtypew (                                            \
   2951 uint32_t *__restrict__ src_bits, uint32_t *__restrict__ dst_bits,      \
   2952 int src_stride, int dst_stride, int src_x, int src_y, int dest_x,      \
   2953 int dest_y, int width, int height)                                     \
   2954    {                                                                          \
   2955 uint##dtypew##_t *src_w = (uint##dtypew##_t *)src_bits;                \
   2956 uint##dtypew##_t *dst_w = (uint##dtypew##_t *)dst_bits;                \
   2957                                                                               \
   2958 src_stride = src_stride * (32 / dtypew);                               \
   2959 dst_stride = dst_stride * (32 / dtypew);                               \
   2960                                                                               \
   2961 src_w += src_stride * src_y + src_x;                                   \
   2962 dst_w += dst_stride * dest_y + dest_x;                                 \
   2963                                                                               \
   2964 while (height--)                                                       \
   2965 {                                                                      \
   2966     uint##dtypew##_t *__restrict__ pd = dst_w;                         \
   2967     uint##dtypew##_t *__restrict__ ps = src_w;                         \
   2968                                                                               \
   2969     RVV_FOREACH_2 (width, vl, e##dtypew##m8, ps, pd)                   \
   2970     {                                                                  \
   2971 	__riscv_vse##dtypew (                                          \
   2972 	    pd, __riscv_vle##dtypew##_v_u##dtypew##m8 (ps, vl), vl);   \
   2973     }                                                                  \
   2974                                                                               \
   2975     dst_w += dst_stride;                                               \
   2976     src_w += src_stride;                                               \
   2977 }                                                                      \
   2978    }
   2979 RVV_BLT (8);
   2980 RVV_BLT (16);
   2981 RVV_BLT (32);
   2982 
   2983 static pixman_bool_t
   2984 rvv_blt (pixman_implementation_t *__restrict__ imp,
   2985  uint32_t *__restrict__ src_bits,
   2986  uint32_t *__restrict__ dst_bits,
   2987  int src_stride,
   2988  int dst_stride,
   2989  int src_bpp,
   2990  int dst_bpp,
   2991  int src_x,
   2992  int src_y,
   2993  int dest_x,
   2994  int dest_y,
   2995  int width,
   2996  int height)
   2997 {
   2998    if (src_bpp != dst_bpp)
   2999 return FALSE;
   3000 
   3001    switch (src_bpp)
   3002    {
   3003 case 8:
   3004     rvv_blt_u8 (src_bits, dst_bits, src_stride, dst_stride, src_x,
   3005 		src_y, dest_x, dest_y, width, height);
   3006     break;
   3007 case 16:
   3008     rvv_blt_u16 (src_bits, dst_bits, src_stride, dst_stride, src_x,
   3009 		 src_y, dest_x, dest_y, width, height);
   3010     break;
   3011 case 32:
   3012     rvv_blt_u32 (src_bits, dst_bits, src_stride, dst_stride, src_x,
   3013 		 src_y, dest_x, dest_y, width, height);
   3014     break;
   3015 default:
   3016     return FALSE;
   3017    }
   3018 
   3019    return TRUE;
   3020 }
   3021 
   3022 // clang-format off
   3023 static const pixman_fast_path_t rvv_fast_paths[] = {
   3024    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, rvv_composite_over_n_8_0565),
   3025    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, rvv_composite_over_n_8_0565),
   3026    // PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, rvv_composite_over_n_8_0888),
   3027    // PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, rvv_composite_over_n_8_0888),
   3028    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, rvv_composite_over_n_8_8888),
   3029    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, rvv_composite_over_n_8_8888),
   3030    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, rvv_composite_over_n_8_8888),
   3031    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, rvv_composite_over_n_8_8888),
   3032    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, rvv_composite_over_n_1_8888),
   3033    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, rvv_composite_over_n_1_8888),
   3034    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, rvv_composite_over_n_1_8888),
   3035    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, rvv_composite_over_n_1_8888),
   3036    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   rvv_composite_over_n_1_0565),
   3037    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   rvv_composite_over_n_1_0565),
   3038    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, rvv_composite_over_n_8888_8888_ca),
   3039    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, rvv_composite_over_n_8888_8888_ca),
   3040    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, rvv_composite_over_n_8888_0565_ca),
   3041    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, rvv_composite_over_n_8888_8888_ca),
   3042    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, rvv_composite_over_n_8888_8888_ca),
   3043    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, rvv_composite_over_n_8888_0565_ca),
   3044    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, rvv_composite_over_x888_8_8888),
   3045    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, rvv_composite_over_x888_8_8888),
   3046    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, rvv_composite_over_x888_8_8888),
   3047    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, rvv_composite_over_x888_8_8888),
   3048    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, rvv_composite_over_8888_8888),
   3049    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, rvv_composite_over_8888_8888),
   3050    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, rvv_composite_over_8888_0565),
   3051    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, rvv_composite_over_8888_8888),
   3052    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, rvv_composite_over_8888_8888),
   3053    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, rvv_composite_over_8888_0565),
   3054    PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, rvv_composite_add_0565_0565),
   3055    PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, rvv_composite_add_0565_0565),
   3056    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, rvv_composite_add_8888_8888),
   3057    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, rvv_composite_add_8888_8888),
   3058    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, rvv_composite_add_8_8),
   3059    // PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1),
   3060    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, rvv_composite_add_n_8888_8888_ca),
   3061    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, rvv_composite_add_n_8_8),
   3062    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, rvv_composite_solid_fill),
   3063    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, rvv_composite_solid_fill),
   3064    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, rvv_composite_solid_fill),
   3065    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, rvv_composite_solid_fill),
   3066    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, rvv_composite_solid_fill),
   3067    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, rvv_composite_solid_fill),
   3068    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, rvv_composite_solid_fill),
   3069    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, rvv_composite_src_x888_8888),
   3070    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, rvv_composite_src_x888_8888),
   3071    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
   3072    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, rvv_composite_src_8888_8888),
   3073    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
   3074    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
   3075    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, rvv_composite_src_8888_8888),
   3076    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
   3077    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, rvv_composite_src_8888_8888),
   3078    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, rvv_composite_src_8888_8888),
   3079    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, rvv_composite_src_8888_8888),
   3080    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, rvv_composite_src_memcpy),
   3081    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, rvv_composite_src_memcpy),
   3082    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, rvv_composite_src_memcpy),
   3083    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, rvv_composite_src_memcpy),
   3084    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, rvv_composite_src_memcpy),
   3085    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, rvv_composite_src_memcpy),
   3086    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, rvv_composite_src_memcpy),
   3087    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, rvv_composite_in_8_8),
   3088    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, rvv_composite_in_n_8_8),
   3089    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
   3090    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
   3091 
   3092    {PIXMAN_OP_NONE},
   3093 };
   3094 
   3095 pixman_implementation_t *
   3096 _pixman_implementation_create_rvv (pixman_implementation_t *fallback)
   3097 {
   3098    pixman_implementation_t *imp = _pixman_implementation_create (
   3099 fallback, rvv_fast_paths);
   3100 
   3101    // clang-format off
   3102    imp->combine_float[PIXMAN_OP_CLEAR] = rvv_combine_clear_u_float;
   3103    imp->combine_float[PIXMAN_OP_SRC] = rvv_combine_src_u_float;
   3104    imp->combine_float[PIXMAN_OP_DST] = rvv_combine_dst_u_float;
   3105    imp->combine_float[PIXMAN_OP_OVER] = rvv_combine_over_u_float;
   3106    imp->combine_float[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u_float;
   3107    imp->combine_float[PIXMAN_OP_IN] = rvv_combine_in_u_float;
   3108    imp->combine_float[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u_float;
   3109    imp->combine_float[PIXMAN_OP_OUT] = rvv_combine_out_u_float;
   3110    imp->combine_float[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u_float;
   3111    imp->combine_float[PIXMAN_OP_ATOP] = rvv_combine_atop_u_float;
   3112    imp->combine_float[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u_float;
   3113    imp->combine_float[PIXMAN_OP_XOR] = rvv_combine_xor_u_float;
   3114    imp->combine_float[PIXMAN_OP_ADD] = rvv_combine_add_u_float;
   3115    imp->combine_float[PIXMAN_OP_SATURATE] = rvv_combine_saturate_u_float;
   3116 
   3117    /* Disjoint, unified */
   3118    imp->combine_float[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_u_float;
   3119    imp->combine_float[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_u_float;
   3120    imp->combine_float[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_u_float;
   3121    imp->combine_float[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_u_float;
   3122    imp->combine_float[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_u_float;
   3123    imp->combine_float[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_u_float;
   3124    imp->combine_float[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_u_float;
   3125    imp->combine_float[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_u_float;
   3126    imp->combine_float[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_u_float;
   3127    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_u_float;
   3128    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_u_float;
   3129    imp->combine_float[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_u_float;
   3130 
   3131    /* Conjoint, unified */
   3132    imp->combine_float[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_u_float;
   3133    imp->combine_float[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_u_float;
   3134    imp->combine_float[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_u_float;
   3135    imp->combine_float[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_u_float;
   3136    imp->combine_float[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_u_float;
   3137    imp->combine_float[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_u_float;
   3138    imp->combine_float[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_u_float;
   3139    imp->combine_float[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_u_float;
   3140    imp->combine_float[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_u_float;
   3141    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_u_float;
   3142    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_u_float;
   3143    imp->combine_float[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_u_float;
   3144 
   3145    /* PDF operators, unified */
   3146    imp->combine_float[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u_float;
   3147    imp->combine_float[PIXMAN_OP_SCREEN] = rvv_combine_screen_u_float;
   3148    imp->combine_float[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u_float;
   3149    imp->combine_float[PIXMAN_OP_DARKEN] = rvv_combine_darken_u_float;
   3150    imp->combine_float[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u_float;
   3151    imp->combine_float[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u_float;
   3152    imp->combine_float[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_u_float;
   3153    imp->combine_float[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u_float;
   3154    imp->combine_float[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u_float;
   3155    imp->combine_float[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_u_float;
   3156    imp->combine_float[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_u_float;
   3157 
   3158    /* Component alpha combiners */
   3159    imp->combine_float_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear_ca_float;
   3160    imp->combine_float_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca_float;
   3161    imp->combine_float_ca[PIXMAN_OP_DST] = rvv_combine_dst_ca_float;
   3162    imp->combine_float_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca_float;
   3163    imp->combine_float_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca_float;
   3164    imp->combine_float_ca[PIXMAN_OP_IN] = rvv_combine_in_ca_float;
   3165    imp->combine_float_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca_float;
   3166    imp->combine_float_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca_float;
   3167    imp->combine_float_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca_float;
   3168    imp->combine_float_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca_float;
   3169    imp->combine_float_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca_float;
   3170    imp->combine_float_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca_float;
   3171    imp->combine_float_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca_float;
   3172    imp->combine_float_ca[PIXMAN_OP_SATURATE] = rvv_combine_saturate_ca_float;
   3173 
   3174    /* Disjoint CA */
   3175    imp->combine_float_ca[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_ca_float;
   3176    imp->combine_float_ca[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_ca_float;
   3177    imp->combine_float_ca[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_ca_float;
   3178    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_ca_float;
   3179    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_ca_float;
   3180    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_ca_float;
   3181    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_ca_float;
   3182    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_ca_float;
   3183    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_ca_float;
   3184    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_ca_float;
   3185    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_ca_float;
   3186    imp->combine_float_ca[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_ca_float;
   3187 
   3188    /* Conjoint CA */
   3189    imp->combine_float_ca[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_ca_float;
   3190    imp->combine_float_ca[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_ca_float;
   3191    imp->combine_float_ca[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_ca_float;
   3192    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_ca_float;
   3193    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_ca_float;
   3194    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_ca_float;
   3195    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_ca_float;
   3196    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_ca_float;
   3197    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_ca_float;
   3198    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_ca_float;
   3199    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_ca_float;
   3200    imp->combine_float_ca[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_ca_float;
   3201 
   3202    /* PDF operators CA */
   3203    imp->combine_float_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca_float;
   3204    imp->combine_float_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca_float;
   3205    imp->combine_float_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca_float;
   3206    imp->combine_float_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca_float;
   3207    imp->combine_float_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca_float;
   3208    imp->combine_float_ca[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_ca_float;
   3209    imp->combine_float_ca[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_ca_float;
   3210    imp->combine_float_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca_float;
   3211    imp->combine_float_ca[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_ca_float;
   3212    imp->combine_float_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca_float;
   3213    imp->combine_float_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca_float;
   3214 
   3215    /* It is not clear that these make sense, so make them noops for now */
   3216    imp->combine_float_ca[PIXMAN_OP_HSL_HUE] = rvv_combine_dst_u_float;
   3217    imp->combine_float_ca[PIXMAN_OP_HSL_SATURATION] = rvv_combine_dst_u_float;
   3218    imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = rvv_combine_dst_u_float;
   3219    imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = rvv_combine_dst_u_float;
   3220 
   3221    /* Set up function pointers */
   3222    imp->combine_32[PIXMAN_OP_CLEAR] = rvv_combine_clear;
   3223    imp->combine_32[PIXMAN_OP_SRC] = rvv_combine_src_u;
   3224    imp->combine_32[PIXMAN_OP_OVER] = rvv_combine_over_u;
   3225    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u;
   3226    imp->combine_32[PIXMAN_OP_IN] = rvv_combine_in_u;
   3227    imp->combine_32[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u;
   3228    imp->combine_32[PIXMAN_OP_OUT] = rvv_combine_out_u;
   3229    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u;
   3230    imp->combine_32[PIXMAN_OP_ATOP] = rvv_combine_atop_u;
   3231    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u;
   3232    imp->combine_32[PIXMAN_OP_XOR] = rvv_combine_xor_u;
   3233    imp->combine_32[PIXMAN_OP_ADD] = rvv_combine_add_u;
   3234 
   3235    imp->combine_32[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u;
   3236    imp->combine_32[PIXMAN_OP_SCREEN] = rvv_combine_screen_u;
   3237    imp->combine_32[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u;
   3238    imp->combine_32[PIXMAN_OP_DARKEN] = rvv_combine_darken_u;
   3239    imp->combine_32[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u;
   3240    imp->combine_32[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u;
   3241    imp->combine_32[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u;
   3242    imp->combine_32[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u;
   3243 
   3244    imp->combine_32_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear;
   3245    imp->combine_32_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca;
   3246    imp->combine_32_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca;
   3247    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca;
   3248    imp->combine_32_ca[PIXMAN_OP_IN] = rvv_combine_in_ca;
   3249    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca;
   3250    imp->combine_32_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca;
   3251    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca;
   3252    imp->combine_32_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca;
   3253    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca;
   3254    imp->combine_32_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca;
   3255    imp->combine_32_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca;
   3256 
   3257    imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca;
   3258    imp->combine_32_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca;
   3259    imp->combine_32_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca;
   3260    imp->combine_32_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca;
   3261    imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca;
   3262    imp->combine_32_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca;
   3263    imp->combine_32_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca;
   3264    imp->combine_32_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca;
   3265 
   3266    imp->fill = rvv_fill;
   3267    imp->blt = rvv_blt;
   3268 
   3269    return imp;
   3270 }
   3271 // clang-format on