pixman-rvv.c (113605B)
1 /* 2 * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. 3 * 2005 Lars Knoll & Zack Rusin, Trolltech 4 * 2024 Filip Wasil, Samsung Electronics 5 * 2024 Bernard Gingold, Samsung Electronics 6 * 2025 Marek Pikuła, Samsung Electronics 7 * Permission to use, copy, modify, distribute, and sell this software and its 8 * documentation for any purpose is hereby granted without fee, provided that 9 * the above copyright notice appear in all copies and that both that 10 * copyright notice and this permission notice appear in supporting 11 * documentation, and that the name of Keith Packard not be used in 12 * advertising or publicity pertaining to distribution of the software without 13 * specific, written prior permission. Keith Packard makes no 14 * representations about the suitability of this software for any purpose. It 15 * is provided "as is" without express or implied warranty. 16 * 17 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 18 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 19 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 21 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 22 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 23 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 24 * SOFTWARE. 25 */ 26 27 #ifdef HAVE_CONFIG_H 28 #include <pixman-config.h> 29 #endif 30 31 #include "pixman-combine-float.h" 32 #include "pixman-combine32.h" 33 #include "pixman-inlines.h" 34 #include "pixman-private.h" 35 36 #include <riscv_vector.h> 37 38 #include <float.h> 39 #include <math.h> 40 #include <stdbool.h> 41 #include <stddef.h> 42 #include <stdint.h> 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <string.h> 46 47 // Convenience macros { 48 49 #define __FE_PTR(p, vl) ((p) += (vl)) 50 51 #define _RVV_FE_PRE(total_len, vn, vl, vspec) \ 52 size_t vn = total_len, vl = __riscv_vsetvl_##vspec (vn); \ 53 vn > 0 54 55 #define _RVV_FE_POST(vn, vl, vspec) vn -= (vl), vl = __riscv_vsetvl_##vspec (vn) 56 57 #define RVV_FOREACH_1(total_len, vl, vspec, p1) \ 58 for (_RVV_FE_PRE (total_len, vn, vl, vspec); \ 59 __FE_PTR (p1, vl), _RVV_FE_POST (vn, vl, vspec)) 60 61 #define RVV_FOREACH_2(total_len, vl, vspec, p1, p2) \ 62 for (_RVV_FE_PRE (total_len, vn, vl, vspec); \ 63 __FE_PTR (p1, vl), __FE_PTR (p2, vl), _RVV_FE_POST (vn, vl, vspec)) 64 65 #define RVV_FOREACH_3(total_len, vl, vspec, p1, p2, p3) \ 66 for (_RVV_FE_PRE (total_len, vn, vl, vspec); \ 67 __FE_PTR (p1, vl), __FE_PTR (p2, vl), __FE_PTR (p3, vl), \ 68 _RVV_FE_POST (vn, vl, vspec)) 69 70 // vuintXXmYY_t for use in macros (less token concatenation). 71 #define VUINT(ELEN, LMUL) vuint##ELEN##LMUL##_t 72 #define VUINT32(LMUL) VUINT (32, LMUL) 73 #define VUINT16(LMUL) VUINT (16, LMUL) 74 #define VUINT8(LMUL) VUINT (8, LMUL) 75 76 // Short for vreinterpret commonly used for ARGB batch operations. 77 #define RVV_U8x4_U32(LMUL, value) \ 78 __riscv_vreinterpret_v_u8##LMUL##_u32##LMUL (value) 79 #define RVV_U8x4_U32_m2(value) RVV_U8x4_U32 (m2, value) 80 #define RVV_U8x4_U32_m4(value) RVV_U8x4_U32 (m4, value) 81 82 #define RVV_U32_U8x4(LMUL, value) \ 83 __riscv_vreinterpret_v_u32##LMUL##_u8##LMUL (value) 84 #define RVV_U32_U8x4_m2(value) RVV_U32_U8x4 (m2, value) 85 #define RVV_U32_U8x4_m4(value) RVV_U32_U8x4 (m4, value) 86 87 // } 88 89 // Float implementation 90 91 /* 92 * Screen 93 * 94 * ad * as * B(d/ad, s/as) 95 * = ad * as * (d/ad + s/as - s/as * d/ad) 96 * = ad * s + as * d - s * d 97 */ 98 99 static force_inline vfloat32m1_t 100 rvv_blend_screen_float (const vfloat32m1_t sa, 101 const vfloat32m1_t s, 102 const vfloat32m1_t da, 103 const vfloat32m1_t d, 104 size_t vl) 105 { 106 vfloat32m1_t t0, t1, t2; 107 t0 = __riscv_vfmul_vv_f32m1 (s, da, vl); 108 t1 = __riscv_vfmul_vv_f32m1 (d, sa, vl); 109 t2 = __riscv_vfmul_vv_f32m1 (s, d, vl); 110 return __riscv_vfsub_vv_f32m1 (__riscv_vfadd_vv_f32m1 (t0, t1, vl), t2, vl); 111 } 112 113 /* 114 * Multiply 115 * 116 * ad * as * B(d / ad, s / as) 117 * = ad * as * d/ad * s/as 118 * = d * s 119 * 120 */ 121 122 static force_inline vfloat32m1_t 123 rvv_blend_multiply_float (const vfloat32m1_t sa, 124 const vfloat32m1_t s, 125 const vfloat32m1_t da, 126 const vfloat32m1_t d, 127 size_t vl) 128 { 129 return __riscv_vfmul_vv_f32m1 (s, d, vl); 130 } 131 132 /* 133 * Overlay 134 * 135 * ad * as * B(d/ad, s/as) 136 * = ad * as * Hardlight (s, d) 137 * = if (d / ad < 0.5) 138 * as * ad * Multiply (s/as, 2 * d/ad) 139 * else 140 * as * ad * Screen (s/as, 2 * d / ad - 1) 141 * = if (d < 0.5 * ad) 142 * as * ad * s/as * 2 * d /ad 143 * else 144 * as * ad * (s/as + 2 * d / ad - 1 - s / as * (2 * d / ad - 1)) 145 * = if (2 * d < ad) 146 * 2 * s * d 147 * else 148 * ad * s + 2 * as * d - as * ad - ad * s * (2 * d / ad - 1) 149 * = if (2 * d < ad) 150 * 2 * s * d 151 * else 152 * as * ad - 2 * (ad - d) * (as - s) 153 */ 154 155 static force_inline vfloat32m1_t 156 rvv_blend_overlay_float (const vfloat32m1_t sa, 157 const vfloat32m1_t s, 158 const vfloat32m1_t da, 159 const vfloat32m1_t d, 160 size_t vl) 161 { 162 vfloat32m1_t t0, t1, t2, t3, t4, f0, f1, f2; 163 vbool32_t vb; 164 t0 = __riscv_vfadd_vv_f32m1 (d, d, vl); 165 t1 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (s, s, vl), d, vl); 166 vb = __riscv_vmflt_vv_f32m1_b32 (t0, da, vl); 167 t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl); 168 f2 = __riscv_vfsub_vv_f32m1 (da, d, vl); 169 t3 = __riscv_vfmul_vf_f32m1 (f2, 2.0f, vl); 170 t4 = __riscv_vfsub_vv_f32m1 (sa, s, vl); 171 f0 = __riscv_vfmul_vv_f32m1 (t3, t4, vl); 172 f1 = __riscv_vfsub_vv_f32m1 (t2, f0, vl); 173 return __riscv_vmerge_vvm_f32m1 (f1, t1, vb, vl); 174 } 175 176 /* 177 * Darken 178 * 179 * ad * as * B(d/ad, s/as) 180 * = ad * as * MIN(d/ad, s/as) 181 * = MIN (as * d, ad * s) 182 */ 183 184 static force_inline vfloat32m1_t 185 rvv_blend_darken_float (const vfloat32m1_t sa, 186 const vfloat32m1_t s, 187 const vfloat32m1_t da, 188 const vfloat32m1_t d, 189 size_t vl) 190 { 191 vfloat32m1_t ss, dd; 192 vbool32_t vb; 193 ss = __riscv_vfmul_vv_f32m1 (da, s, vl); 194 dd = __riscv_vfmul_vv_f32m1 (sa, d, vl); 195 vb = __riscv_vmfgt_vv_f32m1_b32 (ss, dd, vl); 196 return __riscv_vmerge_vvm_f32m1 (ss, dd, vb, vl); 197 } 198 199 /* 200 * Lighten 201 * 202 * ad * as * B(d/ad, s/as) 203 * = ad * as * MAX(d/ad, s/as) 204 * = MAX (as * d, ad * s) 205 */ 206 207 static force_inline vfloat32m1_t 208 rvv_blend_lighten_float (const vfloat32m1_t sa, 209 const vfloat32m1_t s, 210 const vfloat32m1_t da, 211 const vfloat32m1_t d, 212 size_t vl) 213 { 214 vfloat32m1_t ss, dd; 215 vbool32_t vb; 216 ss = __riscv_vfmul_vv_f32m1 (s, da, vl); 217 dd = __riscv_vfmul_vv_f32m1 (d, sa, vl); 218 vb = __riscv_vmfgt_vv_f32m1_b32 (ss, dd, vl); 219 return __riscv_vmerge_vvm_f32m1 (dd, ss, vb, vl); 220 } 221 222 /* 223 * Color dodge 224 * 225 * ad * as * B(d/ad, s/as) 226 * = if d/ad = 0 227 * ad * as * 0 228 * else if (d/ad >= (1 - s/as) 229 * ad * as * 1 230 * else 231 * ad * as * ((d/ad) / (1 - s/as)) 232 * = if d = 0 233 * 0 234 * elif as * d >= ad * (as - s) 235 * ad * as 236 * else 237 * as * (as * d / (as - s)) 238 * 239 */ 240 241 static force_inline vfloat32m1_t 242 rvv_blend_color_dodge_float (const vfloat32m1_t sa, 243 const vfloat32m1_t s, 244 const vfloat32m1_t da, 245 const vfloat32m1_t d, 246 size_t vl) 247 { 248 vfloat32m1_t t0, t1, t2, t3, t4; 249 vbool32_t is_d_zero, vb, is_t0_non_zero; 250 251 is_d_zero = __riscv_vmfeq_vf_f32m1_b32 (d, 0.0f, vl); 252 253 t0 = __riscv_vfsub_vv_f32m1 (sa, s, vl); // sa - s 254 t1 = __riscv_vfmul_vv_f32m1 (sa, d, vl); // d * sa 255 t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl); // sa * da 256 t3 = __riscv_vfsub_vv_f32m1 (t2, __riscv_vfmul_vv_f32m1 (s, da, vl), 257 vl); // sa * da - s * da 258 259 is_t0_non_zero = __riscv_vmfne_vf_f32m1_b32 (t0, 0.0f, vl); 260 vb = __riscv_vmflt_vv_f32m1_b32 (t3, t1, vl); 261 t4 = __riscv_vfdiv_vv_f32m1 (__riscv_vfmul_vv_f32m1 (sa, t1, vl), t0, 262 vl); // sa * sa * d / (sa - s); 263 264 return __riscv_vfmerge_vfm_f32m1 ( 265 __riscv_vmerge_vvm_f32m1 ( 266 __riscv_vmerge_vvm_f32m1 (t2, t4, is_t0_non_zero, vl), t2, vb, vl), 267 0.0f, is_d_zero, vl); 268 } 269 270 /* 271 * Color burn 272 * 273 * We modify the first clause "if d = 1" to "if d >= 1" since with 274 * premultiplied colors d > 1 can actually happen. 275 * 276 * ad * as * B(d/ad, s/as) 277 * = if d/ad >= 1 278 * ad * as * 1 279 * elif (1 - d/ad) >= s/as 280 * ad * as * 0 281 * else 282 * ad * as * (1 - ((1 - d/ad) / (s/as))) 283 * = if d >= ad 284 * ad * as 285 * elif as * ad - as * d >= ad * s 286 * 0 287 * else 288 * ad * as - as * as * (ad - d) / s 289 */ 290 291 static force_inline vfloat32m1_t 292 rvv_blend_color_burn_float (const vfloat32m1_t sa, 293 const vfloat32m1_t s, 294 const vfloat32m1_t da, 295 const vfloat32m1_t d, 296 size_t vl) 297 { 298 vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7; 299 vbool32_t is_d_ge_da, is_s_zero, vb; 300 301 is_d_ge_da = __riscv_vmfge_vv_f32m1_b32 (d, da, vl); 302 is_s_zero = __riscv_vmfeq_vf_f32m1_b32 (s, 0.0f, vl); 303 304 t0 = __riscv_vfmul_vv_f32m1 (sa, __riscv_vfsub_vv_f32m1 (da, d, vl), 305 vl); // sa * (da - d) 306 t1 = __riscv_vfsub_vv_f32m1 (da, __riscv_vfdiv_vv_f32m1 (t0, s, vl), 307 vl); // da - sa * (da - d) / s) 308 t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl); // sa * da 309 t3 = __riscv_vfmul_vv_f32m1 (sa, t1, vl); // sa * (da - sa * (da - d) / s) 310 t4 = __riscv_vfmul_vv_f32m1 (s, da, vl); // s * da 311 vb = __riscv_vmfge_vf_f32m1_b32 (__riscv_vfsub_vv_f32m1 (t0, t4, vl), 0.0f, 312 vl); // if (sa * (da - d) - s * da >= 0.0f) 313 314 t6 = __riscv_vfmerge_vfm_f32m1 (t3, 0.0f, is_s_zero, vl); 315 t5 = __riscv_vfmerge_vfm_f32m1 (t6, 0.0f, vb, vl); 316 t7 = __riscv_vmerge_vvm_f32m1 (t5, t2, is_d_ge_da, vl); 317 318 return t7; 319 } 320 321 /* 322 * Hard light 323 * 324 * ad * as * B(d/ad, s/as) 325 * = if (s/as <= 0.5) 326 * ad * as * Multiply (d/ad, 2 * s/as) 327 * else 328 * ad * as * Screen (d/ad, 2 * s/as - 1) 329 * = if 2 * s <= as 330 * ad * as * d/ad * 2 * s / as 331 * else 332 * ad * as * (d/ad + (2 * s/as - 1) + d/ad * (2 * s/as - 1)) 333 * = if 2 * s <= as 334 * 2 * s * d 335 * else 336 * as * ad - 2 * (ad - d) * (as - s) 337 */ 338 339 static force_inline vfloat32m1_t 340 rvv_blend_hard_light_float (const vfloat32m1_t sa, 341 const vfloat32m1_t s, 342 const vfloat32m1_t da, 343 const vfloat32m1_t d, 344 size_t vl) 345 { 346 vfloat32m1_t t0, t1, t2, t3, t4; 347 vbool32_t vb; 348 t0 = __riscv_vfadd_vv_f32m1 (s, s, vl); 349 t1 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (s, s, vl), d, vl); 350 vb = __riscv_vmfgt_vv_f32m1_b32 (t0, sa, vl); 351 t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl); 352 t3 = __riscv_vfmul_vf_f32m1 (__riscv_vfsub_vv_f32m1 (da, d, vl), 2.0f, vl); 353 t4 = __riscv_vfsub_vv_f32m1 (sa, s, vl); 354 return __riscv_vmerge_vvm_f32m1 ( 355 t1, 356 __riscv_vfsub_vv_f32m1 (t2, __riscv_vfmul_vv_f32m1 (t3, t4, vl), vl), 357 vb, vl); 358 } 359 360 /* 361 * Soft light 362 * 363 * ad * as * B(d/ad, s/as) 364 * = if (s/as <= 0.5) 365 * ad * as * (d/ad - (1 - 2 * s/as) * d/ad * (1 - d/ad)) 366 * else if (d/ad <= 0.25) 367 * ad * as * (d/ad + (2 * s/as - 1) * ((((16 * d/ad - 12) * d/ad + 4) * d/ad) - d/ad)) 368 * else 369 * ad * as * (d/ad + (2 * s/as - 1) * sqrt (d/ad)) 370 * = if (2 * s <= as) 371 * d * as - d * (ad - d) * (as - 2 * s) / ad; 372 * else if (4 * d <= ad) 373 * (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3); 374 * else 375 * d * as + (sqrt (d * ad) - d) * (2 * s - as); 376 */ 377 378 static force_inline vfloat32m1_t 379 rvv_blend_soft_light_float (const vfloat32m1_t sa, 380 const vfloat32m1_t s, 381 const vfloat32m1_t da, 382 const vfloat32m1_t d, 383 size_t vl) 384 { 385 vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13; 386 vbool32_t is_sa_lt_2s, is_da_ls_4d, is_da_non_zero; 387 is_da_non_zero = __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl); 388 t0 = __riscv_vfadd_vv_f32m1 (s, s, vl); // 2 * s 389 is_sa_lt_2s = __riscv_vmflt_vv_f32m1_b32 (sa, t0, vl); 390 t1 = __riscv_vfmul_vv_f32m1 (sa, d, vl); // d * sa 391 t2 = __riscv_vfsub_vv_f32m1 (sa, t0, vl); // (sa - 2*s) 392 t3 = __riscv_vfmul_vv_f32m1 (d, t2, vl); // (sa - 2*s) * d 393 t7 = __riscv_vfdiv_vv_f32m1 (__riscv_vfmul_vf_f32m1 (d, 16.0f, vl), da, 394 vl); // 16 * d / da 395 t8 = __riscv_vfmul_vv_f32m1 (d, __riscv_vfsub_vf_f32m1 (t7, 12.0f, vl), 396 vl); // (16 * d / da - 12) * d 397 t9 = __riscv_vfadd_vf_f32m1 (__riscv_vfdiv_vv_f32m1 (t8, da, vl), 3.0f, 398 vl); // (16 * d / da - 12) * d / da + 3) 399 t4 = __riscv_vfmul_vv_f32m1 ( 400 t3, t9, vl); // (sa - 2*s) * d * ((16 * d / da - 12) * d / da + 3) 401 t5 = __riscv_vfsub_vv_f32m1 ( 402 t1, t4, 403 vl); // d * sa - (sa - 2*s) * d * ((16 * d / da - 12) * d / da + 3) 404 t6 = __riscv_vfadd_vv_f32m1 (__riscv_vfadd_vv_f32m1 (d, d, vl), 405 __riscv_vfadd_vv_f32m1 (d, d, vl), vl); 406 is_da_ls_4d = __riscv_vmflt_vv_f32m1_b32 (da, t6, vl); 407 t10 = __riscv_vfsub_vv_f32m1 ( 408 __riscv_vfsqrt_v_f32m1 (__riscv_vfmul_vv_f32m1 (d, da, vl), vl), d, 409 vl); // sqrtf (d * da) - d 410 t11 = __riscv_vfmul_vv_f32m1 (t2, t10, 411 vl); // (sqrtf (d * da) - d) * (sa - 2 * s) 412 t12 = __riscv_vfsub_vv_f32m1 ( 413 t1, t11, vl); // d * sa - (sqrtf (d * da) - d) * (sa - 2 * s) 414 // d * sa - d * (da - d) * (sa - 2 * s) / da 415 t13 = __riscv_vfsub_vv_f32m1 ( 416 t1, 417 __riscv_vfdiv_vv_f32m1 ( 418 __riscv_vfmul_vv_f32m1 (__riscv_vfmul_vv_f32m1 (d, t2, vl), 419 __riscv_vfsub_vv_f32m1 (da, d, vl), vl), 420 da, vl), 421 vl); 422 return __riscv_vmerge_vvm_f32m1 ( 423 t1, // if (!FLOAT_IS_ZERO (da)) 424 __riscv_vmerge_vvm_f32m1 ( 425 t13, // if (4 * d > da) 426 __riscv_vmerge_vvm_f32m1 (t5, t12, is_da_ls_4d, vl), is_sa_lt_2s, 427 vl), 428 is_da_non_zero, vl); 429 } 430 431 /* 432 * Difference 433 * 434 * ad * as * B(s/as, d/ad) 435 * = ad * as * abs (s/as - d/ad) 436 * = if (s/as <= d/ad) 437 * ad * as * (d/ad - s/as) 438 * else 439 * ad * as * (s/as - d/ad) 440 * = if (ad * s <= as * d) 441 * as * d - ad * s 442 * else 443 * ad * s - as * d 444 */ 445 446 static force_inline vfloat32m1_t 447 rvv_blend_difference_float (const vfloat32m1_t sa, 448 const vfloat32m1_t s, 449 const vfloat32m1_t da, 450 const vfloat32m1_t d, 451 size_t vl) 452 { 453 vfloat32m1_t dsa, sda; 454 vbool32_t vb; 455 dsa = __riscv_vfmul_vv_f32m1 (d, sa, vl); 456 sda = __riscv_vfmul_vv_f32m1 (s, da, vl); 457 vb = __riscv_vmflt_vv_f32m1_b32 (sda, dsa, vl); 458 return __riscv_vmerge_vvm_f32m1 (__riscv_vfsub_vv_f32m1 (sda, dsa, vl), 459 __riscv_vfsub_vv_f32m1 (dsa, sda, vl), vb, 460 vl); 461 } 462 463 /* 464 * Exclusion 465 * 466 * ad * as * B(s/as, d/ad) 467 * = ad * as * (d/ad + s/as - 2 * d/ad * s/as) 468 * = as * d + ad * s - 2 * s * d 469 */ 470 471 static force_inline vfloat32m1_t 472 rvv_blend_exclusion_float (const vfloat32m1_t sa, 473 const vfloat32m1_t s, 474 const vfloat32m1_t da, 475 const vfloat32m1_t d, 476 size_t vl) 477 { 478 vfloat32m1_t t0, t1; 479 t0 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (d, d, vl), s, vl); 480 t1 = __riscv_vfadd_vv_f32m1 (__riscv_vfmul_vv_f32m1 (s, da, vl), 481 __riscv_vfmul_vv_f32m1 (d, sa, vl), vl); 482 return __riscv_vfsub_vv_f32m1 (t1, t0, vl); 483 } 484 485 typedef vfloat32m1_t (*rvv_combine_channel_float_t) (const vfloat32m1_t sa, 486 const vfloat32m1_t s, 487 const vfloat32m1_t da, 488 const vfloat32m1_t d, 489 size_t vl); 490 491 static force_inline void 492 rvv_combine_inner_float (pixman_bool_t component, 493 float *dest, 494 const float *src, 495 const float *mask, 496 int n_pixels, 497 rvv_combine_channel_float_t combine_a, 498 rvv_combine_channel_float_t combine_c) 499 { 500 float *__restrict__ pd = dest; 501 const float *__restrict__ ps = src; 502 const float *__restrict__ pm = mask; 503 504 const int component_count = 4; 505 int vn = component_count * n_pixels; 506 int vl = 0; 507 int vl_step = 0; 508 509 const ptrdiff_t stride = component_count * sizeof (float); 510 511 vfloat32m1x4_t sa_sr_sg_sb, da_dr_dg_db, ma_mr_mg_mb; 512 vfloat32m1_t da2, dr2, dg2, db2, ma2, mr2, mg2, mb2, sr2, sg2, sb2, sa2; 513 514 if (n_pixels == 0) 515 { 516 return; 517 } 518 519 if (!mask) 520 { 521 for (; vn > 0; vn -= vl_step, pd += vl_step, ps += vl_step) 522 { 523 vl = __riscv_vsetvl_e32m1 (vn / component_count); 524 sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl); 525 da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl); 526 527 da2 = combine_a (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), 528 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), 529 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 530 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl); 531 532 dr2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), 533 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1), 534 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 535 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl); 536 537 dg2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), 538 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2), 539 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 540 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl); 541 542 db2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), 543 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3), 544 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 545 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl); 546 547 __riscv_vsseg4e32_v_f32m1x4 ( 548 pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl); 549 550 vl_step = vl * component_count; 551 } 552 } 553 else 554 { 555 if (component) 556 { 557 for (; vn > 0; 558 vn -= vl_step, pd += vl_step, ps += vl_step, pm += vl_step) 559 { 560 vl = __riscv_vsetvl_e32m1 (vn / component_count); 561 562 sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl); 563 da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl); 564 ma_mr_mg_mb = __riscv_vlseg4e32_v_f32m1x4 (pm, vl); 565 566 sr2 = __riscv_vfmul_vv_f32m1 ( 567 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1), 568 __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 1), vl); 569 570 sg2 = __riscv_vfmul_vv_f32m1 ( 571 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2), 572 __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 2), vl); 573 574 sb2 = __riscv_vfmul_vv_f32m1 ( 575 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3), 576 __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 3), vl); 577 578 ma2 = __riscv_vfmul_vv_f32m1 ( 579 __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 0), 580 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl); 581 582 mr2 = __riscv_vfmul_vv_f32m1 ( 583 __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 1), 584 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl); 585 586 mg2 = __riscv_vfmul_vv_f32m1 ( 587 __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 2), 588 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl); 589 590 mb2 = __riscv_vfmul_vv_f32m1 ( 591 __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 3), 592 __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl); 593 594 da2 = combine_a ( 595 ma2, ma2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 596 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl); 597 598 dr2 = combine_c ( 599 mr2, sr2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 600 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl); 601 602 dg2 = combine_c ( 603 mg2, sg2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 604 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl); 605 606 db2 = combine_c ( 607 mb2, sb2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 608 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl); 609 610 __riscv_vsseg4e32_v_f32m1x4 ( 611 pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl); 612 613 vl_step = vl * component_count; 614 } 615 } 616 else 617 { 618 for (; vn > 0; 619 vn -= vl_step, pd += vl_step, ps += vl_step, pm += vl_step) 620 { 621 vl = __riscv_vsetvl_e32m1 (vn / component_count); 622 623 sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl); 624 da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl); 625 ma2 = __riscv_vlse32_v_f32m1 (pm, stride, vl); 626 627 sa2 = __riscv_vfmul_vv_f32m1 ( 628 ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl); 629 sr2 = __riscv_vfmul_vv_f32m1 ( 630 ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1), vl); 631 sg2 = __riscv_vfmul_vv_f32m1 ( 632 ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2), vl); 633 sb2 = __riscv_vfmul_vv_f32m1 ( 634 ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3), vl); 635 636 ma2 = sa2; 637 638 dr2 = combine_c ( 639 ma2, sr2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 640 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl); 641 642 dg2 = combine_c ( 643 ma2, sg2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 644 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl); 645 646 db2 = combine_c ( 647 ma2, sb2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 648 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl); 649 650 da2 = combine_a ( 651 ma2, sa2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), 652 __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl); 653 654 __riscv_vsseg4e32_v_f32m1x4 ( 655 pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl); 656 657 vl_step = vl * component_count; 658 } 659 } 660 } 661 } 662 663 #define RVV_MAKE_COMBINER(name, component, combine_a, combine_c) \ 664 static void rvv_combine_##name##_float ( \ 665 pixman_implementation_t *imp, pixman_op_t op, float *dest, \ 666 const float *src, const float *mask, int n_pixels) \ 667 { \ 668 rvv_combine_inner_float (component, dest, src, mask, n_pixels, \ 669 combine_a, combine_c); \ 670 } 671 672 #define RVV_MAKE_COMBINERS(name, combine_a, combine_c) \ 673 RVV_MAKE_COMBINER (name##_ca, TRUE, combine_a, combine_c) \ 674 RVV_MAKE_COMBINER (name##_u, FALSE, combine_a, combine_c) 675 676 static force_inline vfloat32m1_t 677 rvv_get_factor_float (combine_factor_t factor, 678 vfloat32m1_t sa, 679 vfloat32m1_t da, 680 size_t vl) 681 { 682 vfloat32m1_t vone = __riscv_vfmv_v_f_f32m1 (1.0f, vl); 683 vfloat32m1_t vzero = __riscv_vfmv_v_f_f32m1 (0.0f, vl); 684 685 switch (factor) 686 { 687 case ZERO: 688 return vzero; 689 690 case ONE: 691 return vone; 692 693 case SRC_ALPHA: 694 return sa; 695 696 case DEST_ALPHA: 697 return da; 698 699 case INV_SA: 700 return __riscv_vfsub_vv_f32m1 (vone, sa, vl); 701 702 case INV_DA: 703 return __riscv_vfsub_vv_f32m1 (vone, da, vl); 704 705 case SA_OVER_DA: 706 return __riscv_vmerge_vvm_f32m1 ( 707 vone, 708 __riscv_vfmin_vv_f32m1 ( 709 vone, 710 __riscv_vfmax_vv_f32m1 ( 711 vzero, __riscv_vfdiv_vv_f32m1 (sa, da, vl), vl), 712 vl), 713 __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl); 714 715 case DA_OVER_SA: 716 return __riscv_vmerge_vvm_f32m1 ( 717 __riscv_vfmin_vv_f32m1 ( 718 vone, 719 __riscv_vfmax_vv_f32m1 ( 720 vzero, __riscv_vfdiv_vv_f32m1 (da, sa, vl), vl), 721 vl), 722 vone, __riscv_vmfeq_vf_f32m1_b32 (sa, 0.0f, vl), vl); 723 724 case INV_SA_OVER_DA: 725 { 726 vfloat32m1_t t0 = __riscv_vfdiv_vv_f32m1 ( 727 __riscv_vfsub_vv_f32m1 (vone, sa, vl), da, vl); 728 return __riscv_vmerge_vvm_f32m1 ( 729 vone, 730 __riscv_vfmin_vv_f32m1 ( 731 vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl), 732 __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl); 733 } 734 735 case INV_DA_OVER_SA: 736 { 737 vfloat32m1_t t0 = __riscv_vfdiv_vv_f32m1 ( 738 __riscv_vfsub_vv_f32m1 (vone, da, vl), sa, vl); 739 return __riscv_vmerge_vvm_f32m1 ( 740 vone, 741 __riscv_vfmin_vv_f32m1 ( 742 vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl), 743 __riscv_vmfne_vf_f32m1_b32 (sa, 0.0f, vl), vl); 744 } 745 746 case ONE_MINUS_SA_OVER_DA: 747 { 748 vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 ( 749 vone, __riscv_vfdiv_vv_f32m1 (sa, da, vl), vl); 750 return __riscv_vmerge_vvm_f32m1 ( 751 vzero, 752 __riscv_vfmin_vv_f32m1 ( 753 vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl), 754 __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl); 755 } 756 757 case ONE_MINUS_DA_OVER_SA: 758 { 759 vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 ( 760 vone, __riscv_vfdiv_vv_f32m1 (da, sa, vl), vl); 761 return __riscv_vmerge_vvm_f32m1 ( 762 vzero, 763 __riscv_vfmin_vv_f32m1 ( 764 vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl), 765 __riscv_vmfne_vf_f32m1_b32 (sa, 0.0f, vl), vl); 766 } 767 768 case ONE_MINUS_INV_DA_OVER_SA: 769 { 770 vbool32_t is_zero = __riscv_vmand_mm_b32 ( 771 __riscv_vmflt_vf_f32m1_b32 (sa, FLT_MIN, vl), 772 __riscv_vmfgt_vf_f32m1_b32 (sa, -FLT_MAX, vl), vl); 773 vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 ( 774 vone, 775 __riscv_vfdiv_vv_f32m1 ( 776 __riscv_vfsub_vv_f32m1 (vone, da, vl), sa, vl), 777 vl); 778 return __riscv_vmerge_vvm_f32m1 ( 779 __riscv_vfmin_vv_f32m1 ( 780 vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl), 781 vzero, is_zero, vl); 782 } 783 784 case ONE_MINUS_INV_SA_OVER_DA: 785 { 786 vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 ( 787 vone, 788 __riscv_vfdiv_vv_f32m1 ( 789 __riscv_vfsub_vv_f32m1 (vone, sa, vl), da, vl), 790 vl); 791 return __riscv_vmerge_vvm_f32m1 ( 792 __riscv_vfmin_vv_f32m1 ( 793 vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl), 794 vzero, __riscv_vmfeq_vf_f32m1_b32 (da, 0.0f, vl), vl); 795 } 796 } 797 798 return __riscv_vfmv_v_f_f32m1 (-1.0f, vl); 799 } 800 801 #define RVV_MAKE_PD_COMBINERS(name, a, b) \ 802 static vfloat32m1_t force_inline rvv_pd_combine_##name##_float ( \ 803 vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d, \ 804 size_t vl) \ 805 { \ 806 const vfloat32m1_t fa = rvv_get_factor_float (a, sa, da, vl); \ 807 const vfloat32m1_t fb = rvv_get_factor_float (b, sa, da, vl); \ 808 vfloat32m1_t t0 = __riscv_vfadd_vv_f32m1 ( \ 809 __riscv_vfmul_vv_f32m1 (s, fa, vl), \ 810 __riscv_vfmul_vv_f32m1 (d, fb, vl), vl); \ 811 return __riscv_vfmin_vv_f32m1 (__riscv_vfmv_v_f_f32m1 (1.0f, vl), t0, \ 812 vl); \ 813 } \ 814 \ 815 RVV_MAKE_COMBINERS (name, rvv_pd_combine_##name##_float, \ 816 rvv_pd_combine_##name##_float) 817 818 RVV_MAKE_PD_COMBINERS (clear, ZERO, ZERO) 819 RVV_MAKE_PD_COMBINERS (src, ONE, ZERO) 820 RVV_MAKE_PD_COMBINERS (dst, ZERO, ONE) 821 RVV_MAKE_PD_COMBINERS (over, ONE, INV_SA) 822 RVV_MAKE_PD_COMBINERS (over_reverse, INV_DA, ONE) 823 RVV_MAKE_PD_COMBINERS (in, DEST_ALPHA, ZERO) 824 RVV_MAKE_PD_COMBINERS (in_reverse, ZERO, SRC_ALPHA) 825 RVV_MAKE_PD_COMBINERS (out, INV_DA, ZERO) 826 RVV_MAKE_PD_COMBINERS (out_reverse, ZERO, INV_SA) 827 RVV_MAKE_PD_COMBINERS (atop, DEST_ALPHA, INV_SA) 828 RVV_MAKE_PD_COMBINERS (atop_reverse, INV_DA, SRC_ALPHA) 829 RVV_MAKE_PD_COMBINERS (xor, INV_DA, INV_SA) 830 RVV_MAKE_PD_COMBINERS (add, ONE, ONE) 831 832 RVV_MAKE_PD_COMBINERS (saturate, INV_DA_OVER_SA, ONE) 833 834 RVV_MAKE_PD_COMBINERS (disjoint_clear, ZERO, ZERO) 835 RVV_MAKE_PD_COMBINERS (disjoint_src, ONE, ZERO) 836 RVV_MAKE_PD_COMBINERS (disjoint_dst, ZERO, ONE) 837 RVV_MAKE_PD_COMBINERS (disjoint_over, ONE, INV_SA_OVER_DA) 838 RVV_MAKE_PD_COMBINERS (disjoint_over_reverse, INV_DA_OVER_SA, ONE) 839 RVV_MAKE_PD_COMBINERS (disjoint_in, ONE_MINUS_INV_DA_OVER_SA, ZERO) 840 RVV_MAKE_PD_COMBINERS (disjoint_in_reverse, ZERO, ONE_MINUS_INV_SA_OVER_DA) 841 RVV_MAKE_PD_COMBINERS (disjoint_out, INV_DA_OVER_SA, ZERO) 842 RVV_MAKE_PD_COMBINERS (disjoint_out_reverse, ZERO, INV_SA_OVER_DA) 843 RVV_MAKE_PD_COMBINERS (disjoint_atop, ONE_MINUS_INV_DA_OVER_SA, INV_SA_OVER_DA) 844 RVV_MAKE_PD_COMBINERS (disjoint_atop_reverse, 845 INV_DA_OVER_SA, 846 ONE_MINUS_INV_SA_OVER_DA) 847 RVV_MAKE_PD_COMBINERS (disjoint_xor, INV_DA_OVER_SA, INV_SA_OVER_DA) 848 849 RVV_MAKE_PD_COMBINERS (conjoint_clear, ZERO, ZERO) 850 RVV_MAKE_PD_COMBINERS (conjoint_src, ONE, ZERO) 851 RVV_MAKE_PD_COMBINERS (conjoint_dst, ZERO, ONE) 852 RVV_MAKE_PD_COMBINERS (conjoint_over, ONE, ONE_MINUS_SA_OVER_DA) 853 RVV_MAKE_PD_COMBINERS (conjoint_over_reverse, ONE_MINUS_DA_OVER_SA, ONE) 854 RVV_MAKE_PD_COMBINERS (conjoint_in, DA_OVER_SA, ZERO) 855 RVV_MAKE_PD_COMBINERS (conjoint_in_reverse, ZERO, SA_OVER_DA) 856 RVV_MAKE_PD_COMBINERS (conjoint_out, ONE_MINUS_DA_OVER_SA, ZERO) 857 RVV_MAKE_PD_COMBINERS (conjoint_out_reverse, ZERO, ONE_MINUS_SA_OVER_DA) 858 RVV_MAKE_PD_COMBINERS (conjoint_atop, DA_OVER_SA, ONE_MINUS_SA_OVER_DA) 859 RVV_MAKE_PD_COMBINERS (conjoint_atop_reverse, ONE_MINUS_DA_OVER_SA, SA_OVER_DA) 860 RVV_MAKE_PD_COMBINERS (conjoint_xor, ONE_MINUS_DA_OVER_SA, ONE_MINUS_SA_OVER_DA) 861 862 #define RVV_MAKE_SEPARABLE_PDF_COMBINERS(name) \ 863 static force_inline vfloat32m1_t rvv_combine_##name##_a ( \ 864 vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d, \ 865 size_t vl) \ 866 { \ 867 return __riscv_vfsub_vv_f32m1 (__riscv_vfadd_vv_f32m1 (da, sa, vl), \ 868 __riscv_vfmul_vv_f32m1 (da, sa, vl), \ 869 vl); \ 870 } \ 871 \ 872 static force_inline vfloat32m1_t rvv_combine_##name##_c ( \ 873 vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d, \ 874 size_t vl) \ 875 { \ 876 vfloat32m1_t f = __riscv_vfmul_vf_f32m1 ( \ 877 __riscv_vfadd_vv_f32m1 ( \ 878 __riscv_vfmul_vv_f32m1 (__riscv_vfsub_vf_f32m1 (sa, 1.0f, vl), \ 879 d, vl), \ 880 __riscv_vfmul_vv_f32m1 (__riscv_vfsub_vf_f32m1 (da, 1.0f, vl), \ 881 s, vl), \ 882 vl), \ 883 -1.0f, vl); \ 884 \ 885 return __riscv_vfadd_vv_f32m1 ( \ 886 f, rvv_blend_##name##_float (sa, s, da, d, vl), vl); \ 887 } \ 888 \ 889 RVV_MAKE_COMBINERS (name, rvv_combine_##name##_a, rvv_combine_##name##_c) 890 891 RVV_MAKE_SEPARABLE_PDF_COMBINERS (multiply) 892 RVV_MAKE_SEPARABLE_PDF_COMBINERS (screen) 893 RVV_MAKE_SEPARABLE_PDF_COMBINERS (overlay) 894 RVV_MAKE_SEPARABLE_PDF_COMBINERS (darken) 895 RVV_MAKE_SEPARABLE_PDF_COMBINERS (lighten) 896 RVV_MAKE_SEPARABLE_PDF_COMBINERS (color_dodge) 897 RVV_MAKE_SEPARABLE_PDF_COMBINERS (color_burn) 898 RVV_MAKE_SEPARABLE_PDF_COMBINERS (hard_light) 899 RVV_MAKE_SEPARABLE_PDF_COMBINERS (soft_light) 900 RVV_MAKE_SEPARABLE_PDF_COMBINERS (difference) 901 RVV_MAKE_SEPARABLE_PDF_COMBINERS (exclusion) 902 903 // int implementation 904 905 // pixman-combine32.h RVV implementation plus some convenience functions { 906 907 /* 908 * x_c = min(x_c + y_c, 255) 909 */ 910 911 #define rvv_UN8_ADD_UN8_vv(x, y, vl) __riscv_vsaddu (x, y, vl) 912 913 #define rvv_UN8x4_ADD_UN8x4_vv_m4(x, y, vl) \ 914 RVV_U8x4_U32_m4 (rvv_UN8_ADD_UN8_vv (RVV_U32_U8x4_m4 (x), \ 915 RVV_U32_U8x4_m4 (y), (vl) * 4)) 916 917 /* 918 * x_c = (x_c * a_c) / 255 919 */ 920 921 #define __rvv_UN8_MUL_UN8_vv(LMUL, LMUL16) \ 922 static force_inline VUINT8 (LMUL) rvv_UN8_MUL_UN8_vv_##LMUL ( \ 923 const VUINT8 (LMUL) x, const VUINT8 (LMUL) a, size_t vl) \ 924 { \ 925 VUINT16 (LMUL16) \ 926 mul_higher = __riscv_vwmaccu ( \ 927 __riscv_vmv_v_x_u16##LMUL16 (ONE_HALF, vl), x, a, vl); \ 928 \ 929 VUINT16 (LMUL16) \ 930 mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl); \ 931 \ 932 return __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), \ 933 G_SHIFT, vl); \ 934 } 935 __rvv_UN8_MUL_UN8_vv (m1, m2); 936 __rvv_UN8_MUL_UN8_vv (m2, m4); 937 __rvv_UN8_MUL_UN8_vv (m4, m8); 938 939 static force_inline vuint8m4_t 940 rvv_UN8_MUL_UN8_vx_m4 (const vuint8m4_t x, const uint8_t a, size_t vl) 941 { 942 vuint16m8_t mul_higher = __riscv_vwmaccu ( 943 __riscv_vmv_v_x_u16m8 (ONE_HALF, vl), a, x, vl); 944 vuint16m8_t mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl); 945 946 return __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, 947 vl); 948 } 949 950 #define __rvv_UN8x4_MUL_UN8x4_vv(LMUL, x, a, vl) \ 951 RVV_U8x4_U32 (LMUL, rvv_UN8_MUL_UN8_vv_##LMUL (RVV_U32_U8x4 (LMUL, x), \ 952 RVV_U32_U8x4 (LMUL, a), \ 953 (vl) * 4)) 954 #define rvv_UN8x4_MUL_UN8x4_vv_m2(x, a, vl) \ 955 __rvv_UN8x4_MUL_UN8x4_vv (m2, x, a, vl) 956 #define rvv_UN8x4_MUL_UN8x4_vv_m4(x, a, vl) \ 957 __rvv_UN8x4_MUL_UN8x4_vv (m4, x, a, vl) 958 959 /* 960 * a_c = a (broadcast to all components) 961 */ 962 963 #define __rvv_UN16_bcast_UN8x4_v(LMUL, LMUL16) \ 964 static force_inline VUINT32 (LMUL) \ 965 rvv_UN16_bcast_UN8x4_v_##LMUL (const VUINT16 (LMUL16) a, size_t vl) \ 966 { \ 967 VUINT32 (LMUL) \ 968 a32 = __riscv_vwcvtu_x (__riscv_vmadd (a, 1 << 8, a, vl), vl); \ 969 \ 970 return __riscv_vmadd (a32, 1 << 16, a32, vl); \ 971 } 972 __rvv_UN16_bcast_UN8x4_v (m2, m1); 973 __rvv_UN16_bcast_UN8x4_v (m4, m2); 974 975 #define rvv_UN8_bcast_UN8x4_v_m4(a, vl) \ 976 rvv_UN16_bcast_UN8x4_v_m4 (__riscv_vwcvtu_x (a, vl), vl) 977 978 /* 979 * x_c = (x_c * a) / 255 980 */ 981 982 #define rvv_UN8x4_MUL_UN8_vv_m4(x, a, vl) \ 983 rvv_UN8x4_MUL_UN8x4_vv_m4 (x, rvv_UN8_bcast_UN8x4_v_m4 (a, vl), vl) 984 985 #define __rvv_UN8x4_MUL_UN16_vv(LMUL, x, a, vl) \ 986 rvv_UN8x4_MUL_UN8x4_vv_##LMUL (x, rvv_UN16_bcast_UN8x4_v_##LMUL (a, vl), vl) 987 #define rvv_UN8x4_MUL_UN16_vv_m2(x, a, vl) \ 988 __rvv_UN8x4_MUL_UN16_vv (m2, x, a, vl) 989 #define rvv_UN8x4_MUL_UN16_vv_m4(x, a, vl) \ 990 __rvv_UN8x4_MUL_UN16_vv (m4, x, a, vl) 991 992 #define rvv_UN8x4_MUL_UN8_vx_m4(x, a, vl) \ 993 RVV_U8x4_U32_m4 (rvv_UN8_MUL_UN8_vx_m4 (RVV_U32_U8x4_m4 (x), a, (vl) * 4)) 994 995 static force_inline vuint32m2_t 996 rvv_DIV_ONE_UN32m2_UN32m2_v (const vuint32m2_t x, size_t vl) 997 { 998 vuint32m2_t mul_higher = __riscv_vadd (x, ONE_HALF, vl); 999 vuint32m2_t mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl); 1000 1001 return __riscv_vsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, vl); 1002 } 1003 1004 static force_inline vuint8m2_t 1005 rvv_DIV_ONE_UN32m8_UN8m2_v (const vuint32m8_t x, size_t vl) 1006 { 1007 vuint32m8_t mul_higher = __riscv_vadd (x, ONE_HALF, vl); 1008 vuint32m8_t mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl); 1009 1010 return __riscv_vncvt_x ( 1011 __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, vl), 1012 vl); 1013 } 1014 1015 /* 1016 * x_c = (x_c * a) / 255 + y_c 1017 */ 1018 1019 #define rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4(x, a, y, vl) \ 1020 rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN16_vv_m4 (x, a, vl), y, vl) 1021 1022 /* 1023 * x_c = (x_c * a + y_c * b) / 255 1024 */ 1025 1026 #define rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4(x, a, y, b, vl) \ 1027 rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN16_vv_m4 (x, a, vl), \ 1028 rvv_UN8x4_MUL_UN16_vv_m4 (y, b, vl), vl) 1029 1030 /* 1031 * x_c = (x_c * a_c) / 255 + y_c 1032 */ 1033 1034 #define rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4(x, a, y, vl) \ 1035 rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN8x4_vv_m4 (x, a, vl), y, vl) 1036 1037 /* 1038 * x_c = (x_c * a_c + y_c * b) / 255 1039 */ 1040 1041 #define rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4(x, a, y, b, vl) \ 1042 rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN8x4_vv_m4 (x, a, vl), \ 1043 rvv_UN8x4_MUL_UN16_vv_m4 (y, b, vl), vl) 1044 1045 // } pixman-combine32.h 1046 1047 // Additional functions. 1048 1049 #define rvv_shift_alpha_u16(x, vl) __riscv_vnsrl (x, 24, vl) 1050 1051 #define rvv_shift_not_alpha_u16(x, vl) \ 1052 rvv_shift_alpha_u16 (__riscv_vnot (x, vl), vl) 1053 1054 #define rvv_load_alpha_u8m1(src, vl) \ 1055 __riscv_vlse8_v_u8m1 ((uint8_t *)src + 3, 4, vl) 1056 1057 #define rvv_load_not_alpha_u8m1(src, vl) \ 1058 __riscv_vnot (rvv_load_alpha_u8m1 (src, vl), vl) 1059 1060 #define rvv_u8m2_to_i16m4(in, vl) \ 1061 __riscv_vreinterpret_i16m4 (__riscv_vwcvtu_x (in, vl)) 1062 1063 #define rvv_over_m4(src, dest, vl) \ 1064 rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4 ( \ 1065 dest, rvv_shift_not_alpha_u16 (src, vl), src, vl) 1066 1067 #define rvv_in_m4(x, y, vl) rvv_UN8x4_MUL_UN8_vv_m4 (x, y, vl) 1068 1069 #define rvv_in_load_s_m_m4(src, mask, vl) \ 1070 rvv_in_m4 (__riscv_vle32_v_u32m4 (src, vl), \ 1071 rvv_load_alpha_u8m1 (mask, vl), vl) 1072 1073 #define rvv_in_load_s_nm_m4(src, mask, vl) \ 1074 rvv_in_m4 (__riscv_vle32_v_u32m4 (src, vl), \ 1075 rvv_load_not_alpha_u8m1 (mask, vl), vl) 1076 1077 static force_inline vuint16m2_t 1078 rvv_convert_8888_to_0565_m2 (const vuint32m4_t s, size_t vl) 1079 { 1080 vuint32m4_t rb = __riscv_vand (s, 0xF800F8, vl); 1081 1082 return __riscv_vor ( 1083 __riscv_vor (__riscv_vnsrl (rb, 3, vl), __riscv_vnsrl (rb, 8, vl), vl), 1084 __riscv_vand (__riscv_vnsrl (s, 5, vl), 0x7E0, vl), vl); 1085 } 1086 1087 static force_inline vuint32m4_t 1088 rvv_convert_0565_to_0888_m4 (const vuint16m2_t s, size_t vl) 1089 { 1090 vuint8m1_t g1, g2; 1091 vuint16m2_t r, g_w, b; 1092 vuint32m4_t r_w, rb_w; 1093 1094 r = __riscv_vand (s, 0xF800, vl); 1095 b = __riscv_vand (s, 0x001F, vl); 1096 r_w = __riscv_vwmulu (r, 1 << 8, vl); 1097 rb_w = __riscv_vwmaccu (r_w, 1 << 3, b, vl); 1098 rb_w = __riscv_vand (__riscv_vor (rb_w, __riscv_vsrl (rb_w, 5, vl), vl), 1099 0xFF00FF, vl); 1100 1101 g1 = __riscv_vsll (__riscv_vnsrl (s, 5, vl), 2, vl); 1102 g2 = __riscv_vsrl (g1, 6, vl); 1103 g_w = __riscv_vwaddu_vv (g1, g2, vl); 1104 1105 return __riscv_vwmaccu (rb_w, 1 << 8, g_w, vl); 1106 } 1107 1108 #define rvv_convert_0565_to_8888_m4(s, vl) \ 1109 __riscv_vor (rvv_convert_0565_to_0888_m4 (s, vl), 0xff000000, vl) 1110 1111 #define __rvv_combine_mask_value_ca(LMUL, src, mask, vl) \ 1112 rvv_UN8x4_MUL_UN8x4_vv_##LMUL (src, mask, vl) 1113 #define rvv_combine_mask_value_ca_m2(src, mask, vl) \ 1114 __rvv_combine_mask_value_ca (m2, src, mask, vl) 1115 #define rvv_combine_mask_value_ca_m4(src, mask, vl) \ 1116 __rvv_combine_mask_value_ca (m4, src, mask, vl) 1117 1118 #define __rvv_combine_mask_alpha_ca(LMUL, src, mask, vl) \ 1119 rvv_UN8x4_MUL_UN16_vv_##LMUL (mask, rvv_shift_alpha_u16 (src, vl), vl) 1120 #define rvv_combine_mask_alpha_ca_m2(src, mask, vl) \ 1121 __rvv_combine_mask_alpha_ca (m2, src, mask, vl) 1122 #define rvv_combine_mask_alpha_ca_m4(src, mask, vl) \ 1123 __rvv_combine_mask_alpha_ca (m4, src, mask, vl) 1124 1125 #define __rvv_combine_mask(LMUL, src, mask, vl) \ 1126 rvv_UN8x4_MUL_UN16_vv_##LMUL (src, rvv_shift_alpha_u16 (mask, vl), vl) 1127 #define rvv_combine_mask_m2(src, mask, vl) \ 1128 __rvv_combine_mask (m2, src, mask, vl) 1129 #define rvv_combine_mask_m4(src, mask, vl) \ 1130 __rvv_combine_mask (m4, src, mask, vl) 1131 1132 #define __rvv_combine_mask_ca(LMUL) \ 1133 static force_inline void rvv_combine_mask_ca_##LMUL ( \ 1134 VUINT32 (LMUL) *__restrict__ src, VUINT32 (LMUL) *__restrict__ mask, \ 1135 size_t vl) \ 1136 { \ 1137 VUINT32 (LMUL) src_cpy = *src; \ 1138 *(src) = rvv_combine_mask_value_ca_##LMUL (*(src), *(mask), vl); \ 1139 *(mask) = rvv_combine_mask_alpha_ca_##LMUL (src_cpy, *(mask), vl); \ 1140 } 1141 __rvv_combine_mask_ca (m2); 1142 __rvv_combine_mask_ca (m4); 1143 1144 static void 1145 rvv_combine_clear (pixman_implementation_t *__restrict__ imp, 1146 pixman_op_t op, 1147 uint32_t *__restrict__ dest, 1148 const uint32_t *__restrict__ src, 1149 const uint32_t *__restrict__ mask, 1150 int width) 1151 { 1152 uint32_t *pd = dest; 1153 1154 vuint32m8_t v = __riscv_vmv_v_x_u32m8 (0, __riscv_vsetvlmax_e32m8 ()); 1155 RVV_FOREACH_1 (width, vl, e32m8, pd) { __riscv_vse32 (pd, v, vl); } 1156 } 1157 1158 static void 1159 rvv_combine_src_u (pixman_implementation_t *__restrict__ imp, 1160 pixman_op_t op, 1161 uint32_t *__restrict__ dest, 1162 const uint32_t *__restrict__ src, 1163 const uint32_t *__restrict__ mask, 1164 int width) 1165 { 1166 uint32_t *__restrict__ pd = dest; 1167 const uint32_t *__restrict__ ps = src; 1168 const uint32_t *__restrict__ pm = mask; 1169 1170 if (mask) 1171 { 1172 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1173 { 1174 __riscv_vse32 (pd, rvv_in_load_s_m_m4 (ps, pm, vl), vl); 1175 } 1176 } 1177 else 1178 { 1179 RVV_FOREACH_2 (width, vl, e32m8, ps, pd) 1180 { 1181 __riscv_vse32 (pd, __riscv_vle32_v_u32m8 (ps, vl), vl); 1182 } 1183 } 1184 } 1185 1186 static void 1187 rvv_combine_over_u (pixman_implementation_t *__restrict__ imp, 1188 pixman_op_t op, 1189 uint32_t *__restrict__ dest, 1190 const uint32_t *__restrict__ src, 1191 const uint32_t *__restrict__ mask, 1192 int width) 1193 { 1194 uint32_t *__restrict__ pd = dest; 1195 const uint32_t *__restrict__ ps = src; 1196 const uint32_t *__restrict__ pm = mask; 1197 1198 if (mask) 1199 { 1200 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1201 { 1202 __riscv_vse32 (pd, 1203 rvv_over_m4 (rvv_in_load_s_m_m4 (ps, pm, vl), 1204 __riscv_vle32_v_u32m4 (pd, vl), vl), 1205 vl); 1206 } 1207 } 1208 else 1209 { 1210 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1211 { 1212 __riscv_vse32 (pd, 1213 rvv_over_m4 (__riscv_vle32_v_u32m4 (ps, vl), 1214 __riscv_vle32_v_u32m4 (pd, vl), vl), 1215 vl); 1216 } 1217 } 1218 } 1219 1220 static void 1221 rvv_combine_over_reverse_u (pixman_implementation_t *__restrict__ imp, 1222 pixman_op_t op, 1223 uint32_t *__restrict__ dest, 1224 const uint32_t *__restrict__ src, 1225 const uint32_t *__restrict__ mask, 1226 int width) 1227 { 1228 uint32_t *__restrict__ pd = dest; 1229 const uint32_t *__restrict__ ps = src; 1230 const uint32_t *__restrict__ pm = mask; 1231 1232 if (mask) 1233 { 1234 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1235 { 1236 __riscv_vse32 (pd, 1237 rvv_over_m4 (__riscv_vle32_v_u32m4 (pd, vl), 1238 rvv_in_load_s_m_m4 (ps, pm, vl), vl), 1239 vl); 1240 } 1241 } 1242 else 1243 { 1244 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1245 { 1246 __riscv_vse32 (pd, 1247 rvv_over_m4 (__riscv_vle32_v_u32m4 (pd, vl), 1248 __riscv_vle32_v_u32m4 (ps, vl), vl), 1249 vl); 1250 } 1251 } 1252 } 1253 1254 static void 1255 rvv_combine_in_u (pixman_implementation_t *__restrict__ imp, 1256 pixman_op_t op, 1257 uint32_t *__restrict__ dest, 1258 const uint32_t *__restrict__ src, 1259 const uint32_t *__restrict__ mask, 1260 int width) 1261 { 1262 uint32_t *__restrict__ pd = dest; 1263 const uint32_t *__restrict__ ps = src; 1264 const uint32_t *__restrict__ pm = mask; 1265 1266 if (mask) 1267 { 1268 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1269 { 1270 __riscv_vse32 (pd, 1271 rvv_in_m4 (rvv_in_load_s_m_m4 (ps, pm, vl), 1272 rvv_load_alpha_u8m1 (pd, vl), vl), 1273 vl); 1274 } 1275 } 1276 else 1277 { 1278 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1279 { 1280 __riscv_vse32 (pd, rvv_in_load_s_m_m4 (ps, pd, vl), vl); 1281 } 1282 } 1283 } 1284 1285 static void 1286 rvv_combine_in_reverse_u (pixman_implementation_t *__restrict__ imp, 1287 pixman_op_t op, 1288 uint32_t *__restrict__ dest, 1289 const uint32_t *__restrict__ src, 1290 const uint32_t *__restrict__ mask, 1291 int width) 1292 { 1293 uint32_t *__restrict__ pd = dest; 1294 const uint32_t *__restrict__ ps = src; 1295 const uint32_t *__restrict__ pm = mask; 1296 1297 if (mask) 1298 { 1299 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1300 { 1301 __riscv_vse32 (pd, 1302 rvv_in_m4 (__riscv_vle32_v_u32m4 (pd, vl), 1303 rvv_UN8_MUL_UN8_vv_m1 ( 1304 rvv_load_alpha_u8m1 (ps, vl), 1305 rvv_load_alpha_u8m1 (pm, vl), vl), 1306 vl), 1307 vl); 1308 } 1309 } 1310 else 1311 { 1312 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1313 { 1314 __riscv_vse32 (pd, rvv_in_load_s_m_m4 (pd, ps, vl), vl); 1315 } 1316 } 1317 } 1318 1319 static void 1320 rvv_combine_out_u (pixman_implementation_t *__restrict__ imp, 1321 pixman_op_t op, 1322 uint32_t *__restrict__ dest, 1323 const uint32_t *__restrict__ src, 1324 const uint32_t *__restrict__ mask, 1325 int width) 1326 { 1327 uint32_t *__restrict__ pd = dest; 1328 const uint32_t *__restrict__ ps = src; 1329 const uint32_t *__restrict__ pm = mask; 1330 1331 if (mask) 1332 { 1333 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1334 { 1335 __riscv_vse32 (pd, 1336 rvv_in_m4 (rvv_in_load_s_m_m4 (ps, pm, vl), 1337 rvv_load_not_alpha_u8m1 (pd, vl), vl), 1338 vl); 1339 } 1340 } 1341 else 1342 { 1343 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1344 { 1345 __riscv_vse32 (pd, rvv_in_load_s_nm_m4 (ps, pd, vl), vl); 1346 } 1347 } 1348 } 1349 1350 static void 1351 rvv_combine_out_reverse_u (pixman_implementation_t *__restrict__ imp, 1352 pixman_op_t op, 1353 uint32_t *__restrict__ dest, 1354 const uint32_t *__restrict__ src, 1355 const uint32_t *__restrict__ mask, 1356 int width) 1357 { 1358 uint32_t *__restrict__ pd = dest; 1359 const uint32_t *__restrict__ ps = src; 1360 const uint32_t *__restrict__ pm = mask; 1361 1362 if (mask) 1363 { 1364 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1365 { 1366 __riscv_vse32 ( 1367 pd, 1368 rvv_in_m4 (__riscv_vle32_v_u32m4 (pd, vl), 1369 __riscv_vnot (rvv_UN8_MUL_UN8_vv_m1 ( 1370 rvv_load_alpha_u8m1 (ps, vl), 1371 rvv_load_alpha_u8m1 (pm, vl), vl), 1372 vl), 1373 vl), 1374 vl); 1375 } 1376 } 1377 else 1378 { 1379 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1380 { 1381 __riscv_vse32 (pd, rvv_in_load_s_nm_m4 (pd, ps, vl), vl); 1382 } 1383 } 1384 } 1385 1386 static void 1387 rvv_combine_atop_u (pixman_implementation_t *__restrict__ imp, 1388 pixman_op_t op, 1389 uint32_t *__restrict__ dest, 1390 const uint32_t *__restrict__ src, 1391 const uint32_t *__restrict__ mask, 1392 int width) 1393 { 1394 uint32_t *__restrict__ pd = dest; 1395 const uint32_t *__restrict__ ps = src; 1396 const uint32_t *__restrict__ pm = mask; 1397 vuint32m4_t s, d; 1398 1399 if (mask) 1400 { 1401 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1402 { 1403 s = rvv_in_load_s_m_m4 (ps, pm, vl); 1404 d = __riscv_vle32_v_u32m4 (pd, vl); 1405 __riscv_vse32 (pd, 1406 rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1407 s, rvv_shift_alpha_u16 (d, vl), d, 1408 rvv_shift_not_alpha_u16 (s, vl), vl), 1409 vl); 1410 } 1411 } 1412 else 1413 { 1414 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1415 { 1416 s = __riscv_vle32_v_u32m4 (ps, vl); 1417 d = __riscv_vle32_v_u32m4 (pd, vl); 1418 __riscv_vse32 (pd, 1419 rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1420 s, rvv_shift_alpha_u16 (d, vl), d, 1421 rvv_shift_not_alpha_u16 (s, vl), vl), 1422 vl); 1423 } 1424 } 1425 } 1426 1427 static void 1428 rvv_combine_atop_reverse_u (pixman_implementation_t *__restrict__ imp, 1429 pixman_op_t op, 1430 uint32_t *__restrict__ dest, 1431 const uint32_t *__restrict__ src, 1432 const uint32_t *__restrict__ mask, 1433 int width) 1434 { 1435 uint32_t *__restrict__ pd = dest; 1436 const uint32_t *__restrict__ ps = src; 1437 const uint32_t *__restrict__ pm = mask; 1438 vuint32m4_t s, d; 1439 1440 if (mask) 1441 { 1442 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1443 { 1444 s = rvv_in_load_s_m_m4 (ps, pm, vl); 1445 d = __riscv_vle32_v_u32m4 (pd, vl); 1446 __riscv_vse32 (pd, 1447 rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1448 s, rvv_shift_not_alpha_u16 (d, vl), d, 1449 rvv_shift_alpha_u16 (s, vl), vl), 1450 vl); 1451 } 1452 } 1453 else 1454 { 1455 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1456 { 1457 s = __riscv_vle32_v_u32m4 (ps, vl); 1458 d = __riscv_vle32_v_u32m4 (pd, vl); 1459 __riscv_vse32 (pd, 1460 rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1461 s, rvv_shift_not_alpha_u16 (d, vl), d, 1462 rvv_shift_alpha_u16 (s, vl), vl), 1463 vl); 1464 } 1465 } 1466 } 1467 1468 static void 1469 rvv_combine_xor_u (pixman_implementation_t *__restrict__ imp, 1470 pixman_op_t op, 1471 uint32_t *__restrict__ dest, 1472 const uint32_t *__restrict__ src, 1473 const uint32_t *__restrict__ mask, 1474 int width) 1475 { 1476 uint32_t *__restrict__ pd = dest; 1477 const uint32_t *__restrict__ ps = src; 1478 const uint32_t *__restrict__ pm = mask; 1479 vuint32m4_t s, d; 1480 1481 if (mask) 1482 { 1483 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1484 { 1485 s = rvv_in_load_s_m_m4 (ps, pm, vl); 1486 d = __riscv_vle32_v_u32m4 (pd, vl); 1487 __riscv_vse32 (pd, 1488 rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1489 s, rvv_shift_not_alpha_u16 (d, vl), d, 1490 rvv_shift_not_alpha_u16 (s, vl), vl), 1491 vl); 1492 } 1493 } 1494 else 1495 { 1496 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1497 { 1498 s = __riscv_vle32_v_u32m4 (ps, vl); 1499 d = __riscv_vle32_v_u32m4 (pd, vl); 1500 __riscv_vse32 (pd, 1501 rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1502 s, rvv_shift_not_alpha_u16 (d, vl), d, 1503 rvv_shift_not_alpha_u16 (s, vl), vl), 1504 vl); 1505 } 1506 } 1507 } 1508 1509 static void 1510 rvv_combine_add_u (pixman_implementation_t *__restrict__ imp, 1511 pixman_op_t op, 1512 uint32_t *__restrict__ dest, 1513 const uint32_t *__restrict__ src, 1514 const uint32_t *__restrict__ mask, 1515 int width) 1516 { 1517 uint32_t *__restrict__ pd = dest; 1518 const uint32_t *__restrict__ ps = src; 1519 const uint32_t *__restrict__ pm = mask; 1520 1521 if (mask) 1522 { 1523 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1524 { 1525 __riscv_vse32 ( 1526 pd, 1527 rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl), 1528 rvv_in_load_s_m_m4 (ps, pm, vl), vl), 1529 vl); 1530 } 1531 } 1532 else 1533 { 1534 RVV_FOREACH_2 (width, vl, e32m4, ps, pd) 1535 { 1536 __riscv_vse32 ( 1537 pd, 1538 rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl), 1539 __riscv_vle32_v_u32m4 (ps, vl), vl), 1540 vl); 1541 } 1542 } 1543 } 1544 1545 /* 1546 * Multiply 1547 * 1548 * ad * as * B(d / ad, s / as) 1549 * = ad * as * d/ad * s/as 1550 * = d * s 1551 * 1552 */ 1553 static void 1554 rvv_combine_multiply_u (pixman_implementation_t *imp, 1555 pixman_op_t op, 1556 uint32_t *__restrict__ dest, 1557 const uint32_t *__restrict__ src, 1558 const uint32_t *__restrict__ mask, 1559 int width) 1560 { 1561 uint32_t *__restrict__ pd = dest; 1562 const uint32_t *__restrict__ ps = src; 1563 const uint32_t *__restrict__ pm = mask; 1564 1565 vuint32m4_t s, d; 1566 if (mask) 1567 { 1568 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1569 { 1570 s = rvv_in_load_s_m_m4 (ps, pm, vl); 1571 d = __riscv_vle32_v_u32m4 (pd, vl); 1572 1573 __riscv_vse32 (pd, 1574 rvv_UN8x4_ADD_UN8x4_vv_m4 ( 1575 rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl), 1576 rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1577 s, rvv_shift_not_alpha_u16 (d, vl), d, 1578 rvv_shift_not_alpha_u16 (s, vl), vl), 1579 vl), 1580 vl); 1581 } 1582 } 1583 else 1584 { 1585 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1586 { 1587 s = __riscv_vle32_v_u32m4 (ps, vl); 1588 d = __riscv_vle32_v_u32m4 (pd, vl); 1589 1590 __riscv_vse32 (pd, 1591 rvv_UN8x4_ADD_UN8x4_vv_m4 ( 1592 rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl), 1593 rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1594 s, rvv_shift_not_alpha_u16 (d, vl), d, 1595 rvv_shift_not_alpha_u16 (s, vl), vl), 1596 vl), 1597 vl); 1598 } 1599 } 1600 } 1601 1602 static void 1603 rvv_combine_multiply_ca (pixman_implementation_t *__restrict__ imp, 1604 pixman_op_t op, 1605 uint32_t *__restrict__ dest, 1606 const uint32_t *__restrict__ src, 1607 const uint32_t *__restrict__ mask, 1608 int width) 1609 { 1610 uint32_t *__restrict__ pd = dest; 1611 const uint32_t *__restrict__ ps = src; 1612 const uint32_t *__restrict__ pm = mask; 1613 1614 vuint32m4_t s, m, d; 1615 1616 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1617 { 1618 s = __riscv_vle32_v_u32m4 (ps, vl); 1619 m = __riscv_vle32_v_u32m4 (pm, vl); 1620 rvv_combine_mask_ca_m4 (&s, &m, vl); 1621 1622 d = __riscv_vle32_v_u32m4 (pd, vl); 1623 1624 __riscv_vse32 (pd, 1625 rvv_UN8x4_ADD_UN8x4_vv_m4 ( 1626 rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1627 d, __riscv_vnot (m, vl), s, 1628 rvv_shift_not_alpha_u16 (d, vl), vl), 1629 rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl), vl), 1630 vl); 1631 } 1632 } 1633 1634 #define PDF_SEPARABLE_BLEND_MODE(name) \ 1635 static void rvv_combine_##name##_u ( \ 1636 pixman_implementation_t *imp, pixman_op_t op, uint32_t *dest, \ 1637 const uint32_t *src, const uint32_t *mask, int width) \ 1638 { \ 1639 uint32_t *__restrict__ pd = dest; \ 1640 const uint32_t *__restrict__ ps = src; \ 1641 const uint32_t *__restrict__ pm = mask; \ 1642 \ 1643 vuint32m2_t s, d, ra, rx; \ 1644 vuint16m1_t da, sa; \ 1645 size_t vl4; \ 1646 vuint8m2_t s4, d4, sa4, isa4, da4, ida4; \ 1647 vuint32m8_t rx4; \ 1648 \ 1649 RVV_FOREACH_3 (width, vl, e32m2, ps, pm, pd) \ 1650 { \ 1651 vl4 = vl * 4; \ 1652 \ 1653 s = __riscv_vle32_v_u32m2 (ps, vl); \ 1654 if (mask) \ 1655 s = rvv_combine_mask_m2 (s, __riscv_vle32_v_u32m2 (pm, vl), \ 1656 vl); \ 1657 sa = rvv_shift_alpha_u16 (s, vl); \ 1658 \ 1659 d = __riscv_vle32_v_u32m2 (pd, vl); \ 1660 da = rvv_shift_alpha_u16 (d, vl); \ 1661 \ 1662 ra = __riscv_vsub (__riscv_vwaddu_vv (__riscv_vmul (da, 0xFF, vl), \ 1663 __riscv_vmul (sa, 0xFF, vl), \ 1664 vl), \ 1665 __riscv_vwmulu (sa, da, vl), vl); \ 1666 \ 1667 s4 = RVV_U32_U8x4_m2 (s); \ 1668 sa4 = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (sa, vl)); \ 1669 isa4 = __riscv_vnot (sa4, vl4); \ 1670 d4 = RVV_U32_U8x4_m2 (d); \ 1671 da4 = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (da, vl)); \ 1672 ida4 = __riscv_vnot (da4, vl4); \ 1673 \ 1674 rx4 = __riscv_vadd ( \ 1675 __riscv_vwaddu_vv (__riscv_vwmulu (isa4, d4, vl4), \ 1676 __riscv_vwmulu (ida4, s4, vl4), vl4), \ 1677 rvv_blend_##name##_int (d4, da4, s4, sa4, vl4), vl4); \ 1678 \ 1679 ra = __riscv_vminu (ra, 255 * 255, vl); \ 1680 rx4 = __riscv_vminu (rx4, 255 * 255, vl4); \ 1681 \ 1682 ra = rvv_DIV_ONE_UN32m2_UN32m2_v (ra, vl); \ 1683 rx = RVV_U8x4_U32_m2 (rvv_DIV_ONE_UN32m8_UN8m2_v (rx4, vl4)); \ 1684 \ 1685 __riscv_vse32 (pd, \ 1686 __riscv_vor (__riscv_vsll (ra, 24, vl), \ 1687 __riscv_vand (rx, 0x00FFFFFF, vl), \ 1688 vl), \ 1689 vl); \ 1690 } \ 1691 } \ 1692 \ 1693 static void rvv_combine_##name##_ca ( \ 1694 pixman_implementation_t *imp, pixman_op_t op, uint32_t *dest, \ 1695 const uint32_t *src, const uint32_t *mask, int width) \ 1696 { \ 1697 uint32_t *__restrict__ pd = dest; \ 1698 const uint32_t *__restrict__ ps = src; \ 1699 const uint32_t *__restrict__ pm = mask; \ 1700 \ 1701 vuint32m2_t s, m, d, ra, rx; \ 1702 vuint16m1_t da, sa; \ 1703 size_t vl4; \ 1704 vuint8m2_t s4, m4, d4, ixa4, da4, ida4; \ 1705 vuint32m8_t rx4; \ 1706 \ 1707 RVV_FOREACH_3 (width, vl, e32m2, ps, pm, pd) \ 1708 { \ 1709 m = __riscv_vle32_v_u32m2 (pm, vl); \ 1710 s = __riscv_vle32_v_u32m2 (ps, vl); \ 1711 rvv_combine_mask_ca_m2 (&s, &m, vl); \ 1712 sa = rvv_shift_alpha_u16 (s, vl); \ 1713 \ 1714 d = __riscv_vle32_v_u32m2 (pd, vl); \ 1715 da = rvv_shift_alpha_u16 (d, vl); \ 1716 \ 1717 ra = __riscv_vsub (__riscv_vwaddu_vv (__riscv_vmul (da, 0xFF, vl), \ 1718 __riscv_vmul (sa, 0xFF, vl), \ 1719 vl), \ 1720 __riscv_vwmulu (sa, da, vl), vl); \ 1721 \ 1722 ixa4 = RVV_U32_U8x4_m2 (__riscv_vnot (m, vl)); \ 1723 d4 = RVV_U32_U8x4_m2 (d); \ 1724 ida4 = RVV_U32_U8x4_m2 ( \ 1725 __riscv_vnot (rvv_UN16_bcast_UN8x4_v_m2 (da, vl), vl)); \ 1726 s4 = RVV_U32_U8x4_m2 (s); \ 1727 da4 = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (da, vl)); \ 1728 m4 = RVV_U32_U8x4_m2 (m); \ 1729 \ 1730 vl4 = vl * 4; \ 1731 rx4 = __riscv_vadd ( \ 1732 __riscv_vwaddu_vv (__riscv_vwmulu (ixa4, d4, vl4), \ 1733 __riscv_vwmulu (ida4, s4, vl4), vl4), \ 1734 rvv_blend_##name##_int (d4, da4, s4, m4, vl4), vl4); \ 1735 \ 1736 ra = __riscv_vminu (ra, 255 * 255, vl); \ 1737 rx4 = __riscv_vminu (rx4, 255 * 255, vl4); \ 1738 \ 1739 ra = rvv_DIV_ONE_UN32m2_UN32m2_v (ra, vl); \ 1740 rx = RVV_U8x4_U32_m2 (rvv_DIV_ONE_UN32m8_UN8m2_v (rx4, vl4)); \ 1741 \ 1742 __riscv_vse32 (pd, \ 1743 __riscv_vor (__riscv_vsll (ra, 24, vl), \ 1744 __riscv_vand (rx, 0x00FFFFFF, vl), \ 1745 vl), \ 1746 vl); \ 1747 } \ 1748 } 1749 1750 static force_inline vuint32m8_t 1751 rvv_blend_screen_int (const vuint8m2_t d, 1752 const vuint8m2_t ad, 1753 const vuint8m2_t s, 1754 const vuint8m2_t as, 1755 size_t vl) 1756 { 1757 return __riscv_vsub (__riscv_vwaddu_vv (__riscv_vwmulu (s, ad, vl), 1758 __riscv_vwmulu (d, as, vl), vl), 1759 __riscv_vwcvtu_x (__riscv_vwmulu (s, d, vl), vl), vl); 1760 } 1761 1762 PDF_SEPARABLE_BLEND_MODE (screen) 1763 1764 static force_inline vuint32m8_t 1765 _rvv_blend_overlay_hard_light (const vuint8m2_t d, 1766 const vuint8m2_t ad, 1767 const vuint8m2_t s, 1768 const vuint8m2_t as, 1769 const vbool4_t selector, 1770 size_t vl) 1771 { 1772 vuint32m8_t out_true = __riscv_vwmulu (__riscv_vwmulu (s, d, vl), 2, vl); 1773 1774 vint16m4_t d_i = rvv_u8m2_to_i16m4 (d, vl); 1775 vint16m4_t ad_i = rvv_u8m2_to_i16m4 (ad, vl); 1776 vint16m4_t s_i = rvv_u8m2_to_i16m4 (s, vl); 1777 vint16m4_t as_i = rvv_u8m2_to_i16m4 (as, vl); 1778 1779 vuint32m8_t out_false = __riscv_vreinterpret_v_i32m8_u32m8 (__riscv_vsub ( 1780 __riscv_vwmul (as_i, ad_i, vl), 1781 __riscv_vsll (__riscv_vwmul (__riscv_vsub (ad_i, d_i, vl), 1782 __riscv_vsub (as_i, s_i, vl), vl), 1783 1, vl), 1784 vl)); 1785 1786 return __riscv_vmerge (out_false, out_true, selector, vl); 1787 } 1788 1789 static force_inline vuint32m8_t 1790 rvv_blend_overlay_int (const vuint8m2_t d, 1791 const vuint8m2_t ad, 1792 const vuint8m2_t s, 1793 const vuint8m2_t as, 1794 size_t vl) 1795 { 1796 return _rvv_blend_overlay_hard_light ( 1797 d, ad, s, as, 1798 __riscv_vmsltu (__riscv_vwmulu (d, 2, vl), __riscv_vwcvtu_x (ad, vl), 1799 vl), 1800 vl); 1801 } 1802 1803 PDF_SEPARABLE_BLEND_MODE (overlay) 1804 1805 static force_inline vuint32m8_t 1806 rvv_blend_darken_int (const vuint8m2_t d, 1807 const vuint8m2_t ad, 1808 const vuint8m2_t s, 1809 const vuint8m2_t as, 1810 size_t vl) 1811 { 1812 return __riscv_vwcvtu_x (__riscv_vminu (__riscv_vwmulu (ad, s, vl), 1813 __riscv_vwmulu (as, d, vl), vl), 1814 vl); 1815 } 1816 1817 PDF_SEPARABLE_BLEND_MODE (darken) 1818 1819 static force_inline vuint32m8_t 1820 rvv_blend_lighten_int (const vuint8m2_t d, 1821 const vuint8m2_t ad, 1822 const vuint8m2_t s, 1823 const vuint8m2_t as, 1824 size_t vl) 1825 { 1826 return __riscv_vwcvtu_x (__riscv_vmaxu (__riscv_vwmulu (as, d, vl), 1827 __riscv_vwmulu (ad, s, vl), vl), 1828 vl); 1829 } 1830 1831 PDF_SEPARABLE_BLEND_MODE (lighten) 1832 1833 static force_inline vuint32m8_t 1834 rvv_blend_hard_light_int (const vuint8m2_t d, 1835 const vuint8m2_t ad, 1836 const vuint8m2_t s, 1837 const vuint8m2_t as, 1838 size_t vl) 1839 { 1840 return _rvv_blend_overlay_hard_light ( 1841 d, ad, s, as, 1842 __riscv_vmsltu (__riscv_vwmulu (s, 2, vl), __riscv_vwcvtu_x (as, vl), 1843 vl), 1844 vl); 1845 } 1846 1847 PDF_SEPARABLE_BLEND_MODE (hard_light) 1848 1849 static force_inline vuint32m8_t 1850 rvv_blend_difference_int (const vuint8m2_t d, 1851 const vuint8m2_t ad, 1852 const vuint8m2_t s, 1853 const vuint8m2_t as, 1854 size_t vl) 1855 { 1856 vuint16m4_t das = __riscv_vwmulu (d, as, vl); 1857 vuint16m4_t sad = __riscv_vwmulu (s, ad, vl); 1858 1859 return __riscv_vmerge (__riscv_vwsubu_vv (sad, das, vl), 1860 __riscv_vwsubu_vv (das, sad, vl), 1861 __riscv_vmsltu (sad, das, vl), vl); 1862 } 1863 1864 PDF_SEPARABLE_BLEND_MODE (difference) 1865 1866 static force_inline vuint32m8_t 1867 rvv_blend_exclusion_int (const vuint8m2_t d, 1868 const vuint8m2_t ad, 1869 const vuint8m2_t s, 1870 const vuint8m2_t as, 1871 size_t vl) 1872 { 1873 return __riscv_vsub (__riscv_vwaddu_vv (__riscv_vwmulu (s, ad, vl), 1874 __riscv_vwmulu (d, as, vl), vl), 1875 __riscv_vwmulu (__riscv_vwmulu (d, s, vl), 2, vl), vl); 1876 } 1877 1878 PDF_SEPARABLE_BLEND_MODE (exclusion) 1879 1880 #undef PDF_SEPARABLE_BLEND_MODE 1881 1882 static void 1883 rvv_combine_over_ca (pixman_implementation_t *__restrict__ imp, 1884 pixman_op_t op, 1885 uint32_t *__restrict__ dest, 1886 const uint32_t *__restrict__ src, 1887 const uint32_t *__restrict__ mask, 1888 int width) 1889 { 1890 uint32_t *__restrict__ pd = dest; 1891 const uint32_t *__restrict__ ps = src; 1892 const uint32_t *__restrict__ pm = mask; 1893 1894 vuint32m4_t s, m; 1895 1896 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1897 { 1898 s = __riscv_vle32_v_u32m4 (ps, vl); 1899 m = __riscv_vle32_v_u32m4 (pm, vl); 1900 rvv_combine_mask_ca_m4 (&s, &m, vl); 1901 1902 __riscv_vse32 ( 1903 pd, 1904 rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 ( 1905 __riscv_vle32_v_u32m4 (pd, vl), __riscv_vnot (m, vl), s, vl), 1906 vl); 1907 } 1908 } 1909 1910 static void 1911 rvv_combine_over_reverse_ca (pixman_implementation_t *__restrict__ imp, 1912 pixman_op_t op, 1913 uint32_t *__restrict__ dest, 1914 const uint32_t *__restrict__ src, 1915 const uint32_t *__restrict__ mask, 1916 int width) 1917 { 1918 uint32_t *__restrict__ pd = dest; 1919 const uint32_t *__restrict__ ps = src; 1920 const uint32_t *__restrict__ pm = mask; 1921 1922 vuint32m4_t d; 1923 1924 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1925 { 1926 d = __riscv_vle32_v_u32m4 (pd, vl); 1927 __riscv_vse32 ( 1928 pd, 1929 rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4 ( 1930 rvv_UN8x4_MUL_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (ps, vl), 1931 __riscv_vle32_v_u32m4 (pm, vl), vl), 1932 rvv_shift_not_alpha_u16 (d, vl), d, vl), 1933 vl); 1934 } 1935 } 1936 1937 static void 1938 rvv_combine_atop_ca (pixman_implementation_t *__restrict__ imp, 1939 pixman_op_t op, 1940 uint32_t *__restrict__ dest, 1941 const uint32_t *__restrict__ src, 1942 const uint32_t *__restrict__ mask, 1943 int width) 1944 { 1945 uint32_t *__restrict__ pd = dest; 1946 const uint32_t *__restrict__ ps = src; 1947 const uint32_t *__restrict__ pm = mask; 1948 1949 vuint32m4_t d, s, m; 1950 1951 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1952 { 1953 s = __riscv_vle32_v_u32m4 (ps, vl); 1954 m = __riscv_vle32_v_u32m4 (pm, vl); 1955 rvv_combine_mask_ca_m4 (&s, &m, vl); 1956 1957 d = __riscv_vle32_v_u32m4 (pd, vl); 1958 __riscv_vse32 ( 1959 pd, 1960 rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1961 d, __riscv_vnot (m, vl), s, rvv_shift_alpha_u16 (d, vl), vl), 1962 vl); 1963 } 1964 } 1965 1966 static void 1967 rvv_combine_xor_ca (pixman_implementation_t *__restrict__ imp, 1968 pixman_op_t op, 1969 uint32_t *__restrict__ dest, 1970 const uint32_t *__restrict__ src, 1971 const uint32_t *__restrict__ mask, 1972 int width) 1973 { 1974 uint32_t *__restrict__ pd = dest; 1975 const uint32_t *__restrict__ ps = src; 1976 const uint32_t *__restrict__ pm = mask; 1977 1978 vuint32m4_t d, s, m; 1979 1980 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 1981 { 1982 s = __riscv_vle32_v_u32m4 (ps, vl); 1983 m = __riscv_vle32_v_u32m4 (pm, vl); 1984 rvv_combine_mask_ca_m4 (&s, &m, vl); 1985 1986 d = __riscv_vle32_v_u32m4 (pd, vl); 1987 __riscv_vse32 (pd, 1988 rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 1989 d, __riscv_vnot (m, vl), s, 1990 rvv_shift_not_alpha_u16 (d, vl), vl), 1991 vl); 1992 } 1993 } 1994 1995 static void 1996 rvv_combine_atop_reverse_ca (pixman_implementation_t *__restrict__ imp, 1997 pixman_op_t op, 1998 uint32_t *__restrict__ dest, 1999 const uint32_t *__restrict__ src, 2000 const uint32_t *__restrict__ mask, 2001 int width) 2002 { 2003 uint32_t *__restrict__ pd = dest; 2004 const uint32_t *__restrict__ ps = src; 2005 const uint32_t *__restrict__ pm = mask; 2006 2007 vuint32m4_t d, s, m; 2008 2009 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 2010 { 2011 s = __riscv_vle32_v_u32m4 (ps, vl); 2012 m = __riscv_vle32_v_u32m4 (pm, vl); 2013 rvv_combine_mask_ca_m4 (&s, &m, vl); 2014 2015 d = __riscv_vle32_v_u32m4 (pd, vl); 2016 __riscv_vse32 (pd, 2017 rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 ( 2018 d, m, s, rvv_shift_not_alpha_u16 (d, vl), vl), 2019 vl); 2020 } 2021 } 2022 2023 static void 2024 rvv_combine_src_ca (pixman_implementation_t *__restrict__ imp, 2025 pixman_op_t op, 2026 uint32_t *__restrict__ dest, 2027 const uint32_t *__restrict__ src, 2028 const uint32_t *__restrict__ mask, 2029 int width) 2030 { 2031 uint32_t *__restrict__ pd = dest; 2032 const uint32_t *__restrict__ ps = src; 2033 const uint32_t *__restrict__ pm = mask; 2034 2035 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 2036 { 2037 __riscv_vse32 ( 2038 pd, 2039 rvv_combine_mask_value_ca_m4 (__riscv_vle32_v_u32m4 (ps, vl), 2040 __riscv_vle32_v_u32m4 (pm, vl), vl), 2041 vl); 2042 } 2043 } 2044 2045 static void 2046 rvv_combine_in_ca (pixman_implementation_t *__restrict__ imp, 2047 pixman_op_t op, 2048 uint32_t *__restrict__ dest, 2049 const uint32_t *__restrict__ src, 2050 const uint32_t *__restrict__ mask, 2051 int width) 2052 { 2053 uint32_t *__restrict__ pd = dest; 2054 const uint32_t *__restrict__ ps = src; 2055 const uint32_t *__restrict__ pm = mask; 2056 2057 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 2058 { 2059 __riscv_vse32 (pd, 2060 rvv_in_m4 (rvv_combine_mask_value_ca_m4 ( 2061 __riscv_vle32_v_u32m4 (ps, vl), 2062 __riscv_vle32_v_u32m4 (pm, vl), vl), 2063 rvv_load_alpha_u8m1 (pd, vl), vl), 2064 vl); 2065 } 2066 } 2067 2068 static void 2069 rvv_combine_in_reverse_ca (pixman_implementation_t *imp, 2070 pixman_op_t op, 2071 uint32_t *dest, 2072 const uint32_t *src, 2073 const uint32_t *mask, 2074 int width) 2075 { 2076 uint32_t *__restrict__ pd = dest; 2077 const uint32_t *__restrict__ ps = src; 2078 const uint32_t *__restrict__ pm = mask; 2079 2080 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 2081 { 2082 __riscv_vse32 ( 2083 pd, 2084 rvv_UN8x4_MUL_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl), 2085 rvv_combine_mask_alpha_ca_m4 ( 2086 __riscv_vle32_v_u32m4 (ps, vl), 2087 __riscv_vle32_v_u32m4 (pm, vl), vl), 2088 vl), 2089 vl); 2090 } 2091 } 2092 2093 static void 2094 rvv_combine_out_ca (pixman_implementation_t *__restrict__ imp, 2095 pixman_op_t op, 2096 uint32_t *__restrict__ dest, 2097 const uint32_t *__restrict__ src, 2098 const uint32_t *__restrict__ mask, 2099 int width) 2100 { 2101 uint32_t *__restrict__ pd = dest; 2102 const uint32_t *__restrict__ ps = src; 2103 const uint32_t *__restrict__ pm = mask; 2104 2105 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 2106 { 2107 __riscv_vse32 (pd, 2108 rvv_in_m4 (rvv_combine_mask_value_ca_m4 ( 2109 __riscv_vle32_v_u32m4 (ps, vl), 2110 __riscv_vle32_v_u32m4 (pm, vl), vl), 2111 rvv_load_not_alpha_u8m1 (pd, vl), vl), 2112 vl); 2113 } 2114 } 2115 2116 static void 2117 rvv_combine_out_reverse_ca (pixman_implementation_t *imp, 2118 pixman_op_t op, 2119 uint32_t *dest, 2120 const uint32_t *src, 2121 const uint32_t *mask, 2122 int width) 2123 { 2124 uint32_t *__restrict__ pd = dest; 2125 const uint32_t *__restrict__ ps = src; 2126 const uint32_t *__restrict__ pm = mask; 2127 2128 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 2129 { 2130 __riscv_vse32 ( 2131 pd, 2132 rvv_UN8x4_MUL_UN8x4_vv_m4 ( 2133 __riscv_vle32_v_u32m4 (pd, vl), 2134 __riscv_vnot_v_u32m4 (rvv_combine_mask_alpha_ca_m4 ( 2135 __riscv_vle32_v_u32m4 (ps, vl), 2136 __riscv_vle32_v_u32m4 (pm, vl), vl), 2137 vl), 2138 vl), 2139 vl); 2140 } 2141 } 2142 2143 static void 2144 rvv_combine_add_ca (pixman_implementation_t *__restrict__ imp, 2145 pixman_op_t op, 2146 uint32_t *__restrict__ dest, 2147 const uint32_t *__restrict__ src, 2148 const uint32_t *__restrict__ mask, 2149 int width) 2150 { 2151 uint32_t *__restrict__ pd = dest; 2152 const uint32_t *__restrict__ ps = src; 2153 const uint32_t *__restrict__ pm = mask; 2154 2155 RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd) 2156 { 2157 __riscv_vse32 ( 2158 pd, 2159 rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl), 2160 rvv_combine_mask_value_ca_m4 ( 2161 __riscv_vle32_v_u32m4 (ps, vl), 2162 __riscv_vle32_v_u32m4 (pm, vl), vl), 2163 vl), 2164 vl); 2165 } 2166 } 2167 2168 static void 2169 rvv_composite_src_x888_8888 (pixman_implementation_t *__restrict__ imp, 2170 pixman_composite_info_t *__restrict__ info) 2171 { 2172 PIXMAN_COMPOSITE_ARGS (info); 2173 uint32_t *__restrict__ dst_line, *__restrict__ dst; 2174 uint32_t *__restrict__ src_line, *__restrict__ src; 2175 int32_t dst_stride, src_stride; 2176 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2177 dst_line, 1); 2178 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2179 src_line, 1); 2180 while (height--) 2181 { 2182 dst = dst_line; 2183 dst_line += dst_stride; 2184 src = src_line; 2185 src_line += src_stride; 2186 2187 RVV_FOREACH_2 (width, vl, e32m8, src, dst) 2188 { 2189 __riscv_vse32 ( 2190 dst, 2191 __riscv_vor (__riscv_vle32_v_u32m8 (src, vl), 0xff000000, vl), 2192 vl); 2193 } 2194 } 2195 } 2196 2197 static void 2198 rvv_composite_src_8888_8888 (pixman_implementation_t *__restrict__ imp, 2199 pixman_composite_info_t *__restrict__ info) 2200 { 2201 PIXMAN_COMPOSITE_ARGS (info); 2202 uint32_t *__restrict__ dst_line, *__restrict__ dst; 2203 uint32_t *__restrict__ src_line, *__restrict__ src; 2204 int32_t dst_stride, src_stride; 2205 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2206 dst_line, 1); 2207 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2208 src_line, 1); 2209 while (height--) 2210 { 2211 dst = dst_line; 2212 dst_line += dst_stride; 2213 src = src_line; 2214 src_line += src_stride; 2215 2216 RVV_FOREACH_2 (width, vl, e32m8, src, dst) 2217 { 2218 __riscv_vse32 (dst, __riscv_vle32_v_u32m8 (src, vl), vl); 2219 } 2220 } 2221 } 2222 2223 static void 2224 rvv_composite_over_x888_8_8888 (pixman_implementation_t *__restrict__ imp, 2225 pixman_composite_info_t *__restrict__ info) 2226 { 2227 PIXMAN_COMPOSITE_ARGS (info); 2228 uint32_t *__restrict__ src, *__restrict__ src_line; 2229 uint32_t *__restrict__ dst, *__restrict__ dst_line; 2230 uint8_t *__restrict__ mask, *__restrict__ mask_line; 2231 int32_t src_stride, mask_stride, dst_stride; 2232 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2233 dst_line, 1); 2234 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, 2235 mask_line, 1); 2236 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2237 src_line, 1); 2238 2239 while (height--) 2240 { 2241 src = src_line; 2242 src_line += src_stride; 2243 dst = dst_line; 2244 dst_line += dst_stride; 2245 mask = mask_line; 2246 mask_line += mask_stride; 2247 2248 RVV_FOREACH_3 (width, vl, e32m4, src, mask, dst) 2249 { 2250 __riscv_vse32 ( 2251 dst, 2252 rvv_over_m4 ( 2253 rvv_in_m4 (__riscv_vor (__riscv_vle32_v_u32m4 (src, vl), 2254 0xff000000, vl), 2255 __riscv_vle8_v_u8m1 (mask, vl), vl), 2256 __riscv_vle32_v_u32m4 (dst, vl), vl), 2257 vl); 2258 } 2259 } 2260 } 2261 2262 static void 2263 rvv_composite_over_8888_8888 (pixman_implementation_t *imp, 2264 pixman_composite_info_t *info) 2265 { 2266 PIXMAN_COMPOSITE_ARGS (info); 2267 uint32_t *dst_line, *dst; 2268 uint32_t *src_line, *src; 2269 int dst_stride, src_stride; 2270 2271 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2272 dst_line, 1); 2273 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2274 src_line, 1); 2275 2276 while (height--) 2277 { 2278 dst = dst_line; 2279 dst_line += dst_stride; 2280 src = src_line; 2281 src_line += src_stride; 2282 2283 RVV_FOREACH_2 (width, vl, e32m4, src, dst) 2284 { 2285 __riscv_vse32 (dst, 2286 rvv_over_m4 (__riscv_vle32_v_u32m4 (src, vl), 2287 __riscv_vle32_v_u32m4 (dst, vl), vl), 2288 vl); 2289 } 2290 } 2291 } 2292 2293 static void 2294 rvv_composite_over_n_8_0565 (pixman_implementation_t *imp, 2295 pixman_composite_info_t *info) 2296 { 2297 PIXMAN_COMPOSITE_ARGS (info); 2298 uint16_t *__restrict__ dst_line, *__restrict__ dst; 2299 uint8_t *__restrict__ mask_line, *__restrict__ mask; 2300 int dst_stride, mask_stride; 2301 uint32_t src; 2302 vuint32m4_t vsrc; 2303 2304 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2305 if (src == 0) 2306 return; 2307 vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ()); 2308 2309 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, 2310 dst_line, 1); 2311 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, 2312 mask_line, 1); 2313 2314 while (height--) 2315 { 2316 dst = dst_line; 2317 dst_line += dst_stride; 2318 mask = mask_line; 2319 mask_line += mask_stride; 2320 2321 RVV_FOREACH_2 (width, vl, e16m2, mask, dst) 2322 { 2323 __riscv_vse16 ( 2324 dst, 2325 rvv_convert_8888_to_0565_m2 ( 2326 rvv_over_m4 ( 2327 rvv_in_m4 (vsrc, __riscv_vle8_v_u8m1 (mask, vl), vl), 2328 rvv_convert_0565_to_0888_m4 ( 2329 __riscv_vle16_v_u16m2 (dst, vl), vl), 2330 vl), 2331 vl), 2332 vl); 2333 } 2334 } 2335 } 2336 2337 static void 2338 rvv_composite_over_n_8_8888 (pixman_implementation_t *imp, 2339 pixman_composite_info_t *info) 2340 { 2341 PIXMAN_COMPOSITE_ARGS (info); 2342 uint32_t *dst_line, *dst; 2343 uint8_t *mask_line, *mask; 2344 int dst_stride, mask_stride; 2345 uint32_t src; 2346 vuint32m4_t vsrc; 2347 2348 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2349 vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ()); 2350 2351 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2352 dst_line, 1); 2353 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, 2354 mask_line, 1); 2355 2356 while (height--) 2357 { 2358 dst = dst_line; 2359 dst_line += dst_stride; 2360 mask = mask_line; 2361 mask_line += mask_stride; 2362 2363 RVV_FOREACH_2 (width, vl, e32m4, mask, dst) 2364 { 2365 __riscv_vse32 ( 2366 dst, 2367 rvv_over_m4 ( 2368 rvv_in_m4 (vsrc, __riscv_vle8_v_u8m1 (mask, vl), vl), 2369 __riscv_vle32_v_u32m4 (dst, vl), vl), 2370 vl); 2371 } 2372 } 2373 } 2374 2375 static void 2376 rvv_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, 2377 pixman_composite_info_t *info) 2378 { 2379 PIXMAN_COMPOSITE_ARGS (info); 2380 uint32_t *dst_line, *dst; 2381 uint32_t *mask_line, *mask; 2382 int dst_stride, mask_stride; 2383 uint32_t src; 2384 vuint32m4_t vsrc; 2385 2386 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2387 if (src == 0) 2388 return; 2389 vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ()); 2390 2391 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2392 dst_line, 1); 2393 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, 2394 mask_line, 1); 2395 2396 while (height--) 2397 { 2398 dst = dst_line; 2399 dst_line += dst_stride; 2400 mask = mask_line; 2401 mask_line += mask_stride; 2402 2403 RVV_FOREACH_2 (width, vl, e32m4, mask, dst) 2404 { 2405 __riscv_vse32 (dst, 2406 rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 ( 2407 __riscv_vle32_v_u32m4 (mask, vl), vsrc, 2408 __riscv_vle32_v_u32m4 (dst, vl), vl), 2409 vl); 2410 } 2411 } 2412 } 2413 2414 static void 2415 rvv_composite_over_n_8888_8888_ca (pixman_implementation_t *__restrict__ imp, 2416 pixman_composite_info_t *__restrict__ info) 2417 { 2418 PIXMAN_COMPOSITE_ARGS (info); 2419 uint32_t *__restrict__ dst_line, *__restrict__ dst; 2420 uint32_t *__restrict__ mask_line, *__restrict__ mask; 2421 int dst_stride, mask_stride; 2422 uint32_t src, srca; 2423 vuint32m4_t vsrc; 2424 2425 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2426 if (src == 0) 2427 return; 2428 srca = src >> 24; 2429 vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ()); 2430 2431 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2432 dst_line, 1); 2433 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, 2434 mask_line, 1); 2435 2436 while (height--) 2437 { 2438 dst = dst_line; 2439 dst_line += dst_stride; 2440 mask = mask_line; 2441 mask_line += mask_stride; 2442 2443 RVV_FOREACH_2 (width, vl, e32m4, mask, dst) 2444 { 2445 vuint32m4_t m = __riscv_vle32_v_u32m4 (mask, vl); 2446 __riscv_vse32 ( 2447 dst, 2448 rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 ( 2449 __riscv_vle32_v_u32m4 (dst, vl), 2450 __riscv_vnot (rvv_UN8x4_MUL_UN8_vx_m4 (m, srca, vl), vl), 2451 rvv_UN8x4_MUL_UN8x4_vv_m4 (m, vsrc, vl), vl), 2452 vl); 2453 } 2454 } 2455 } 2456 2457 static void 2458 rvv_composite_over_n_8888_0565_ca (pixman_implementation_t *__restrict__ imp, 2459 pixman_composite_info_t *__restrict__ info) 2460 { 2461 PIXMAN_COMPOSITE_ARGS (info); 2462 uint16_t *__restrict__ dst_line, *__restrict__ dst; 2463 uint32_t *__restrict__ mask_line, *__restrict__ mask; 2464 int dst_stride, mask_stride; 2465 uint32_t src, srca; 2466 vuint32m4_t vsrc; 2467 2468 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2469 srca = src >> 24; 2470 if (src == 0) 2471 return; 2472 vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ()); 2473 2474 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, 2475 dst_line, 1); 2476 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, 2477 mask_line, 1); 2478 2479 while (height--) 2480 { 2481 dst = dst_line; 2482 dst_line += dst_stride; 2483 mask = mask_line; 2484 mask_line += mask_stride; 2485 2486 RVV_FOREACH_2 (width, vl, e32m4, mask, dst) 2487 { 2488 vuint32m4_t ma = __riscv_vle32_v_u32m4 (mask, vl); 2489 2490 __riscv_vse16 ( 2491 dst, 2492 rvv_convert_8888_to_0565_m2 ( 2493 rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 ( 2494 rvv_convert_0565_to_0888_m4 ( 2495 __riscv_vle16_v_u16m2 (dst, vl), vl), 2496 __riscv_vnot (rvv_UN8x4_MUL_UN8_vx_m4 (ma, srca, vl), 2497 vl), 2498 rvv_UN8x4_MUL_UN8x4_vv_m4 (ma, vsrc, vl), vl), 2499 vl), 2500 vl); 2501 } 2502 } 2503 } 2504 2505 static void 2506 rvv_composite_over_8888_0565 (pixman_implementation_t *__restrict__ imp, 2507 pixman_composite_info_t *__restrict__ info) 2508 { 2509 PIXMAN_COMPOSITE_ARGS (info); 2510 uint16_t *__restrict__ dst_line, *__restrict__ dst; 2511 uint32_t *__restrict__ src_line, *__restrict__ src; 2512 int dst_stride, src_stride; 2513 2514 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2515 src_line, 1); 2516 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, 2517 dst_line, 1); 2518 2519 while (height--) 2520 { 2521 dst = dst_line; 2522 dst_line += dst_stride; 2523 src = src_line; 2524 src_line += src_stride; 2525 2526 RVV_FOREACH_2 (width, vl, e16m2, src, dst) 2527 { 2528 __riscv_vse16 ( 2529 dst, 2530 rvv_convert_8888_to_0565_m2 ( 2531 rvv_over_m4 (__riscv_vle32_v_u32m4 (src, vl), 2532 rvv_convert_0565_to_0888_m4 ( 2533 __riscv_vle16_v_u16m2 (dst, vl), vl), 2534 vl), 2535 vl), 2536 vl); 2537 } 2538 } 2539 } 2540 2541 static void 2542 rvv_composite_add_8_8 (pixman_implementation_t *imp, 2543 pixman_composite_info_t *info) 2544 { 2545 PIXMAN_COMPOSITE_ARGS (info); 2546 uint8_t *dst_line, *dst; 2547 uint8_t *src_line, *src; 2548 int dst_stride, src_stride; 2549 2550 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, 2551 src_line, 1); 2552 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, 2553 dst_line, 1); 2554 2555 while (height--) 2556 { 2557 dst = dst_line; 2558 dst_line += dst_stride; 2559 src = src_line; 2560 src_line += src_stride; 2561 2562 RVV_FOREACH_2 (width, vl, e8m8, src, dst) 2563 { 2564 __riscv_vse8 (dst, 2565 rvv_UN8_ADD_UN8_vv (__riscv_vle8_v_u8m8 (src, vl), 2566 __riscv_vle8_v_u8m8 (dst, vl), 2567 vl), 2568 vl); 2569 } 2570 } 2571 } 2572 2573 static void 2574 rvv_composite_add_0565_0565 (pixman_implementation_t *imp, 2575 pixman_composite_info_t *info) 2576 { 2577 PIXMAN_COMPOSITE_ARGS (info); 2578 uint16_t *dst_line, *dst; 2579 uint16_t *src_line, *src; 2580 int dst_stride, src_stride; 2581 2582 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, 2583 src_line, 1); 2584 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, 2585 dst_line, 1); 2586 2587 while (height--) 2588 { 2589 dst = dst_line; 2590 dst_line += dst_stride; 2591 src = src_line; 2592 src_line += src_stride; 2593 2594 RVV_FOREACH_2 (width, vl, e16m2, src, dst) 2595 { 2596 __riscv_vse16 (dst, 2597 rvv_convert_8888_to_0565_m2 ( 2598 rvv_UN8x4_ADD_UN8x4_vv_m4 ( 2599 rvv_convert_0565_to_8888_m4 ( 2600 __riscv_vle16_v_u16m2 (src, vl), vl), 2601 rvv_convert_0565_to_8888_m4 ( 2602 __riscv_vle16_v_u16m2 (dst, vl), vl), 2603 vl), 2604 vl), 2605 vl); 2606 } 2607 } 2608 } 2609 2610 static void 2611 rvv_composite_add_8888_8888 (pixman_implementation_t *__restrict__ imp, 2612 pixman_composite_info_t *__restrict__ info) 2613 { 2614 PIXMAN_COMPOSITE_ARGS (info); 2615 uint32_t *__restrict__ dst_line, *__restrict__ dst; 2616 uint32_t *__restrict__ src_line, *__restrict__ src; 2617 int dst_stride, src_stride; 2618 2619 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, 2620 src_line, 1); 2621 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, 2622 dst_line, 1); 2623 2624 while (height--) 2625 { 2626 dst = dst_line; 2627 dst_line += dst_stride; 2628 src = src_line; 2629 src_line += src_stride; 2630 2631 RVV_FOREACH_2 (width, vl, e32m4, src, dst) 2632 { 2633 __riscv_vse32 ( 2634 dst, 2635 rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (src, vl), 2636 __riscv_vle32_v_u32m4 (dst, vl), vl), 2637 vl); 2638 } 2639 } 2640 } 2641 2642 static void 2643 rvv_composite_add_n_8_8 (pixman_implementation_t *imp, 2644 pixman_composite_info_t *info) 2645 { 2646 PIXMAN_COMPOSITE_ARGS (info); 2647 uint8_t *dst_line, *dst; 2648 uint8_t *mask_line, *mask; 2649 int dst_stride, mask_stride; 2650 uint32_t src; 2651 uint8_t sa; 2652 2653 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, 2654 dst_line, 1); 2655 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, 2656 mask_line, 1); 2657 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2658 sa = (src >> 24); 2659 2660 while (height--) 2661 { 2662 dst = dst_line; 2663 dst_line += dst_stride; 2664 mask = mask_line; 2665 mask_line += mask_stride; 2666 2667 RVV_FOREACH_2 (width, vl, e8m4, mask, dst) 2668 { 2669 __riscv_vse8 ( 2670 dst, 2671 rvv_UN8_ADD_UN8_vv (rvv_UN8_MUL_UN8_vx_m4 ( 2672 __riscv_vle8_v_u8m4 (mask, vl), sa, vl), 2673 __riscv_vle8_v_u8m4 (dst, vl), vl), 2674 vl); 2675 } 2676 } 2677 } 2678 2679 static void 2680 rvv_composite_src_memcpy (pixman_implementation_t *imp, 2681 pixman_composite_info_t *info) 2682 { 2683 PIXMAN_COMPOSITE_ARGS (info); 2684 int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8; 2685 uint32_t n_bytes = width * bpp; 2686 int dst_stride, src_stride; 2687 uint8_t *dst; 2688 uint8_t *src; 2689 2690 src_stride = src_image->bits.rowstride * 4; 2691 dst_stride = dest_image->bits.rowstride * 4; 2692 2693 src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp; 2694 dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp; 2695 2696 while (height--) 2697 { 2698 memcpy (dst, src, n_bytes); 2699 2700 dst += dst_stride; 2701 src += src_stride; 2702 } 2703 } 2704 2705 static void 2706 rvv_composite_in_n_8_8 (pixman_implementation_t *imp, 2707 pixman_composite_info_t *info) 2708 { 2709 PIXMAN_COMPOSITE_ARGS (info); 2710 uint32_t src, srca; 2711 uint8_t *dst_line, *dst; 2712 uint8_t *mask_line, *mask; 2713 int dst_stride, mask_stride; 2714 2715 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2716 srca = src >> 24; 2717 2718 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, 2719 dst_line, 1); 2720 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, 2721 mask_line, 1); 2722 2723 if (srca == 0xff) 2724 { 2725 while (height--) 2726 { 2727 dst = dst_line; 2728 dst_line += dst_stride; 2729 mask = mask_line; 2730 mask_line += mask_stride; 2731 2732 RVV_FOREACH_2 (width, vl, e8m4, mask, dst) 2733 { 2734 __riscv_vse8 ( 2735 dst, 2736 rvv_UN8_MUL_UN8_vv_m4 (__riscv_vle8_v_u8m4 (mask, vl), 2737 __riscv_vle8_v_u8m4 (dst, vl), vl), 2738 vl); 2739 } 2740 } 2741 } 2742 else 2743 { 2744 while (height--) 2745 { 2746 dst = dst_line; 2747 dst_line += dst_stride; 2748 mask = mask_line; 2749 mask_line += mask_stride; 2750 2751 RVV_FOREACH_2 (width, vl, e8m4, mask, dst) 2752 { 2753 __riscv_vse8 (dst, 2754 rvv_UN8_MUL_UN8_vv_m4 ( 2755 rvv_UN8_MUL_UN8_vx_m4 ( 2756 __riscv_vle8_v_u8m4 (mask, vl), srca, vl), 2757 __riscv_vle8_v_u8m4 (dst, vl), vl), 2758 vl); 2759 } 2760 } 2761 } 2762 } 2763 2764 static void 2765 rvv_composite_in_8_8 (pixman_implementation_t *imp, 2766 pixman_composite_info_t *info) 2767 { 2768 PIXMAN_COMPOSITE_ARGS (info); 2769 uint8_t *dst_line, *dst; 2770 uint8_t *src_line, *src; 2771 int dst_stride, src_stride; 2772 2773 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, 2774 src_line, 1); 2775 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, 2776 dst_line, 1); 2777 2778 while (height--) 2779 { 2780 dst = dst_line; 2781 dst_line += dst_stride; 2782 src = src_line; 2783 src_line += src_stride; 2784 2785 RVV_FOREACH_2 (width, vl, e8m4, src, dst) 2786 { 2787 __riscv_vse8 (dst, 2788 rvv_UN8_MUL_UN8_vv_m4 (__riscv_vle8_v_u8m4 (src, vl), 2789 __riscv_vle8_v_u8m4 (dst, vl), 2790 vl), 2791 vl); 2792 } 2793 } 2794 } 2795 2796 #define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs)) 2797 2798 /* 2799 * There is some potential for hand vectorization, but for now let's leave it 2800 * autovectorized. 2801 */ 2802 static force_inline void 2803 pixman_fill1_line (uint32_t *dst, int offs, int width, int v) 2804 { 2805 if (offs) 2806 { 2807 int leading_pixels = 32 - offs; 2808 if (leading_pixels >= width) 2809 { 2810 if (v) 2811 *dst |= A1_FILL_MASK (width, offs); 2812 else 2813 *dst &= ~A1_FILL_MASK (width, offs); 2814 return; 2815 } 2816 else 2817 { 2818 if (v) 2819 *dst++ |= A1_FILL_MASK (leading_pixels, offs); 2820 else 2821 *dst++ &= ~A1_FILL_MASK (leading_pixels, offs); 2822 width -= leading_pixels; 2823 } 2824 } 2825 while (width >= 32) 2826 { 2827 if (v) 2828 *dst++ = 0xFFFFFFFF; 2829 else 2830 *dst++ = 0; 2831 width -= 32; 2832 } 2833 if (width > 0) 2834 { 2835 if (v) 2836 *dst |= A1_FILL_MASK (width, 0); 2837 else 2838 *dst &= ~A1_FILL_MASK (width, 0); 2839 } 2840 } 2841 2842 static void 2843 rvv_fill1 (uint32_t *bits, 2844 int stride, 2845 int x, 2846 int y, 2847 int width, 2848 int height, 2849 uint32_t filler) 2850 { 2851 uint32_t *dst = bits + y * stride + (x >> 5); 2852 int offs = x & 31; 2853 2854 while (height--) 2855 { 2856 pixman_fill1_line (dst, offs, width, (filler & 1)); 2857 dst += stride; 2858 } 2859 } 2860 2861 #define RVV_FILL(dtypew) \ 2862 static void rvv_fill_u##dtypew (uint32_t *__restrict__ bits, int stride, \ 2863 int x, int y, int width, int height, \ 2864 uint32_t filler) \ 2865 { \ 2866 uint##dtypew##_t *__restrict__ bitsw = (uint##dtypew##_t *)bits; \ 2867 int32_t vstride = stride * (32 / dtypew); \ 2868 vuint##dtypew##m8_t vfiller = __riscv_vmv_v_x_u##dtypew##m8 ( \ 2869 (uint##dtypew##_t)filler, __riscv_vsetvlmax_e##dtypew##m8 ()); \ 2870 \ 2871 bitsw += y * vstride + x; \ 2872 while (height--) \ 2873 { \ 2874 uint##dtypew##_t *__restrict__ d = bitsw; \ 2875 \ 2876 RVV_FOREACH_1 (width, vl, e##dtypew##m8, d) \ 2877 { \ 2878 __riscv_vse##dtypew (d, vfiller, vl); \ 2879 } \ 2880 \ 2881 bitsw += vstride; \ 2882 } \ 2883 } 2884 2885 RVV_FILL (8); 2886 RVV_FILL (16); 2887 RVV_FILL (32); 2888 2889 static pixman_bool_t 2890 rvv_fill (pixman_implementation_t *__restrict__ imp, 2891 uint32_t *__restrict__ bits, 2892 int stride, 2893 int bpp, 2894 int x, 2895 int y, 2896 int width, 2897 int height, 2898 uint32_t filler) 2899 { 2900 switch (bpp) 2901 { 2902 case 1: 2903 rvv_fill1 (bits, stride, x, y, width, height, filler); 2904 break; 2905 case 8: 2906 rvv_fill_u8 (bits, stride, x, y, width, height, filler); 2907 break; 2908 case 16: 2909 rvv_fill_u16 (bits, stride, x, y, width, height, filler); 2910 break; 2911 case 32: 2912 rvv_fill_u32 (bits, stride, x, y, width, height, filler); 2913 break; 2914 default: 2915 return FALSE; 2916 } 2917 2918 return TRUE; 2919 } 2920 2921 static void 2922 rvv_composite_solid_fill (pixman_implementation_t *imp, 2923 pixman_composite_info_t *info) 2924 { 2925 PIXMAN_COMPOSITE_ARGS (info); 2926 uint32_t src; 2927 2928 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 2929 2930 if (dest_image->bits.format == PIXMAN_a1) 2931 { 2932 src = src >> 31; 2933 } 2934 else if (dest_image->bits.format == PIXMAN_a8) 2935 { 2936 src = src >> 24; 2937 } 2938 else if (dest_image->bits.format == PIXMAN_r5g6b5 || 2939 dest_image->bits.format == PIXMAN_b5g6r5) 2940 { 2941 src = convert_8888_to_0565 (src); 2942 } 2943 2944 rvv_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, 2945 PIXMAN_FORMAT_BPP (dest_image->bits.format), dest_x, dest_y, 2946 width, height, src); 2947 } 2948 2949 #define RVV_BLT(dtypew) \ 2950 static void rvv_blt_u##dtypew ( \ 2951 uint32_t *__restrict__ src_bits, uint32_t *__restrict__ dst_bits, \ 2952 int src_stride, int dst_stride, int src_x, int src_y, int dest_x, \ 2953 int dest_y, int width, int height) \ 2954 { \ 2955 uint##dtypew##_t *src_w = (uint##dtypew##_t *)src_bits; \ 2956 uint##dtypew##_t *dst_w = (uint##dtypew##_t *)dst_bits; \ 2957 \ 2958 src_stride = src_stride * (32 / dtypew); \ 2959 dst_stride = dst_stride * (32 / dtypew); \ 2960 \ 2961 src_w += src_stride * src_y + src_x; \ 2962 dst_w += dst_stride * dest_y + dest_x; \ 2963 \ 2964 while (height--) \ 2965 { \ 2966 uint##dtypew##_t *__restrict__ pd = dst_w; \ 2967 uint##dtypew##_t *__restrict__ ps = src_w; \ 2968 \ 2969 RVV_FOREACH_2 (width, vl, e##dtypew##m8, ps, pd) \ 2970 { \ 2971 __riscv_vse##dtypew ( \ 2972 pd, __riscv_vle##dtypew##_v_u##dtypew##m8 (ps, vl), vl); \ 2973 } \ 2974 \ 2975 dst_w += dst_stride; \ 2976 src_w += src_stride; \ 2977 } \ 2978 } 2979 RVV_BLT (8); 2980 RVV_BLT (16); 2981 RVV_BLT (32); 2982 2983 static pixman_bool_t 2984 rvv_blt (pixman_implementation_t *__restrict__ imp, 2985 uint32_t *__restrict__ src_bits, 2986 uint32_t *__restrict__ dst_bits, 2987 int src_stride, 2988 int dst_stride, 2989 int src_bpp, 2990 int dst_bpp, 2991 int src_x, 2992 int src_y, 2993 int dest_x, 2994 int dest_y, 2995 int width, 2996 int height) 2997 { 2998 if (src_bpp != dst_bpp) 2999 return FALSE; 3000 3001 switch (src_bpp) 3002 { 3003 case 8: 3004 rvv_blt_u8 (src_bits, dst_bits, src_stride, dst_stride, src_x, 3005 src_y, dest_x, dest_y, width, height); 3006 break; 3007 case 16: 3008 rvv_blt_u16 (src_bits, dst_bits, src_stride, dst_stride, src_x, 3009 src_y, dest_x, dest_y, width, height); 3010 break; 3011 case 32: 3012 rvv_blt_u32 (src_bits, dst_bits, src_stride, dst_stride, src_x, 3013 src_y, dest_x, dest_y, width, height); 3014 break; 3015 default: 3016 return FALSE; 3017 } 3018 3019 return TRUE; 3020 } 3021 3022 // clang-format off 3023 static const pixman_fast_path_t rvv_fast_paths[] = { 3024 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, rvv_composite_over_n_8_0565), 3025 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, rvv_composite_over_n_8_0565), 3026 // PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, rvv_composite_over_n_8_0888), 3027 // PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, rvv_composite_over_n_8_0888), 3028 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, rvv_composite_over_n_8_8888), 3029 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, rvv_composite_over_n_8_8888), 3030 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, rvv_composite_over_n_8_8888), 3031 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, rvv_composite_over_n_8_8888), 3032 // PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, rvv_composite_over_n_1_8888), 3033 // PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, rvv_composite_over_n_1_8888), 3034 // PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, rvv_composite_over_n_1_8888), 3035 // PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, rvv_composite_over_n_1_8888), 3036 // PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5, rvv_composite_over_n_1_0565), 3037 // PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5, rvv_composite_over_n_1_0565), 3038 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, rvv_composite_over_n_8888_8888_ca), 3039 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, rvv_composite_over_n_8888_8888_ca), 3040 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, rvv_composite_over_n_8888_0565_ca), 3041 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, rvv_composite_over_n_8888_8888_ca), 3042 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, rvv_composite_over_n_8888_8888_ca), 3043 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, rvv_composite_over_n_8888_0565_ca), 3044 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, rvv_composite_over_x888_8_8888), 3045 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, rvv_composite_over_x888_8_8888), 3046 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, rvv_composite_over_x888_8_8888), 3047 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, rvv_composite_over_x888_8_8888), 3048 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, rvv_composite_over_8888_8888), 3049 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, rvv_composite_over_8888_8888), 3050 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, rvv_composite_over_8888_0565), 3051 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, rvv_composite_over_8888_8888), 3052 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, rvv_composite_over_8888_8888), 3053 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, rvv_composite_over_8888_0565), 3054 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, rvv_composite_add_0565_0565), 3055 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, rvv_composite_add_0565_0565), 3056 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, rvv_composite_add_8888_8888), 3057 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, rvv_composite_add_8888_8888), 3058 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, rvv_composite_add_8_8), 3059 // PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1), 3060 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, rvv_composite_add_n_8888_8888_ca), 3061 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, rvv_composite_add_n_8_8), 3062 PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, rvv_composite_solid_fill), 3063 PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, rvv_composite_solid_fill), 3064 PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, rvv_composite_solid_fill), 3065 PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, rvv_composite_solid_fill), 3066 PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, rvv_composite_solid_fill), 3067 PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, rvv_composite_solid_fill), 3068 PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, rvv_composite_solid_fill), 3069 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, rvv_composite_src_x888_8888), 3070 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, rvv_composite_src_x888_8888), 3071 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888), 3072 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, rvv_composite_src_8888_8888), 3073 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888), 3074 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888), 3075 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, rvv_composite_src_8888_8888), 3076 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888), 3077 PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, rvv_composite_src_8888_8888), 3078 PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, rvv_composite_src_8888_8888), 3079 PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, rvv_composite_src_8888_8888), 3080 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, rvv_composite_src_memcpy), 3081 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, rvv_composite_src_memcpy), 3082 PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, rvv_composite_src_memcpy), 3083 PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, rvv_composite_src_memcpy), 3084 PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, rvv_composite_src_memcpy), 3085 PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, rvv_composite_src_memcpy), 3086 PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, rvv_composite_src_memcpy), 3087 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, rvv_composite_in_8_8), 3088 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, rvv_composite_in_n_8_8), 3089 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888), 3090 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888), 3091 3092 {PIXMAN_OP_NONE}, 3093 }; 3094 3095 pixman_implementation_t * 3096 _pixman_implementation_create_rvv (pixman_implementation_t *fallback) 3097 { 3098 pixman_implementation_t *imp = _pixman_implementation_create ( 3099 fallback, rvv_fast_paths); 3100 3101 // clang-format off 3102 imp->combine_float[PIXMAN_OP_CLEAR] = rvv_combine_clear_u_float; 3103 imp->combine_float[PIXMAN_OP_SRC] = rvv_combine_src_u_float; 3104 imp->combine_float[PIXMAN_OP_DST] = rvv_combine_dst_u_float; 3105 imp->combine_float[PIXMAN_OP_OVER] = rvv_combine_over_u_float; 3106 imp->combine_float[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u_float; 3107 imp->combine_float[PIXMAN_OP_IN] = rvv_combine_in_u_float; 3108 imp->combine_float[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u_float; 3109 imp->combine_float[PIXMAN_OP_OUT] = rvv_combine_out_u_float; 3110 imp->combine_float[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u_float; 3111 imp->combine_float[PIXMAN_OP_ATOP] = rvv_combine_atop_u_float; 3112 imp->combine_float[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u_float; 3113 imp->combine_float[PIXMAN_OP_XOR] = rvv_combine_xor_u_float; 3114 imp->combine_float[PIXMAN_OP_ADD] = rvv_combine_add_u_float; 3115 imp->combine_float[PIXMAN_OP_SATURATE] = rvv_combine_saturate_u_float; 3116 3117 /* Disjoint, unified */ 3118 imp->combine_float[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_u_float; 3119 imp->combine_float[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_u_float; 3120 imp->combine_float[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_u_float; 3121 imp->combine_float[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_u_float; 3122 imp->combine_float[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_u_float; 3123 imp->combine_float[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_u_float; 3124 imp->combine_float[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_u_float; 3125 imp->combine_float[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_u_float; 3126 imp->combine_float[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_u_float; 3127 imp->combine_float[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_u_float; 3128 imp->combine_float[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_u_float; 3129 imp->combine_float[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_u_float; 3130 3131 /* Conjoint, unified */ 3132 imp->combine_float[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_u_float; 3133 imp->combine_float[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_u_float; 3134 imp->combine_float[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_u_float; 3135 imp->combine_float[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_u_float; 3136 imp->combine_float[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_u_float; 3137 imp->combine_float[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_u_float; 3138 imp->combine_float[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_u_float; 3139 imp->combine_float[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_u_float; 3140 imp->combine_float[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_u_float; 3141 imp->combine_float[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_u_float; 3142 imp->combine_float[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_u_float; 3143 imp->combine_float[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_u_float; 3144 3145 /* PDF operators, unified */ 3146 imp->combine_float[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u_float; 3147 imp->combine_float[PIXMAN_OP_SCREEN] = rvv_combine_screen_u_float; 3148 imp->combine_float[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u_float; 3149 imp->combine_float[PIXMAN_OP_DARKEN] = rvv_combine_darken_u_float; 3150 imp->combine_float[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u_float; 3151 imp->combine_float[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u_float; 3152 imp->combine_float[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_u_float; 3153 imp->combine_float[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u_float; 3154 imp->combine_float[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u_float; 3155 imp->combine_float[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_u_float; 3156 imp->combine_float[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_u_float; 3157 3158 /* Component alpha combiners */ 3159 imp->combine_float_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear_ca_float; 3160 imp->combine_float_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca_float; 3161 imp->combine_float_ca[PIXMAN_OP_DST] = rvv_combine_dst_ca_float; 3162 imp->combine_float_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca_float; 3163 imp->combine_float_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca_float; 3164 imp->combine_float_ca[PIXMAN_OP_IN] = rvv_combine_in_ca_float; 3165 imp->combine_float_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca_float; 3166 imp->combine_float_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca_float; 3167 imp->combine_float_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca_float; 3168 imp->combine_float_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca_float; 3169 imp->combine_float_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca_float; 3170 imp->combine_float_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca_float; 3171 imp->combine_float_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca_float; 3172 imp->combine_float_ca[PIXMAN_OP_SATURATE] = rvv_combine_saturate_ca_float; 3173 3174 /* Disjoint CA */ 3175 imp->combine_float_ca[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_ca_float; 3176 imp->combine_float_ca[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_ca_float; 3177 imp->combine_float_ca[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_ca_float; 3178 imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_ca_float; 3179 imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_ca_float; 3180 imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_ca_float; 3181 imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_ca_float; 3182 imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_ca_float; 3183 imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_ca_float; 3184 imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_ca_float; 3185 imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_ca_float; 3186 imp->combine_float_ca[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_ca_float; 3187 3188 /* Conjoint CA */ 3189 imp->combine_float_ca[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_ca_float; 3190 imp->combine_float_ca[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_ca_float; 3191 imp->combine_float_ca[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_ca_float; 3192 imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_ca_float; 3193 imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_ca_float; 3194 imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_ca_float; 3195 imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_ca_float; 3196 imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_ca_float; 3197 imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_ca_float; 3198 imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_ca_float; 3199 imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_ca_float; 3200 imp->combine_float_ca[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_ca_float; 3201 3202 /* PDF operators CA */ 3203 imp->combine_float_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca_float; 3204 imp->combine_float_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca_float; 3205 imp->combine_float_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca_float; 3206 imp->combine_float_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca_float; 3207 imp->combine_float_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca_float; 3208 imp->combine_float_ca[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_ca_float; 3209 imp->combine_float_ca[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_ca_float; 3210 imp->combine_float_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca_float; 3211 imp->combine_float_ca[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_ca_float; 3212 imp->combine_float_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca_float; 3213 imp->combine_float_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca_float; 3214 3215 /* It is not clear that these make sense, so make them noops for now */ 3216 imp->combine_float_ca[PIXMAN_OP_HSL_HUE] = rvv_combine_dst_u_float; 3217 imp->combine_float_ca[PIXMAN_OP_HSL_SATURATION] = rvv_combine_dst_u_float; 3218 imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = rvv_combine_dst_u_float; 3219 imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = rvv_combine_dst_u_float; 3220 3221 /* Set up function pointers */ 3222 imp->combine_32[PIXMAN_OP_CLEAR] = rvv_combine_clear; 3223 imp->combine_32[PIXMAN_OP_SRC] = rvv_combine_src_u; 3224 imp->combine_32[PIXMAN_OP_OVER] = rvv_combine_over_u; 3225 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u; 3226 imp->combine_32[PIXMAN_OP_IN] = rvv_combine_in_u; 3227 imp->combine_32[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u; 3228 imp->combine_32[PIXMAN_OP_OUT] = rvv_combine_out_u; 3229 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u; 3230 imp->combine_32[PIXMAN_OP_ATOP] = rvv_combine_atop_u; 3231 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u; 3232 imp->combine_32[PIXMAN_OP_XOR] = rvv_combine_xor_u; 3233 imp->combine_32[PIXMAN_OP_ADD] = rvv_combine_add_u; 3234 3235 imp->combine_32[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u; 3236 imp->combine_32[PIXMAN_OP_SCREEN] = rvv_combine_screen_u; 3237 imp->combine_32[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u; 3238 imp->combine_32[PIXMAN_OP_DARKEN] = rvv_combine_darken_u; 3239 imp->combine_32[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u; 3240 imp->combine_32[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u; 3241 imp->combine_32[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u; 3242 imp->combine_32[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u; 3243 3244 imp->combine_32_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear; 3245 imp->combine_32_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca; 3246 imp->combine_32_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca; 3247 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca; 3248 imp->combine_32_ca[PIXMAN_OP_IN] = rvv_combine_in_ca; 3249 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca; 3250 imp->combine_32_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca; 3251 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca; 3252 imp->combine_32_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca; 3253 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca; 3254 imp->combine_32_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca; 3255 imp->combine_32_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca; 3256 3257 imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca; 3258 imp->combine_32_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca; 3259 imp->combine_32_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca; 3260 imp->combine_32_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca; 3261 imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca; 3262 imp->combine_32_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca; 3263 imp->combine_32_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca; 3264 imp->combine_32_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca; 3265 3266 imp->fill = rvv_fill; 3267 imp->blt = rvv_blt; 3268 3269 return imp; 3270 } 3271 // clang-format on