blend.h (31976B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) { 6 #if USE_SSE2 7 return _mm_packs_epi32(a, b); 8 #elif USE_NEON 9 return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b)); 10 #else 11 return CONVERT(combine(a, b), HalfRGBA8); 12 #endif 13 } 14 15 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v, 16 float scale = 255.0f) { 17 ivec4 i = round_pixel(v, scale); 18 HalfRGBA8 xz = packRGBA8(i.z, i.x); 19 HalfRGBA8 yw = packRGBA8(i.y, i.w); 20 HalfRGBA8 xyzwl = zipLow(xz, yw); 21 HalfRGBA8 xyzwh = zipHigh(xz, yw); 22 HalfRGBA8 lo = zip2Low(xyzwl, xyzwh); 23 HalfRGBA8 hi = zip2High(xyzwl, xyzwh); 24 return combine(lo, hi); 25 } 26 27 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha, 28 float scale = 255.0f) { 29 I32 i = round_pixel(alpha, scale); 30 HalfRGBA8 c = packRGBA8(i, i); 31 c = zipLow(c, c); 32 return zip(c, c); 33 } 34 35 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha, 36 float scale = 255.0f) { 37 I32 i = round_pixel(alpha, scale); 38 return repeat2(packRGBA8(i, i)); 39 } 40 41 UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v, 42 float scale = 255.0f) { 43 I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale); 44 return repeat2(packRGBA8(i, i)); 45 } 46 47 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() { 48 return pack_pixels_RGBA8(fragment_shader->gl_FragColor); 49 } 50 51 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v, 52 float scale = 255.0f) { 53 ivec4 i = round_pixel(bit_cast<vec4>(v), scale); 54 return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w)); 55 } 56 57 static ALWAYS_INLINE WideR8 packR8(I32 a) { 58 #if USE_SSE2 59 return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a))); 60 #elif USE_NEON 61 return vqmovun_s32(a); 62 #else 63 return CONVERT(a, WideR8); 64 #endif 65 } 66 67 static ALWAYS_INLINE WideR8 pack_pixels_R8(Float c, float scale = 255.0f) { 68 return packR8(round_pixel(c, scale)); 69 } 70 71 static ALWAYS_INLINE WideR8 pack_pixels_R8() { 72 return pack_pixels_R8(fragment_shader->gl_FragColor.x); 73 } 74 75 // Load a partial span > 0 and < 4 pixels. 76 template <typename V, typename P> 77 static ALWAYS_INLINE V partial_load_span(const P* src, int span) { 78 return bit_cast<V>( 79 (span >= 2 80 ? combine(unaligned_load<V2<P>>(src), 81 V2<P>{span > 2 ? unaligned_load<P>(src + 2) : P(0), 0}) 82 : V4<P>{unaligned_load<P>(src), 0, 0, 0})); 83 } 84 85 // Store a partial span > 0 and < 4 pixels. 86 template <typename V, typename P> 87 static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) { 88 auto pixels = bit_cast<V4<P>>(src); 89 if (span >= 2) { 90 unaligned_store(dst, lowHalf(pixels)); 91 if (span > 2) { 92 unaligned_store(dst + 2, pixels.z); 93 } 94 } else { 95 unaligned_store(dst, pixels.x); 96 } 97 } 98 99 // Dispatcher that chooses when to load a full or partial span 100 template <typename V, typename P> 101 static ALWAYS_INLINE V load_span(const P* src, int span) { 102 if (span >= 4) { 103 return unaligned_load<V, P>(src); 104 } else { 105 return partial_load_span<V, P>(src, span); 106 } 107 } 108 109 // Dispatcher that chooses when to store a full or partial span 110 template <typename V, typename P> 111 static ALWAYS_INLINE void store_span(P* dst, V src, int span) { 112 if (span >= 4) { 113 unaligned_store<V, P>(dst, src); 114 } else { 115 partial_store_span<V, P>(dst, src, span); 116 } 117 } 118 119 template <typename T> 120 static ALWAYS_INLINE T muldiv256(T x, T y) { 121 return (x * y) >> 8; 122 } 123 124 // (x*y + x) >> 8, cheap approximation of (x*y) / 255 125 template <typename T> 126 static ALWAYS_INLINE T muldiv255(T x, T y) { 127 return (x * y + x) >> 8; 128 } 129 130 template <typename V> 131 static ALWAYS_INLINE WideRGBA8 pack_span(uint32_t*, const V& v, 132 float scale = 255.0f) { 133 return pack_pixels_RGBA8(v, scale); 134 } 135 136 template <typename C> 137 static ALWAYS_INLINE WideR8 pack_span(uint8_t*, C c, float scale = 255.0f) { 138 return pack_pixels_R8(c, scale); 139 } 140 141 // Helper functions to apply a color modulus when available. 142 struct NoColor {}; 143 144 template <typename P> 145 static ALWAYS_INLINE P applyColor(P src, NoColor) { 146 return src; 147 } 148 149 struct InvertColor {}; 150 151 template <typename P> 152 static ALWAYS_INLINE P applyColor(P src, InvertColor) { 153 return 255 - src; 154 } 155 156 template <typename P> 157 static ALWAYS_INLINE P applyColor(P src, P color) { 158 return muldiv255(color, src); 159 } 160 161 static ALWAYS_INLINE WideRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) { 162 return applyColor(unpack(src), color); 163 } 164 165 template <typename P, typename C> 166 static ALWAYS_INLINE auto packColor(P* buf, C color) { 167 return pack_span(buf, color, 255.0f); 168 } 169 170 template <typename P> 171 static ALWAYS_INLINE NoColor packColor(UNUSED P* buf, NoColor noColor) { 172 return noColor; 173 } 174 175 template <typename P> 176 static ALWAYS_INLINE InvertColor packColor(UNUSED P* buf, 177 InvertColor invertColor) { 178 return invertColor; 179 } 180 181 // Single argument variation that takes an explicit destination buffer type. 182 template <typename P, typename C> 183 static ALWAYS_INLINE auto packColor(C color) { 184 // Just pass in a typed null pointer, as the pack routines never use the 185 // pointer's value, just its type. 186 return packColor((P*)0, color); 187 } 188 189 // Byte-wise addition for when x or y is a signed 8-bit value stored in the 190 // low byte of a larger type T only with zeroed-out high bits, where T is 191 // greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used 192 // upon signed operands, using up all the precision in a 16 bit integer, and 193 // potentially losing the sign bit in the last >> 8 shift. Due to the 194 // properties of two's complement arithmetic, even though we've discarded the 195 // sign bit, we can still represent a negative number under addition (without 196 // requiring any extra sign bits), just that any negative number will behave 197 // like a large unsigned number under addition, generating a single carry bit 198 // on overflow that we need to discard. Thus, just doing a byte-wise add will 199 // overflow without the troublesome carry, giving us only the remaining 8 low 200 // bits we actually need while keeping the high bits at zero. 201 template <typename T> 202 static ALWAYS_INLINE T addlow(T x, T y) { 203 typedef VectorType<uint8_t, sizeof(T)> bytes; 204 return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y)); 205 } 206 207 // Replace color components of each pixel with the pixel's alpha values. 208 template <typename T> 209 static ALWAYS_INLINE T alphas(T c) { 210 return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15); 211 } 212 213 // Replace the alpha values of the first vector with alpha values from the 214 // second, while leaving the color components unmodified. 215 template <typename T> 216 static ALWAYS_INLINE T set_alphas(T c, T a) { 217 return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31); 218 } 219 220 // Miscellaneous helper functions for working with packed RGBA8 data. 221 static ALWAYS_INLINE HalfRGBA8 if_then_else(V8<int16_t> c, HalfRGBA8 t, 222 HalfRGBA8 e) { 223 return bit_cast<HalfRGBA8>((c & t) | (~c & e)); 224 } 225 226 template <typename T, typename C, int N> 227 static ALWAYS_INLINE VectorType<T, N> if_then_else(VectorType<C, N> c, 228 VectorType<T, N> t, 229 VectorType<T, N> e) { 230 return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)), 231 if_then_else(highHalf(c), highHalf(t), highHalf(e))); 232 } 233 234 static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) { 235 #if USE_SSE2 236 return bit_cast<HalfRGBA8>( 237 _mm_min_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y))); 238 #elif USE_NEON 239 return vminq_u16(x, y); 240 #else 241 return if_then_else(x < y, x, y); 242 #endif 243 } 244 245 template <typename T, int N> 246 static ALWAYS_INLINE VectorType<T, N> min(VectorType<T, N> x, 247 VectorType<T, N> y) { 248 return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y))); 249 } 250 251 static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) { 252 #if USE_SSE2 253 return bit_cast<HalfRGBA8>( 254 _mm_max_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y))); 255 #elif USE_NEON 256 return vmaxq_u16(x, y); 257 #else 258 return if_then_else(x > y, x, y); 259 #endif 260 } 261 262 template <typename T, int N> 263 static ALWAYS_INLINE VectorType<T, N> max(VectorType<T, N> x, 264 VectorType<T, N> y) { 265 return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y))); 266 } 267 268 template <typename T, int N> 269 static ALWAYS_INLINE VectorType<T, N> recip(VectorType<T, N> v) { 270 return combine(recip(lowHalf(v)), recip(highHalf(v))); 271 } 272 273 // Helper to get the reciprocal if the value is non-zero, or otherwise default 274 // to the supplied fallback value. 275 template <typename V> 276 static ALWAYS_INLINE V recip_or(V v, float f) { 277 return if_then_else(v != V(0.0f), recip(v), V(f)); 278 } 279 280 template <typename T, int N> 281 static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) { 282 return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v))); 283 } 284 285 // Extract the alpha components so that we can cheaply calculate the reciprocal 286 // on a single SIMD register. Then multiply the duplicated alpha reciprocal with 287 // the pixel data. 0 alpha is treated as transparent black. 288 static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) { 289 Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f); 290 return v * a.xxxxyyyyzzzzwwww; 291 } 292 293 // Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to 294 // RGBA to unpack. 295 static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) { 296 return bit_cast<vec4>( 297 SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15)); 298 } 299 300 // The following lum/sat functions mostly follow the KHR_blend_equation_advanced 301 // specification but are rearranged to work on premultiplied data. 302 static ALWAYS_INLINE Float lumv3(vec3 v) { 303 return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f; 304 } 305 306 static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); } 307 308 static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); } 309 310 static inline vec3 clip_color(vec3 v, Float lum, Float alpha) { 311 Float mincol = max(-minv3(v), lum); 312 Float maxcol = max(maxv3(v), alpha - lum); 313 return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f)); 314 } 315 316 static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) { 317 return clip_color(base - lumv3(base), lumv3(ref), alpha); 318 } 319 320 static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) { 321 vec3 diff = base - minv3(base); 322 Float sbase = maxv3(diff); 323 Float ssat = maxv3(sref) - minv3(sref); 324 // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale 325 // to black, as per specification. 326 return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha); 327 } 328 329 // Flags the reflect the current blend-stage clipping to be applied. 330 enum SWGLClipFlag { 331 SWGL_CLIP_FLAG_MASK = 1 << 0, 332 SWGL_CLIP_FLAG_AA = 1 << 1, 333 SWGL_CLIP_FLAG_BLEND_OVERRIDE = 1 << 2, 334 }; 335 static int swgl_ClipFlags = 0; 336 static BlendKey swgl_BlendOverride = BLEND_KEY_NONE; 337 static WideRGBA8 swgl_BlendColorRGBA8 = {0}; 338 static WideRGBA8 swgl_BlendAlphaRGBA8 = {0}; 339 340 // A pointer into the color buffer for the start of the span. 341 static void* swgl_SpanBuf = nullptr; 342 // A pointer into the clip mask for the start of the span. 343 static uint8_t* swgl_ClipMaskBuf = nullptr; 344 345 static ALWAYS_INLINE WideR8 expand_mask(UNUSED uint8_t* buf, WideR8 mask) { 346 return mask; 347 } 348 static ALWAYS_INLINE WideRGBA8 expand_mask(UNUSED uint32_t* buf, WideR8 mask) { 349 WideRG8 maskRG = zip(mask, mask); 350 return zip(maskRG, maskRG); 351 } 352 353 // Loads a chunk of clip masks. The current pointer into the color buffer is 354 // used to reconstruct the relative position within the span. From there, the 355 // pointer into the clip mask can be generated from the start of the clip mask 356 // span. 357 template <typename P> 358 static ALWAYS_INLINE uint8_t* get_clip_mask(P* buf) { 359 return &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf]; 360 } 361 362 template <typename P> 363 static ALWAYS_INLINE auto load_clip_mask(P* buf, int span) 364 -> decltype(expand_mask(buf, 0)) { 365 return expand_mask(buf, 366 unpack(load_span<PackedR8>(get_clip_mask(buf), span))); 367 } 368 369 // Temporarily removes masking from the blend stage, assuming the caller will 370 // handle it. 371 static ALWAYS_INLINE void override_clip_mask() { 372 blend_key = BlendKey(blend_key - MASK_BLEND_KEY_NONE); 373 } 374 375 // Restores masking to the blend stage, assuming it was previously overridden. 376 static ALWAYS_INLINE void restore_clip_mask() { 377 blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key); 378 } 379 380 // A pointer to the start of the opaque destination region of the span for AA. 381 static const uint8_t* swgl_OpaqueStart = nullptr; 382 // The size, in bytes, of the opaque region. 383 static uint32_t swgl_OpaqueSize = 0; 384 // AA coverage distance offsets for the left and right edges. 385 static Float swgl_LeftAADist = 0.0f; 386 static Float swgl_RightAADist = 0.0f; 387 // AA coverage slope values used for accumulating coverage for each step. 388 static Float swgl_AASlope = 0.0f; 389 390 // Get the amount of pixels we need to process before the start of the opaque 391 // region. 392 template <typename P> 393 static ALWAYS_INLINE int get_aa_opaque_start(P* buf) { 394 return max(int((P*)swgl_OpaqueStart - buf), 0); 395 } 396 397 // Assuming we are already in the opaque part of the span, return the remaining 398 // size of the opaque part. 399 template <typename P> 400 static ALWAYS_INLINE int get_aa_opaque_size(P* buf) { 401 return max(int((P*)&swgl_OpaqueStart[swgl_OpaqueSize] - buf), 0); 402 } 403 404 // Temporarily removes anti-aliasing from the blend stage, assuming the caller 405 // will handle it. 406 static ALWAYS_INLINE void override_aa() { 407 blend_key = BlendKey(blend_key - AA_BLEND_KEY_NONE); 408 } 409 410 // Restores anti-aliasing to the blend stage, assuming it was previously 411 // overridden. 412 static ALWAYS_INLINE void restore_aa() { 413 blend_key = BlendKey(AA_BLEND_KEY_NONE + blend_key); 414 } 415 416 static PREFER_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst, 417 WideRGBA8 src, int span = 4) { 418 WideRGBA8 dst = unpack(pdst); 419 const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF, 420 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0, 421 0xFFFF, 0xFFFF, 0xFFFF, 0}; 422 const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF, 423 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF}; 424 const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255, 425 0, 0, 0, 255, 0, 0, 0, 255}; 426 427 // clang-format off 428 // Computes AA for the given pixel based on the offset of the pixel within 429 // destination row. Given the initial coverage offsets for the left and right 430 // edges, the offset is scaled by the slope and accumulated to find the 431 // minimum coverage value for the pixel. A final weight is generated that 432 // can be used to scale the source pixel. 433 #define DO_AA(format, body) \ 434 do { \ 435 int offset = int((const uint8_t*)buf - swgl_OpaqueStart); \ 436 if (uint32_t(offset) >= swgl_OpaqueSize) { \ 437 Float delta = swgl_AASlope * float(offset); \ 438 Float dist = clamp(min(swgl_LeftAADist + delta.x, \ 439 swgl_RightAADist + delta.y), \ 440 0.0f, 256.0f); \ 441 auto aa = pack_pixels_##format(dist, 1.0f); \ 442 body; \ 443 } \ 444 } while (0) 445 446 // Each blend case is preceded by the MASK_ variant. The MASK_ case first 447 // loads the mask values and multiplies the source value by them. After, it 448 // falls through to the normal blending case using the masked source. The 449 // AA_ variations may further precede the blend cases, in which case the 450 // source value is further modified before use. 451 #define BLEND_CASE_KEY(key) \ 452 case AA_##key: \ 453 DO_AA(RGBA8, src = muldiv256(src, aa)); \ 454 goto key; \ 455 case AA_MASK_##key: \ 456 DO_AA(RGBA8, src = muldiv256(src, aa)); \ 457 FALLTHROUGH; \ 458 case MASK_##key: \ 459 src = muldiv255(src, load_clip_mask(buf, span)); \ 460 FALLTHROUGH; \ 461 case key: key 462 463 #define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__)) 464 465 switch (blend_key) { 466 BLEND_CASE(GL_ONE, GL_ZERO): 467 return src; 468 BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, 469 GL_ONE_MINUS_SRC_ALPHA): 470 // dst + src.a*(src.rgb1 - dst) 471 // use addlow for signed overflow 472 return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst)); 473 BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA): 474 return src + dst - muldiv255(dst, alphas(src)); 475 BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR): 476 return dst - muldiv255(dst, src); 477 BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE): 478 return dst - (muldiv255(dst, src) & RGB_MASK); 479 BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA): 480 return dst - muldiv255(dst, alphas(src)); 481 BLEND_CASE(GL_ZERO, GL_SRC_COLOR): 482 return muldiv255(src, dst); 483 BLEND_CASE(GL_ONE, GL_ONE): 484 return src + dst; 485 BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA): 486 return src + dst - (muldiv255(dst, src) & ALPHA_MASK); 487 BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE): 488 // src*(1-dst.a) + dst*1 = src - src*dst.a + dst 489 return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK); 490 BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR): 491 // src*k + (1-src)*dst = src*k + dst - 492 // src*dst = dst + src*(k - dst) use addlow 493 // for signed overflow 494 return addlow( 495 dst, muldiv255(src, repeat2(ctx->blendcolor) - dst)); 496 497 // We must explicitly handle the masked/anti-aliased secondary blend case. 498 // The secondary color as well as the source must be multiplied by the 499 // weights. 500 case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { 501 WideRGBA8 secondary = 502 applyColor(dst, 503 packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); 504 return src + dst - secondary; 505 } 506 case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { 507 WideRGBA8 secondary = 508 applyColor(dst, 509 packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); 510 WideRGBA8 mask = load_clip_mask(buf, span); 511 return muldiv255(src, mask) + dst - muldiv255(secondary, mask); 512 } 513 case AA_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { 514 WideRGBA8 secondary = 515 applyColor(dst, 516 packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); 517 DO_AA(RGBA8, { 518 src = muldiv256(src, aa); 519 secondary = muldiv256(secondary, aa); 520 }); 521 return src + dst - secondary; 522 } 523 case AA_MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { 524 WideRGBA8 secondary = 525 applyColor(dst, 526 packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); 527 WideRGBA8 mask = load_clip_mask(buf, span); 528 DO_AA(RGBA8, mask = muldiv256(mask, aa)); 529 return muldiv255(src, mask) + dst - muldiv255(secondary, mask); 530 } 531 532 BLEND_CASE(GL_MIN): 533 return min(src, dst); 534 BLEND_CASE(GL_MAX): 535 return max(src, dst); 536 537 // The KHR_blend_equation_advanced spec describes the blend equations such 538 // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to 539 // the result: 540 // Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As) 541 // Ar = As*Ad + As*(1-Ad) + Ad*(1-As) 542 // However, working with unpremultiplied values requires expensive math to 543 // unpremultiply and premultiply again during blending. We can use the fact 544 // that premultiplied value P = C*A and simplify the equations such that no 545 // unpremultiplied colors are necessary, allowing us to stay with integer 546 // math that avoids floating-point conversions in the common case. Some of 547 // the blend modes require division or sqrt, in which case we do convert 548 // to (possibly transposed/unpacked) floating-point to implement the mode. 549 // However, most common modes can still use cheaper premultiplied integer 550 // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified 551 // to: 552 // Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As) 553 // .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As 554 // Ar = As*Ad + As - As*Ad + Ad - Ad*As 555 // .. Ar = As + Ad - As*Ad 556 // Note that the alpha equation is the same for all blend equations, such 557 // that so long as the implementation results in As + Ad - As*Ad, we can 558 // avoid using separate instructions to compute the alpha result, which is 559 // dependent on the math used to implement each blend mode. The exact 560 // reductions used to get the final math for every blend mode are too 561 // involved to show here in comments, but mostly follows from replacing 562 // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms 563 // as possible. 564 565 BLEND_CASE(GL_MULTIPLY_KHR): { 566 WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK), 567 alphas(dst) - (dst & RGB_MASK)); 568 return src + dst + (diff & RGB_MASK) - alphas(diff); 569 } 570 BLEND_CASE(GL_SCREEN_KHR): 571 return src + dst - muldiv255(src, dst); 572 BLEND_CASE(GL_OVERLAY_KHR): { 573 WideRGBA8 srcA = alphas(src); 574 WideRGBA8 dstA = alphas(dst); 575 WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); 576 return src + dst + 577 if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff), 578 -diff); 579 } 580 BLEND_CASE(GL_DARKEN_KHR): 581 return src + dst - 582 max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); 583 BLEND_CASE(GL_LIGHTEN_KHR): 584 return src + dst - 585 min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); 586 587 BLEND_CASE(GL_COLORDODGE_KHR): { 588 // Color-dodge and color-burn require division, so we convert to FP math 589 // here, but avoid transposing to a vec4. 590 WideRGBA32F srcF = CONVERT(src, WideRGBA32F); 591 WideRGBA32F srcA = alphas(srcF); 592 WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); 593 WideRGBA32F dstA = alphas(dstF); 594 return pack_pixels_RGBA8( 595 srcA * set_alphas( 596 min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)), 597 dstF) + 598 srcF * (255.0f - dstA) + dstF * (255.0f - srcA), 599 1.0f / 255.0f); 600 } 601 BLEND_CASE(GL_COLORBURN_KHR): { 602 WideRGBA32F srcF = CONVERT(src, WideRGBA32F); 603 WideRGBA32F srcA = alphas(srcF); 604 WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); 605 WideRGBA32F dstA = alphas(dstF); 606 return pack_pixels_RGBA8( 607 srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA * 608 recip_or(srcF, 255.0f))), 609 dstF) + 610 srcF * (255.0f - dstA) + dstF * (255.0f - srcA), 611 1.0f / 255.0f); 612 } 613 BLEND_CASE(GL_HARDLIGHT_KHR): { 614 WideRGBA8 srcA = alphas(src); 615 WideRGBA8 dstA = alphas(dst); 616 WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); 617 return src + dst + 618 if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff), 619 -diff); 620 } 621 622 BLEND_CASE(GL_SOFTLIGHT_KHR): { 623 // Soft-light requires an unpremultiply that can't be factored out as 624 // well as a sqrt, so we convert to FP math here, but avoid transposing 625 // to a vec4. 626 WideRGBA32F srcF = CONVERT(src, WideRGBA32F); 627 WideRGBA32F srcA = alphas(srcF); 628 WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); 629 WideRGBA32F dstA = alphas(dstF); 630 WideRGBA32F dstU = unpremultiply(dstF); 631 WideRGBA32F scale = srcF + srcF - srcA; 632 return pack_pixels_RGBA8( 633 dstF * (255.0f + 634 set_alphas( 635 scale * 636 if_then_else(scale < 0.0f, 1.0f - dstU, 637 min((16.0f * dstU - 12.0f) * dstU + 3.0f, 638 inversesqrt(dstU) - 1.0f)), 639 WideRGBA32F(0.0f))) + 640 srcF * (255.0f - dstA), 641 1.0f / 255.0f); 642 } 643 BLEND_CASE(GL_DIFFERENCE_KHR): { 644 WideRGBA8 diff = 645 min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst))); 646 return src + dst - diff - (diff & RGB_MASK); 647 } 648 BLEND_CASE(GL_EXCLUSION_KHR): { 649 WideRGBA8 diff = muldiv255(src, dst); 650 return src + dst - diff - (diff & RGB_MASK); 651 } 652 653 // The HSL blend modes are non-separable and require complicated use of 654 // division. It is advantageous to convert to FP and transpose to vec4 655 // math to more easily manipulate the individual color components. 656 #define DO_HSL(rgb) \ 657 do { \ 658 vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); \ 659 vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); \ 660 Float srcA = srcV.w * (1.0f / 255.0f); \ 661 Float dstA = dstV.w * (1.0f / 255.0f); \ 662 Float srcDstA = srcV.w * dstA; \ 663 vec3 srcC = vec3(srcV) * dstA; \ 664 vec3 dstC = vec3(dstV) * srcA; \ 665 return pack_pixels_RGBA8(vec4(rgb + vec3(srcV) - srcC + vec3(dstV) - dstC, \ 666 srcV.w + dstV.w - srcDstA), \ 667 1.0f); \ 668 } while (0) 669 670 BLEND_CASE(GL_HSL_HUE_KHR): 671 DO_HSL(set_lum_sat(srcC, dstC, dstC, srcDstA)); 672 BLEND_CASE(GL_HSL_SATURATION_KHR): 673 DO_HSL(set_lum_sat(dstC, srcC, dstC, srcDstA)); 674 BLEND_CASE(GL_HSL_COLOR_KHR): 675 DO_HSL(set_lum(srcC, dstC, srcDstA)); 676 BLEND_CASE(GL_HSL_LUMINOSITY_KHR): 677 DO_HSL(set_lum(dstC, srcC, srcDstA)); 678 679 // SWGL-specific extended blend modes. 680 BLEND_CASE(SWGL_BLEND_DROP_SHADOW): { 681 // Premultiplied alpha over blend, but with source color set to source alpha 682 // modulated with a constant color. 683 WideRGBA8 color = applyColor(alphas(src), swgl_BlendColorRGBA8); 684 return color + dst - muldiv255(dst, alphas(color)); 685 } 686 687 BLEND_CASE(SWGL_BLEND_SUBPIXEL_TEXT): 688 // Premultiplied alpha over blend, but treats the source as a subpixel mask 689 // modulated with a constant color. 690 return applyColor(src, swgl_BlendColorRGBA8) + dst - 691 muldiv255(dst, applyColor(src, swgl_BlendAlphaRGBA8)); 692 693 default: 694 UNREACHABLE; 695 // return src; 696 } 697 698 #undef BLEND_CASE 699 #undef BLEND_CASE_KEY 700 // clang-format on 701 } 702 703 static PREFER_INLINE WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src, 704 int span = 4) { 705 // clang-format off 706 #define BLEND_CASE_KEY(key) \ 707 case AA_##key: \ 708 DO_AA(R8, src = muldiv256(src, aa)); \ 709 goto key; \ 710 case AA_MASK_##key: \ 711 DO_AA(R8, src = muldiv256(src, aa)); \ 712 FALLTHROUGH; \ 713 case MASK_##key: \ 714 src = muldiv255(src, load_clip_mask(buf, span)); \ 715 FALLTHROUGH; \ 716 case key: key 717 718 #define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__)) 719 720 switch (blend_key) { 721 BLEND_CASE(GL_ONE, GL_ZERO): 722 return src; 723 BLEND_CASE(GL_ZERO, GL_SRC_COLOR): 724 return muldiv255(src, dst); 725 BLEND_CASE(GL_ONE, GL_ONE): 726 return src + dst; 727 default: 728 UNREACHABLE; 729 // return src; 730 } 731 732 #undef BLEND_CASE 733 #undef BLEND_CASE_KEY 734 // clang-format on 735 } 736 737 static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r) { 738 unaligned_store(buf, pack(r)); 739 } 740 741 static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r, int len) { 742 partial_store_span(buf, pack(r), len); 743 } 744 745 static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r) { 746 return blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r); 747 } 748 749 static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r, int len) { 750 return blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, len), r, len); 751 } 752 753 static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r) { 754 unaligned_store(buf, r); 755 } 756 757 static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r, int len) { 758 partial_store_span(buf, r, len); 759 } 760 761 static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r) { 762 return pack(blend_span(buf, unpack(r))); 763 } 764 765 static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r, 766 int len) { 767 return pack(blend_span(buf, unpack(r), len)); 768 } 769 770 static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r) { 771 unaligned_store(buf, pack(r)); 772 } 773 774 static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r, int len) { 775 partial_store_span(buf, pack(r), len); 776 } 777 778 static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r) { 779 return blend_pixels(buf, unpack(unaligned_load<PackedR8>(buf)), r); 780 } 781 782 static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) { 783 return blend_pixels(buf, unpack(partial_load_span<PackedR8>(buf, len)), r, 784 len); 785 } 786 787 static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) { 788 unaligned_store(buf, r); 789 } 790 791 static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) { 792 partial_store_span(buf, r, len); 793 } 794 795 static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) { 796 return pack(blend_span(buf, unpack(r))); 797 } 798 799 static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) { 800 return pack(blend_span(buf, unpack(r), len)); 801 } 802 803 template <bool BLEND, typename P, typename R> 804 static ALWAYS_INLINE void commit_blend_span(P* buf, R r) { 805 if (BLEND) { 806 commit_span(buf, blend_span(buf, r)); 807 } else { 808 commit_span(buf, r); 809 } 810 } 811 812 template <bool BLEND, typename P, typename R> 813 static ALWAYS_INLINE void commit_blend_span(P* buf, R r, int len) { 814 if (BLEND) { 815 commit_span(buf, blend_span(buf, r, len), len); 816 } else { 817 commit_span(buf, r, len); 818 } 819 } 820 821 template <typename P, typename R> 822 static ALWAYS_INLINE void commit_blend_solid_span(P* buf, R r, int len) { 823 for (P* end = &buf[len & ~3]; buf < end; buf += 4) { 824 commit_span(buf, blend_span(buf, r)); 825 } 826 len &= 3; 827 if (len > 0) { 828 partial_store_span(buf, pack(blend_span(buf, r, len)), len); 829 } 830 } 831 832 template <bool BLEND> 833 static void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) { 834 commit_blend_solid_span(buf, r, len); 835 } 836 837 template <> 838 ALWAYS_INLINE void commit_solid_span<false>(uint32_t* buf, WideRGBA8 r, 839 int len) { 840 fill_n(buf, len, bit_cast<U32>(pack(r)).x); 841 } 842 843 template <bool BLEND> 844 static void commit_solid_span(uint8_t* buf, WideR8 r, int len) { 845 commit_blend_solid_span(buf, r, len); 846 } 847 848 template <> 849 ALWAYS_INLINE void commit_solid_span<false>(uint8_t* buf, WideR8 r, int len) { 850 PackedR8 p = pack(r); 851 if (uintptr_t(buf) & 3) { 852 int align = 4 - (uintptr_t(buf) & 3); 853 align = min(align, len); 854 partial_store_span(buf, p, align); 855 buf += align; 856 len -= align; 857 } 858 fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(p)); 859 buf += len & ~3; 860 len &= 3; 861 if (len > 0) { 862 partial_store_span(buf, p, len); 863 } 864 }