tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

blend.h (31976B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) {
      6 #if USE_SSE2
      7  return _mm_packs_epi32(a, b);
      8 #elif USE_NEON
      9  return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
     10 #else
     11  return CONVERT(combine(a, b), HalfRGBA8);
     12 #endif
     13 }
     14 
     15 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v,
     16                                                 float scale = 255.0f) {
     17  ivec4 i = round_pixel(v, scale);
     18  HalfRGBA8 xz = packRGBA8(i.z, i.x);
     19  HalfRGBA8 yw = packRGBA8(i.y, i.w);
     20  HalfRGBA8 xyzwl = zipLow(xz, yw);
     21  HalfRGBA8 xyzwh = zipHigh(xz, yw);
     22  HalfRGBA8 lo = zip2Low(xyzwl, xyzwh);
     23  HalfRGBA8 hi = zip2High(xyzwl, xyzwh);
     24  return combine(lo, hi);
     25 }
     26 
     27 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha,
     28                                                 float scale = 255.0f) {
     29  I32 i = round_pixel(alpha, scale);
     30  HalfRGBA8 c = packRGBA8(i, i);
     31  c = zipLow(c, c);
     32  return zip(c, c);
     33 }
     34 
     35 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha,
     36                                                 float scale = 255.0f) {
     37  I32 i = round_pixel(alpha, scale);
     38  return repeat2(packRGBA8(i, i));
     39 }
     40 
     41 UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v,
     42                                                        float scale = 255.0f) {
     43  I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale);
     44  return repeat2(packRGBA8(i, i));
     45 }
     46 
     47 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() {
     48  return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
     49 }
     50 
     51 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v,
     52                                                 float scale = 255.0f) {
     53  ivec4 i = round_pixel(bit_cast<vec4>(v), scale);
     54  return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w));
     55 }
     56 
     57 static ALWAYS_INLINE WideR8 packR8(I32 a) {
     58 #if USE_SSE2
     59  return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a)));
     60 #elif USE_NEON
     61  return vqmovun_s32(a);
     62 #else
     63  return CONVERT(a, WideR8);
     64 #endif
     65 }
     66 
     67 static ALWAYS_INLINE WideR8 pack_pixels_R8(Float c, float scale = 255.0f) {
     68  return packR8(round_pixel(c, scale));
     69 }
     70 
     71 static ALWAYS_INLINE WideR8 pack_pixels_R8() {
     72  return pack_pixels_R8(fragment_shader->gl_FragColor.x);
     73 }
     74 
     75 // Load a partial span > 0 and < 4 pixels.
     76 template <typename V, typename P>
     77 static ALWAYS_INLINE V partial_load_span(const P* src, int span) {
     78  return bit_cast<V>(
     79      (span >= 2
     80           ? combine(unaligned_load<V2<P>>(src),
     81                     V2<P>{span > 2 ? unaligned_load<P>(src + 2) : P(0), 0})
     82           : V4<P>{unaligned_load<P>(src), 0, 0, 0}));
     83 }
     84 
     85 // Store a partial span > 0 and < 4 pixels.
     86 template <typename V, typename P>
     87 static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) {
     88  auto pixels = bit_cast<V4<P>>(src);
     89  if (span >= 2) {
     90    unaligned_store(dst, lowHalf(pixels));
     91    if (span > 2) {
     92      unaligned_store(dst + 2, pixels.z);
     93    }
     94  } else {
     95    unaligned_store(dst, pixels.x);
     96  }
     97 }
     98 
     99 // Dispatcher that chooses when to load a full or partial span
    100 template <typename V, typename P>
    101 static ALWAYS_INLINE V load_span(const P* src, int span) {
    102  if (span >= 4) {
    103    return unaligned_load<V, P>(src);
    104  } else {
    105    return partial_load_span<V, P>(src, span);
    106  }
    107 }
    108 
    109 // Dispatcher that chooses when to store a full or partial span
    110 template <typename V, typename P>
    111 static ALWAYS_INLINE void store_span(P* dst, V src, int span) {
    112  if (span >= 4) {
    113    unaligned_store<V, P>(dst, src);
    114  } else {
    115    partial_store_span<V, P>(dst, src, span);
    116  }
    117 }
    118 
    119 template <typename T>
    120 static ALWAYS_INLINE T muldiv256(T x, T y) {
    121  return (x * y) >> 8;
    122 }
    123 
    124 // (x*y + x) >> 8, cheap approximation of (x*y) / 255
    125 template <typename T>
    126 static ALWAYS_INLINE T muldiv255(T x, T y) {
    127  return (x * y + x) >> 8;
    128 }
    129 
    130 template <typename V>
    131 static ALWAYS_INLINE WideRGBA8 pack_span(uint32_t*, const V& v,
    132                                         float scale = 255.0f) {
    133  return pack_pixels_RGBA8(v, scale);
    134 }
    135 
    136 template <typename C>
    137 static ALWAYS_INLINE WideR8 pack_span(uint8_t*, C c, float scale = 255.0f) {
    138  return pack_pixels_R8(c, scale);
    139 }
    140 
    141 // Helper functions to apply a color modulus when available.
    142 struct NoColor {};
    143 
    144 template <typename P>
    145 static ALWAYS_INLINE P applyColor(P src, NoColor) {
    146  return src;
    147 }
    148 
    149 struct InvertColor {};
    150 
    151 template <typename P>
    152 static ALWAYS_INLINE P applyColor(P src, InvertColor) {
    153  return 255 - src;
    154 }
    155 
    156 template <typename P>
    157 static ALWAYS_INLINE P applyColor(P src, P color) {
    158  return muldiv255(color, src);
    159 }
    160 
    161 static ALWAYS_INLINE WideRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) {
    162  return applyColor(unpack(src), color);
    163 }
    164 
    165 template <typename P, typename C>
    166 static ALWAYS_INLINE auto packColor(P* buf, C color) {
    167  return pack_span(buf, color, 255.0f);
    168 }
    169 
    170 template <typename P>
    171 static ALWAYS_INLINE NoColor packColor(UNUSED P* buf, NoColor noColor) {
    172  return noColor;
    173 }
    174 
    175 template <typename P>
    176 static ALWAYS_INLINE InvertColor packColor(UNUSED P* buf,
    177                                           InvertColor invertColor) {
    178  return invertColor;
    179 }
    180 
    181 // Single argument variation that takes an explicit destination buffer type.
    182 template <typename P, typename C>
    183 static ALWAYS_INLINE auto packColor(C color) {
    184  // Just pass in a typed null pointer, as the pack routines never use the
    185  // pointer's value, just its type.
    186  return packColor((P*)0, color);
    187 }
    188 
    189 // Byte-wise addition for when x or y is a signed 8-bit value stored in the
    190 // low byte of a larger type T only with zeroed-out high bits, where T is
    191 // greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used
    192 // upon signed operands, using up all the precision in a 16 bit integer, and
    193 // potentially losing the sign bit in the last >> 8 shift. Due to the
    194 // properties of two's complement arithmetic, even though we've discarded the
    195 // sign bit, we can still represent a negative number under addition (without
    196 // requiring any extra sign bits), just that any negative number will behave
    197 // like a large unsigned number under addition, generating a single carry bit
    198 // on overflow that we need to discard. Thus, just doing a byte-wise add will
    199 // overflow without the troublesome carry, giving us only the remaining 8 low
    200 // bits we actually need while keeping the high bits at zero.
    201 template <typename T>
    202 static ALWAYS_INLINE T addlow(T x, T y) {
    203  typedef VectorType<uint8_t, sizeof(T)> bytes;
    204  return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
    205 }
    206 
    207 // Replace color components of each pixel with the pixel's alpha values.
    208 template <typename T>
    209 static ALWAYS_INLINE T alphas(T c) {
    210  return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
    211 }
    212 
    213 // Replace the alpha values of the first vector with alpha values from the
    214 // second, while leaving the color components unmodified.
    215 template <typename T>
    216 static ALWAYS_INLINE T set_alphas(T c, T a) {
    217  return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31);
    218 }
    219 
    220 // Miscellaneous helper functions for working with packed RGBA8 data.
    221 static ALWAYS_INLINE HalfRGBA8 if_then_else(V8<int16_t> c, HalfRGBA8 t,
    222                                            HalfRGBA8 e) {
    223  return bit_cast<HalfRGBA8>((c & t) | (~c & e));
    224 }
    225 
    226 template <typename T, typename C, int N>
    227 static ALWAYS_INLINE VectorType<T, N> if_then_else(VectorType<C, N> c,
    228                                                   VectorType<T, N> t,
    229                                                   VectorType<T, N> e) {
    230  return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)),
    231                 if_then_else(highHalf(c), highHalf(t), highHalf(e)));
    232 }
    233 
    234 static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) {
    235 #if USE_SSE2
    236  return bit_cast<HalfRGBA8>(
    237      _mm_min_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
    238 #elif USE_NEON
    239  return vminq_u16(x, y);
    240 #else
    241  return if_then_else(x < y, x, y);
    242 #endif
    243 }
    244 
    245 template <typename T, int N>
    246 static ALWAYS_INLINE VectorType<T, N> min(VectorType<T, N> x,
    247                                          VectorType<T, N> y) {
    248  return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y)));
    249 }
    250 
    251 static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) {
    252 #if USE_SSE2
    253  return bit_cast<HalfRGBA8>(
    254      _mm_max_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
    255 #elif USE_NEON
    256  return vmaxq_u16(x, y);
    257 #else
    258  return if_then_else(x > y, x, y);
    259 #endif
    260 }
    261 
    262 template <typename T, int N>
    263 static ALWAYS_INLINE VectorType<T, N> max(VectorType<T, N> x,
    264                                          VectorType<T, N> y) {
    265  return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y)));
    266 }
    267 
    268 template <typename T, int N>
    269 static ALWAYS_INLINE VectorType<T, N> recip(VectorType<T, N> v) {
    270  return combine(recip(lowHalf(v)), recip(highHalf(v)));
    271 }
    272 
    273 // Helper to get the reciprocal if the value is non-zero, or otherwise default
    274 // to the supplied fallback value.
    275 template <typename V>
    276 static ALWAYS_INLINE V recip_or(V v, float f) {
    277  return if_then_else(v != V(0.0f), recip(v), V(f));
    278 }
    279 
    280 template <typename T, int N>
    281 static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) {
    282  return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v)));
    283 }
    284 
    285 // Extract the alpha components so that we can cheaply calculate the reciprocal
    286 // on a single SIMD register. Then multiply the duplicated alpha reciprocal with
    287 // the pixel data. 0 alpha is treated as transparent black.
    288 static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) {
    289  Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f);
    290  return v * a.xxxxyyyyzzzzwwww;
    291 }
    292 
    293 // Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to
    294 // RGBA to unpack.
    295 static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) {
    296  return bit_cast<vec4>(
    297      SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15));
    298 }
    299 
    300 // The following lum/sat functions mostly follow the KHR_blend_equation_advanced
    301 // specification but are rearranged to work on premultiplied data.
    302 static ALWAYS_INLINE Float lumv3(vec3 v) {
    303  return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f;
    304 }
    305 
    306 static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); }
    307 
    308 static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); }
    309 
    310 static inline vec3 clip_color(vec3 v, Float lum, Float alpha) {
    311  Float mincol = max(-minv3(v), lum);
    312  Float maxcol = max(maxv3(v), alpha - lum);
    313  return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f));
    314 }
    315 
    316 static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) {
    317  return clip_color(base - lumv3(base), lumv3(ref), alpha);
    318 }
    319 
    320 static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) {
    321  vec3 diff = base - minv3(base);
    322  Float sbase = maxv3(diff);
    323  Float ssat = maxv3(sref) - minv3(sref);
    324  // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale
    325  // to black, as per specification.
    326  return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha);
    327 }
    328 
    329 // Flags the reflect the current blend-stage clipping to be applied.
    330 enum SWGLClipFlag {
    331  SWGL_CLIP_FLAG_MASK = 1 << 0,
    332  SWGL_CLIP_FLAG_AA = 1 << 1,
    333  SWGL_CLIP_FLAG_BLEND_OVERRIDE = 1 << 2,
    334 };
    335 static int swgl_ClipFlags = 0;
    336 static BlendKey swgl_BlendOverride = BLEND_KEY_NONE;
    337 static WideRGBA8 swgl_BlendColorRGBA8 = {0};
    338 static WideRGBA8 swgl_BlendAlphaRGBA8 = {0};
    339 
    340 // A pointer into the color buffer for the start of the span.
    341 static void* swgl_SpanBuf = nullptr;
    342 // A pointer into the clip mask for the start of the span.
    343 static uint8_t* swgl_ClipMaskBuf = nullptr;
    344 
    345 static ALWAYS_INLINE WideR8 expand_mask(UNUSED uint8_t* buf, WideR8 mask) {
    346  return mask;
    347 }
    348 static ALWAYS_INLINE WideRGBA8 expand_mask(UNUSED uint32_t* buf, WideR8 mask) {
    349  WideRG8 maskRG = zip(mask, mask);
    350  return zip(maskRG, maskRG);
    351 }
    352 
    353 // Loads a chunk of clip masks. The current pointer into the color buffer is
    354 // used to reconstruct the relative position within the span. From there, the
    355 // pointer into the clip mask can be generated from the start of the clip mask
    356 // span.
    357 template <typename P>
    358 static ALWAYS_INLINE uint8_t* get_clip_mask(P* buf) {
    359  return &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf];
    360 }
    361 
    362 template <typename P>
    363 static ALWAYS_INLINE auto load_clip_mask(P* buf, int span)
    364    -> decltype(expand_mask(buf, 0)) {
    365  return expand_mask(buf,
    366                     unpack(load_span<PackedR8>(get_clip_mask(buf), span)));
    367 }
    368 
    369 // Temporarily removes masking from the blend stage, assuming the caller will
    370 // handle it.
    371 static ALWAYS_INLINE void override_clip_mask() {
    372  blend_key = BlendKey(blend_key - MASK_BLEND_KEY_NONE);
    373 }
    374 
    375 // Restores masking to the blend stage, assuming it was previously overridden.
    376 static ALWAYS_INLINE void restore_clip_mask() {
    377  blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key);
    378 }
    379 
    380 // A pointer to the start of the opaque destination region of the span for AA.
    381 static const uint8_t* swgl_OpaqueStart = nullptr;
    382 // The size, in bytes, of the opaque region.
    383 static uint32_t swgl_OpaqueSize = 0;
    384 // AA coverage distance offsets for the left and right edges.
    385 static Float swgl_LeftAADist = 0.0f;
    386 static Float swgl_RightAADist = 0.0f;
    387 // AA coverage slope values used for accumulating coverage for each step.
    388 static Float swgl_AASlope = 0.0f;
    389 
    390 // Get the amount of pixels we need to process before the start of the opaque
    391 // region.
    392 template <typename P>
    393 static ALWAYS_INLINE int get_aa_opaque_start(P* buf) {
    394  return max(int((P*)swgl_OpaqueStart - buf), 0);
    395 }
    396 
    397 // Assuming we are already in the opaque part of the span, return the remaining
    398 // size of the opaque part.
    399 template <typename P>
    400 static ALWAYS_INLINE int get_aa_opaque_size(P* buf) {
    401  return max(int((P*)&swgl_OpaqueStart[swgl_OpaqueSize] - buf), 0);
    402 }
    403 
    404 // Temporarily removes anti-aliasing from the blend stage, assuming the caller
    405 // will handle it.
    406 static ALWAYS_INLINE void override_aa() {
    407  blend_key = BlendKey(blend_key - AA_BLEND_KEY_NONE);
    408 }
    409 
    410 // Restores anti-aliasing to the blend stage, assuming it was previously
    411 // overridden.
    412 static ALWAYS_INLINE void restore_aa() {
    413  blend_key = BlendKey(AA_BLEND_KEY_NONE + blend_key);
    414 }
    415 
    416 static PREFER_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst,
    417                                            WideRGBA8 src, int span = 4) {
    418  WideRGBA8 dst = unpack(pdst);
    419  const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0,      0xFFFF, 0xFFFF,
    420                              0xFFFF, 0,      0xFFFF, 0xFFFF, 0xFFFF, 0,
    421                              0xFFFF, 0xFFFF, 0xFFFF, 0};
    422  const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF,
    423                                0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF};
    424  const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255,
    425                                  0, 0, 0, 255, 0, 0, 0, 255};
    426 
    427  // clang-format off
    428  // Computes AA for the given pixel based on the offset of the pixel within
    429  // destination row. Given the initial coverage offsets for the left and right
    430  // edges, the offset is scaled by the slope and accumulated to find the
    431  // minimum coverage value for the pixel. A final weight is generated that
    432  // can be used to scale the source pixel.
    433 #define DO_AA(format, body)                                   \
    434  do {                                                        \
    435    int offset = int((const uint8_t*)buf - swgl_OpaqueStart); \
    436    if (uint32_t(offset) >= swgl_OpaqueSize) {                \
    437      Float delta = swgl_AASlope * float(offset);             \
    438      Float dist = clamp(min(swgl_LeftAADist + delta.x,       \
    439                             swgl_RightAADist + delta.y),     \
    440                         0.0f, 256.0f);                       \
    441      auto aa = pack_pixels_##format(dist, 1.0f);             \
    442      body;                                                   \
    443    }                                                         \
    444  } while (0)
    445 
    446  // Each blend case is preceded by the MASK_ variant. The MASK_ case first
    447  // loads the mask values and multiplies the source value by them. After, it
    448  // falls through to the normal blending case using the masked source. The
    449  // AA_ variations may further precede the blend cases, in which case the
    450  // source value is further modified before use.
    451 #define BLEND_CASE_KEY(key)                          \
    452  case AA_##key:                                     \
    453    DO_AA(RGBA8, src = muldiv256(src, aa));          \
    454    goto key;                                        \
    455  case AA_MASK_##key:                                \
    456    DO_AA(RGBA8, src = muldiv256(src, aa));          \
    457    FALLTHROUGH;                                     \
    458  case MASK_##key:                                   \
    459    src = muldiv255(src, load_clip_mask(buf, span)); \
    460    FALLTHROUGH;                                     \
    461  case key: key
    462 
    463 #define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
    464 
    465  switch (blend_key) {
    466  BLEND_CASE(GL_ONE, GL_ZERO):
    467    return src;
    468  BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE,
    469                  GL_ONE_MINUS_SRC_ALPHA):
    470    // dst + src.a*(src.rgb1 - dst)
    471    // use addlow for signed overflow
    472    return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst));
    473  BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
    474    return src + dst - muldiv255(dst, alphas(src));
    475  BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR):
    476    return dst - muldiv255(dst, src);
    477  BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE):
    478    return dst - (muldiv255(dst, src) & RGB_MASK);
    479  BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA):
    480    return dst - muldiv255(dst, alphas(src));
    481  BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
    482    return muldiv255(src, dst);
    483  BLEND_CASE(GL_ONE, GL_ONE):
    484    return src + dst;
    485  BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
    486    return src + dst - (muldiv255(dst, src) & ALPHA_MASK);
    487  BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE):
    488    // src*(1-dst.a) + dst*1 = src - src*dst.a + dst
    489    return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK);
    490  BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR):
    491    // src*k + (1-src)*dst = src*k + dst -
    492    // src*dst = dst + src*(k - dst) use addlow
    493    // for signed overflow
    494    return addlow(
    495        dst, muldiv255(src, repeat2(ctx->blendcolor) - dst));
    496 
    497  // We must explicitly handle the masked/anti-aliased secondary blend case.
    498  // The secondary color as well as the source must be multiplied by the
    499  // weights.
    500  case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
    501    WideRGBA8 secondary =
    502        applyColor(dst,
    503            packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
    504    return src + dst - secondary;
    505  }
    506  case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
    507    WideRGBA8 secondary =
    508        applyColor(dst,
    509            packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
    510    WideRGBA8 mask = load_clip_mask(buf, span);
    511    return muldiv255(src, mask) + dst - muldiv255(secondary, mask);
    512  }
    513  case AA_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
    514    WideRGBA8 secondary =
    515        applyColor(dst,
    516            packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
    517    DO_AA(RGBA8, {
    518      src = muldiv256(src, aa);
    519      secondary = muldiv256(secondary, aa);
    520    });
    521    return src + dst - secondary;
    522  }
    523  case AA_MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
    524    WideRGBA8 secondary =
    525        applyColor(dst,
    526            packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
    527    WideRGBA8 mask = load_clip_mask(buf, span);
    528    DO_AA(RGBA8, mask = muldiv256(mask, aa));
    529    return muldiv255(src, mask) + dst - muldiv255(secondary, mask);
    530  }
    531 
    532  BLEND_CASE(GL_MIN):
    533    return min(src, dst);
    534  BLEND_CASE(GL_MAX):
    535    return max(src, dst);
    536 
    537  // The KHR_blend_equation_advanced spec describes the blend equations such
    538  // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to
    539  // the result:
    540  //     Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As)
    541  //     Ar = As*Ad + As*(1-Ad) + Ad*(1-As)
    542  // However, working with unpremultiplied values requires expensive math to
    543  // unpremultiply and premultiply again during blending. We can use the fact
    544  // that premultiplied value P = C*A and simplify the equations such that no
    545  // unpremultiplied colors are necessary, allowing us to stay with integer
    546  // math that avoids floating-point conversions in the common case. Some of
    547  // the blend modes require division or sqrt, in which case we do convert
    548  // to (possibly transposed/unpacked) floating-point to implement the mode.
    549  // However, most common modes can still use cheaper premultiplied integer
    550  // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified
    551  // to:
    552  //     Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As)
    553  //     .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As
    554  //     Ar = As*Ad + As - As*Ad + Ad - Ad*As
    555  //     .. Ar = As + Ad - As*Ad
    556  // Note that the alpha equation is the same for all blend equations, such
    557  // that so long as the implementation results in As + Ad - As*Ad, we can
    558  // avoid using separate instructions to compute the alpha result, which is
    559  // dependent on the math used to implement each blend mode. The exact
    560  // reductions used to get the final math for every blend mode are too
    561  // involved to show here in comments, but mostly follows from replacing
    562  // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms
    563  // as possible.
    564 
    565  BLEND_CASE(GL_MULTIPLY_KHR): {
    566    WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK),
    567                               alphas(dst) - (dst & RGB_MASK));
    568    return src + dst + (diff & RGB_MASK) - alphas(diff);
    569  }
    570  BLEND_CASE(GL_SCREEN_KHR):
    571    return src + dst - muldiv255(src, dst);
    572  BLEND_CASE(GL_OVERLAY_KHR): {
    573    WideRGBA8 srcA = alphas(src);
    574    WideRGBA8 dstA = alphas(dst);
    575    WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
    576    return src + dst +
    577           if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff),
    578                        -diff);
    579  }
    580  BLEND_CASE(GL_DARKEN_KHR):
    581    return src + dst -
    582           max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
    583  BLEND_CASE(GL_LIGHTEN_KHR):
    584    return src + dst -
    585           min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
    586 
    587  BLEND_CASE(GL_COLORDODGE_KHR): {
    588    // Color-dodge and color-burn require division, so we convert to FP math
    589    // here, but avoid transposing to a vec4.
    590    WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
    591    WideRGBA32F srcA = alphas(srcF);
    592    WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
    593    WideRGBA32F dstA = alphas(dstF);
    594    return pack_pixels_RGBA8(
    595        srcA * set_alphas(
    596                   min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)),
    597                   dstF) +
    598            srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
    599        1.0f / 255.0f);
    600  }
    601  BLEND_CASE(GL_COLORBURN_KHR): {
    602    WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
    603    WideRGBA32F srcA = alphas(srcF);
    604    WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
    605    WideRGBA32F dstA = alphas(dstF);
    606    return pack_pixels_RGBA8(
    607        srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA *
    608                                                recip_or(srcF, 255.0f))),
    609                          dstF) +
    610            srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
    611        1.0f / 255.0f);
    612  }
    613  BLEND_CASE(GL_HARDLIGHT_KHR): {
    614    WideRGBA8 srcA = alphas(src);
    615    WideRGBA8 dstA = alphas(dst);
    616    WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
    617    return src + dst +
    618           if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff),
    619                        -diff);
    620  }
    621 
    622  BLEND_CASE(GL_SOFTLIGHT_KHR): {
    623    // Soft-light requires an unpremultiply that can't be factored out as
    624    // well as a sqrt, so we convert to FP math here, but avoid transposing
    625    // to a vec4.
    626    WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
    627    WideRGBA32F srcA = alphas(srcF);
    628    WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
    629    WideRGBA32F dstA = alphas(dstF);
    630    WideRGBA32F dstU = unpremultiply(dstF);
    631    WideRGBA32F scale = srcF + srcF - srcA;
    632    return pack_pixels_RGBA8(
    633        dstF * (255.0f +
    634                set_alphas(
    635                    scale *
    636                        if_then_else(scale < 0.0f, 1.0f - dstU,
    637                                     min((16.0f * dstU - 12.0f) * dstU + 3.0f,
    638                                         inversesqrt(dstU) - 1.0f)),
    639                    WideRGBA32F(0.0f))) +
    640            srcF * (255.0f - dstA),
    641        1.0f / 255.0f);
    642  }
    643  BLEND_CASE(GL_DIFFERENCE_KHR): {
    644    WideRGBA8 diff =
    645        min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst)));
    646    return src + dst - diff - (diff & RGB_MASK);
    647  }
    648  BLEND_CASE(GL_EXCLUSION_KHR): {
    649    WideRGBA8 diff = muldiv255(src, dst);
    650    return src + dst - diff - (diff & RGB_MASK);
    651  }
    652 
    653  // The HSL blend modes are non-separable and require complicated use of
    654  // division. It is advantageous to convert to FP and transpose to vec4
    655  // math to more easily manipulate the individual color components.
    656 #define DO_HSL(rgb)                                                            \
    657  do {                                                                         \
    658    vec4 srcV = unpack(CONVERT(src, PackedRGBA32F));                           \
    659    vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F));                           \
    660    Float srcA = srcV.w * (1.0f / 255.0f);                                     \
    661    Float dstA = dstV.w * (1.0f / 255.0f);                                     \
    662    Float srcDstA = srcV.w * dstA;                                             \
    663    vec3 srcC = vec3(srcV) * dstA;                                             \
    664    vec3 dstC = vec3(dstV) * srcA;                                             \
    665    return pack_pixels_RGBA8(vec4(rgb + vec3(srcV) - srcC + vec3(dstV) - dstC, \
    666                                  srcV.w + dstV.w - srcDstA),                  \
    667                             1.0f);                                            \
    668  } while (0)
    669 
    670  BLEND_CASE(GL_HSL_HUE_KHR):
    671    DO_HSL(set_lum_sat(srcC, dstC, dstC, srcDstA));
    672  BLEND_CASE(GL_HSL_SATURATION_KHR):
    673    DO_HSL(set_lum_sat(dstC, srcC, dstC, srcDstA));
    674  BLEND_CASE(GL_HSL_COLOR_KHR):
    675    DO_HSL(set_lum(srcC, dstC, srcDstA));
    676  BLEND_CASE(GL_HSL_LUMINOSITY_KHR):
    677    DO_HSL(set_lum(dstC, srcC, srcDstA));
    678 
    679  // SWGL-specific extended blend modes.
    680  BLEND_CASE(SWGL_BLEND_DROP_SHADOW): {
    681    // Premultiplied alpha over blend, but with source color set to source alpha
    682    // modulated with a constant color.
    683    WideRGBA8 color = applyColor(alphas(src), swgl_BlendColorRGBA8);
    684    return color + dst - muldiv255(dst, alphas(color));
    685  }
    686 
    687  BLEND_CASE(SWGL_BLEND_SUBPIXEL_TEXT):
    688    // Premultiplied alpha over blend, but treats the source as a subpixel mask
    689    // modulated with a constant color.
    690    return applyColor(src, swgl_BlendColorRGBA8) + dst -
    691           muldiv255(dst, applyColor(src, swgl_BlendAlphaRGBA8));
    692 
    693  default:
    694    UNREACHABLE;
    695    // return src;
    696  }
    697 
    698 #undef BLEND_CASE
    699 #undef BLEND_CASE_KEY
    700  // clang-format on
    701 }
    702 
    703 static PREFER_INLINE WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src,
    704                                         int span = 4) {
    705  // clang-format off
    706 #define BLEND_CASE_KEY(key)                          \
    707  case AA_##key:                                     \
    708    DO_AA(R8, src = muldiv256(src, aa));             \
    709    goto key;                                        \
    710  case AA_MASK_##key:                                \
    711    DO_AA(R8, src = muldiv256(src, aa));             \
    712    FALLTHROUGH;                                     \
    713  case MASK_##key:                                   \
    714    src = muldiv255(src, load_clip_mask(buf, span)); \
    715    FALLTHROUGH;                                     \
    716  case key: key
    717 
    718 #define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
    719 
    720  switch (blend_key) {
    721  BLEND_CASE(GL_ONE, GL_ZERO):
    722    return src;
    723  BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
    724    return muldiv255(src, dst);
    725  BLEND_CASE(GL_ONE, GL_ONE):
    726    return src + dst;
    727  default:
    728    UNREACHABLE;
    729    // return src;
    730  }
    731 
    732 #undef BLEND_CASE
    733 #undef BLEND_CASE_KEY
    734  // clang-format on
    735 }
    736 
    737 static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r) {
    738  unaligned_store(buf, pack(r));
    739 }
    740 
    741 static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r, int len) {
    742  partial_store_span(buf, pack(r), len);
    743 }
    744 
    745 static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r) {
    746  return blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r);
    747 }
    748 
    749 static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r, int len) {
    750  return blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, len), r, len);
    751 }
    752 
    753 static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r) {
    754  unaligned_store(buf, r);
    755 }
    756 
    757 static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r, int len) {
    758  partial_store_span(buf, r, len);
    759 }
    760 
    761 static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r) {
    762  return pack(blend_span(buf, unpack(r)));
    763 }
    764 
    765 static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r,
    766                                            int len) {
    767  return pack(blend_span(buf, unpack(r), len));
    768 }
    769 
    770 static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r) {
    771  unaligned_store(buf, pack(r));
    772 }
    773 
    774 static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r, int len) {
    775  partial_store_span(buf, pack(r), len);
    776 }
    777 
    778 static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r) {
    779  return blend_pixels(buf, unpack(unaligned_load<PackedR8>(buf)), r);
    780 }
    781 
    782 static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) {
    783  return blend_pixels(buf, unpack(partial_load_span<PackedR8>(buf, len)), r,
    784                      len);
    785 }
    786 
    787 static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) {
    788  unaligned_store(buf, r);
    789 }
    790 
    791 static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) {
    792  partial_store_span(buf, r, len);
    793 }
    794 
    795 static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) {
    796  return pack(blend_span(buf, unpack(r)));
    797 }
    798 
    799 static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) {
    800  return pack(blend_span(buf, unpack(r), len));
    801 }
    802 
    803 template <bool BLEND, typename P, typename R>
    804 static ALWAYS_INLINE void commit_blend_span(P* buf, R r) {
    805  if (BLEND) {
    806    commit_span(buf, blend_span(buf, r));
    807  } else {
    808    commit_span(buf, r);
    809  }
    810 }
    811 
    812 template <bool BLEND, typename P, typename R>
    813 static ALWAYS_INLINE void commit_blend_span(P* buf, R r, int len) {
    814  if (BLEND) {
    815    commit_span(buf, blend_span(buf, r, len), len);
    816  } else {
    817    commit_span(buf, r, len);
    818  }
    819 }
    820 
    821 template <typename P, typename R>
    822 static ALWAYS_INLINE void commit_blend_solid_span(P* buf, R r, int len) {
    823  for (P* end = &buf[len & ~3]; buf < end; buf += 4) {
    824    commit_span(buf, blend_span(buf, r));
    825  }
    826  len &= 3;
    827  if (len > 0) {
    828    partial_store_span(buf, pack(blend_span(buf, r, len)), len);
    829  }
    830 }
    831 
    832 template <bool BLEND>
    833 static void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) {
    834  commit_blend_solid_span(buf, r, len);
    835 }
    836 
    837 template <>
    838 ALWAYS_INLINE void commit_solid_span<false>(uint32_t* buf, WideRGBA8 r,
    839                                            int len) {
    840  fill_n(buf, len, bit_cast<U32>(pack(r)).x);
    841 }
    842 
    843 template <bool BLEND>
    844 static void commit_solid_span(uint8_t* buf, WideR8 r, int len) {
    845  commit_blend_solid_span(buf, r, len);
    846 }
    847 
    848 template <>
    849 ALWAYS_INLINE void commit_solid_span<false>(uint8_t* buf, WideR8 r, int len) {
    850  PackedR8 p = pack(r);
    851  if (uintptr_t(buf) & 3) {
    852    int align = 4 - (uintptr_t(buf) & 3);
    853    align = min(align, len);
    854    partial_store_span(buf, p, align);
    855    buf += align;
    856    len -= align;
    857  }
    858  fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(p));
    859  buf += len & ~3;
    860  len &= 3;
    861  if (len > 0) {
    862    partial_store_span(buf, p, len);
    863  }
    864 }