tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

swgl_ext.h (133516B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 // When using a solid color with clip masking, the cost of loading the clip mask
      6 // in the blend stage exceeds the cost of processing the color. Here we handle
      7 // the entire span of clip mask texture before the blend stage to more
      8 // efficiently process it and modulate it with color without incurring blend
      9 // stage overheads.
     10 #include <cstdint>
     11 
     12 template <typename P, typename C>
     13 static void commit_masked_solid_span(P* buf, C color, int len) {
     14  override_clip_mask();
     15  uint8_t* mask = get_clip_mask(buf);
     16  for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) {
     17    commit_span(
     18        buf,
     19        blend_span(
     20            buf,
     21            applyColor(expand_mask(buf, unpack(unaligned_load<PackedR8>(mask))),
     22                       color)));
     23  }
     24  restore_clip_mask();
     25 }
     26 
     27 // When using a solid color with anti-aliasing, most of the solid span will not
     28 // benefit from anti-aliasing in the opaque region. We only want to apply the AA
     29 // blend stage in the non-opaque start and end of the span where AA is needed.
     30 template <typename P, typename R>
     31 static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) {
     32  if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) {
     33    commit_solid_span<true>(buf, r, start);
     34    buf += start;
     35    len -= start;
     36  }
     37  if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) {
     38    override_aa();
     39    commit_solid_span<true>(buf, r, opaque);
     40    restore_aa();
     41    buf += opaque;
     42    len -= opaque;
     43  }
     44  if (len > 0) {
     45    commit_solid_span<true>(buf, r, len);
     46  }
     47 }
     48 
     49 // Forces a value with vector run-class to have scalar run-class.
     50 template <typename T>
     51 static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) {
     52  return force_scalar(v);
     53 }
     54 
     55 // Advance all varying inperpolants by a single chunk
     56 #define swgl_stepInterp() step_interp_inputs()
     57 
     58 // Pseudo-intrinsic that accesses the interpolation step for a given varying
     59 #define swgl_interpStep(v) (interp_step.v)
     60 
     61 // Commit an entire span of a solid color. This dispatches to clip-masked and
     62 // anti-aliased fast-paths as appropriate.
     63 #define swgl_commitSolid(format, v, n)                                   \
     64  do {                                                                   \
     65    int len = (n);                                                       \
     66    if (blend_key) {                                                     \
     67      if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) {                        \
     68        commit_masked_solid_span(swgl_Out##format,                       \
     69                                 packColor(swgl_Out##format, (v)), len); \
     70      } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) {                   \
     71        commit_aa_solid_span(swgl_Out##format,                           \
     72                             pack_span(swgl_Out##format, (v)), len);     \
     73      } else {                                                           \
     74        commit_solid_span<true>(swgl_Out##format,                        \
     75                                pack_span(swgl_Out##format, (v)), len);  \
     76      }                                                                  \
     77    } else {                                                             \
     78      commit_solid_span<false>(swgl_Out##format,                         \
     79                               pack_span(swgl_Out##format, (v)), len);   \
     80    }                                                                    \
     81    swgl_Out##format += len;                                             \
     82    swgl_SpanLength -= len;                                              \
     83  } while (0)
     84 #define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength)
     85 #define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength)
     86 #define swgl_commitPartialSolidRGBA8(len, v) \
     87  swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength))
     88 #define swgl_commitPartialSolidR8(len, v) \
     89  swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength))
     90 
     91 #define swgl_commitChunk(format, chunk)                 \
     92  do {                                                  \
     93    auto r = chunk;                                     \
     94    if (blend_key) r = blend_span(swgl_Out##format, r); \
     95    commit_span(swgl_Out##format, r);                   \
     96    swgl_Out##format += swgl_StepSize;                  \
     97    swgl_SpanLength -= swgl_StepSize;                   \
     98  } while (0)
     99 
    100 // Commit a single chunk of a color
    101 #define swgl_commitColor(format, color) \
    102  swgl_commitChunk(format, pack_pixels_##format(color))
    103 #define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color)
    104 #define swgl_commitColorR8(color) swgl_commitColor(R8, color)
    105 
    106 template <typename S>
    107 static ALWAYS_INLINE bool swgl_isTextureLinear(S s) {
    108  return s->filter == TextureFilter::LINEAR;
    109 }
    110 
    111 template <typename S>
    112 static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) {
    113  return s->format == TextureFormat::RGBA8;
    114 }
    115 
    116 template <typename S>
    117 static ALWAYS_INLINE bool swgl_isTextureR8(S s) {
    118  return s->format == TextureFormat::R8;
    119 }
    120 
    121 // Use the default linear quantization scale of 128. This gives 7 bits of
    122 // fractional precision, which when multiplied with a signed 9 bit value
    123 // still fits in a 16 bit integer.
    124 const int swgl_LinearQuantizeScale = 128;
    125 
    126 // Quantizes UVs for access into a linear texture.
    127 template <typename S, typename T>
    128 static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) {
    129  return linearQuantize(p, swgl_LinearQuantizeScale, s);
    130 }
    131 
    132 // Quantizes an interpolation step for UVs for access into a linear texture.
    133 template <typename S, typename T>
    134 static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) {
    135  return samplerScale(s, p) * swgl_LinearQuantizeScale;
    136 }
    137 
    138 template <typename S>
    139 static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf,
    140                                                     S sampler, ivec2 i) {
    141  return textureLinearUnpackedRGBA8(sampler, i);
    142 }
    143 
    144 template <typename S>
    145 static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf,
    146                                                  S sampler, ivec2 i) {
    147  return textureLinearUnpackedR8(sampler, i);
    148 }
    149 
    150 template <typename S>
    151 static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) {
    152  return swgl_isTextureRGBA8(s);
    153 }
    154 
    155 template <typename S>
    156 static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) {
    157  return swgl_isTextureR8(s);
    158 }
    159 
    160 // Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets
    161 // for linear sampling.
    162 #define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv)     \
    163  uv = swgl_linearQuantize(sampler, uv);                                      \
    164  vec2_scalar uv_step =                                                       \
    165      float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};   \
    166  vec2_scalar min_uv = max(                                                   \
    167      swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \
    168  vec2_scalar max_uv =                                                        \
    169      max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}),    \
    170          min_uv);
    171 
    172 // Implements the fallback linear filter that can deal with clamping and
    173 // arbitrary scales.
    174 template <bool BLEND, typename S, typename C, typename P>
    175 static P* blendTextureLinearFallback(S sampler, vec2 uv, int span,
    176                                     vec2_scalar uv_step, vec2_scalar min_uv,
    177                                     vec2_scalar max_uv, C color, P* buf) {
    178  for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
    179    commit_blend_span<BLEND>(
    180        buf, applyColor(textureLinearUnpacked(buf, sampler,
    181                                              ivec2(clamp(uv, min_uv, max_uv))),
    182                        color));
    183  }
    184  return buf;
    185 }
    186 
    187 static ALWAYS_INLINE U64 castForShuffle(V16<int16_t> r) {
    188  return bit_cast<U64>(r);
    189 }
    190 static ALWAYS_INLINE U16 castForShuffle(V4<int16_t> r) {
    191  return bit_cast<U16>(r);
    192 }
    193 
    194 static ALWAYS_INLINE V16<int16_t> applyFracX(V16<int16_t> r, I16 fracx) {
    195  return r * fracx.xxxxyyyyzzzzwwww;
    196 }
    197 static ALWAYS_INLINE V4<int16_t> applyFracX(V4<int16_t> r, I16 fracx) {
    198  return r * fracx;
    199 }
    200 
    201 // Implements a faster linear filter that works with axis-aligned constant Y but
    202 // scales less than 1, i.e. upscaling. In this case we can optimize for the
    203 // constant Y fraction as well as load all chunks from memory in a single tap
    204 // for each row.
    205 template <bool BLEND, typename S, typename C, typename P>
    206 static void blendTextureLinearUpscale(S sampler, vec2 uv, int span,
    207                                      vec2_scalar uv_step, vec2_scalar min_uv,
    208                                      vec2_scalar max_uv, C color, P* buf) {
    209  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
    210  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
    211  typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
    212 
    213  ivec2 i(clamp(uv, min_uv, max_uv));
    214  ivec2 frac = i;
    215  i >>= 7;
    216  P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x));
    217  P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x));
    218  I16 fracx = computeFracX(sampler, i, frac);
    219  int16_t fracy = computeFracY(frac).x;
    220  auto src0 =
    221      CONVERT(unaligned_load<packed_type>(&row0[i.x.x]), signed_unpacked_type);
    222  auto src1 =
    223      CONVERT(unaligned_load<packed_type>(&row1[i.x.x]), signed_unpacked_type);
    224  auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
    225 
    226  // We attempt to sample ahead by one chunk and interpolate it with the current
    227  // one. However, due to the complication of upscaling, we may not necessarily
    228  // shift in all the next set of samples.
    229  for (P* end = buf + span; buf < end; buf += 4) {
    230    uv.x += uv_step.x;
    231    I32 ixn = cast(uv.x);
    232    I16 fracn = computeFracNoClamp(ixn);
    233    ixn >>= 7;
    234    auto src0n = CONVERT(unaligned_load<packed_type>(&row0[ixn.x]),
    235                         signed_unpacked_type);
    236    auto src1n = CONVERT(unaligned_load<packed_type>(&row1[ixn.x]),
    237                         signed_unpacked_type);
    238    auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
    239 
    240    // Since we're upscaling, we know that a source pixel has a larger footprint
    241    // than the destination pixel, and thus all the source pixels needed for
    242    // this chunk will fall within a single chunk of texture data. However,
    243    // since the source pixels don't map 1:1 with destination pixels, we need to
    244    // shift the source pixels over based on their offset from the start of the
    245    // chunk. This could conceivably be optimized better with usage of PSHUFB or
    246    // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort
    247    // to masking in the correct pixels to avoid having to index into memory.
    248    // For the last sample to interpolate with, we need to potentially shift in
    249    // a sample from the next chunk over in the case the samples fill out an
    250    // entire chunk.
    251    auto shuf = src;
    252    auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4);
    253    if (i.x.y == i.x.x) {
    254      shuf = shuf.xxyz;
    255      shufn = shufn.xxyz;
    256    }
    257    if (i.x.z == i.x.y) {
    258      shuf = shuf.xyyz;
    259      shufn = shufn.xyyz;
    260    }
    261    if (i.x.w == i.x.z) {
    262      shuf = shuf.xyzz;
    263      shufn = shufn.xyzz;
    264    }
    265 
    266    // Convert back to a signed unpacked type so that we can interpolate the
    267    // final result.
    268    auto interp = bit_cast<signed_unpacked_type>(shuf);
    269    auto interpn = bit_cast<signed_unpacked_type>(shufn);
    270    interp += applyFracX(interpn - interp, fracx) >> 7;
    271 
    272    commit_blend_span<BLEND>(
    273        buf, applyColor(bit_cast<unpacked_type>(interp), color));
    274 
    275    i.x = ixn;
    276    fracx = fracn;
    277    src = srcn;
    278  }
    279 }
    280 
    281 // This is the fastest variant of the linear filter that still provides
    282 // filtering. In cases where there is no scaling required, but we have a
    283 // subpixel offset that forces us to blend in neighboring pixels, we can
    284 // optimize away most of the memory loads and shuffling that is required by the
    285 // fallback filter.
    286 template <bool BLEND, typename S, typename C, typename P>
    287 static void blendTextureLinearFast(S sampler, vec2 uv, int span,
    288                                   vec2_scalar min_uv, vec2_scalar max_uv,
    289                                   C color, P* buf) {
    290  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
    291  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
    292  typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
    293 
    294  ivec2 i(clamp(uv, min_uv, max_uv));
    295  ivec2 frac = i;
    296  i >>= 7;
    297  P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
    298  P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
    299  int16_t fracx = computeFracX(sampler, i, frac).x;
    300  int16_t fracy = computeFracY(frac).x;
    301  auto src0 = CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
    302  auto src1 = CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
    303  auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
    304 
    305  // Since there is no scaling, we sample ahead by one chunk and interpolate it
    306  // with the current one. We can then reuse this value on the next iteration.
    307  for (P* end = buf + span; buf < end; buf += 4) {
    308    row0 += 4;
    309    row1 += 4;
    310    auto src0n =
    311        CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
    312    auto src1n =
    313        CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
    314    auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
    315 
    316    // For the last sample to interpolate with, we need to potentially shift in
    317    // a sample from the next chunk over since the samples fill out an entire
    318    // chunk.
    319    auto interp = bit_cast<signed_unpacked_type>(src);
    320    auto interpn =
    321        bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 2, 3, 4));
    322    interp += ((interpn - interp) * fracx) >> 7;
    323 
    324    commit_blend_span<BLEND>(
    325        buf, applyColor(bit_cast<unpacked_type>(interp), color));
    326 
    327    src = srcn;
    328  }
    329 }
    330 
    331 // Implements a faster linear filter that works with axis-aligned constant Y but
    332 // downscaling the texture by half. In this case we can optimize for the
    333 // constant X/Y fractions and reduction factor while minimizing shuffling.
    334 template <bool BLEND, typename S, typename C, typename P>
    335 static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span,
    336                                                  vec2_scalar min_uv,
    337                                                  vec2_scalar max_uv, C color,
    338                                                  P* buf) {
    339  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
    340  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
    341  typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
    342 
    343  ivec2 i(clamp(uv, min_uv, max_uv));
    344  ivec2 frac = i;
    345  i >>= 7;
    346  P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
    347  P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
    348  int16_t fracx = computeFracX(sampler, i, frac).x;
    349  int16_t fracy = computeFracY(frac).x;
    350 
    351  for (P* end = buf + span; buf < end; buf += 4) {
    352    auto src0 =
    353        CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
    354    auto src1 =
    355        CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
    356    auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
    357    row0 += 4;
    358    row1 += 4;
    359    auto src0n =
    360        CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
    361    auto src1n =
    362        CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
    363    auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
    364    row0 += 4;
    365    row1 += 4;
    366 
    367    auto interp =
    368        bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 0, 2, 4, 6));
    369    auto interpn =
    370        bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 3, 5, 7));
    371    interp += ((interpn - interp) * fracx) >> 7;
    372 
    373    commit_blend_span<BLEND>(
    374        buf, applyColor(bit_cast<unpacked_type>(interp), color));
    375  }
    376 }
    377 
    378 enum LinearFilter {
    379  // No linear filter is needed.
    380  LINEAR_FILTER_NEAREST = 0,
    381  // The most general linear filter that handles clamping and varying scales.
    382  LINEAR_FILTER_FALLBACK,
    383  // A linear filter optimized for axis-aligned upscaling.
    384  LINEAR_FILTER_UPSCALE,
    385  // A linear filter with no scaling but with subpixel offset.
    386  LINEAR_FILTER_FAST,
    387  // A linear filter optimized for 2x axis-aligned downscaling.
    388  LINEAR_FILTER_DOWNSCALE
    389 };
    390 
    391 // Dispatches to an appropriate linear filter depending on the selected filter.
    392 template <bool BLEND, typename S, typename C, typename P>
    393 static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span,
    394                                     vec2_scalar uv_step, vec2_scalar min_uv,
    395                                     vec2_scalar max_uv, C color, P* buf,
    396                                     LinearFilter filter) {
    397  P* end = buf + span;
    398  if (filter != LINEAR_FILTER_FALLBACK) {
    399    // If we're not using the fallback, then Y is constant across the entire
    400    // row. We just need to ensure that we handle any samples that might pull
    401    // data from before the start of the row and require clamping.
    402    float beforeDist = max(0.0f, min_uv.x) - uv.x.x;
    403    if (beforeDist > 0) {
    404      int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0,
    405                         int(end - buf));
    406      buf = blendTextureLinearFallback<BLEND>(sampler, uv, before, uv_step,
    407                                              min_uv, max_uv, color, buf);
    408      uv.x += (before / swgl_StepSize) * uv_step.x;
    409    }
    410    // We need to check how many samples we can take from inside the row without
    411    // requiring clamping. In case the filter oversamples the row by a step, we
    412    // subtract off a step from the width to leave some room.
    413    float insideDist =
    414        min(max_uv.x, float((int(sampler->width) - swgl_StepSize) *
    415                            swgl_LinearQuantizeScale)) -
    416        uv.x.x;
    417    if (uv_step.x > 0.0f && insideDist >= uv_step.x) {
    418      int32_t inside = int(end - buf);
    419      if (filter == LINEAR_FILTER_DOWNSCALE) {
    420        inside = min(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) &
    421                         ~(swgl_StepSize - 1),
    422                     inside);
    423        if (inside > 0) {
    424          blendTextureLinearDownscale<BLEND>(sampler, uv, inside, min_uv,
    425                                             max_uv, color, buf);
    426          buf += inside;
    427          uv.x += (inside / swgl_StepSize) * uv_step.x;
    428        }
    429      } else if (filter == LINEAR_FILTER_UPSCALE) {
    430        inside = min(int(insideDist / uv_step.x) * swgl_StepSize, inside);
    431        if (inside > 0) {
    432          blendTextureLinearUpscale<BLEND>(sampler, uv, inside, uv_step, min_uv,
    433                                           max_uv, color, buf);
    434          buf += inside;
    435          uv.x += (inside / swgl_StepSize) * uv_step.x;
    436        }
    437      } else {
    438        inside = min(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) &
    439                         ~(swgl_StepSize - 1),
    440                     inside);
    441        if (inside > 0) {
    442          blendTextureLinearFast<BLEND>(sampler, uv, inside, min_uv, max_uv,
    443                                        color, buf);
    444          buf += inside;
    445          uv.x += (inside / swgl_StepSize) * uv_step.x;
    446        }
    447      }
    448    }
    449  }
    450  // If the fallback filter was requested, or if there are any samples left that
    451  // may be outside the row and require clamping, then handle that with here.
    452  if (buf < end) {
    453    buf = blendTextureLinearFallback<BLEND>(
    454        sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf);
    455  }
    456  return buf;
    457 }
    458 
    459 // Helper function to quantize UVs for linear filtering before dispatch
    460 template <bool BLEND, typename S, typename C, typename P>
    461 static inline int blendTextureLinear(S sampler, vec2 uv, int span,
    462                                     const vec4_scalar& uv_rect, C color,
    463                                     P* buf, LinearFilter filter) {
    464  if (!matchTextureFormat(sampler, buf)) {
    465    return 0;
    466  }
    467  LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
    468  blendTextureLinearDispatch<BLEND>(sampler, uv, span, uv_step, min_uv, max_uv,
    469                                    color, buf, filter);
    470  return span;
    471 }
    472 
    473 // Samples an axis-aligned span of on a single row of a texture using 1:1
    474 // nearest filtering. Sampling is constrained to only fall within the given UV
    475 // bounds. This requires a pointer to the destination buffer. An optional color
    476 // modulus can be supplied.
    477 template <bool BLEND, typename S, typename C, typename P>
    478 static int blendTextureNearestFast(S sampler, vec2 uv, int span,
    479                                   const vec4_scalar& uv_rect, C color,
    480                                   P* buf) {
    481  if (!matchTextureFormat(sampler, buf)) {
    482    return 0;
    483  }
    484 
    485  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
    486 
    487  ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv)));
    488  ivec2_scalar minUV =
    489      make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}));
    490  ivec2_scalar maxUV =
    491      make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}));
    492 
    493  // Calculate the row pointer within the buffer, clamping to within valid row
    494  // bounds.
    495  P* row =
    496      &((P*)sampler
    497            ->buf)[clampCoord(clamp(i.y, minUV.y, maxUV.y), sampler->height) *
    498                   sampler->stride];
    499  // Find clamped X bounds within the row.
    500  int minX = clamp(minUV.x, 0, sampler->width - 1);
    501  int maxX = clamp(maxUV.x, minX, sampler->width - 1);
    502  int curX = i.x;
    503  int endX = i.x + span;
    504  // If we need to start sampling below the valid sample bounds, then we need to
    505  // fill this section with a constant clamped sample.
    506  if (curX < minX) {
    507    int n = min(minX, endX) - curX;
    508    auto src =
    509        applyColor(unpack(bit_cast<packed_type>(V4<P>(row[minX]))), color);
    510    commit_solid_span<BLEND>(buf, src, n);
    511    buf += n;
    512    curX += n;
    513  }
    514  // Here we only deal with valid samples within the sample bounds. No clamping
    515  // should occur here within these inner loops.
    516  int n = max(min(maxX + 1, endX) - curX, 0);
    517  // Try to process as many chunks as possible with full loads and stores.
    518  for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) {
    519    auto src = applyColor(unaligned_load<packed_type>(&row[curX]), color);
    520    commit_blend_span<BLEND>(buf, src);
    521  }
    522  n &= 3;
    523  // If we have any leftover samples after processing chunks, use partial loads
    524  // and stores.
    525  if (n > 0) {
    526    auto src = applyColor(partial_load_span<packed_type>(&row[curX], n), color);
    527    commit_blend_span<BLEND>(buf, src, n);
    528    buf += n;
    529    curX += n;
    530  }
    531  // If we still have samples left above the valid sample bounds, then we again
    532  // need to fill this section with a constant clamped sample.
    533  if (curX < endX) {
    534    auto src =
    535        applyColor(unpack(bit_cast<packed_type>(V4<P>(row[maxX]))), color);
    536    commit_solid_span<BLEND>(buf, src, endX - curX);
    537  }
    538  return span;
    539 }
    540 
    541 // We need to verify that the pixel step reasonably approximates stepping by a
    542 // single texel for every pixel we need to reproduce. Try to ensure that the
    543 // margin of error is no more than approximately 2^-7. Also, we check here if
    544 // the scaling can be quantized for acceleration.
    545 template <typename T>
    546 static ALWAYS_INLINE int spanNeedsScale(int span, T P) {
    547  span &= ~(128 - 1);
    548  span += 128;
    549  int scaled = round((P.x.y - P.x.x) * span);
    550  return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0;
    551 }
    552 
    553 // Helper function to decide whether we can safely apply 1:1 nearest filtering
    554 // without diverging too much from the linear filter.
    555 template <typename S, typename T>
    556 static inline LinearFilter needsTextureLinear(S sampler, T P, int span) {
    557  // If each row is not wide enough for linear filtering, then just use nearest
    558  // filtering.
    559  if (sampler->width < 2) {
    560    return LINEAR_FILTER_NEAREST;
    561  }
    562  // First verify if the row Y doesn't change across samples
    563  if (P.y.x != P.y.y) {
    564    return LINEAR_FILTER_FALLBACK;
    565  }
    566  P = samplerScale(sampler, P);
    567  if (int scale = spanNeedsScale(span, P)) {
    568    // If the source region is not flipped and smaller than the destination,
    569    // then we can use the upscaling filter since row Y is constant.
    570    return P.x.x < P.x.y && P.x.y - P.x.x <= 1
    571               ? LINEAR_FILTER_UPSCALE
    572               : (scale == 2 ? LINEAR_FILTER_DOWNSCALE
    573                             : LINEAR_FILTER_FALLBACK);
    574  }
    575  // Also verify that we're reasonably close to the center of a texel
    576  // so that it doesn't look that much different than if a linear filter
    577  // was used.
    578  if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 ||
    579      (int(P.y.x * 4.0f + 0.5f) & 3) != 2) {
    580    // The source and destination regions are the same, but there is a
    581    // significant subpixel offset. We can use a faster linear filter to deal
    582    // with the offset in this case.
    583    return LINEAR_FILTER_FAST;
    584  }
    585  // Otherwise, we have a constant 1:1 step and we're stepping reasonably close
    586  // to the center of each pixel, so it's safe to disable the linear filter and
    587  // use nearest.
    588  return LINEAR_FILTER_NEAREST;
    589 }
    590 
    591 // Commit an entire span with linear filtering
    592 #define swgl_commitTextureLinear(format, s, p, uv_rect, color, n)              \
    593  do {                                                                         \
    594    auto packed_color = packColor(swgl_Out##format, color);                    \
    595    int len = (n);                                                             \
    596    int drawn = 0;                                                             \
    597    if (LinearFilter filter = needsTextureLinear(s, p, len)) {                 \
    598      if (blend_key) {                                                         \
    599        drawn = blendTextureLinear<true>(s, p, len, uv_rect, packed_color,     \
    600                                         swgl_Out##format, filter);            \
    601      } else {                                                                 \
    602        drawn = blendTextureLinear<false>(s, p, len, uv_rect, packed_color,    \
    603                                          swgl_Out##format, filter);           \
    604      }                                                                        \
    605    } else if (blend_key) {                                                    \
    606      drawn = blendTextureNearestFast<true>(s, p, len, uv_rect, packed_color,  \
    607                                            swgl_Out##format);                 \
    608    } else {                                                                   \
    609      drawn = blendTextureNearestFast<false>(s, p, len, uv_rect, packed_color, \
    610                                             swgl_Out##format);                \
    611    }                                                                          \
    612    swgl_Out##format += drawn;                                                 \
    613    swgl_SpanLength -= drawn;                                                  \
    614  } while (0)
    615 #define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \
    616  swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength)
    617 #define swgl_commitTextureLinearR8(s, p, uv_rect) \
    618  swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength)
    619 
    620 // Commit a partial span with linear filtering, optionally inverting the color
    621 #define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \
    622  swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(),      \
    623                           min(int(len), swgl_SpanLength))
    624 #define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \
    625  swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(),        \
    626                           min(int(len), swgl_SpanLength))
    627 
    628 // Commit an entire span with linear filtering that is scaled by a color
    629 #define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \
    630  swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength)
    631 #define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \
    632  swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength)
    633 
    634 // Helper function that samples from an R8 texture while expanding it to support
    635 // a differing framebuffer format.
    636 template <bool BLEND, typename S, typename C, typename P>
    637 static inline int blendTextureLinearR8(S sampler, vec2 uv, int span,
    638                                       const vec4_scalar& uv_rect, C color,
    639                                       P* buf) {
    640  if (!swgl_isTextureR8(sampler) || sampler->width < 2) {
    641    return 0;
    642  }
    643  LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
    644  for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
    645    commit_blend_span<BLEND>(
    646        buf, applyColor(expand_mask(buf, textureLinearUnpackedR8(
    647                                             sampler,
    648                                             ivec2(clamp(uv, min_uv, max_uv)))),
    649                        color));
    650  }
    651  return span;
    652 }
    653 
    654 // Commit an entire span with linear filtering while expanding from R8 to RGBA8
    655 #define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color)      \
    656  do {                                                                    \
    657    auto packed_color = packColor(swgl_OutRGBA8, color);                  \
    658    int drawn = 0;                                                        \
    659    if (blend_key) {                                                      \
    660      drawn = blendTextureLinearR8<true>(s, p, swgl_SpanLength, uv_rect,  \
    661                                         packed_color, swgl_OutRGBA8);    \
    662    } else {                                                              \
    663      drawn = blendTextureLinearR8<false>(s, p, swgl_SpanLength, uv_rect, \
    664                                          packed_color, swgl_OutRGBA8);   \
    665    }                                                                     \
    666    swgl_OutRGBA8 += drawn;                                               \
    667    swgl_SpanLength -= drawn;                                             \
    668  } while (0)
    669 #define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \
    670  swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor())
    671 
    672 // Compute repeating UVs, possibly constrained by tile repeat limits
    673 static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) {
    674  if (tile_repeat.x > 0.0f) {
    675    // Clamp to a number slightly less than the tile repeat limit so that
    676    // it results in a number close to but not equal to 1 after fract().
    677    // This avoids fract() yielding 0 if the limit was left as whole integer.
    678    uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f);
    679  }
    680  return fract(uv);
    681 }
    682 
    683 // Compute the number of non-repeating steps before we need to potentially
    684 // repeat the UVs.
    685 static inline int computeNoRepeatSteps(Float uv, float uv_step,
    686                                       float tile_repeat, int steps) {
    687  if (uv.w < uv.x) {
    688    // Ensure the UV taps are ordered low to high.
    689    uv = uv.wzyx;
    690  }
    691  // Check if the samples cross the boundary of the next whole integer or the
    692  // tile repeat limit, whichever is lower.
    693  float limit = floor(uv.x) + 1.0f;
    694  if (tile_repeat > 0.0f) {
    695    limit = min(limit, tile_repeat);
    696  }
    697  return uv.x >= 0.0f && uv.w < limit
    698             ? (uv_step != 0.0f
    699                    ? int(clamp((limit - uv.x) / uv_step, 0.0f, float(steps)))
    700                    : steps)
    701             : 0;
    702 }
    703 
    704 // Blends an entire span of texture with linear filtering and repeating UVs.
    705 template <bool BLEND, typename S, typename C, typename P>
    706 static int blendTextureLinearRepeat(S sampler, vec2 uv, int span,
    707                                    const vec2_scalar& tile_repeat,
    708                                    const vec4_scalar& uv_repeat,
    709                                    const vec4_scalar& uv_rect, C color,
    710                                    P* buf) {
    711  if (!matchTextureFormat(sampler, buf)) {
    712    return 0;
    713  }
    714  vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y};
    715  vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y};
    716  // Choose a linear filter to use for no-repeat sub-spans
    717  LinearFilter filter =
    718      needsTextureLinear(sampler, uv * uv_scale + uv_offset, span);
    719  // We need to step UVs unscaled and unquantized so that we can modulo them
    720  // with fract. We use uv_scale and uv_offset to map them into the correct
    721  // range.
    722  vec2_scalar uv_step =
    723      float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
    724  uv_scale = swgl_linearQuantizeStep(sampler, uv_scale);
    725  uv_offset = swgl_linearQuantize(sampler, uv_offset);
    726  vec2_scalar min_uv = max(
    727      swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f);
    728  vec2_scalar max_uv = max(
    729      swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv);
    730  for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
    731    int steps = int(end - buf) / swgl_StepSize;
    732    // Find the sub-span before UVs repeat to avoid expensive repeat math
    733    steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
    734    if (steps > 0) {
    735      steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
    736      if (steps > 0) {
    737        buf = blendTextureLinearDispatch<BLEND>(
    738            sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize,
    739            uv_step * uv_scale, min_uv, max_uv, color, buf, filter);
    740        if (buf >= end) {
    741          break;
    742        }
    743        uv += steps * uv_step;
    744      }
    745    }
    746    // UVs might repeat within this step, so explicitly compute repeated UVs
    747    vec2 repeated_uv = clamp(
    748        tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv);
    749    commit_blend_span<BLEND>(
    750        buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)),
    751                        color));
    752  }
    753  return span;
    754 }
    755 
    756 // Commit an entire span with linear filtering and repeating UVs
    757 #define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat,   \
    758                                       uv_rect, color)                         \
    759  do {                                                                         \
    760    auto packed_color = packColor(swgl_Out##format, color);                    \
    761    int drawn = 0;                                                             \
    762    if (blend_key) {                                                           \
    763      drawn = blendTextureLinearRepeat<true>(s, p, swgl_SpanLength,            \
    764                                             tile_repeat, uv_repeat, uv_rect,  \
    765                                             packed_color, swgl_Out##format);  \
    766    } else {                                                                   \
    767      drawn = blendTextureLinearRepeat<false>(s, p, swgl_SpanLength,           \
    768                                              tile_repeat, uv_repeat, uv_rect, \
    769                                              packed_color, swgl_Out##format); \
    770    }                                                                          \
    771    swgl_Out##format += drawn;                                                 \
    772    swgl_SpanLength -= drawn;                                                  \
    773  } while (0)
    774 #define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat,      \
    775                                            uv_rect)                           \
    776  swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
    777                                 NoColor())
    778 #define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \
    779                                                 uv_rect, color)               \
    780  swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
    781                                 color)
    782 
    783 template <typename S>
    784 static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf,
    785                                                      S sampler, ivec2 i) {
    786  return textureNearestPackedRGBA8(sampler, i);
    787 }
    788 
    789 // Blends an entire span of texture with nearest filtering and either
    790 // repeated or clamped UVs.
    791 template <bool BLEND, bool REPEAT, typename S, typename C, typename P>
    792 static int blendTextureNearestRepeat(S sampler, vec2 uv, int span,
    793                                     const vec2_scalar& tile_repeat,
    794                                     const vec4_scalar& uv_rect, C color,
    795                                     P* buf) {
    796  if (!matchTextureFormat(sampler, buf)) {
    797    return 0;
    798  }
    799  if (!REPEAT) {
    800    // If clamping, then we step pre-scaled to the sampler. For repeat modes,
    801    // this will be accomplished via uv_scale instead.
    802    uv = samplerScale(sampler, uv);
    803  }
    804  vec2_scalar uv_step =
    805      float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
    806  vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y});
    807  vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w});
    808  vec2_scalar uv_scale = max_uv - min_uv;
    809  // If the effective sampling area of this texture is only a single pixel, then
    810  // treat it as a solid span. For repeat modes, the bounds are specified on
    811  // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so
    812  // the test varies depending on which. If the sample range on an axis is
    813  // greater than one pixel, we can still check if we don't move far enough from
    814  // the pixel center on that axis to hit the next pixel.
    815  if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) ||
    816       (abs(uv_step.x) * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) &&
    817      (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) ||
    818       (abs(uv_step.y) * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) {
    819    vec2 repeated_uv = REPEAT
    820                           ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
    821                           : clamp(uv, min_uv, max_uv);
    822    commit_solid_span<BLEND>(buf,
    823                             applyColor(unpack(textureNearestPacked(
    824                                            buf, sampler, ivec2(repeated_uv))),
    825                                        color),
    826                             span);
    827  } else {
    828    for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
    829      if (REPEAT) {
    830        int steps = int(end - buf) / swgl_StepSize;
    831        // Find the sub-span before UVs repeat to avoid expensive repeat math
    832        steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
    833        if (steps > 0) {
    834          steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
    835          if (steps > 0) {
    836            vec2 inside_uv = fract(uv) * uv_scale + min_uv;
    837            vec2 inside_step = uv_step * uv_scale;
    838            for (P* outside = &buf[steps * swgl_StepSize]; buf < outside;
    839                 buf += swgl_StepSize, inside_uv += inside_step) {
    840              commit_blend_span<BLEND>(
    841                  buf, applyColor(
    842                           textureNearestPacked(buf, sampler, ivec2(inside_uv)),
    843                           color));
    844            }
    845            if (buf >= end) {
    846              break;
    847            }
    848            uv += steps * uv_step;
    849          }
    850        }
    851      }
    852 
    853      // UVs might repeat within this step, so explicitly compute repeated UVs
    854      vec2 repeated_uv = REPEAT
    855                             ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
    856                             : clamp(uv, min_uv, max_uv);
    857      commit_blend_span<BLEND>(
    858          buf,
    859          applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)),
    860                     color));
    861    }
    862  }
    863  return span;
    864 }
    865 
    866 // Determine if we can use the fast nearest filter for the given nearest mode.
    867 // If the Y coordinate varies more than half a pixel over
    868 // the span (which might cause the texel to alias to the next one), or the span
    869 // needs X scaling, then we have to use the fallback.
    870 template <typename S, typename T>
    871 static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) {
    872  P = samplerScale(sampler, P);
    873  return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P);
    874 }
    875 
    876 // Commit an entire span with nearest filtering and either clamped or repeating
    877 // UVs
    878 #define swgl_commitTextureNearest(format, s, p, uv_rect, color)               \
    879  do {                                                                        \
    880    auto packed_color = packColor(swgl_Out##format, color);                   \
    881    int drawn = 0;                                                            \
    882    if (needsNearestFallback(s, p, swgl_SpanLength)) {                        \
    883      if (blend_key) {                                                        \
    884        drawn = blendTextureNearestRepeat<true, false>(                       \
    885            s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color,               \
    886            swgl_Out##format);                                                \
    887      } else {                                                                \
    888        drawn = blendTextureNearestRepeat<false, false>(                      \
    889            s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color,               \
    890            swgl_Out##format);                                                \
    891      }                                                                       \
    892    } else if (blend_key) {                                                   \
    893      drawn = blendTextureNearestFast<true>(s, p, swgl_SpanLength, uv_rect,   \
    894                                            packed_color, swgl_Out##format);  \
    895    } else {                                                                  \
    896      drawn = blendTextureNearestFast<false>(s, p, swgl_SpanLength, uv_rect,  \
    897                                             packed_color, swgl_Out##format); \
    898    }                                                                         \
    899    swgl_Out##format += drawn;                                                \
    900    swgl_SpanLength -= drawn;                                                 \
    901  } while (0)
    902 #define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \
    903  swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor())
    904 #define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \
    905  swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color)
    906 
    907 #define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \
    908                                        color)                              \
    909  do {                                                                      \
    910    auto packed_color = packColor(swgl_Out##format, color);                 \
    911    int drawn = 0;                                                          \
    912    if (blend_key) {                                                        \
    913      drawn = blendTextureNearestRepeat<true, true>(                        \
    914          s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color,        \
    915          swgl_Out##format);                                                \
    916    } else {                                                                \
    917      drawn = blendTextureNearestRepeat<false, true>(                       \
    918          s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color,        \
    919          swgl_Out##format);                                                \
    920    }                                                                       \
    921    swgl_Out##format += drawn;                                              \
    922    swgl_SpanLength -= drawn;                                               \
    923  } while (0)
    924 #define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \
    925                                             uv_rect)                      \
    926  swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat,     \
    927                                  NoColor())
    928 #define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat,         \
    929                                                  uv_repeat, uv_rect, color) \
    930  swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color)
    931 
    932 // Commit an entire span of texture with filtering determined by sampler state.
    933 #define swgl_commitTexture(format, s, ...)               \
    934  do {                                                   \
    935    if (s->filter == TextureFilter::LINEAR) {            \
    936      swgl_commitTextureLinear##format(s, __VA_ARGS__);  \
    937    } else {                                             \
    938      swgl_commitTextureNearest##format(s, __VA_ARGS__); \
    939    }                                                    \
    940  } while (0)
    941 #define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__)
    942 #define swgl_commitTextureColorRGBA8(...) \
    943  swgl_commitTexture(ColorRGBA8, __VA_ARGS__)
    944 #define swgl_commitTextureRepeatRGBA8(...) \
    945  swgl_commitTexture(RepeatRGBA8, __VA_ARGS__)
    946 #define swgl_commitTextureRepeatColorRGBA8(...) \
    947  swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__)
    948 
    949 // Commit an entire span of a separable pass of a Gaussian blur that falls
    950 // within the given radius scaled by supplied coefficients, clamped to uv_rect
    951 // bounds.
    952 template <bool BLEND, typename S, typename P>
    953 static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect,
    954                             P* buf, int span, bool hori, int radius,
    955                             vec2_scalar coeffs) {
    956  if (!matchTextureFormat(sampler, buf)) {
    957    return 0;
    958  }
    959  vec2_scalar size = {float(sampler->width), float(sampler->height)};
    960  ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size);
    961  ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size));
    962  int startX = curUV.x;
    963  int endX = min(min(bounds.z, curUV.x + span), int(size.x));
    964  if (hori) {
    965    for (; curUV.x + swgl_StepSize <= endX;
    966         buf += swgl_StepSize, curUV.x += swgl_StepSize) {
    967      commit_blend_span<BLEND>(
    968          buf, gaussianBlurHorizontal<P>(sampler, curUV, bounds.x, bounds.z,
    969                                         radius, coeffs.x, coeffs.y));
    970    }
    971  } else {
    972    for (; curUV.x + swgl_StepSize <= endX;
    973         buf += swgl_StepSize, curUV.x += swgl_StepSize) {
    974      commit_blend_span<BLEND>(
    975          buf, gaussianBlurVertical<P>(sampler, curUV, bounds.y, bounds.w,
    976                                       radius, coeffs.x, coeffs.y));
    977    }
    978  }
    979  return curUV.x - startX;
    980 }
    981 
    982 #define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs)   \
    983  do {                                                                         \
    984    int drawn = 0;                                                             \
    985    if (blend_key) {                                                           \
    986      drawn = blendGaussianBlur<true>(s, p, uv_rect, swgl_Out##format,         \
    987                                      swgl_SpanLength, hori, radius, coeffs);  \
    988    } else {                                                                   \
    989      drawn = blendGaussianBlur<false>(s, p, uv_rect, swgl_Out##format,        \
    990                                       swgl_SpanLength, hori, radius, coeffs); \
    991    }                                                                          \
    992    swgl_Out##format += drawn;                                                 \
    993    swgl_SpanLength -= drawn;                                                  \
    994  } while (0)
    995 #define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \
    996  swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs)
    997 #define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \
    998  swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs)
    999 
   1000 // Convert and pack planar YUV samples to RGB output using a color space
   1001 static ALWAYS_INLINE PackedRGBA8 convertYUV(const YUVMatrix& rgb_from_ycbcr,
   1002                                            U16 y, U16 u, U16 v) {
   1003  auto yy = V8<int16_t>(zip(y, y));
   1004  auto uv = V8<int16_t>(zip(u, v));
   1005  return rgb_from_ycbcr.convert(yy, uv);
   1006 }
   1007 
   1008 // Helper functions to sample from planar YUV textures before converting to RGB
   1009 template <typename S0>
   1010 static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0,
   1011                                           const YUVMatrix& rgb_from_ycbcr,
   1012                                           UNUSED int rescaleFactor) {
   1013  switch (sampler0->format) {
   1014    case TextureFormat::RGBA8: {
   1015      auto planar = textureLinearPlanarRGBA8(sampler0, uv0);
   1016      return convertYUV(rgb_from_ycbcr, highHalf(planar.rg), lowHalf(planar.rg),
   1017                        lowHalf(planar.ba));
   1018    }
   1019    case TextureFormat::YUY2: {
   1020      auto planar = textureLinearPlanarYUY2(sampler0, uv0);
   1021      return convertYUV(rgb_from_ycbcr, planar.y, planar.u, planar.v);
   1022    }
   1023    default:
   1024      assert(false);
   1025      return PackedRGBA8(0);
   1026  }
   1027 }
   1028 
   1029 template <bool BLEND, typename S0, typename P, typename C = NoColor>
   1030 static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
   1031                    const vec4_scalar& uv_rect0, const vec3_scalar& ycbcr_bias,
   1032                    const mat3_scalar& rgb_from_debiased_ycbcr,
   1033                    int rescaleFactor, C color = C()) {
   1034  if (!swgl_isTextureLinear(sampler0)) {
   1035    return 0;
   1036  }
   1037  LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
   1038  const auto rgb_from_ycbcr =
   1039      YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
   1040  auto c = packColor(buf, color);
   1041  auto* end = buf + span;
   1042  for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) {
   1043    commit_blend_span<BLEND>(
   1044        buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
   1045                                  rgb_from_ycbcr, rescaleFactor),
   1046                        c));
   1047  }
   1048  return span;
   1049 }
   1050 
   1051 template <typename S0, typename S1>
   1052 static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
   1053                                           ivec2 uv1,
   1054                                           const YUVMatrix& rgb_from_ycbcr,
   1055                                           int rescaleFactor) {
   1056  switch (sampler1->format) {
   1057    case TextureFormat::RG8: {
   1058      assert(sampler0->format == TextureFormat::R8);
   1059      auto y = textureLinearUnpackedR8(sampler0, uv0);
   1060      auto planar = textureLinearPlanarRG8(sampler1, uv1);
   1061      return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.rg),
   1062                        highHalf(planar.rg));
   1063    }
   1064    case TextureFormat::RGBA8: {
   1065      assert(sampler0->format == TextureFormat::R8);
   1066      auto y = textureLinearUnpackedR8(sampler0, uv0);
   1067      auto planar = textureLinearPlanarRGBA8(sampler1, uv1);
   1068      return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.ba),
   1069                        highHalf(planar.rg));
   1070    }
   1071    case TextureFormat::RG16: {
   1072      assert(sampler0->format == TextureFormat::R16);
   1073      // The rescaling factor represents how many bits to add to renormalize the
   1074      // texture to 16 bits, and so the color depth is actually 16 minus the
   1075      // rescaling factor.
   1076      // Need to right shift the sample by the amount of bits over 8 it
   1077      // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
   1078      // of precision at the low end already, hence 1 is subtracted from the
   1079      // color depth.
   1080      int colorDepth = 16 - rescaleFactor;
   1081      int rescaleBits = (colorDepth - 1) - 8;
   1082      auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits;
   1083      auto uv = textureLinearUnpackedRG16(sampler1, uv1) >> rescaleBits;
   1084      return rgb_from_ycbcr.convert(zip(y, y), uv);
   1085    }
   1086    default:
   1087      assert(false);
   1088      return PackedRGBA8(0);
   1089  }
   1090 }
   1091 
   1092 template <bool BLEND, typename S0, typename S1, typename P,
   1093          typename C = NoColor>
   1094 static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
   1095                    const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
   1096                    const vec4_scalar& uv_rect1, const vec3_scalar& ycbcr_bias,
   1097                    const mat3_scalar& rgb_from_debiased_ycbcr,
   1098                    int rescaleFactor, C color = C()) {
   1099  if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) {
   1100    return 0;
   1101  }
   1102  LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
   1103  LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
   1104  const auto rgb_from_ycbcr =
   1105      YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
   1106  auto c = packColor(buf, color);
   1107  auto* end = buf + span;
   1108  for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) {
   1109    commit_blend_span<BLEND>(
   1110        buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
   1111                                  sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
   1112                                  rgb_from_ycbcr, rescaleFactor),
   1113                        c));
   1114  }
   1115  return span;
   1116 }
   1117 
   1118 template <typename S0, typename S1, typename S2>
   1119 static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
   1120                                           ivec2 uv1, S2 sampler2, ivec2 uv2,
   1121                                           const YUVMatrix& rgb_from_ycbcr,
   1122                                           int rescaleFactor) {
   1123  assert(sampler0->format == sampler1->format &&
   1124         sampler0->format == sampler2->format);
   1125  switch (sampler0->format) {
   1126    case TextureFormat::R8: {
   1127      auto y = textureLinearUnpackedR8(sampler0, uv0);
   1128      auto u = textureLinearUnpackedR8(sampler1, uv1);
   1129      auto v = textureLinearUnpackedR8(sampler2, uv2);
   1130      return convertYUV(rgb_from_ycbcr, y, u, v);
   1131    }
   1132    case TextureFormat::R16: {
   1133      // The rescaling factor represents how many bits to add to renormalize the
   1134      // texture to 16 bits, and so the color depth is actually 16 minus the
   1135      // rescaling factor.
   1136      // Need to right shift the sample by the amount of bits over 8 it
   1137      // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
   1138      // of precision at the low end already, hence 1 is subtracted from the
   1139      // color depth.
   1140      int colorDepth = 16 - rescaleFactor;
   1141      int rescaleBits = (colorDepth - 1) - 8;
   1142      auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits;
   1143      auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits;
   1144      auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits;
   1145      return convertYUV(rgb_from_ycbcr, U16(y), U16(u), U16(v));
   1146    }
   1147    default:
   1148      assert(false);
   1149      return PackedRGBA8(0);
   1150  }
   1151 }
   1152 
   1153 // Fallback helper for when we can't specifically accelerate YUV with
   1154 // composition.
   1155 template <bool BLEND, typename S0, typename S1, typename S2, typename P,
   1156          typename C>
   1157 static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0,
   1158                             vec2_scalar uv_step0, vec2_scalar min_uv0,
   1159                             vec2_scalar max_uv0, S1 sampler1, vec2 uv1,
   1160                             vec2_scalar uv_step1, vec2_scalar min_uv1,
   1161                             vec2_scalar max_uv1, S2 sampler2, vec2 uv2,
   1162                             vec2_scalar uv_step2, vec2_scalar min_uv2,
   1163                             vec2_scalar max_uv2, const vec3_scalar& ycbcr_bias,
   1164                             const mat3_scalar& rgb_from_debiased_ycbcr,
   1165                             int rescaleFactor, C color) {
   1166  const auto rgb_from_ycbcr =
   1167      YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
   1168  for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0,
   1169             uv1 += uv_step1, uv2 += uv_step2) {
   1170    commit_blend_span<BLEND>(
   1171        buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
   1172                                  sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
   1173                                  sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)),
   1174                                  rgb_from_ycbcr, rescaleFactor),
   1175                        color));
   1176  }
   1177 }
   1178 
   1179 template <bool BLEND, typename S0, typename S1, typename S2, typename P,
   1180          typename C = NoColor>
   1181 static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
   1182                    const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
   1183                    const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2,
   1184                    const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias,
   1185                    const mat3_scalar& rgb_from_debiased_ycbcr,
   1186                    int rescaleFactor, C color = C()) {
   1187  if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
   1188      !swgl_isTextureLinear(sampler2)) {
   1189    return 0;
   1190  }
   1191  LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
   1192  LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
   1193  LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
   1194  auto c = packColor(buf, color);
   1195  blendYUVFallback<BLEND>(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0,
   1196                          sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2,
   1197                          uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias,
   1198                          rgb_from_debiased_ycbcr, rescaleFactor, c);
   1199  return span;
   1200 }
   1201 
   1202 // A variant of the blendYUV that attempts to reuse the inner loops from the
   1203 // CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on
   1204 // the source data, which in turn allows it to be much faster than blendYUV.
   1205 // At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer
   1206 // and that no color scaling is applied, which we can accomplish via template
   1207 // specialization. We need to further validate inside that texture formats
   1208 // and dimensions are sane for video and that the video is axis-aligned before
   1209 // acceleration can proceed.
   1210 template <bool BLEND>
   1211 static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0,
   1212                    const vec4_scalar& uv_rect0, sampler2DRect sampler1,
   1213                    vec2 uv1, const vec4_scalar& uv_rect1,
   1214                    sampler2DRect sampler2, vec2 uv2,
   1215                    const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias,
   1216                    const mat3_scalar& rgb_from_debiased_ycbcr,
   1217                    int rescaleFactor, NoColor noColor = NoColor()) {
   1218  if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
   1219      !swgl_isTextureLinear(sampler2)) {
   1220    return 0;
   1221  }
   1222  LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
   1223  LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
   1224  LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
   1225  auto* end = buf + span;
   1226  // CompositeYUV imposes further restrictions on the source textures, such that
   1227  // the the Y/U/V samplers must all have a matching format, the U/V samplers
   1228  // must have matching sizes and sample coordinates, and there must be no
   1229  // change in row across the entire span.
   1230  if (sampler0->format == sampler1->format &&
   1231      sampler1->format == sampler2->format &&
   1232      sampler1->width == sampler2->width &&
   1233      sampler1->height == sampler2->height && uv_step0.y == 0 &&
   1234      uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 &&
   1235      uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) {
   1236    // CompositeYUV does not support a clamp rect, so we must take care to
   1237    // advance till we're inside the bounds of the clamp rect.
   1238    int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x,
   1239                                   (min_uv1.x - uv1.x.x) / uv_step1.x))),
   1240                      (end - buf) / swgl_StepSize);
   1241    if (outside > 0) {
   1242      blendYUVFallback<BLEND>(buf, outside * swgl_StepSize, sampler0, uv0,
   1243                              uv_step0, min_uv0, max_uv0, sampler1, uv1,
   1244                              uv_step1, min_uv1, max_uv1, sampler2, uv2,
   1245                              uv_step2, min_uv2, max_uv2, ycbcr_bias,
   1246                              rgb_from_debiased_ycbcr, rescaleFactor, noColor);
   1247      buf += outside * swgl_StepSize;
   1248      uv0.x += outside * uv_step0.x;
   1249      uv1.x += outside * uv_step1.x;
   1250      uv2.x += outside * uv_step2.x;
   1251    }
   1252    // Find the amount of chunks inside the clamp rect before we hit the
   1253    // maximum. If there are any chunks inside, we can finally dispatch to
   1254    // CompositeYUV.
   1255    int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x,
   1256                             (max_uv1.x - uv1.x.x) / uv_step1.x)),
   1257                     (end - buf) / swgl_StepSize);
   1258    if (inside > 0) {
   1259      // We need the color depth, which is relative to the texture format and
   1260      // rescale factor.
   1261      int colorDepth =
   1262          (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor;
   1263      // Finally, call the inner loop of CompositeYUV.
   1264      const auto rgb_from_ycbcr =
   1265          YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
   1266      linear_row_yuv<BLEND>(
   1267          buf, inside * swgl_StepSize, sampler0, force_scalar(uv0),
   1268          uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1),
   1269          uv_step1.x / swgl_StepSize, colorDepth, rgb_from_ycbcr);
   1270      // Now that we're done, advance past the processed inside portion.
   1271      buf += inside * swgl_StepSize;
   1272      uv0.x += inside * uv_step0.x;
   1273      uv1.x += inside * uv_step1.x;
   1274      uv2.x += inside * uv_step2.x;
   1275    }
   1276  }
   1277  // We either got here because we have some samples outside the clamp rect, or
   1278  // because some of the preconditions were not satisfied. Process whatever is
   1279  // left of the span.
   1280  blendYUVFallback<BLEND>(buf, end - buf, sampler0, uv0, uv_step0, min_uv0,
   1281                          max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1,
   1282                          sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias,
   1283                          rgb_from_debiased_ycbcr, rescaleFactor, noColor);
   1284  return span;
   1285 }
   1286 
   1287 // Commit a single chunk of a YUV surface represented by multiple planar
   1288 // textures. This requires a color space specifier selecting how to convert
   1289 // from YUV to RGB output. In the case of HDR formats, a rescaling factor
   1290 // selects how many bits of precision must be utilized on conversion. See the
   1291 // sampleYUV dispatcher functions for the various supported plane
   1292 // configurations this intrinsic accepts.
   1293 #define swgl_commitTextureLinearYUV(...)                                    \
   1294  do {                                                                      \
   1295    int drawn = 0;                                                          \
   1296    if (blend_key) {                                                        \
   1297      drawn = blendYUV<true>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__);  \
   1298    } else {                                                                \
   1299      drawn = blendYUV<false>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \
   1300    }                                                                       \
   1301    swgl_OutRGBA8 += drawn;                                                 \
   1302    swgl_SpanLength -= drawn;                                               \
   1303  } while (0)
   1304 
   1305 // Commit a single chunk of a YUV surface scaled by a color.
   1306 #define swgl_commitTextureLinearColorYUV(...) \
   1307  swgl_commitTextureLinearYUV(__VA_ARGS__)
   1308 
   1309 // Each gradient stops entry is a pair of RGBA32F start color and end step.
   1310 struct GradientStops {
   1311  Float startColor;
   1312  union {
   1313    Float stepColor;
   1314    vec4_scalar stepData;
   1315  };
   1316 
   1317  // Whether this gradient entry can be merged with an adjacent entry. The
   1318  // step will be equal with the adjacent step if and only if they can be
   1319  // merged, or rather, that the stops are actually part of a single larger
   1320  // gradient.
   1321  bool can_merge(const GradientStops& next) const {
   1322    return stepData == next.stepData;
   1323  }
   1324 
   1325  // Get the interpolated color within the entry based on the offset from its
   1326  // start.
   1327  Float interpolate(float offset) const {
   1328    return startColor + stepColor * offset;
   1329  }
   1330 
   1331  // Get the end color of the entry where interpolation stops.
   1332  Float end_color() const { return startColor + stepColor; }
   1333 };
   1334 
   1335 // Checks if a gradient table of the specified size exists at the UV coords of
   1336 // the address within an RGBA32F texture. If so, a linear address within the
   1337 // texture is returned that may be used to sample the gradient table later. If
   1338 // the address doesn't describe a valid gradient, then a negative value is
   1339 // returned.
   1340 static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address,
   1341                                        int entries) {
   1342  return sampler->format == TextureFormat::RGBA32F && address.y >= 0 &&
   1343                 address.y < int(sampler->height) && address.x >= 0 &&
   1344                 address.x < int(sampler->width) && entries > 0 &&
   1345                 address.x +
   1346                         int(sizeof(GradientStops) / sizeof(Float)) * entries <=
   1347                     int(sampler->width)
   1348             ? address.y * sampler->stride + address.x * 4
   1349             : -1;
   1350 }
   1351 
   1352 static inline int swgl_validateGradientFromStops(sampler2D sampler,
   1353                                                 ivec2_scalar address,
   1354                                                 int entries) {
   1355  // 1px (4 floats per color stop).
   1356  int colors_size = entries;
   1357  // 4 stop offsets (4 floats) per px.
   1358  int stops_size = ((entries + 3) & ~3) / 4;
   1359  return sampler->format == TextureFormat::RGBA32F && address.y >= 0 &&
   1360                 address.y < int(sampler->height) && address.x >= 0 &&
   1361                 address.x < int(sampler->width) && entries > 0 &&
   1362                 address.x + colors_size + stops_size <= int(sampler->width)
   1363             ? address.y * sampler->stride + address.x * 4
   1364             : -1;
   1365 }
   1366 
   1367 static inline WideRGBA8 sampleGradient(sampler2D sampler, int address,
   1368                                       Float entry) {
   1369  assert(sampler->format == TextureFormat::RGBA32F);
   1370  assert(address >= 0 && address < int(sampler->height * sampler->stride));
   1371  // Get the integer portion of the entry index to find the entry colors.
   1372  I32 index = cast(entry);
   1373  // Use the fractional portion of the entry index to control blending between
   1374  // entry colors.
   1375  Float offset = entry - cast(index);
   1376  // Every entry is a pair of colors blended by the fractional offset.
   1377  assert(test_all(index >= 0 &&
   1378                  index * int(sizeof(GradientStops) / sizeof(Float)) <
   1379                      int(sampler->width)));
   1380  GradientStops* stops = (GradientStops*)&sampler->buf[address];
   1381  // Blend between the colors for each SIMD lane, then pack them to RGBA8
   1382  // result. Since the layout of the RGBA8 framebuffer is actually BGRA while
   1383  // the gradient table has RGBA colors, swizzling is required.
   1384  return combine(
   1385      packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw),
   1386                round_pixel(stops[index.y].interpolate(offset.y).zyxw)),
   1387      packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw),
   1388                round_pixel(stops[index.w].interpolate(offset.w).zyxw)));
   1389 }
   1390 
   1391 // Samples a gradient entry from the gradient at the provided linearized
   1392 // address. The integer portion of the entry index is used to find the entry
   1393 // within the table whereas the fractional portion is used to blend between
   1394 // adjacent table entries.
   1395 #define swgl_commitGradientRGBA8(sampler, address, entry) \
   1396  swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry))
   1397 
   1398 // Variant that allows specifying a color multiplier of the gradient result.
   1399 #define swgl_commitGradientColorRGBA8(sampler, address, entry, color)         \
   1400  swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \
   1401                                     packColor(swgl_OutRGBA, color)))
   1402 
   1403 // Precomputed noise for adding directly to four horizontally contiguous pixels
   1404 // TODO: These should be updated for parity with the shader dither
   1405 // implementation once something more final exists there. Right now, these are
   1406 // very close but slightly off.
   1407 static const WideRGBA8 ditherNoise[64] = {
   1408    {2, 2, 2, 128, 194, 194, 194, 128, 50, 50, 50, 128, 242, 242, 242, 128},
   1409    {194, 194, 194, 128, 50, 50, 50, 128, 242, 242, 242, 128, 14, 14, 14, 128},
   1410    {50, 50, 50, 128, 242, 242, 242, 128, 14, 14, 14, 128, 206, 206, 206, 128},
   1411    {242, 242, 242, 128, 14, 14, 14, 128, 206, 206, 206, 128, 62, 62, 62, 128},
   1412    {14, 14, 14, 128, 206, 206, 206, 128, 62, 62, 62, 128, 254, 254, 254, 128},
   1413    {206, 206, 206, 128, 62, 62, 62, 128, 254, 254, 254, 128, 130, 130, 130,
   1414     128},
   1415    {62, 62, 62, 128, 254, 254, 254, 128, 130, 130, 130, 128, 66, 66, 66, 128},
   1416    {254, 254, 254, 128, 130, 130, 130, 128, 66, 66, 66, 128, 178, 178, 178,
   1417     128},
   1418    {130, 130, 130, 128, 66, 66, 66, 128, 178, 178, 178, 128, 114, 114, 114,
   1419     128},
   1420    {66, 66, 66, 128, 178, 178, 178, 128, 114, 114, 114, 128, 142, 142, 142,
   1421     128},
   1422    {178, 178, 178, 128, 114, 114, 114, 128, 142, 142, 142, 128, 78, 78, 78,
   1423     128},
   1424    {114, 114, 114, 128, 142, 142, 142, 128, 78, 78, 78, 128, 190, 190, 190,
   1425     128},
   1426    {142, 142, 142, 128, 78, 78, 78, 128, 190, 190, 190, 128, 126, 126, 126,
   1427     128},
   1428    {78, 78, 78, 128, 190, 190, 190, 128, 126, 126, 126, 128, 34, 34, 34, 128},
   1429    {190, 190, 190, 128, 126, 126, 126, 128, 34, 34, 34, 128, 226, 226, 226,
   1430     128},
   1431    {126, 126, 126, 128, 34, 34, 34, 128, 226, 226, 226, 128, 18, 18, 18, 128},
   1432    {34, 34, 34, 128, 226, 226, 226, 128, 18, 18, 18, 128, 210, 210, 210, 128},
   1433    {226, 226, 226, 128, 18, 18, 18, 128, 210, 210, 210, 128, 46, 46, 46, 128},
   1434    {18, 18, 18, 128, 210, 210, 210, 128, 46, 46, 46, 128, 238, 238, 238, 128},
   1435    {210, 210, 210, 128, 46, 46, 46, 128, 238, 238, 238, 128, 30, 30, 30, 128},
   1436    {46, 46, 46, 128, 238, 238, 238, 128, 30, 30, 30, 128, 222, 222, 222, 128},
   1437    {238, 238, 238, 128, 30, 30, 30, 128, 222, 222, 222, 128, 162, 162, 162,
   1438     128},
   1439    {30, 30, 30, 128, 222, 222, 222, 128, 162, 162, 162, 128, 98, 98, 98, 128},
   1440    {222, 222, 222, 128, 162, 162, 162, 128, 98, 98, 98, 128, 146, 146, 146,
   1441     128},
   1442    {162, 162, 162, 128, 98, 98, 98, 128, 146, 146, 146, 128, 82, 82, 82, 128},
   1443    {98, 98, 98, 128, 146, 146, 146, 128, 82, 82, 82, 128, 174, 174, 174, 128},
   1444    {146, 146, 146, 128, 82, 82, 82, 128, 174, 174, 174, 128, 110, 110, 110,
   1445     128},
   1446    {82, 82, 82, 128, 174, 174, 174, 128, 110, 110, 110, 128, 158, 158, 158,
   1447     128},
   1448    {174, 174, 174, 128, 110, 110, 110, 128, 158, 158, 158, 128, 94, 94, 94,
   1449     128},
   1450    {110, 110, 110, 128, 158, 158, 158, 128, 94, 94, 94, 128, 10, 10, 10, 128},
   1451    {158, 158, 158, 128, 94, 94, 94, 128, 10, 10, 10, 128, 202, 202, 202, 128},
   1452    {94, 94, 94, 128, 10, 10, 10, 128, 202, 202, 202, 128, 58, 58, 58, 128},
   1453    {10, 10, 10, 128, 202, 202, 202, 128, 58, 58, 58, 128, 250, 250, 250, 128},
   1454    {202, 202, 202, 128, 58, 58, 58, 128, 250, 250, 250, 128, 6, 6, 6, 128},
   1455    {58, 58, 58, 128, 250, 250, 250, 128, 6, 6, 6, 128, 198, 198, 198, 128},
   1456    {250, 250, 250, 128, 6, 6, 6, 128, 198, 198, 198, 128, 54, 54, 54, 128},
   1457    {6, 6, 6, 128, 198, 198, 198, 128, 54, 54, 54, 128, 246, 246, 246, 128},
   1458    {198, 198, 198, 128, 54, 54, 54, 128, 246, 246, 246, 128, 138, 138, 138,
   1459     128},
   1460    {54, 54, 54, 128, 246, 246, 246, 128, 138, 138, 138, 128, 74, 74, 74, 128},
   1461    {246, 246, 246, 128, 138, 138, 138, 128, 74, 74, 74, 128, 186, 186, 186,
   1462     128},
   1463    {138, 138, 138, 128, 74, 74, 74, 128, 186, 186, 186, 128, 122, 122, 122,
   1464     128},
   1465    {74, 74, 74, 128, 186, 186, 186, 128, 122, 122, 122, 128, 134, 134, 134,
   1466     128},
   1467    {186, 186, 186, 128, 122, 122, 122, 128, 134, 134, 134, 128, 70, 70, 70,
   1468     128},
   1469    {122, 122, 122, 128, 134, 134, 134, 128, 70, 70, 70, 128, 182, 182, 182,
   1470     128},
   1471    {134, 134, 134, 128, 70, 70, 70, 128, 182, 182, 182, 128, 118, 118, 118,
   1472     128},
   1473    {70, 70, 70, 128, 182, 182, 182, 128, 118, 118, 118, 128, 42, 42, 42, 128},
   1474    {182, 182, 182, 128, 118, 118, 118, 128, 42, 42, 42, 128, 234, 234, 234,
   1475     128},
   1476    {118, 118, 118, 128, 42, 42, 42, 128, 234, 234, 234, 128, 26, 26, 26, 128},
   1477    {42, 42, 42, 128, 234, 234, 234, 128, 26, 26, 26, 128, 218, 218, 218, 128},
   1478    {234, 234, 234, 128, 26, 26, 26, 128, 218, 218, 218, 128, 38, 38, 38, 128},
   1479    {26, 26, 26, 128, 218, 218, 218, 128, 38, 38, 38, 128, 230, 230, 230, 128},
   1480    {218, 218, 218, 128, 38, 38, 38, 128, 230, 230, 230, 128, 22, 22, 22, 128},
   1481    {38, 38, 38, 128, 230, 230, 230, 128, 22, 22, 22, 128, 214, 214, 214, 128},
   1482    {230, 230, 230, 128, 22, 22, 22, 128, 214, 214, 214, 128, 170, 170, 170,
   1483     128},
   1484    {22, 22, 22, 128, 214, 214, 214, 128, 170, 170, 170, 128, 106, 106, 106,
   1485     128},
   1486    {214, 214, 214, 128, 170, 170, 170, 128, 106, 106, 106, 128, 154, 154, 154,
   1487     128},
   1488    {170, 170, 170, 128, 106, 106, 106, 128, 154, 154, 154, 128, 90, 90, 90,
   1489     128},
   1490    {106, 106, 106, 128, 154, 154, 154, 128, 90, 90, 90, 128, 166, 166, 166,
   1491     128},
   1492    {154, 154, 154, 128, 90, 90, 90, 128, 166, 166, 166, 128, 102, 102, 102,
   1493     128},
   1494    {90, 90, 90, 128, 166, 166, 166, 128, 102, 102, 102, 128, 150, 150, 150,
   1495     128},
   1496    {166, 166, 166, 128, 102, 102, 102, 128, 150, 150, 150, 128, 86, 86, 86,
   1497     128},
   1498    {102, 102, 102, 128, 150, 150, 150, 128, 86, 86, 86, 128, 2, 2, 2, 128},
   1499    {150, 150, 150, 128, 86, 86, 86, 128, 2, 2, 2, 128, 194, 194, 194, 128},
   1500    {86, 86, 86, 128, 2, 2, 2, 128, 194, 194, 194, 128, 50, 50, 50, 128}};
   1501 
   1502 static ALWAYS_INLINE const WideRGBA8* getDitherNoise(int32_t fragCoordY) {
   1503  return &ditherNoise[(fragCoordY & 7) * 8];
   1504 }
   1505 
   1506 // Values in color should be in the 0..0xFF00 range so that dithering has
   1507 // enough overhead to avoid overflow and underflow.
   1508 static ALWAYS_INLINE WideRGBA8 dither(WideRGBA8 color, int32_t fragCoordX,
   1509                                      const WideRGBA8* ditherNoiseYIndexed) {
   1510  return color + ditherNoiseYIndexed[fragCoordX & 7];
   1511 }
   1512 
   1513 /// Find the gradient stops pair affecting the current offset by searching
   1514 /// into gradient stop offsets organized in a tree structure.
   1515 ///
   1516 /// This is ported from sample_gradient_stops_tree in ps_quad_gradient.glsl.
   1517 /// The tree structure is explained in the documentation of
   1518 /// write_gpu_gradient_stops_tree in prim_store/gradient/mod.rs
   1519 static int32_t findGradientStopPair(float offset, float* stops,
   1520                                    int32_t numStops,
   1521                                    float& prevOffset,
   1522                                    float& nextOffset) {
   1523    int32_t levelBaseAddr = 0;
   1524    // Number of blocks of 4 indices for the current level.
   1525    // At the root, a single block is stored. Each level stores
   1526    // 5 times more blocks than the previous one.
   1527    int32_t levelStride = 1;
   1528    // Relative address within the current level.
   1529    int32_t offsetInLevel = 0;
   1530    // By the end of this function, this will contain the index of the
   1531    // second stop of the pair we are looking for.
   1532    int32_t index = 0;
   1533 
   1534    // The index distance between consecutive stop offsets at
   1535    // the current level. At the last level, the stride is 1.
   1536    // each has a 5 times more stride than the next (so the
   1537    // index stride starts high and is divided by 5 at each
   1538    // iteration).
   1539    int32_t indexStride = 1;
   1540    while (indexStride * 5 <= numStops) {
   1541        indexStride *= 5;
   1542    }
   1543 
   1544 
   1545    // We take advantage of the fact that stop offsets are normalized from
   1546    // 0 to 1 which means that the first offset is always 0 and the last is
   1547    // always 1.
   1548    // This is important because in the loop, we won't be setting prevOffset
   1549    // if offset is < 0.0 and won't be setting nextOffset if offset > 1.0,
   1550    // so initializing them this way here handles those cases.
   1551    prevOffset = 0.0;
   1552    nextOffset = 1.0;
   1553 
   1554    while (true) {
   1555        int32_t addr = (levelBaseAddr + offsetInLevel) * 4;
   1556        float currentStops0 = stops[addr];
   1557        float currentStops1 = stops[addr + 1];
   1558        float currentStops2 = stops[addr + 2];
   1559        float currentStops3 = stops[addr + 3];
   1560 
   1561        // Determine which of the five partitions (sub-trees)
   1562        // to take next.
   1563        int32_t nextPartition = 4;
   1564        if (currentStops0 > offset) {
   1565            nextPartition = 0;
   1566            nextOffset = currentStops0;
   1567        } else if (currentStops1 > offset) {
   1568            nextPartition = 1;
   1569            prevOffset = currentStops0;
   1570            nextOffset = currentStops1;
   1571        } else if (currentStops2 > offset) {
   1572            nextPartition = 2;
   1573            prevOffset = currentStops1;
   1574            nextOffset = currentStops2;
   1575        } else if (currentStops3 > offset) {
   1576            nextPartition = 3;
   1577            prevOffset = currentStops2;
   1578            nextOffset = currentStops3;
   1579        } else {
   1580            prevOffset = currentStops3;
   1581        }
   1582 
   1583        index += nextPartition * indexStride;
   1584 
   1585        if (indexStride == 1) {
   1586            // If the index stride is 1, we visited a leaf,
   1587            // we are done.
   1588            break;
   1589        }
   1590 
   1591        indexStride /= 5;
   1592        levelBaseAddr += levelStride;
   1593        levelStride *= 5;
   1594        offsetInLevel = offsetInLevel * 5 + nextPartition;
   1595    }
   1596 
   1597    // clamp the index to [1..numStops]
   1598    if (index < 1) {
   1599        index = 1;
   1600    } else if (index > numStops - 1) {
   1601        index = numStops - 1;
   1602    }
   1603 
   1604    return index - 1;
   1605 }
   1606 
   1607 
   1608 // Samples an entire span of a linear gradient by crawling the gradient table
   1609 // and looking for consecutive stops that can be merged into a single larger
   1610 // gradient, then interpolating between those larger gradients within the span.
   1611 template <bool BLEND, bool DITHER>
   1612 static bool commitLinearGradient(sampler2D sampler, int address, float size,
   1613                                 bool tileRepeat, bool gradientRepeat, vec2 pos,
   1614                                 const vec2_scalar& scaleDir, float startOffset,
   1615                                 uint32_t* buf, int span,
   1616                                 vec4 fragCoord = vec4()) {
   1617  assert(sampler->format == TextureFormat::RGBA32F);
   1618  assert(address >= 0 && address < int(sampler->height * sampler->stride));
   1619  GradientStops* stops = (GradientStops*)&sampler->buf[address];
   1620  // Get the chunk delta from the difference in offset steps. This represents
   1621  // how far within the gradient table we advance for every step in output,
   1622  // normalized to gradient table size.
   1623  vec2_scalar posStep = dFdx(pos) * 4.0f;
   1624  float delta = dot(posStep, scaleDir);
   1625  if (!isfinite(delta)) {
   1626    return false;
   1627  }
   1628 
   1629  // Only incremented in the case of dithering
   1630  int32_t currentFragCoordX = int32_t(fragCoord.x.x);
   1631  const auto* ditherNoiseYIndexed =
   1632      DITHER ? getDitherNoise(int32_t(fragCoord.y.x)) : nullptr;
   1633 
   1634  // If we have a repeating brush, then the position will be modulo the [0,1)
   1635  // interval. Compute coefficients that can be used to quickly evaluate the
   1636  // distance to the interval boundary where the offset will wrap.
   1637  vec2_scalar distCoeffsX = {0.25f * span, 0.0f};
   1638  vec2_scalar distCoeffsY = distCoeffsX;
   1639  if (tileRepeat) {
   1640    if (posStep.x != 0.0f) {
   1641      distCoeffsX = vec2_scalar{step(0.0f, posStep.x), 1.0f} * recip(posStep.x);
   1642    }
   1643    if (posStep.y != 0.0f) {
   1644      distCoeffsY = vec2_scalar{step(0.0f, posStep.y), 1.0f} * recip(posStep.y);
   1645    }
   1646  }
   1647 
   1648  for (; span > 0;) {
   1649    // Try to process as many chunks as are within the span if possible.
   1650    float chunks = 0.25f * span;
   1651    vec2 repeatPos = pos;
   1652    if (tileRepeat) {
   1653      // If this is a repeating brush, then limit the chunks to not cross the
   1654      // interval boundaries.
   1655      repeatPos = fract(pos);
   1656      chunks = min(chunks, distCoeffsX.x - repeatPos.x.x * distCoeffsX.y);
   1657      chunks = min(chunks, distCoeffsY.x - repeatPos.y.x * distCoeffsY.y);
   1658    }
   1659    // Compute the gradient offset from the position.
   1660    Float offset =
   1661        repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset;
   1662    // If repeat is desired, we need to limit the offset to a fractional value.
   1663    if (gradientRepeat) {
   1664      offset = fract(offset);
   1665    }
   1666    // To properly handle both clamping and repeating of the table offset, we
   1667    // need to ensure we don't run past the 0 and 1 points. Here we compute the
   1668    // intercept points depending on whether advancing forwards or backwards in
   1669    // the gradient table to ensure the chunk count is limited by the amount
   1670    // before intersection. If there is no delta, then we compute no intercept.
   1671    float startEntry;
   1672    int minIndex, maxIndex;
   1673    if (offset.x < 0) {
   1674      // If we're below the gradient table, use the first color stop. We can
   1675      // only intercept the table if walking forward.
   1676      startEntry = 0;
   1677      minIndex = int(startEntry);
   1678      maxIndex = minIndex;
   1679      if (delta > 0) {
   1680        chunks = min(chunks, -offset.x / delta);
   1681      }
   1682    } else if (offset.x < 1) {
   1683      // Otherwise, we're inside the gradient table. Depending on the direction
   1684      // we're walking the the table, we may intersect either the 0 or 1 offset.
   1685      // Compute the start entry based on our initial offset, and compute the
   1686      // end entry based on the available chunks limited by intercepts. Clamp
   1687      // them into the valid range of the table.
   1688      startEntry = 1.0f + offset.x * size;
   1689      if (delta < 0) {
   1690        chunks = min(chunks, -offset.x / delta);
   1691      } else if (delta > 0) {
   1692        chunks = min(chunks, (1 - offset.x) / delta);
   1693      }
   1694      float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size,
   1695                             0.0f, 1.0f + size);
   1696      // Now that we know the range of entries we need to sample, we want to
   1697      // find the largest possible merged gradient within that range. Depending
   1698      // on which direction we are advancing in the table, we either walk up or
   1699      // down the table trying to merge the current entry with the adjacent
   1700      // entry. We finally limit the chunks to only sample from this merged
   1701      // gradient.
   1702      minIndex = int(startEntry);
   1703      maxIndex = minIndex;
   1704      if (delta > 0) {
   1705        while (maxIndex + 1 < endEntry &&
   1706               stops[maxIndex].can_merge(stops[maxIndex + 1])) {
   1707          maxIndex++;
   1708        }
   1709        chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size));
   1710      } else if (delta < 0) {
   1711        while (minIndex - 1 > endEntry &&
   1712               stops[minIndex - 1].can_merge(stops[minIndex])) {
   1713          minIndex--;
   1714        }
   1715        chunks = min(chunks, (minIndex - startEntry) / (delta * size));
   1716      }
   1717    } else {
   1718      // If we're above the gradient table, use the last color stop. We can
   1719      // only intercept the table if walking backward.
   1720      startEntry = 1.0f + size;
   1721      minIndex = int(startEntry);
   1722      maxIndex = minIndex;
   1723      if (delta < 0) {
   1724        chunks = min(chunks, (1 - offset.x) / delta);
   1725      }
   1726    }
   1727    // If there are any amount of whole chunks of a merged gradient found,
   1728    // then we want to process that as a single gradient span with the start
   1729    // and end colors from the min and max entries.
   1730    if (chunks >= 1.0f) {
   1731      int inside = int(chunks);
   1732      // Sample the start color from the min entry and the end color from the
   1733      // max entry of the merged gradient. These are scaled to a range of
   1734      // 0..0xFF00, as that is the largest shifted value that can fit in a U16.
   1735      // For dithering, this allows room to avoid overflow and underflow
   1736      // when applying the dither pattern. Since we are only doing addition with
   1737      // the step value, we can still represent negative step values without
   1738      // having to use an explicit sign bit, as the result will still come out
   1739      // the same, allowing us to gain an extra bit of precision. We will later
   1740      // shift these into 8 bit output range while committing the span, but
   1741      // stepping with higher precision to avoid banding. We convert from RGBA
   1742      // to BGRA here to avoid doing this in the inner loop.
   1743      auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00);
   1744      auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00);
   1745      // Get the color range of the merged gradient, normalized to its size.
   1746      auto colorRangeF =
   1747          (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex));
   1748      // Compute the actual starting color of the current start offset within
   1749      // the merged gradient. The value 0.5 is added to the low bits (0x80) so
   1750      // that the color will effectively round to the nearest increment below.
   1751      auto colorF =
   1752          minColorF + colorRangeF * (startEntry - minIndex) + float(0x80);
   1753      // Compute the portion of the color range that we advance on each chunk.
   1754      Float deltaColorF = colorRangeF * (delta * size);
   1755      // Quantize the color delta and current color. These have already been
   1756      // scaled to the 0..0xFF00 range, so we just need to round them to U16.
   1757      auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16));
   1758      for (int remaining = inside;;) {
   1759        auto color =
   1760            combine(CONVERT(round_pixel(colorF, 1), U16),
   1761                    CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16),
   1762                    CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16),
   1763                    CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16));
   1764        // Finally, step the current color through the output chunks, shifting
   1765        // it into 8 bit range and outputting as we go. Only process a segment
   1766        // at a time to avoid overflowing 8-bit precision due to rounding of
   1767        // deltas.
   1768        int segment = min(remaining, 256 / 4);
   1769        for (auto* end = buf + segment * 4; buf < end; buf += 4) {
   1770          if (DITHER) {
   1771            commit_blend_span<BLEND>(
   1772                buf,
   1773                dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8);
   1774            currentFragCoordX += 4;
   1775          } else {
   1776            commit_blend_span<BLEND>(buf, color >> 8);
   1777          }
   1778          color += deltaColor;
   1779        }
   1780        remaining -= segment;
   1781        if (remaining <= 0) {
   1782          break;
   1783        }
   1784        colorF += deltaColorF * segment;
   1785      }
   1786      // Deduct the number of chunks inside the gradient from the remaining
   1787      // overall span. If we exhausted the span, bail out.
   1788      span -= inside * 4;
   1789      if (span <= 0) {
   1790        break;
   1791      }
   1792      // Otherwise, assume we're in a transitional section of the gradient that
   1793      // will probably require per-sample table lookups, so fall through below.
   1794      // We need to re-evaluate the position and offset first, though.
   1795      pos += posStep * float(inside);
   1796      repeatPos = tileRepeat ? fract(pos) : pos;
   1797      offset =
   1798          repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset;
   1799      if (gradientRepeat) {
   1800        offset = fract(offset);
   1801      }
   1802    }
   1803    // If we get here, there were no whole chunks of a merged gradient found
   1804    // that we could process, but we still have a non-zero amount of span left.
   1805    // That means we have segments of gradient that begin or end at the current
   1806    // entry we're on. For this case, we just fall back to sampleGradient which
   1807    // will calculate a table entry for each sample, assuming the samples may
   1808    // have different table entries.
   1809    Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
   1810    if (DITHER) {
   1811      auto gradientSample = sampleGradient(sampler, address, entry) << 8;
   1812      commit_blend_span<BLEND>(
   1813          buf,
   1814          dither(gradientSample, currentFragCoordX, ditherNoiseYIndexed) >> 8);
   1815      currentFragCoordX += 4;
   1816    } else {
   1817      commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
   1818    }
   1819    span -= 4;
   1820    buf += 4;
   1821    pos += posStep;
   1822  }
   1823  return true;
   1824 }
   1825 
   1826 // Samples an entire span of a linear gradient.
   1827 template <bool BLEND, bool DITHER>
   1828 static bool commitLinearGradientFromStops(sampler2D sampler, int offsetsAddress,
   1829                                          int colorsAddress, float stopCount,
   1830                                          bool gradientRepeat, vec2 pos,
   1831                                          const vec2_scalar& scaleDir,
   1832                                          float startOffset, uint32_t* buf,
   1833                                          int span, vec4 fragCoord = vec4()) {
   1834  assert(sampler->format == TextureFormat::RGBA32F);
   1835  // Stop offsets are expected to be stored just after the colors.
   1836  assert(colorsAddress >= 0 && colorsAddress < offsetsAddress);
   1837  assert(offsetsAddress >= 0 && offsetsAddress + (stopCount + 3) / 4 <
   1838                                    int(sampler->height * sampler->stride));
   1839  float* stopOffsets = (float*)&sampler->buf[offsetsAddress];
   1840  Float* stopColors = (Float*)&sampler->buf[colorsAddress];
   1841 
   1842  // Number of pixels per chunks.
   1843  const float CHUNK_SIZE = 4.0f;
   1844 
   1845  // Only incremented in the case of dithering
   1846  // Only incremented in the case of dithering
   1847  int32_t currentFragCoordX = int32_t(fragCoord.x.x);
   1848  const auto* ditherNoiseYIndexed =
   1849      DITHER ? getDitherNoise(int32_t(fragCoord.y.x)) : nullptr;
   1850 
   1851  // Get the pixel delta from the difference in offset steps. This represents
   1852  // how far within the gradient offset range we advance for every step in
   1853  // output.
   1854  vec2_scalar posStep = dFdx(pos);
   1855  float delta = dot(posStep, scaleDir);
   1856  if (!isfinite(delta)) {
   1857    return false;
   1858  }
   1859 
   1860  for (; span > 0;) {
   1861    // The number of pixels that are affected by the current gradient stop pair.
   1862    float subSpan = span;
   1863 
   1864    // Compute the gradient offset from the position.
   1865    Float offset = pos.x * scaleDir.x + pos.y * scaleDir.y - startOffset;
   1866    // If repeat is desired, we need to limit the offset to a fractional value.
   1867    if (gradientRepeat) {
   1868      offset = fract(offset);
   1869    }
   1870 
   1871    int32_t stopIndex = 0;
   1872    float prevOffset = 0.0;
   1873    float nextOffset = 0.0;
   1874    if (offset.x < 0) {
   1875      // If before the start of the gradient stop range, then use the first
   1876      // stop.
   1877      if (delta > 0) {
   1878        subSpan = min(subSpan, -offset.x / delta);
   1879      }
   1880    } else if (offset.x >= 1) {
   1881      // If beyond the end of the gradient stop range, then use the last
   1882      // stop.
   1883      stopIndex = stopCount - 1;
   1884      if (delta < 0) {
   1885        subSpan = min(subSpan, (1.0f - offset.x) / delta);
   1886      }
   1887    } else {
   1888      // Otherwise, we're inside the gradient stop range. Find the pair
   1889      // that affect the start of the current block and how many blocks
   1890      // are affected by the same pair.
   1891      stopIndex =
   1892          findGradientStopPair(offset.x, stopOffsets, stopCount,
   1893                               prevOffset, nextOffset);
   1894      float offsetRange =
   1895          delta > 0.0f ? nextOffset - offset.x : prevOffset - offset.x;
   1896      subSpan = min(subSpan, offsetRange / delta);
   1897    }
   1898 
   1899    // Ensure that we advance by at least a pixel.
   1900    subSpan = max(ceil(subSpan), 1.0f);
   1901 
   1902    // Sample the start colors of the gradient stop pair. These are scaled to
   1903    // a range of 0..0xFF00, as that is the largest shifted value that can fit
   1904    // in a U16.  Since we are only doing addition with the step value, we can
   1905    // still represent negative step values without having to use an explicit
   1906    // sign bit, as the result will still come out the same, allowing us to gain
   1907    // an extra bit of precision. We will later shift these into 8 bit output
   1908    // range while committing the span, but stepping with higher precision to
   1909    // avoid banding. We convert from RGBA to BGRA here to avoid doing this in
   1910    // the inner loop.
   1911    // The 256 factor is a leftover from a previous version of this code that
   1912    // uses a 256 pixels gradient table. The math could be simplified to avoid
   1913    // it but this change requires careful consideration of its interactions
   1914    // with the dithering code.
   1915    auto colorScale = (DITHER ? float(0xFF00) : 255.0f) * 256.0f;
   1916    auto minColorF = stopColors[stopIndex].zyxw * colorScale;
   1917    auto maxColorF = stopColors[stopIndex + 1].zyxw * colorScale;
   1918    auto deltaOffset = nextOffset - prevOffset;
   1919    // Get the color range of the merged gradient, normalized to its size.
   1920    Float colorRangeF = deltaOffset == 0.0f
   1921                            ? Float(0.0f)
   1922                            : (maxColorF - minColorF) * (1.0 / deltaOffset);
   1923 
   1924    // Compute the actual starting color of the current start offset within
   1925    // the merged gradient. The value 0.5 is added to the low bits (0x80) so
   1926    // that the color will effectively round to the nearest increment below.
   1927    auto colorF =
   1928        minColorF + colorRangeF * (offset.x - prevOffset) + float(0x80);
   1929 
   1930    // Compute the portion of the color range that we advance on each chunk.
   1931    Float deltaColorF = colorRangeF * delta * CHUNK_SIZE;
   1932    // Quantize the color delta and current color. These have already been
   1933    // scaled to the 0..0xFF00 range, so we just need to round them to U16.
   1934    auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16));
   1935    // If there are any amount of whole chunks of a merged gradient found,
   1936    // then we want to process that as a single gradient span.
   1937    int chunks = int(subSpan) / 4;
   1938    if (chunks > 0) {
   1939      for (int remaining = chunks;;) {
   1940        auto color =
   1941            combine(CONVERT(round_pixel(colorF, 1), U16),
   1942                    CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16),
   1943                    CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16),
   1944                    CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16));
   1945        // Finally, step the current color through the output chunks, shifting
   1946        // it into 8 bit range and outputting as we go. Only process a segment
   1947        // at a time to avoid overflowing 8-bit precision due to rounding of
   1948        // deltas.
   1949        int segment = min(remaining, 256 / 4);
   1950        for (auto* end = buf + segment * 4; buf < end; buf += 4) {
   1951          if (DITHER) {
   1952            commit_blend_span<BLEND>(
   1953                buf,
   1954                dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8);
   1955            currentFragCoordX += 4;
   1956          } else {
   1957            commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8));
   1958          }
   1959          color += deltaColor;
   1960        }
   1961        remaining -= segment;
   1962        colorF += deltaColorF * segment;
   1963        if (remaining <= 0) {
   1964          break;
   1965        }
   1966      }
   1967      span -= chunks * 4;
   1968      pos += posStep * float(chunks) * CHUNK_SIZE;
   1969    }
   1970 
   1971    // We may have a partial chunk to write.
   1972    int remainder = int(subSpan - chunks * 4);
   1973    if (remainder > 0) {
   1974      assert(remainder < 4);
   1975      // The logic here is similar to the full chunks loop above, but we do a
   1976      // partial write instead of a pushing a full chunk.
   1977      auto color =
   1978          combine(CONVERT(round_pixel(colorF, 1), U16),
   1979                  CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16),
   1980                  CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16),
   1981                  CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16));
   1982      if (DITHER) {
   1983        color = dither(color, currentFragCoordX, ditherNoiseYIndexed),
   1984        currentFragCoordX += remainder;
   1985      }
   1986      commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8), remainder);
   1987 
   1988      buf += remainder;
   1989      span -= remainder;
   1990      pos += posStep * float(remainder);
   1991    }
   1992  }
   1993  return true;
   1994 }
   1995 
   1996 // Commits an entire span of a linear gradient, given the address of a table
   1997 // previously resolved with swgl_validateGradient. The size of the inner portion
   1998 // of the table is given, assuming the table start and ends with a single entry
   1999 // each to deal with clamping. Repeating will be handled if necessary. The
   2000 // initial offset within the table is used to designate where to start the span
   2001 // and how to step through the gradient table.
   2002 #define swgl_commitLinearGradientRGBA8(sampler, address, size, tileRepeat,   \
   2003                                       gradientRepeat, pos, scaleDir,        \
   2004                                       startOffset)                          \
   2005  do {                                                                       \
   2006    bool drawn = false;                                                      \
   2007    if (blend_key) {                                                         \
   2008      drawn = commitLinearGradient<true, false>(                             \
   2009          sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \
   2010          startOffset, swgl_OutRGBA8, swgl_SpanLength);                      \
   2011    } else {                                                                 \
   2012      drawn = commitLinearGradient<false, false>(                            \
   2013          sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \
   2014          startOffset, swgl_OutRGBA8, swgl_SpanLength);                      \
   2015    }                                                                        \
   2016    if (drawn) {                                                             \
   2017      swgl_OutRGBA8 += swgl_SpanLength;                                      \
   2018      swgl_SpanLength = 0;                                                   \
   2019    }                                                                        \
   2020  } while (0)
   2021 
   2022 #define swgl_commitDitheredLinearGradientRGBA8(sampler, address, size,       \
   2023                                               tileRepeat, gradientRepeat,   \
   2024                                               pos, scaleDir, startOffset)   \
   2025  do {                                                                       \
   2026    bool drawn = false;                                                      \
   2027    if (blend_key) {                                                         \
   2028      drawn = commitLinearGradient<true, true>(                              \
   2029          sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \
   2030          startOffset, swgl_OutRGBA8, swgl_SpanLength, gl_FragCoord);        \
   2031    } else {                                                                 \
   2032      drawn = commitLinearGradient<false, true>(                             \
   2033          sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \
   2034          startOffset, swgl_OutRGBA8, swgl_SpanLength, gl_FragCoord);        \
   2035    }                                                                        \
   2036    if (drawn) {                                                             \
   2037      swgl_OutRGBA8 += swgl_SpanLength;                                      \
   2038      swgl_SpanLength = 0;                                                   \
   2039    }                                                                        \
   2040  } while (0)
   2041 
   2042 #define swgl_commitLinearGradientFromStopsRGBA8(                             \
   2043    sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos,       \
   2044    scaleDir, startOffset)                                                   \
   2045  do {                                                                       \
   2046    bool drawn = false;                                                      \
   2047    if (blend_key) {                                                         \
   2048      drawn = commitLinearGradientFromStops<true, false>(                    \
   2049          sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos, \
   2050          scaleDir, startOffset, swgl_OutRGBA8, swgl_SpanLength);            \
   2051    } else {                                                                 \
   2052      drawn = commitLinearGradientFromStops<false, false>(                   \
   2053          sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos, \
   2054          scaleDir, startOffset, swgl_OutRGBA8, swgl_SpanLength);            \
   2055    }                                                                        \
   2056    if (drawn) {                                                             \
   2057      swgl_OutRGBA8 += swgl_SpanLength;                                      \
   2058      swgl_SpanLength = 0;                                                   \
   2059    }                                                                        \
   2060  } while (0)
   2061 
   2062 #define swgl_commitDitheredLinearGradientFromStopsRGBA8(                      \
   2063    sampler, offsetsAddress, colorsAddress, size, tileRepeat, gradientRepeat, \
   2064    pos, scaleDir, startOffset)                                               \
   2065  do {                                                                        \
   2066    bool drawn = false;                                                       \
   2067    if (blend_key) {                                                          \
   2068      drawn = commitLinearGradientFromStops<true, true>(                      \
   2069          sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos,  \
   2070          scaleDir, startOffset, swgl_OutRGBA8, swgl_SpanLength, fragCoord);  \
   2071    } else {                                                                  \
   2072      drawn = commitLinearGradientFromStops<false, true>(                     \
   2073          sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos,  \
   2074          scaleDir, startOffset, swgl_OutRGBA8, swgl_SpanLength, fragCoord);  \
   2075    }                                                                         \
   2076    if (drawn) {                                                              \
   2077      swgl_OutRGBA8 += swgl_SpanLength;                                       \
   2078      swgl_SpanLength = 0;                                                    \
   2079    }                                                                         \
   2080  } while (0)
   2081 
   2082 template <bool CLAMP, typename V>
   2083 static ALWAYS_INLINE V fastSqrt(V v) {
   2084  if (CLAMP) {
   2085    // Clamp to avoid zero or negative.
   2086    v = max(v, V(1.0e-12f));
   2087  }
   2088 #if USE_SSE2 || USE_NEON
   2089  return v * inversesqrt(v);
   2090 #else
   2091  return sqrt(v);
   2092 #endif
   2093 }
   2094 
   2095 template <bool CLAMP, typename V>
   2096 static ALWAYS_INLINE auto fastLength(V v) {
   2097  return fastSqrt<CLAMP>(dot(v, v));
   2098 }
   2099 
   2100 // Samples an entire span of a radial gradient by crawling the gradient table
   2101 // and looking for consecutive stops that can be merged into a single larger
   2102 // gradient, then interpolating between those larger gradients within the span
   2103 // based on the computed position relative to a radius.
   2104 template <bool BLEND, bool DITHER>
   2105 static bool commitRadialGradient(sampler2D sampler, int address, float size,
   2106                                 bool repeat, vec2 pos, float radius,
   2107                                 uint32_t* buf, int span,
   2108                                 vec4 fragCoord = vec4()) {
   2109  assert(sampler->format == TextureFormat::RGBA32F);
   2110  assert(address >= 0 && address < int(sampler->height * sampler->stride));
   2111  GradientStops* stops = (GradientStops*)&sampler->buf[address];
   2112  // clang-format off
   2113  // Given position p, delta d, and radius r, we need to repeatedly solve the
   2114  // following quadratic for the pixel offset t:
   2115  //    length(p + t*d) = r
   2116  //    (px + t*dx)^2 + (py + t*dy)^2 = r^2
   2117  // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is:
   2118  //    t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0
   2119  //    t^2*d.d + t*2*d.p + (p.p-r^2) = 0
   2120  // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to:
   2121  //    t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d)
   2122  // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so
   2123  // we cache them below for faster computation.
   2124  //
   2125  // The quadratic has two solutions, representing the span intersecting the
   2126  // given radius of gradient, which can occur at two offsets. If there is only
   2127  // one solution (where b^2-4ac = 0), this represents the point at which the
   2128  // span runs tangent to the radius. This middle point is significant in that
   2129  // before it, we walk down the gradient ramp, and after it, we walk up the
   2130  // ramp.
   2131  // clang-format on
   2132  vec2_scalar pos0 = {pos.x.x, pos.y.x};
   2133  vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x};
   2134  float deltaDelta = dot(delta, delta);
   2135  if (!isfinite(deltaDelta) || !isfinite(radius)) {
   2136    return false;
   2137  }
   2138 
   2139  // Only incremented in the case of dithering
   2140  int32_t currentFragCoordX = int32_t(fragCoord.x.x);
   2141  const auto* ditherNoiseYIndexed =
   2142      DITHER ? getDitherNoise(int32_t(fragCoord.y.x)) : nullptr;
   2143 
   2144  float invDelta, middleT, middleB;
   2145  if (deltaDelta > 0) {
   2146    invDelta = 1.0f / deltaDelta;
   2147    middleT = -dot(delta, pos0) * invDelta;
   2148    middleB = middleT * middleT - dot(pos0, pos0) * invDelta;
   2149  } else {
   2150    // If position is invariant, just set the coefficients so the quadratic
   2151    // always reduces to the end of the span.
   2152    invDelta = 0.0f;
   2153    middleT = float(span);
   2154    middleB = 0.0f;
   2155  }
   2156  // We only want search for merged gradients up to the minimum of either the
   2157  // mid-point or the span length. Cache those offsets here as they don't vary
   2158  // in the inner loop.
   2159  Float middleEndRadius = fastLength<true>(
   2160      pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f});
   2161  float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x;
   2162  float endRadius = middleEndRadius.y;
   2163  // Convert delta to change in position per chunk.
   2164  delta *= 4;
   2165  deltaDelta *= 4 * 4;
   2166  // clang-format off
   2167  // Given current position p and delta d, we reduce:
   2168  //    length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p))
   2169  // where dot(p+d,p+d) can be accumulated as:
   2170  //    (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2)
   2171  //                      = p.p + 2p.d + d.d
   2172  // Since p increases by d every loop iteration, p.d increases by d.d, and thus
   2173  // we can accumulate d.d to calculate 2p.d, then allowing us to get the next
   2174  // dot-product by adding it to dot-product p.p of the prior iteration. This
   2175  // saves us some multiplications and an expensive sqrt inside the inner loop.
   2176  // clang-format on
   2177  Float dotPos = dot(pos, pos);
   2178  Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta;
   2179  float deltaDelta2 = 2.0f * deltaDelta;
   2180  for (int t = 0; t < span;) {
   2181    // Compute the gradient table offset from the current position.
   2182    Float offset = fastSqrt<true>(dotPos) - radius;
   2183    float startRadius = radius;
   2184    // If repeat is desired, we need to limit the offset to a fractional value.
   2185    if (repeat) {
   2186      // The non-repeating radius at which the gradient table actually starts,
   2187      // radius + floor(offset) = radius + (offset - fract(offset)).
   2188      startRadius += offset.x;
   2189      offset = fract(offset);
   2190      startRadius -= offset.x;
   2191    }
   2192    // We need to find the min/max index in the table of the gradient we want to
   2193    // use as well as the intercept point where we leave this gradient.
   2194    float intercept = -1;
   2195    int minIndex = 0;
   2196    int maxIndex = int(1.0f + size);
   2197    if (offset.x < 0) {
   2198      // If inside the inner radius of the gradient table, then use the first
   2199      // stop. Set the intercept to advance forward to the start of the gradient
   2200      // table.
   2201      maxIndex = minIndex;
   2202      if (t >= middleT) {
   2203        intercept = radius;
   2204      }
   2205    } else if (offset.x < 1) {
   2206      // Otherwise, we're inside the valid part of the gradient table.
   2207      minIndex = int(1.0f + offset.x * size);
   2208      maxIndex = minIndex;
   2209      // Find the offset in the gradient that corresponds to the search limit.
   2210      // We only search up to the minimum of either the mid-point or the span
   2211      // length. Get the table index that corresponds to this offset, clamped so
   2212      // that we avoid hitting the beginning (0) or end (1 + size) of the table.
   2213      float searchOffset =
   2214          (t >= middleT ? endRadius : middleRadius) - startRadius;
   2215      int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size));
   2216      // If we are past the mid-point, walk up the gradient table trying to
   2217      // merge stops. If we're below the mid-point, we need to walk down the
   2218      // table. We note the table index at which we need to look for an
   2219      // intercept to determine a valid span.
   2220      if (t >= middleT) {
   2221        while (maxIndex + 1 <= searchIndex &&
   2222               stops[maxIndex].can_merge(stops[maxIndex + 1])) {
   2223          maxIndex++;
   2224        }
   2225        intercept = maxIndex + 1;
   2226      } else {
   2227        while (minIndex - 1 >= searchIndex &&
   2228               stops[minIndex - 1].can_merge(stops[minIndex])) {
   2229          minIndex--;
   2230        }
   2231        intercept = minIndex;
   2232      }
   2233      // Convert from a table index into units of radius from the center of the
   2234      // gradient.
   2235      intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius;
   2236    } else {
   2237      // If outside the outer radius of the gradient table, then use the last
   2238      // stop. Set the intercept to advance toward the valid part of the
   2239      // gradient table if going in, or just run to the end of the span if going
   2240      // away from the gradient.
   2241      minIndex = maxIndex;
   2242      if (t < middleT) {
   2243        intercept = radius + 1;
   2244      }
   2245    }
   2246    // Solve the quadratic for t to find where the merged gradient ends. If no
   2247    // intercept is found, just go to the middle or end of the span.
   2248    float endT = t >= middleT ? span : min(span, int(middleT));
   2249    if (intercept >= 0) {
   2250      float b = middleB + intercept * intercept * invDelta;
   2251      if (b > 0) {
   2252        b = fastSqrt<false>(b);
   2253        endT = min(endT, t >= middleT ? middleT + b : middleT - b);
   2254      } else {
   2255        // Due to the imprecision of fastSqrt in offset calculations, solving
   2256        // the quadratic may fail. However, if the discriminant is still close
   2257        // to 0, then just assume it is 0.
   2258        endT = min(endT, middleT);
   2259      }
   2260    }
   2261    // Figure out how many chunks are actually inside the merged gradient.
   2262    if (t + 4.0f <= endT) {
   2263      int inside = int(endT - t) & ~3;
   2264      // Convert start and end colors to BGRA and scale to 0..0xFF00 range
   2265      // (for dithered) or 0..255 (for non-dithered) later.
   2266      auto minColorF =
   2267          stops[minIndex].startColor.zyxw * (DITHER ? float(0xFF00) : 255.0f);
   2268      auto maxColorF =
   2269          stops[maxIndex].end_color().zyxw * (DITHER ? float(0xFF00) : 255.0f);
   2270 
   2271      // Compute the change in color per change in gradient offset.
   2272      auto deltaColorF =
   2273          (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex));
   2274      // Subtract off the color difference of the beginning of the current span
   2275      // from the beginning of the gradient.
   2276      Float colorF =
   2277          minColorF - deltaColorF * (startRadius + (minIndex - 1) / size);
   2278      // Finally, walk over the span accumulating the position dot product and
   2279      // getting its sqrt as an offset into the color ramp. At this point we
   2280      // just need to round to an integer and pack down to pixel format.
   2281      for (auto* end = buf + inside; buf < end; buf += 4) {
   2282        Float offsetG = fastSqrt<false>(dotPos);
   2283        if (DITHER) {
   2284          auto color = combine(
   2285              CONVERT(round_pixel(colorF + deltaColorF * offsetG.x, 1), U16),
   2286              CONVERT(round_pixel(colorF + deltaColorF * offsetG.y, 1), U16),
   2287              CONVERT(round_pixel(colorF + deltaColorF * offsetG.z, 1), U16),
   2288              CONVERT(round_pixel(colorF + deltaColorF * offsetG.w, 1), U16));
   2289          commit_blend_span<BLEND>(
   2290              buf, dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8);
   2291          currentFragCoordX += 4;
   2292        } else {
   2293          auto color = combine(
   2294              packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1),
   2295                        round_pixel(colorF + deltaColorF * offsetG.y, 1)),
   2296              packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1),
   2297                        round_pixel(colorF + deltaColorF * offsetG.w, 1)));
   2298          commit_blend_span<BLEND>(buf, color);
   2299        }
   2300 
   2301        dotPos += dotPosDelta;
   2302        dotPosDelta += deltaDelta2;
   2303      }
   2304      // Advance past the portion of gradient we just processed.
   2305      t += inside;
   2306      // If we hit the end of the span, exit out now.
   2307      if (t >= span) {
   2308        break;
   2309      }
   2310      // Otherwise, we are most likely in a transitional section of the gradient
   2311      // between stops that will likely require doing per-sample table lookups.
   2312      // Rather than having to redo all the searching above to figure that out,
   2313      // just assume that to be the case and fall through below to doing the
   2314      // table lookups to hopefully avoid an iteration.
   2315      offset = fastSqrt<true>(dotPos) - radius;
   2316      if (repeat) {
   2317        offset = fract(offset);
   2318      }
   2319    }
   2320    // If we got here, that means we still have span left to process but did not
   2321    // have any whole chunks that fell within a merged gradient. Just fall back
   2322    // to doing a table lookup for each sample.
   2323    Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
   2324    commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
   2325    buf += 4;
   2326    t += 4;
   2327    dotPos += dotPosDelta;
   2328    dotPosDelta += deltaDelta2;
   2329  }
   2330  return true;
   2331 }
   2332 
   2333 // Samples an entire span of a radial gradient.
   2334 template <bool BLEND, bool DITHER>
   2335 static bool commitRadialGradientFromStops(sampler2D sampler, int offsetsAddress,
   2336                                          int colorsAddress, float stopCount,
   2337                                          bool repeat, vec2 pos,
   2338                                          float startRadius, uint32_t* buf,
   2339                                          int span, vec4 fragCoord = vec4()) {
   2340  assert(sampler->format == TextureFormat::RGBA32F);
   2341  // Stop offsets are expected to be stored just after the colors.
   2342  assert(colorsAddress >= 0 && colorsAddress < offsetsAddress);
   2343  assert(offsetsAddress >= 0 && offsetsAddress + (stopCount + 3) / 4 <
   2344                                    int(sampler->height * sampler->stride));
   2345  float* stopOffsets = (float*)&sampler->buf[offsetsAddress];
   2346  Float* stopColors = (Float*)&sampler->buf[colorsAddress];
   2347  // clang-format off
   2348  // Given position p, delta d, and radius r, we need to repeatedly solve the
   2349  // following quadratic for the pixel offset t:
   2350  //    length(p + t*d) = r
   2351  //    (px + t*dx)^2 + (py + t*dy)^2 = r^2
   2352  // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is:
   2353  //    t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0
   2354  //    t^2*d.d + t*2*d.p + (p.p-r^2) = 0
   2355  // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to:
   2356  //    t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d)
   2357  // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so
   2358  // we cache them below for faster computation.
   2359  //
   2360  // The quadratic has two solutions, representing the span intersecting the
   2361  // given radius of gradient, which can occur at two offsets. If there is only
   2362  // one solution (where b^2-4ac = 0), this represents the point at which the
   2363  // span runs tangent to the radius. This middle point is significant in that
   2364  // before it, we walk down the gradient ramp, and after it, we walk up the
   2365  // ramp.
   2366  // clang-format on
   2367  vec2_scalar pos0 = {pos.x.x, pos.y.x};
   2368  vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x};
   2369  float deltaDelta = dot(delta, delta);
   2370  if (!isfinite(deltaDelta) || !isfinite(startRadius)) {
   2371    return false;
   2372  }
   2373 
   2374  // Only incremented in the case of dithering
   2375  int32_t currentFragCoordX = int32_t(fragCoord.x.x);
   2376  const auto* ditherNoiseYIndexed =
   2377      DITHER ? getDitherNoise(int32_t(fragCoord.y.x)) : nullptr;
   2378 
   2379  float invDelta, middleT, middleB;
   2380  if (deltaDelta > 0) {
   2381    invDelta = 1.0f / deltaDelta;
   2382    middleT = -dot(delta, pos0) * invDelta;
   2383    middleB = middleT * middleT - dot(pos0, pos0) * invDelta;
   2384  } else {
   2385    // If position is invariant, just set the coefficients so the quadratic
   2386    // always reduces to the end of the span.
   2387    invDelta = 0.0f;
   2388    middleT = float(span);
   2389    middleB = 0.0f;
   2390  }
   2391 
   2392  // Convert delta to change in position per chunk.
   2393  delta *= 4;
   2394  deltaDelta *= 4 * 4;
   2395  // clang-format off
   2396  // Given current position p and delta d, we reduce:
   2397  //    length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p))
   2398  // where dot(p+d,p+d) can be accumulated as:
   2399  //    (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2)
   2400  //                      = p.p + 2p.d + d.d
   2401  // Since p increases by d every loop iteration, p.d increases by d.d, and thus
   2402  // we can accumulate d.d to calculate 2p.d, then allowing us to get the next
   2403  // dot-product by adding it to dot-product p.p of the prior iteration. This
   2404  // saves us some multiplications and an expensive sqrt inside the inner loop.
   2405  // clang-format on
   2406  Float dotPos = dot(pos, pos);
   2407  Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta;
   2408  float deltaDelta2 = 2.0f * deltaDelta;
   2409 
   2410  for (int t = 0; t < span;) {
   2411    // Compute the gradient table offset from the current position.
   2412    Float offset = fastSqrt<true>(dotPos) - startRadius;
   2413    float adjustedStartRadius = startRadius;
   2414    // If repeat is desired, we need to limit the offset to a fractional value.
   2415    if (repeat) {
   2416      // The non-repeating radius at which the gradient table actually starts,
   2417      // startRadius + floor(offset) = startRadius + (offset - fract(offset)).
   2418      adjustedStartRadius += offset.x;
   2419      offset = fract(offset);
   2420      adjustedStartRadius -= offset.x;
   2421    }
   2422 
   2423    // We need to find the pair of gradient stops that affect the the current
   2424    // portion of the span as well as the intercept point where we leave this
   2425    // gradient.
   2426    float intercept = -1;
   2427    int32_t stopIndex = 0;
   2428    float prevOffset = 0.0f;
   2429    float nextOffset = 0.0f;
   2430    if (offset.x < 0) {
   2431      // If inside the inner radius of the gradient table, then use the first
   2432      // stop. Set the intercept to advance forward to the start of the gradient
   2433      // table.
   2434      if (t >= middleT) {
   2435        intercept = startRadius;
   2436      }
   2437    } else if (offset.x >= 1) {
   2438      // If outside the outer radius of the gradient table, then use the last
   2439      // stop. Set the intercept to advance toward the valid part of the
   2440      // gradient table if going in, or just run to the end of the span if going
   2441      // away from the gradient.
   2442      stopIndex = stopCount - 1;
   2443      if (t < middleT) {
   2444        intercept = startRadius + 1;
   2445      }
   2446    } else {
   2447      // Otherwise, we're inside the valid part of the gradient table.
   2448 
   2449      stopIndex =
   2450          findGradientStopPair(offset.x, stopOffsets, stopCount,
   2451                                   prevOffset, nextOffset);
   2452      if (t >= middleT) {
   2453        intercept = adjustedStartRadius + nextOffset;
   2454      } else {
   2455        intercept = adjustedStartRadius + prevOffset;
   2456      }
   2457    }
   2458    // Solve the quadratic for t to find where the current stop pair ends. If no
   2459    // intercept is found, just go to the middle or end of the span.
   2460    float endT = t >= middleT ? span : min(span, int(middleT));
   2461    if (intercept >= 0) {
   2462      float b = middleB + intercept * intercept * invDelta;
   2463      if (b > 0) {
   2464        b = fastSqrt<false>(b);
   2465        endT = min(endT, t >= middleT ? middleT + b : middleT - b);
   2466      } else {
   2467        // Due to the imprecision of fastSqrt in offset calculations, solving
   2468        // the quadratic may fail. However, if the discriminant is still close
   2469        // to 0, then just assume it is 0.
   2470        endT = min(endT, middleT);
   2471      }
   2472    }
   2473    // Ensure that we are advancing by at least one pixel at each iteration.
   2474    endT = max(ceil(endT), t + 1.0f);
   2475 
   2476    // Figure out how many pixels belonging to whole chunks are inside the
   2477    // gradient stop pair.
   2478    int inside = int(endT - t) & ~3;
   2479    // Convert start and end colors to BGRA and scale to 0..0xFF00 range
   2480    // (for dithered) and 0.255 range (for non-dithered).
   2481    auto minColorF =
   2482        stopColors[stopIndex].zyxw * (DITHER ? float(0xFF00) : 255.0f);
   2483    auto maxColorF =
   2484        stopColors[stopIndex + 1].zyxw * (DITHER ? float(0xFF00) : 255.0f);
   2485 
   2486    // Compute the change in color per change in gradient offset.
   2487    auto deltaOffset = nextOffset - prevOffset;
   2488    Float deltaColorF =
   2489        deltaOffset == 0.0f
   2490            ?
   2491            // Note: If we take this branch, we know that we are going to fill
   2492            // some pixels with a solid color (we are in or out of the range of
   2493            // gradient stops). We could leverage that to skip the offset
   2494            // calculation.
   2495            Float(0.0f)
   2496            : (maxColorF - minColorF) / deltaOffset;
   2497    // Subtract off the color difference of the beginning of the current span
   2498    // from the beginning of the gradient.
   2499    Float colorF = minColorF - deltaColorF * (adjustedStartRadius + prevOffset);
   2500    // Finally, walk over the span accumulating the position dot product and
   2501    // getting its sqrt as an offset into the color ramp. At this point we just
   2502    // need to round to an integer and pack down to pixel format.
   2503    for (auto* end = buf + inside; buf < end; buf += 4) {
   2504      Float offsetG = fastSqrt<false>(dotPos);
   2505      if (DITHER) {
   2506        auto color = combine(
   2507            CONVERT(round_pixel(colorF + deltaColorF * offsetG.x, 1), U16),
   2508            CONVERT(round_pixel(colorF + deltaColorF * offsetG.y, 1), U16),
   2509            CONVERT(round_pixel(colorF + deltaColorF * offsetG.z, 1), U16),
   2510            CONVERT(round_pixel(colorF + deltaColorF * offsetG.w, 1), U16));
   2511        commit_blend_span<BLEND>(
   2512            buf, dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8);
   2513        currentFragCoordX += 4;
   2514      } else {
   2515        auto color = combine(
   2516            packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1),
   2517                      round_pixel(colorF + deltaColorF * offsetG.y, 1)),
   2518            packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1),
   2519                      round_pixel(colorF + deltaColorF * offsetG.w, 1)));
   2520        commit_blend_span<BLEND>(buf, color);
   2521      }
   2522      dotPos += dotPosDelta;
   2523      dotPosDelta += deltaDelta2;
   2524    }
   2525    // Advance past the portion of gradient we just processed.
   2526    t += inside;
   2527 
   2528    // If we hit the end of the span, exit out now.
   2529    if (t >= span) {
   2530      break;
   2531    }
   2532 
   2533    // Otherwise we may have a partial chunk to write.
   2534    int remainder = endT - t;
   2535    if (remainder > 0) {
   2536      assert(remainder < 4);
   2537      // The logic here is similar to the full chunks loop above, but we do a
   2538      // partial write instead of a pushing a full chunk.
   2539      Float offsetG = fastSqrt<false>(dotPos);
   2540      if (DITHER) {
   2541        auto color = combine(
   2542            CONVERT(round_pixel(colorF + deltaColorF * offsetG.x, 1), U16),
   2543            CONVERT(round_pixel(colorF + deltaColorF * offsetG.y, 1), U16),
   2544            CONVERT(round_pixel(colorF + deltaColorF * offsetG.z, 1), U16),
   2545            CONVERT(round_pixel(colorF + deltaColorF * offsetG.w, 1), U16));
   2546        commit_blend_span<BLEND>(
   2547            buf, dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8,
   2548            remainder);
   2549        currentFragCoordX += 4;
   2550      } else {
   2551        auto color = combine(
   2552            packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1),
   2553                      round_pixel(colorF + deltaColorF * offsetG.y, 1)),
   2554            packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1),
   2555                      round_pixel(colorF + deltaColorF * offsetG.w, 1)));
   2556        commit_blend_span<BLEND>(buf, color, remainder);
   2557      }
   2558      buf += remainder;
   2559      t += remainder;
   2560 
   2561      // dotPosDelta's members are monotonically increasing, so adjusting the
   2562      // step only requires undoing the factor of 4 and multiplying with the
   2563      // actual number of remainder pixels.
   2564      float partialDeltaDelta2 = deltaDelta2 * 0.25f * float(remainder);
   2565      dotPosDelta += partialDeltaDelta2;
   2566 
   2567      // For dotPos, however, there is a compounding effect that makes the math
   2568      // trickier. For simplicity's sake we are just computing the the
   2569      // parameters for a single-pixel step and applying it remainder times.
   2570 
   2571      // The deltaDelta2 for a single-pixel step (undoing the 4*4 factor we did
   2572      // earlier when making deltaDelta2 work for 4-pixels chunks).
   2573      float singlePxDeltaDelta2 = deltaDelta2 * 0.0625f;
   2574      // The first single-pixel delta for dotPos (The difference between
   2575      // dotPos's first two lanes).
   2576      float dotPosDeltaFirst = dotPos.y - dotPos.x;
   2577      // For each 1-pixel step the delta is applied and monotonically increased
   2578      // by singleDeltaDelta2.
   2579      Float pxOffsets = {0.0f, 1.0f, 2.0f, 3.0f};
   2580      Float partialDotPosDelta =
   2581          pxOffsets * singlePxDeltaDelta2 + dotPosDeltaFirst;
   2582 
   2583      // Apply each single-pixel step.
   2584      for (int i = 0; i < remainder; ++i) {
   2585        dotPos += partialDotPosDelta;
   2586        partialDotPosDelta += singlePxDeltaDelta2;
   2587      }
   2588    }
   2589  }
   2590  return true;
   2591 }
   2592 
   2593 // Commits an entire span of a radial gradient similar to
   2594 // swglcommitLinearGradient, but given a varying 2D position scaled to
   2595 // gradient-space and a radius at which the distance from the origin maps to the
   2596 // start of the gradient table.
   2597 #define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos, \
   2598                                       radius)                              \
   2599  do {                                                                      \
   2600    bool drawn = false;                                                     \
   2601    if (blend_key) {                                                        \
   2602      drawn = commitRadialGradient<true, false>(                            \
   2603          sampler, address, size, repeat, pos, radius, swgl_OutRGBA8,       \
   2604          swgl_SpanLength);                                                 \
   2605    } else {                                                                \
   2606      drawn = commitRadialGradient<false, false>(                           \
   2607          sampler, address, size, repeat, pos, radius, swgl_OutRGBA8,       \
   2608          swgl_SpanLength);                                                 \
   2609    }                                                                       \
   2610    if (drawn) {                                                            \
   2611      swgl_OutRGBA8 += swgl_SpanLength;                                     \
   2612      swgl_SpanLength = 0;                                                  \
   2613    }                                                                       \
   2614  } while (0)
   2615 
   2616 #define swgl_commitDitheredRadialGradientRGBA8(sampler, address, size, repeat, \
   2617                                               pos, radius)                    \
   2618  do {                                                                         \
   2619    bool drawn = false;                                                        \
   2620    if (blend_key) {                                                           \
   2621      drawn = commitRadialGradient<true, true>(sampler, address, size, repeat, \
   2622                                               pos, radius, swgl_OutRGBA8,     \
   2623                                               swgl_SpanLength, gl_FragCoord); \
   2624    } else {                                                                   \
   2625      drawn = commitRadialGradient<false, true>(                               \
   2626          sampler, address, size, repeat, pos, radius, swgl_OutRGBA8,          \
   2627          swgl_SpanLength, gl_FragCoord);                                      \
   2628    }                                                                          \
   2629    if (drawn) {                                                               \
   2630      swgl_OutRGBA8 += swgl_SpanLength;                                        \
   2631      swgl_SpanLength = 0;                                                     \
   2632    }                                                                          \
   2633  } while (0)
   2634 
   2635 // Commits an entire span of a radial gradient similar to
   2636 // swglcommitLinearGradient, but given a varying 2D position scaled to
   2637 // gradient-space and a radius at which the distance from the origin maps to the
   2638 // start of the gradient table.
   2639 #define swgl_commitRadialGradientFromStopsRGBA8(                            \
   2640    sampler, offsetsAddress, colorsAddress, size, repeat, pos, startRadius) \
   2641  do {                                                                      \
   2642    bool drawn = false;                                                     \
   2643    if (blend_key) {                                                        \
   2644      drawn = commitRadialGradientFromStops<true, false>(                   \
   2645          sampler, offsetsAddress, colorsAddress, size, repeat, pos,        \
   2646          startRadius, swgl_OutRGBA8, swgl_SpanLength);                     \
   2647    } else {                                                                \
   2648      drawn = commitRadialGradientFromStops<false, false>(                  \
   2649          sampler, offsetsAddress, colorsAddress, size, repeat, pos,        \
   2650          startRadius, swgl_OutRGBA8, swgl_SpanLength);                     \
   2651    }                                                                       \
   2652    if (drawn) {                                                            \
   2653      swgl_OutRGBA8 += swgl_SpanLength;                                     \
   2654      swgl_SpanLength = 0;                                                  \
   2655    }                                                                       \
   2656  } while (0)
   2657 
   2658 #define swgl_commitDitheredRadialGradientFromStopsRGBA8(                    \
   2659    sampler, offsetsAddress, colorsAddress, size, repeat, pos, startRadius) \
   2660  do {                                                                      \
   2661    bool drawn = false;                                                     \
   2662    if (blend_key) {                                                        \
   2663      drawn = commitRadialGradientFromStops<true, true>(                    \
   2664          sampler, offsetsAddress, colorsAddress, size, repeat, pos,        \
   2665          startRadius, swgl_OutRGBA8, swgl_SpanLength, gl_FragCoord);       \
   2666    } else {                                                                \
   2667      drawn = commitRadialGradientFromStops<false, true>(                   \
   2668          sampler, offsetsAddress, colorsAddress, size, repeat, pos,        \
   2669          startRadius, swgl_OutRGBA8, swgl_SpanLength, gl_FragCoord);       \
   2670    }                                                                       \
   2671    if (drawn) {                                                            \
   2672      swgl_OutRGBA8 += swgl_SpanLength;                                     \
   2673      swgl_SpanLength = 0;                                                  \
   2674    }                                                                       \
   2675  } while (0)
   2676 
   2677 // Extension to set a clip mask image to be sampled during blending. The offset
   2678 // specifies the positioning of the clip mask image relative to the viewport
   2679 // origin. The bounding box specifies the rectangle relative to the clip mask's
   2680 // origin that constrains sampling within the clip mask. Blending must be
   2681 // enabled for this to work.
   2682 static sampler2D swgl_ClipMask = nullptr;
   2683 static IntPoint swgl_ClipMaskOffset = {0, 0};
   2684 static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0};
   2685 #define swgl_clipMask(mask, offset, bb_origin, bb_size)        \
   2686  do {                                                         \
   2687    if (bb_size != vec2_scalar(0.0f, 0.0f)) {                  \
   2688      swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK;                   \
   2689      swgl_ClipMask = mask;                                    \
   2690      swgl_ClipMaskOffset = make_ivec2(offset);                \
   2691      swgl_ClipMaskBounds =                                    \
   2692          IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \
   2693    }                                                          \
   2694  } while (0)
   2695 
   2696 // Extension to enable anti-aliasing for the given edges of a quad.
   2697 // Blending must be enable for this to work.
   2698 static int swgl_AAEdgeMask = 0;
   2699 
   2700 static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; }
   2701 static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; }
   2702 static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) {
   2703  return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) |
   2704         (mask.w ? 8 : 0);
   2705 }
   2706 
   2707 #define swgl_antiAlias(edges)                \
   2708  do {                                       \
   2709    swgl_AAEdgeMask = calcAAEdgeMask(edges); \
   2710    if (swgl_AAEdgeMask) {                   \
   2711      swgl_ClipFlags |= SWGL_CLIP_FLAG_AA;   \
   2712    }                                        \
   2713  } while (0)
   2714 
   2715 #define swgl_blendDropShadow(color)                         \
   2716  do {                                                      \
   2717    swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE;        \
   2718    swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \
   2719    swgl_BlendColorRGBA8 = packColor<uint32_t>(color);      \
   2720  } while (0)
   2721 
   2722 #define swgl_blendSubpixelText(color)                         \
   2723  do {                                                        \
   2724    swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE;          \
   2725    swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \
   2726    swgl_BlendColorRGBA8 = packColor<uint32_t>(color);        \
   2727    swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8);      \
   2728  } while (0)
   2729 
   2730 // Dispatch helper used by the GLSL translator to swgl_drawSpan functions.
   2731 // The number of pixels committed is tracked by checking for the difference in
   2732 // swgl_SpanLength. Any varying interpolants used will be advanced past the
   2733 // committed part of the span in case the fragment shader must be executed for
   2734 // any remaining pixels that were not committed by the span shader.
   2735 #define DISPATCH_DRAW_SPAN(self, format)        \
   2736  do {                                          \
   2737    int total = self->swgl_SpanLength;          \
   2738    self->swgl_drawSpan##format();              \
   2739    int drawn = total - self->swgl_SpanLength;  \
   2740    if (drawn) self->step_interp_inputs(drawn); \
   2741    return drawn;                               \
   2742  } while (0)