swgl_ext.h (133516B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 // When using a solid color with clip masking, the cost of loading the clip mask 6 // in the blend stage exceeds the cost of processing the color. Here we handle 7 // the entire span of clip mask texture before the blend stage to more 8 // efficiently process it and modulate it with color without incurring blend 9 // stage overheads. 10 #include <cstdint> 11 12 template <typename P, typename C> 13 static void commit_masked_solid_span(P* buf, C color, int len) { 14 override_clip_mask(); 15 uint8_t* mask = get_clip_mask(buf); 16 for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) { 17 commit_span( 18 buf, 19 blend_span( 20 buf, 21 applyColor(expand_mask(buf, unpack(unaligned_load<PackedR8>(mask))), 22 color))); 23 } 24 restore_clip_mask(); 25 } 26 27 // When using a solid color with anti-aliasing, most of the solid span will not 28 // benefit from anti-aliasing in the opaque region. We only want to apply the AA 29 // blend stage in the non-opaque start and end of the span where AA is needed. 30 template <typename P, typename R> 31 static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) { 32 if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) { 33 commit_solid_span<true>(buf, r, start); 34 buf += start; 35 len -= start; 36 } 37 if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) { 38 override_aa(); 39 commit_solid_span<true>(buf, r, opaque); 40 restore_aa(); 41 buf += opaque; 42 len -= opaque; 43 } 44 if (len > 0) { 45 commit_solid_span<true>(buf, r, len); 46 } 47 } 48 49 // Forces a value with vector run-class to have scalar run-class. 50 template <typename T> 51 static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) { 52 return force_scalar(v); 53 } 54 55 // Advance all varying inperpolants by a single chunk 56 #define swgl_stepInterp() step_interp_inputs() 57 58 // Pseudo-intrinsic that accesses the interpolation step for a given varying 59 #define swgl_interpStep(v) (interp_step.v) 60 61 // Commit an entire span of a solid color. This dispatches to clip-masked and 62 // anti-aliased fast-paths as appropriate. 63 #define swgl_commitSolid(format, v, n) \ 64 do { \ 65 int len = (n); \ 66 if (blend_key) { \ 67 if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { \ 68 commit_masked_solid_span(swgl_Out##format, \ 69 packColor(swgl_Out##format, (v)), len); \ 70 } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { \ 71 commit_aa_solid_span(swgl_Out##format, \ 72 pack_span(swgl_Out##format, (v)), len); \ 73 } else { \ 74 commit_solid_span<true>(swgl_Out##format, \ 75 pack_span(swgl_Out##format, (v)), len); \ 76 } \ 77 } else { \ 78 commit_solid_span<false>(swgl_Out##format, \ 79 pack_span(swgl_Out##format, (v)), len); \ 80 } \ 81 swgl_Out##format += len; \ 82 swgl_SpanLength -= len; \ 83 } while (0) 84 #define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength) 85 #define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength) 86 #define swgl_commitPartialSolidRGBA8(len, v) \ 87 swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength)) 88 #define swgl_commitPartialSolidR8(len, v) \ 89 swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength)) 90 91 #define swgl_commitChunk(format, chunk) \ 92 do { \ 93 auto r = chunk; \ 94 if (blend_key) r = blend_span(swgl_Out##format, r); \ 95 commit_span(swgl_Out##format, r); \ 96 swgl_Out##format += swgl_StepSize; \ 97 swgl_SpanLength -= swgl_StepSize; \ 98 } while (0) 99 100 // Commit a single chunk of a color 101 #define swgl_commitColor(format, color) \ 102 swgl_commitChunk(format, pack_pixels_##format(color)) 103 #define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color) 104 #define swgl_commitColorR8(color) swgl_commitColor(R8, color) 105 106 template <typename S> 107 static ALWAYS_INLINE bool swgl_isTextureLinear(S s) { 108 return s->filter == TextureFilter::LINEAR; 109 } 110 111 template <typename S> 112 static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) { 113 return s->format == TextureFormat::RGBA8; 114 } 115 116 template <typename S> 117 static ALWAYS_INLINE bool swgl_isTextureR8(S s) { 118 return s->format == TextureFormat::R8; 119 } 120 121 // Use the default linear quantization scale of 128. This gives 7 bits of 122 // fractional precision, which when multiplied with a signed 9 bit value 123 // still fits in a 16 bit integer. 124 const int swgl_LinearQuantizeScale = 128; 125 126 // Quantizes UVs for access into a linear texture. 127 template <typename S, typename T> 128 static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) { 129 return linearQuantize(p, swgl_LinearQuantizeScale, s); 130 } 131 132 // Quantizes an interpolation step for UVs for access into a linear texture. 133 template <typename S, typename T> 134 static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) { 135 return samplerScale(s, p) * swgl_LinearQuantizeScale; 136 } 137 138 template <typename S> 139 static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf, 140 S sampler, ivec2 i) { 141 return textureLinearUnpackedRGBA8(sampler, i); 142 } 143 144 template <typename S> 145 static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf, 146 S sampler, ivec2 i) { 147 return textureLinearUnpackedR8(sampler, i); 148 } 149 150 template <typename S> 151 static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) { 152 return swgl_isTextureRGBA8(s); 153 } 154 155 template <typename S> 156 static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) { 157 return swgl_isTextureR8(s); 158 } 159 160 // Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets 161 // for linear sampling. 162 #define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv) \ 163 uv = swgl_linearQuantize(sampler, uv); \ 164 vec2_scalar uv_step = \ 165 float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; \ 166 vec2_scalar min_uv = max( \ 167 swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \ 168 vec2_scalar max_uv = \ 169 max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), \ 170 min_uv); 171 172 // Implements the fallback linear filter that can deal with clamping and 173 // arbitrary scales. 174 template <bool BLEND, typename S, typename C, typename P> 175 static P* blendTextureLinearFallback(S sampler, vec2 uv, int span, 176 vec2_scalar uv_step, vec2_scalar min_uv, 177 vec2_scalar max_uv, C color, P* buf) { 178 for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { 179 commit_blend_span<BLEND>( 180 buf, applyColor(textureLinearUnpacked(buf, sampler, 181 ivec2(clamp(uv, min_uv, max_uv))), 182 color)); 183 } 184 return buf; 185 } 186 187 static ALWAYS_INLINE U64 castForShuffle(V16<int16_t> r) { 188 return bit_cast<U64>(r); 189 } 190 static ALWAYS_INLINE U16 castForShuffle(V4<int16_t> r) { 191 return bit_cast<U16>(r); 192 } 193 194 static ALWAYS_INLINE V16<int16_t> applyFracX(V16<int16_t> r, I16 fracx) { 195 return r * fracx.xxxxyyyyzzzzwwww; 196 } 197 static ALWAYS_INLINE V4<int16_t> applyFracX(V4<int16_t> r, I16 fracx) { 198 return r * fracx; 199 } 200 201 // Implements a faster linear filter that works with axis-aligned constant Y but 202 // scales less than 1, i.e. upscaling. In this case we can optimize for the 203 // constant Y fraction as well as load all chunks from memory in a single tap 204 // for each row. 205 template <bool BLEND, typename S, typename C, typename P> 206 static void blendTextureLinearUpscale(S sampler, vec2 uv, int span, 207 vec2_scalar uv_step, vec2_scalar min_uv, 208 vec2_scalar max_uv, C color, P* buf) { 209 typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; 210 typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; 211 typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type; 212 213 ivec2 i(clamp(uv, min_uv, max_uv)); 214 ivec2 frac = i; 215 i >>= 7; 216 P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x)); 217 P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x)); 218 I16 fracx = computeFracX(sampler, i, frac); 219 int16_t fracy = computeFracY(frac).x; 220 auto src0 = 221 CONVERT(unaligned_load<packed_type>(&row0[i.x.x]), signed_unpacked_type); 222 auto src1 = 223 CONVERT(unaligned_load<packed_type>(&row1[i.x.x]), signed_unpacked_type); 224 auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); 225 226 // We attempt to sample ahead by one chunk and interpolate it with the current 227 // one. However, due to the complication of upscaling, we may not necessarily 228 // shift in all the next set of samples. 229 for (P* end = buf + span; buf < end; buf += 4) { 230 uv.x += uv_step.x; 231 I32 ixn = cast(uv.x); 232 I16 fracn = computeFracNoClamp(ixn); 233 ixn >>= 7; 234 auto src0n = CONVERT(unaligned_load<packed_type>(&row0[ixn.x]), 235 signed_unpacked_type); 236 auto src1n = CONVERT(unaligned_load<packed_type>(&row1[ixn.x]), 237 signed_unpacked_type); 238 auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); 239 240 // Since we're upscaling, we know that a source pixel has a larger footprint 241 // than the destination pixel, and thus all the source pixels needed for 242 // this chunk will fall within a single chunk of texture data. However, 243 // since the source pixels don't map 1:1 with destination pixels, we need to 244 // shift the source pixels over based on their offset from the start of the 245 // chunk. This could conceivably be optimized better with usage of PSHUFB or 246 // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort 247 // to masking in the correct pixels to avoid having to index into memory. 248 // For the last sample to interpolate with, we need to potentially shift in 249 // a sample from the next chunk over in the case the samples fill out an 250 // entire chunk. 251 auto shuf = src; 252 auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4); 253 if (i.x.y == i.x.x) { 254 shuf = shuf.xxyz; 255 shufn = shufn.xxyz; 256 } 257 if (i.x.z == i.x.y) { 258 shuf = shuf.xyyz; 259 shufn = shufn.xyyz; 260 } 261 if (i.x.w == i.x.z) { 262 shuf = shuf.xyzz; 263 shufn = shufn.xyzz; 264 } 265 266 // Convert back to a signed unpacked type so that we can interpolate the 267 // final result. 268 auto interp = bit_cast<signed_unpacked_type>(shuf); 269 auto interpn = bit_cast<signed_unpacked_type>(shufn); 270 interp += applyFracX(interpn - interp, fracx) >> 7; 271 272 commit_blend_span<BLEND>( 273 buf, applyColor(bit_cast<unpacked_type>(interp), color)); 274 275 i.x = ixn; 276 fracx = fracn; 277 src = srcn; 278 } 279 } 280 281 // This is the fastest variant of the linear filter that still provides 282 // filtering. In cases where there is no scaling required, but we have a 283 // subpixel offset that forces us to blend in neighboring pixels, we can 284 // optimize away most of the memory loads and shuffling that is required by the 285 // fallback filter. 286 template <bool BLEND, typename S, typename C, typename P> 287 static void blendTextureLinearFast(S sampler, vec2 uv, int span, 288 vec2_scalar min_uv, vec2_scalar max_uv, 289 C color, P* buf) { 290 typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; 291 typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; 292 typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type; 293 294 ivec2 i(clamp(uv, min_uv, max_uv)); 295 ivec2 frac = i; 296 i >>= 7; 297 P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); 298 P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); 299 int16_t fracx = computeFracX(sampler, i, frac).x; 300 int16_t fracy = computeFracY(frac).x; 301 auto src0 = CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type); 302 auto src1 = CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type); 303 auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); 304 305 // Since there is no scaling, we sample ahead by one chunk and interpolate it 306 // with the current one. We can then reuse this value on the next iteration. 307 for (P* end = buf + span; buf < end; buf += 4) { 308 row0 += 4; 309 row1 += 4; 310 auto src0n = 311 CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type); 312 auto src1n = 313 CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type); 314 auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); 315 316 // For the last sample to interpolate with, we need to potentially shift in 317 // a sample from the next chunk over since the samples fill out an entire 318 // chunk. 319 auto interp = bit_cast<signed_unpacked_type>(src); 320 auto interpn = 321 bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 2, 3, 4)); 322 interp += ((interpn - interp) * fracx) >> 7; 323 324 commit_blend_span<BLEND>( 325 buf, applyColor(bit_cast<unpacked_type>(interp), color)); 326 327 src = srcn; 328 } 329 } 330 331 // Implements a faster linear filter that works with axis-aligned constant Y but 332 // downscaling the texture by half. In this case we can optimize for the 333 // constant X/Y fractions and reduction factor while minimizing shuffling. 334 template <bool BLEND, typename S, typename C, typename P> 335 static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span, 336 vec2_scalar min_uv, 337 vec2_scalar max_uv, C color, 338 P* buf) { 339 typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; 340 typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; 341 typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type; 342 343 ivec2 i(clamp(uv, min_uv, max_uv)); 344 ivec2 frac = i; 345 i >>= 7; 346 P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); 347 P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); 348 int16_t fracx = computeFracX(sampler, i, frac).x; 349 int16_t fracy = computeFracY(frac).x; 350 351 for (P* end = buf + span; buf < end; buf += 4) { 352 auto src0 = 353 CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type); 354 auto src1 = 355 CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type); 356 auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); 357 row0 += 4; 358 row1 += 4; 359 auto src0n = 360 CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type); 361 auto src1n = 362 CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type); 363 auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); 364 row0 += 4; 365 row1 += 4; 366 367 auto interp = 368 bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 0, 2, 4, 6)); 369 auto interpn = 370 bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 3, 5, 7)); 371 interp += ((interpn - interp) * fracx) >> 7; 372 373 commit_blend_span<BLEND>( 374 buf, applyColor(bit_cast<unpacked_type>(interp), color)); 375 } 376 } 377 378 enum LinearFilter { 379 // No linear filter is needed. 380 LINEAR_FILTER_NEAREST = 0, 381 // The most general linear filter that handles clamping and varying scales. 382 LINEAR_FILTER_FALLBACK, 383 // A linear filter optimized for axis-aligned upscaling. 384 LINEAR_FILTER_UPSCALE, 385 // A linear filter with no scaling but with subpixel offset. 386 LINEAR_FILTER_FAST, 387 // A linear filter optimized for 2x axis-aligned downscaling. 388 LINEAR_FILTER_DOWNSCALE 389 }; 390 391 // Dispatches to an appropriate linear filter depending on the selected filter. 392 template <bool BLEND, typename S, typename C, typename P> 393 static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span, 394 vec2_scalar uv_step, vec2_scalar min_uv, 395 vec2_scalar max_uv, C color, P* buf, 396 LinearFilter filter) { 397 P* end = buf + span; 398 if (filter != LINEAR_FILTER_FALLBACK) { 399 // If we're not using the fallback, then Y is constant across the entire 400 // row. We just need to ensure that we handle any samples that might pull 401 // data from before the start of the row and require clamping. 402 float beforeDist = max(0.0f, min_uv.x) - uv.x.x; 403 if (beforeDist > 0) { 404 int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0, 405 int(end - buf)); 406 buf = blendTextureLinearFallback<BLEND>(sampler, uv, before, uv_step, 407 min_uv, max_uv, color, buf); 408 uv.x += (before / swgl_StepSize) * uv_step.x; 409 } 410 // We need to check how many samples we can take from inside the row without 411 // requiring clamping. In case the filter oversamples the row by a step, we 412 // subtract off a step from the width to leave some room. 413 float insideDist = 414 min(max_uv.x, float((int(sampler->width) - swgl_StepSize) * 415 swgl_LinearQuantizeScale)) - 416 uv.x.x; 417 if (uv_step.x > 0.0f && insideDist >= uv_step.x) { 418 int32_t inside = int(end - buf); 419 if (filter == LINEAR_FILTER_DOWNSCALE) { 420 inside = min(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) & 421 ~(swgl_StepSize - 1), 422 inside); 423 if (inside > 0) { 424 blendTextureLinearDownscale<BLEND>(sampler, uv, inside, min_uv, 425 max_uv, color, buf); 426 buf += inside; 427 uv.x += (inside / swgl_StepSize) * uv_step.x; 428 } 429 } else if (filter == LINEAR_FILTER_UPSCALE) { 430 inside = min(int(insideDist / uv_step.x) * swgl_StepSize, inside); 431 if (inside > 0) { 432 blendTextureLinearUpscale<BLEND>(sampler, uv, inside, uv_step, min_uv, 433 max_uv, color, buf); 434 buf += inside; 435 uv.x += (inside / swgl_StepSize) * uv_step.x; 436 } 437 } else { 438 inside = min(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) & 439 ~(swgl_StepSize - 1), 440 inside); 441 if (inside > 0) { 442 blendTextureLinearFast<BLEND>(sampler, uv, inside, min_uv, max_uv, 443 color, buf); 444 buf += inside; 445 uv.x += (inside / swgl_StepSize) * uv_step.x; 446 } 447 } 448 } 449 } 450 // If the fallback filter was requested, or if there are any samples left that 451 // may be outside the row and require clamping, then handle that with here. 452 if (buf < end) { 453 buf = blendTextureLinearFallback<BLEND>( 454 sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf); 455 } 456 return buf; 457 } 458 459 // Helper function to quantize UVs for linear filtering before dispatch 460 template <bool BLEND, typename S, typename C, typename P> 461 static inline int blendTextureLinear(S sampler, vec2 uv, int span, 462 const vec4_scalar& uv_rect, C color, 463 P* buf, LinearFilter filter) { 464 if (!matchTextureFormat(sampler, buf)) { 465 return 0; 466 } 467 LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); 468 blendTextureLinearDispatch<BLEND>(sampler, uv, span, uv_step, min_uv, max_uv, 469 color, buf, filter); 470 return span; 471 } 472 473 // Samples an axis-aligned span of on a single row of a texture using 1:1 474 // nearest filtering. Sampling is constrained to only fall within the given UV 475 // bounds. This requires a pointer to the destination buffer. An optional color 476 // modulus can be supplied. 477 template <bool BLEND, typename S, typename C, typename P> 478 static int blendTextureNearestFast(S sampler, vec2 uv, int span, 479 const vec4_scalar& uv_rect, C color, 480 P* buf) { 481 if (!matchTextureFormat(sampler, buf)) { 482 return 0; 483 } 484 485 typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; 486 487 ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv))); 488 ivec2_scalar minUV = 489 make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y})); 490 ivec2_scalar maxUV = 491 make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w})); 492 493 // Calculate the row pointer within the buffer, clamping to within valid row 494 // bounds. 495 P* row = 496 &((P*)sampler 497 ->buf)[clampCoord(clamp(i.y, minUV.y, maxUV.y), sampler->height) * 498 sampler->stride]; 499 // Find clamped X bounds within the row. 500 int minX = clamp(minUV.x, 0, sampler->width - 1); 501 int maxX = clamp(maxUV.x, minX, sampler->width - 1); 502 int curX = i.x; 503 int endX = i.x + span; 504 // If we need to start sampling below the valid sample bounds, then we need to 505 // fill this section with a constant clamped sample. 506 if (curX < minX) { 507 int n = min(minX, endX) - curX; 508 auto src = 509 applyColor(unpack(bit_cast<packed_type>(V4<P>(row[minX]))), color); 510 commit_solid_span<BLEND>(buf, src, n); 511 buf += n; 512 curX += n; 513 } 514 // Here we only deal with valid samples within the sample bounds. No clamping 515 // should occur here within these inner loops. 516 int n = max(min(maxX + 1, endX) - curX, 0); 517 // Try to process as many chunks as possible with full loads and stores. 518 for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) { 519 auto src = applyColor(unaligned_load<packed_type>(&row[curX]), color); 520 commit_blend_span<BLEND>(buf, src); 521 } 522 n &= 3; 523 // If we have any leftover samples after processing chunks, use partial loads 524 // and stores. 525 if (n > 0) { 526 auto src = applyColor(partial_load_span<packed_type>(&row[curX], n), color); 527 commit_blend_span<BLEND>(buf, src, n); 528 buf += n; 529 curX += n; 530 } 531 // If we still have samples left above the valid sample bounds, then we again 532 // need to fill this section with a constant clamped sample. 533 if (curX < endX) { 534 auto src = 535 applyColor(unpack(bit_cast<packed_type>(V4<P>(row[maxX]))), color); 536 commit_solid_span<BLEND>(buf, src, endX - curX); 537 } 538 return span; 539 } 540 541 // We need to verify that the pixel step reasonably approximates stepping by a 542 // single texel for every pixel we need to reproduce. Try to ensure that the 543 // margin of error is no more than approximately 2^-7. Also, we check here if 544 // the scaling can be quantized for acceleration. 545 template <typename T> 546 static ALWAYS_INLINE int spanNeedsScale(int span, T P) { 547 span &= ~(128 - 1); 548 span += 128; 549 int scaled = round((P.x.y - P.x.x) * span); 550 return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0; 551 } 552 553 // Helper function to decide whether we can safely apply 1:1 nearest filtering 554 // without diverging too much from the linear filter. 555 template <typename S, typename T> 556 static inline LinearFilter needsTextureLinear(S sampler, T P, int span) { 557 // If each row is not wide enough for linear filtering, then just use nearest 558 // filtering. 559 if (sampler->width < 2) { 560 return LINEAR_FILTER_NEAREST; 561 } 562 // First verify if the row Y doesn't change across samples 563 if (P.y.x != P.y.y) { 564 return LINEAR_FILTER_FALLBACK; 565 } 566 P = samplerScale(sampler, P); 567 if (int scale = spanNeedsScale(span, P)) { 568 // If the source region is not flipped and smaller than the destination, 569 // then we can use the upscaling filter since row Y is constant. 570 return P.x.x < P.x.y && P.x.y - P.x.x <= 1 571 ? LINEAR_FILTER_UPSCALE 572 : (scale == 2 ? LINEAR_FILTER_DOWNSCALE 573 : LINEAR_FILTER_FALLBACK); 574 } 575 // Also verify that we're reasonably close to the center of a texel 576 // so that it doesn't look that much different than if a linear filter 577 // was used. 578 if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 || 579 (int(P.y.x * 4.0f + 0.5f) & 3) != 2) { 580 // The source and destination regions are the same, but there is a 581 // significant subpixel offset. We can use a faster linear filter to deal 582 // with the offset in this case. 583 return LINEAR_FILTER_FAST; 584 } 585 // Otherwise, we have a constant 1:1 step and we're stepping reasonably close 586 // to the center of each pixel, so it's safe to disable the linear filter and 587 // use nearest. 588 return LINEAR_FILTER_NEAREST; 589 } 590 591 // Commit an entire span with linear filtering 592 #define swgl_commitTextureLinear(format, s, p, uv_rect, color, n) \ 593 do { \ 594 auto packed_color = packColor(swgl_Out##format, color); \ 595 int len = (n); \ 596 int drawn = 0; \ 597 if (LinearFilter filter = needsTextureLinear(s, p, len)) { \ 598 if (blend_key) { \ 599 drawn = blendTextureLinear<true>(s, p, len, uv_rect, packed_color, \ 600 swgl_Out##format, filter); \ 601 } else { \ 602 drawn = blendTextureLinear<false>(s, p, len, uv_rect, packed_color, \ 603 swgl_Out##format, filter); \ 604 } \ 605 } else if (blend_key) { \ 606 drawn = blendTextureNearestFast<true>(s, p, len, uv_rect, packed_color, \ 607 swgl_Out##format); \ 608 } else { \ 609 drawn = blendTextureNearestFast<false>(s, p, len, uv_rect, packed_color, \ 610 swgl_Out##format); \ 611 } \ 612 swgl_Out##format += drawn; \ 613 swgl_SpanLength -= drawn; \ 614 } while (0) 615 #define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \ 616 swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength) 617 #define swgl_commitTextureLinearR8(s, p, uv_rect) \ 618 swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength) 619 620 // Commit a partial span with linear filtering, optionally inverting the color 621 #define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \ 622 swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), \ 623 min(int(len), swgl_SpanLength)) 624 #define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \ 625 swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(), \ 626 min(int(len), swgl_SpanLength)) 627 628 // Commit an entire span with linear filtering that is scaled by a color 629 #define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \ 630 swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength) 631 #define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \ 632 swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength) 633 634 // Helper function that samples from an R8 texture while expanding it to support 635 // a differing framebuffer format. 636 template <bool BLEND, typename S, typename C, typename P> 637 static inline int blendTextureLinearR8(S sampler, vec2 uv, int span, 638 const vec4_scalar& uv_rect, C color, 639 P* buf) { 640 if (!swgl_isTextureR8(sampler) || sampler->width < 2) { 641 return 0; 642 } 643 LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); 644 for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { 645 commit_blend_span<BLEND>( 646 buf, applyColor(expand_mask(buf, textureLinearUnpackedR8( 647 sampler, 648 ivec2(clamp(uv, min_uv, max_uv)))), 649 color)); 650 } 651 return span; 652 } 653 654 // Commit an entire span with linear filtering while expanding from R8 to RGBA8 655 #define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color) \ 656 do { \ 657 auto packed_color = packColor(swgl_OutRGBA8, color); \ 658 int drawn = 0; \ 659 if (blend_key) { \ 660 drawn = blendTextureLinearR8<true>(s, p, swgl_SpanLength, uv_rect, \ 661 packed_color, swgl_OutRGBA8); \ 662 } else { \ 663 drawn = blendTextureLinearR8<false>(s, p, swgl_SpanLength, uv_rect, \ 664 packed_color, swgl_OutRGBA8); \ 665 } \ 666 swgl_OutRGBA8 += drawn; \ 667 swgl_SpanLength -= drawn; \ 668 } while (0) 669 #define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \ 670 swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor()) 671 672 // Compute repeating UVs, possibly constrained by tile repeat limits 673 static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) { 674 if (tile_repeat.x > 0.0f) { 675 // Clamp to a number slightly less than the tile repeat limit so that 676 // it results in a number close to but not equal to 1 after fract(). 677 // This avoids fract() yielding 0 if the limit was left as whole integer. 678 uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f); 679 } 680 return fract(uv); 681 } 682 683 // Compute the number of non-repeating steps before we need to potentially 684 // repeat the UVs. 685 static inline int computeNoRepeatSteps(Float uv, float uv_step, 686 float tile_repeat, int steps) { 687 if (uv.w < uv.x) { 688 // Ensure the UV taps are ordered low to high. 689 uv = uv.wzyx; 690 } 691 // Check if the samples cross the boundary of the next whole integer or the 692 // tile repeat limit, whichever is lower. 693 float limit = floor(uv.x) + 1.0f; 694 if (tile_repeat > 0.0f) { 695 limit = min(limit, tile_repeat); 696 } 697 return uv.x >= 0.0f && uv.w < limit 698 ? (uv_step != 0.0f 699 ? int(clamp((limit - uv.x) / uv_step, 0.0f, float(steps))) 700 : steps) 701 : 0; 702 } 703 704 // Blends an entire span of texture with linear filtering and repeating UVs. 705 template <bool BLEND, typename S, typename C, typename P> 706 static int blendTextureLinearRepeat(S sampler, vec2 uv, int span, 707 const vec2_scalar& tile_repeat, 708 const vec4_scalar& uv_repeat, 709 const vec4_scalar& uv_rect, C color, 710 P* buf) { 711 if (!matchTextureFormat(sampler, buf)) { 712 return 0; 713 } 714 vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y}; 715 vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y}; 716 // Choose a linear filter to use for no-repeat sub-spans 717 LinearFilter filter = 718 needsTextureLinear(sampler, uv * uv_scale + uv_offset, span); 719 // We need to step UVs unscaled and unquantized so that we can modulo them 720 // with fract. We use uv_scale and uv_offset to map them into the correct 721 // range. 722 vec2_scalar uv_step = 723 float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; 724 uv_scale = swgl_linearQuantizeStep(sampler, uv_scale); 725 uv_offset = swgl_linearQuantize(sampler, uv_offset); 726 vec2_scalar min_uv = max( 727 swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); 728 vec2_scalar max_uv = max( 729 swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv); 730 for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { 731 int steps = int(end - buf) / swgl_StepSize; 732 // Find the sub-span before UVs repeat to avoid expensive repeat math 733 steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); 734 if (steps > 0) { 735 steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); 736 if (steps > 0) { 737 buf = blendTextureLinearDispatch<BLEND>( 738 sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize, 739 uv_step * uv_scale, min_uv, max_uv, color, buf, filter); 740 if (buf >= end) { 741 break; 742 } 743 uv += steps * uv_step; 744 } 745 } 746 // UVs might repeat within this step, so explicitly compute repeated UVs 747 vec2 repeated_uv = clamp( 748 tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv); 749 commit_blend_span<BLEND>( 750 buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)), 751 color)); 752 } 753 return span; 754 } 755 756 // Commit an entire span with linear filtering and repeating UVs 757 #define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat, \ 758 uv_rect, color) \ 759 do { \ 760 auto packed_color = packColor(swgl_Out##format, color); \ 761 int drawn = 0; \ 762 if (blend_key) { \ 763 drawn = blendTextureLinearRepeat<true>(s, p, swgl_SpanLength, \ 764 tile_repeat, uv_repeat, uv_rect, \ 765 packed_color, swgl_Out##format); \ 766 } else { \ 767 drawn = blendTextureLinearRepeat<false>(s, p, swgl_SpanLength, \ 768 tile_repeat, uv_repeat, uv_rect, \ 769 packed_color, swgl_Out##format); \ 770 } \ 771 swgl_Out##format += drawn; \ 772 swgl_SpanLength -= drawn; \ 773 } while (0) 774 #define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ 775 uv_rect) \ 776 swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ 777 NoColor()) 778 #define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \ 779 uv_rect, color) \ 780 swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ 781 color) 782 783 template <typename S> 784 static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf, 785 S sampler, ivec2 i) { 786 return textureNearestPackedRGBA8(sampler, i); 787 } 788 789 // Blends an entire span of texture with nearest filtering and either 790 // repeated or clamped UVs. 791 template <bool BLEND, bool REPEAT, typename S, typename C, typename P> 792 static int blendTextureNearestRepeat(S sampler, vec2 uv, int span, 793 const vec2_scalar& tile_repeat, 794 const vec4_scalar& uv_rect, C color, 795 P* buf) { 796 if (!matchTextureFormat(sampler, buf)) { 797 return 0; 798 } 799 if (!REPEAT) { 800 // If clamping, then we step pre-scaled to the sampler. For repeat modes, 801 // this will be accomplished via uv_scale instead. 802 uv = samplerScale(sampler, uv); 803 } 804 vec2_scalar uv_step = 805 float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; 806 vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}); 807 vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}); 808 vec2_scalar uv_scale = max_uv - min_uv; 809 // If the effective sampling area of this texture is only a single pixel, then 810 // treat it as a solid span. For repeat modes, the bounds are specified on 811 // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so 812 // the test varies depending on which. If the sample range on an axis is 813 // greater than one pixel, we can still check if we don't move far enough from 814 // the pixel center on that axis to hit the next pixel. 815 if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) || 816 (abs(uv_step.x) * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) && 817 (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) || 818 (abs(uv_step.y) * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) { 819 vec2 repeated_uv = REPEAT 820 ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv 821 : clamp(uv, min_uv, max_uv); 822 commit_solid_span<BLEND>(buf, 823 applyColor(unpack(textureNearestPacked( 824 buf, sampler, ivec2(repeated_uv))), 825 color), 826 span); 827 } else { 828 for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { 829 if (REPEAT) { 830 int steps = int(end - buf) / swgl_StepSize; 831 // Find the sub-span before UVs repeat to avoid expensive repeat math 832 steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); 833 if (steps > 0) { 834 steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); 835 if (steps > 0) { 836 vec2 inside_uv = fract(uv) * uv_scale + min_uv; 837 vec2 inside_step = uv_step * uv_scale; 838 for (P* outside = &buf[steps * swgl_StepSize]; buf < outside; 839 buf += swgl_StepSize, inside_uv += inside_step) { 840 commit_blend_span<BLEND>( 841 buf, applyColor( 842 textureNearestPacked(buf, sampler, ivec2(inside_uv)), 843 color)); 844 } 845 if (buf >= end) { 846 break; 847 } 848 uv += steps * uv_step; 849 } 850 } 851 } 852 853 // UVs might repeat within this step, so explicitly compute repeated UVs 854 vec2 repeated_uv = REPEAT 855 ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv 856 : clamp(uv, min_uv, max_uv); 857 commit_blend_span<BLEND>( 858 buf, 859 applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)), 860 color)); 861 } 862 } 863 return span; 864 } 865 866 // Determine if we can use the fast nearest filter for the given nearest mode. 867 // If the Y coordinate varies more than half a pixel over 868 // the span (which might cause the texel to alias to the next one), or the span 869 // needs X scaling, then we have to use the fallback. 870 template <typename S, typename T> 871 static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) { 872 P = samplerScale(sampler, P); 873 return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P); 874 } 875 876 // Commit an entire span with nearest filtering and either clamped or repeating 877 // UVs 878 #define swgl_commitTextureNearest(format, s, p, uv_rect, color) \ 879 do { \ 880 auto packed_color = packColor(swgl_Out##format, color); \ 881 int drawn = 0; \ 882 if (needsNearestFallback(s, p, swgl_SpanLength)) { \ 883 if (blend_key) { \ 884 drawn = blendTextureNearestRepeat<true, false>( \ 885 s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ 886 swgl_Out##format); \ 887 } else { \ 888 drawn = blendTextureNearestRepeat<false, false>( \ 889 s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ 890 swgl_Out##format); \ 891 } \ 892 } else if (blend_key) { \ 893 drawn = blendTextureNearestFast<true>(s, p, swgl_SpanLength, uv_rect, \ 894 packed_color, swgl_Out##format); \ 895 } else { \ 896 drawn = blendTextureNearestFast<false>(s, p, swgl_SpanLength, uv_rect, \ 897 packed_color, swgl_Out##format); \ 898 } \ 899 swgl_Out##format += drawn; \ 900 swgl_SpanLength -= drawn; \ 901 } while (0) 902 #define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \ 903 swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor()) 904 #define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \ 905 swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color) 906 907 #define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \ 908 color) \ 909 do { \ 910 auto packed_color = packColor(swgl_Out##format, color); \ 911 int drawn = 0; \ 912 if (blend_key) { \ 913 drawn = blendTextureNearestRepeat<true, true>( \ 914 s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ 915 swgl_Out##format); \ 916 } else { \ 917 drawn = blendTextureNearestRepeat<false, true>( \ 918 s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ 919 swgl_Out##format); \ 920 } \ 921 swgl_Out##format += drawn; \ 922 swgl_SpanLength -= drawn; \ 923 } while (0) 924 #define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ 925 uv_rect) \ 926 swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, \ 927 NoColor()) 928 #define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat, \ 929 uv_repeat, uv_rect, color) \ 930 swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color) 931 932 // Commit an entire span of texture with filtering determined by sampler state. 933 #define swgl_commitTexture(format, s, ...) \ 934 do { \ 935 if (s->filter == TextureFilter::LINEAR) { \ 936 swgl_commitTextureLinear##format(s, __VA_ARGS__); \ 937 } else { \ 938 swgl_commitTextureNearest##format(s, __VA_ARGS__); \ 939 } \ 940 } while (0) 941 #define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__) 942 #define swgl_commitTextureColorRGBA8(...) \ 943 swgl_commitTexture(ColorRGBA8, __VA_ARGS__) 944 #define swgl_commitTextureRepeatRGBA8(...) \ 945 swgl_commitTexture(RepeatRGBA8, __VA_ARGS__) 946 #define swgl_commitTextureRepeatColorRGBA8(...) \ 947 swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__) 948 949 // Commit an entire span of a separable pass of a Gaussian blur that falls 950 // within the given radius scaled by supplied coefficients, clamped to uv_rect 951 // bounds. 952 template <bool BLEND, typename S, typename P> 953 static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect, 954 P* buf, int span, bool hori, int radius, 955 vec2_scalar coeffs) { 956 if (!matchTextureFormat(sampler, buf)) { 957 return 0; 958 } 959 vec2_scalar size = {float(sampler->width), float(sampler->height)}; 960 ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size); 961 ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size)); 962 int startX = curUV.x; 963 int endX = min(min(bounds.z, curUV.x + span), int(size.x)); 964 if (hori) { 965 for (; curUV.x + swgl_StepSize <= endX; 966 buf += swgl_StepSize, curUV.x += swgl_StepSize) { 967 commit_blend_span<BLEND>( 968 buf, gaussianBlurHorizontal<P>(sampler, curUV, bounds.x, bounds.z, 969 radius, coeffs.x, coeffs.y)); 970 } 971 } else { 972 for (; curUV.x + swgl_StepSize <= endX; 973 buf += swgl_StepSize, curUV.x += swgl_StepSize) { 974 commit_blend_span<BLEND>( 975 buf, gaussianBlurVertical<P>(sampler, curUV, bounds.y, bounds.w, 976 radius, coeffs.x, coeffs.y)); 977 } 978 } 979 return curUV.x - startX; 980 } 981 982 #define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs) \ 983 do { \ 984 int drawn = 0; \ 985 if (blend_key) { \ 986 drawn = blendGaussianBlur<true>(s, p, uv_rect, swgl_Out##format, \ 987 swgl_SpanLength, hori, radius, coeffs); \ 988 } else { \ 989 drawn = blendGaussianBlur<false>(s, p, uv_rect, swgl_Out##format, \ 990 swgl_SpanLength, hori, radius, coeffs); \ 991 } \ 992 swgl_Out##format += drawn; \ 993 swgl_SpanLength -= drawn; \ 994 } while (0) 995 #define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \ 996 swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs) 997 #define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \ 998 swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs) 999 1000 // Convert and pack planar YUV samples to RGB output using a color space 1001 static ALWAYS_INLINE PackedRGBA8 convertYUV(const YUVMatrix& rgb_from_ycbcr, 1002 U16 y, U16 u, U16 v) { 1003 auto yy = V8<int16_t>(zip(y, y)); 1004 auto uv = V8<int16_t>(zip(u, v)); 1005 return rgb_from_ycbcr.convert(yy, uv); 1006 } 1007 1008 // Helper functions to sample from planar YUV textures before converting to RGB 1009 template <typename S0> 1010 static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, 1011 const YUVMatrix& rgb_from_ycbcr, 1012 UNUSED int rescaleFactor) { 1013 switch (sampler0->format) { 1014 case TextureFormat::RGBA8: { 1015 auto planar = textureLinearPlanarRGBA8(sampler0, uv0); 1016 return convertYUV(rgb_from_ycbcr, highHalf(planar.rg), lowHalf(planar.rg), 1017 lowHalf(planar.ba)); 1018 } 1019 case TextureFormat::YUY2: { 1020 auto planar = textureLinearPlanarYUY2(sampler0, uv0); 1021 return convertYUV(rgb_from_ycbcr, planar.y, planar.u, planar.v); 1022 } 1023 default: 1024 assert(false); 1025 return PackedRGBA8(0); 1026 } 1027 } 1028 1029 template <bool BLEND, typename S0, typename P, typename C = NoColor> 1030 static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, 1031 const vec4_scalar& uv_rect0, const vec3_scalar& ycbcr_bias, 1032 const mat3_scalar& rgb_from_debiased_ycbcr, 1033 int rescaleFactor, C color = C()) { 1034 if (!swgl_isTextureLinear(sampler0)) { 1035 return 0; 1036 } 1037 LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); 1038 const auto rgb_from_ycbcr = 1039 YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); 1040 auto c = packColor(buf, color); 1041 auto* end = buf + span; 1042 for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) { 1043 commit_blend_span<BLEND>( 1044 buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), 1045 rgb_from_ycbcr, rescaleFactor), 1046 c)); 1047 } 1048 return span; 1049 } 1050 1051 template <typename S0, typename S1> 1052 static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, 1053 ivec2 uv1, 1054 const YUVMatrix& rgb_from_ycbcr, 1055 int rescaleFactor) { 1056 switch (sampler1->format) { 1057 case TextureFormat::RG8: { 1058 assert(sampler0->format == TextureFormat::R8); 1059 auto y = textureLinearUnpackedR8(sampler0, uv0); 1060 auto planar = textureLinearPlanarRG8(sampler1, uv1); 1061 return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.rg), 1062 highHalf(planar.rg)); 1063 } 1064 case TextureFormat::RGBA8: { 1065 assert(sampler0->format == TextureFormat::R8); 1066 auto y = textureLinearUnpackedR8(sampler0, uv0); 1067 auto planar = textureLinearPlanarRGBA8(sampler1, uv1); 1068 return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.ba), 1069 highHalf(planar.rg)); 1070 } 1071 case TextureFormat::RG16: { 1072 assert(sampler0->format == TextureFormat::R16); 1073 // The rescaling factor represents how many bits to add to renormalize the 1074 // texture to 16 bits, and so the color depth is actually 16 minus the 1075 // rescaling factor. 1076 // Need to right shift the sample by the amount of bits over 8 it 1077 // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit 1078 // of precision at the low end already, hence 1 is subtracted from the 1079 // color depth. 1080 int colorDepth = 16 - rescaleFactor; 1081 int rescaleBits = (colorDepth - 1) - 8; 1082 auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; 1083 auto uv = textureLinearUnpackedRG16(sampler1, uv1) >> rescaleBits; 1084 return rgb_from_ycbcr.convert(zip(y, y), uv); 1085 } 1086 default: 1087 assert(false); 1088 return PackedRGBA8(0); 1089 } 1090 } 1091 1092 template <bool BLEND, typename S0, typename S1, typename P, 1093 typename C = NoColor> 1094 static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, 1095 const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, 1096 const vec4_scalar& uv_rect1, const vec3_scalar& ycbcr_bias, 1097 const mat3_scalar& rgb_from_debiased_ycbcr, 1098 int rescaleFactor, C color = C()) { 1099 if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) { 1100 return 0; 1101 } 1102 LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); 1103 LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); 1104 const auto rgb_from_ycbcr = 1105 YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); 1106 auto c = packColor(buf, color); 1107 auto* end = buf + span; 1108 for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) { 1109 commit_blend_span<BLEND>( 1110 buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), 1111 sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), 1112 rgb_from_ycbcr, rescaleFactor), 1113 c)); 1114 } 1115 return span; 1116 } 1117 1118 template <typename S0, typename S1, typename S2> 1119 static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, 1120 ivec2 uv1, S2 sampler2, ivec2 uv2, 1121 const YUVMatrix& rgb_from_ycbcr, 1122 int rescaleFactor) { 1123 assert(sampler0->format == sampler1->format && 1124 sampler0->format == sampler2->format); 1125 switch (sampler0->format) { 1126 case TextureFormat::R8: { 1127 auto y = textureLinearUnpackedR8(sampler0, uv0); 1128 auto u = textureLinearUnpackedR8(sampler1, uv1); 1129 auto v = textureLinearUnpackedR8(sampler2, uv2); 1130 return convertYUV(rgb_from_ycbcr, y, u, v); 1131 } 1132 case TextureFormat::R16: { 1133 // The rescaling factor represents how many bits to add to renormalize the 1134 // texture to 16 bits, and so the color depth is actually 16 minus the 1135 // rescaling factor. 1136 // Need to right shift the sample by the amount of bits over 8 it 1137 // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit 1138 // of precision at the low end already, hence 1 is subtracted from the 1139 // color depth. 1140 int colorDepth = 16 - rescaleFactor; 1141 int rescaleBits = (colorDepth - 1) - 8; 1142 auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; 1143 auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits; 1144 auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits; 1145 return convertYUV(rgb_from_ycbcr, U16(y), U16(u), U16(v)); 1146 } 1147 default: 1148 assert(false); 1149 return PackedRGBA8(0); 1150 } 1151 } 1152 1153 // Fallback helper for when we can't specifically accelerate YUV with 1154 // composition. 1155 template <bool BLEND, typename S0, typename S1, typename S2, typename P, 1156 typename C> 1157 static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0, 1158 vec2_scalar uv_step0, vec2_scalar min_uv0, 1159 vec2_scalar max_uv0, S1 sampler1, vec2 uv1, 1160 vec2_scalar uv_step1, vec2_scalar min_uv1, 1161 vec2_scalar max_uv1, S2 sampler2, vec2 uv2, 1162 vec2_scalar uv_step2, vec2_scalar min_uv2, 1163 vec2_scalar max_uv2, const vec3_scalar& ycbcr_bias, 1164 const mat3_scalar& rgb_from_debiased_ycbcr, 1165 int rescaleFactor, C color) { 1166 const auto rgb_from_ycbcr = 1167 YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); 1168 for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0, 1169 uv1 += uv_step1, uv2 += uv_step2) { 1170 commit_blend_span<BLEND>( 1171 buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), 1172 sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), 1173 sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)), 1174 rgb_from_ycbcr, rescaleFactor), 1175 color)); 1176 } 1177 } 1178 1179 template <bool BLEND, typename S0, typename S1, typename S2, typename P, 1180 typename C = NoColor> 1181 static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, 1182 const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, 1183 const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2, 1184 const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias, 1185 const mat3_scalar& rgb_from_debiased_ycbcr, 1186 int rescaleFactor, C color = C()) { 1187 if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || 1188 !swgl_isTextureLinear(sampler2)) { 1189 return 0; 1190 } 1191 LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); 1192 LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); 1193 LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); 1194 auto c = packColor(buf, color); 1195 blendYUVFallback<BLEND>(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0, 1196 sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, 1197 uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, 1198 rgb_from_debiased_ycbcr, rescaleFactor, c); 1199 return span; 1200 } 1201 1202 // A variant of the blendYUV that attempts to reuse the inner loops from the 1203 // CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on 1204 // the source data, which in turn allows it to be much faster than blendYUV. 1205 // At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer 1206 // and that no color scaling is applied, which we can accomplish via template 1207 // specialization. We need to further validate inside that texture formats 1208 // and dimensions are sane for video and that the video is axis-aligned before 1209 // acceleration can proceed. 1210 template <bool BLEND> 1211 static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0, 1212 const vec4_scalar& uv_rect0, sampler2DRect sampler1, 1213 vec2 uv1, const vec4_scalar& uv_rect1, 1214 sampler2DRect sampler2, vec2 uv2, 1215 const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias, 1216 const mat3_scalar& rgb_from_debiased_ycbcr, 1217 int rescaleFactor, NoColor noColor = NoColor()) { 1218 if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || 1219 !swgl_isTextureLinear(sampler2)) { 1220 return 0; 1221 } 1222 LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); 1223 LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); 1224 LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); 1225 auto* end = buf + span; 1226 // CompositeYUV imposes further restrictions on the source textures, such that 1227 // the the Y/U/V samplers must all have a matching format, the U/V samplers 1228 // must have matching sizes and sample coordinates, and there must be no 1229 // change in row across the entire span. 1230 if (sampler0->format == sampler1->format && 1231 sampler1->format == sampler2->format && 1232 sampler1->width == sampler2->width && 1233 sampler1->height == sampler2->height && uv_step0.y == 0 && 1234 uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 && 1235 uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) { 1236 // CompositeYUV does not support a clamp rect, so we must take care to 1237 // advance till we're inside the bounds of the clamp rect. 1238 int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x, 1239 (min_uv1.x - uv1.x.x) / uv_step1.x))), 1240 (end - buf) / swgl_StepSize); 1241 if (outside > 0) { 1242 blendYUVFallback<BLEND>(buf, outside * swgl_StepSize, sampler0, uv0, 1243 uv_step0, min_uv0, max_uv0, sampler1, uv1, 1244 uv_step1, min_uv1, max_uv1, sampler2, uv2, 1245 uv_step2, min_uv2, max_uv2, ycbcr_bias, 1246 rgb_from_debiased_ycbcr, rescaleFactor, noColor); 1247 buf += outside * swgl_StepSize; 1248 uv0.x += outside * uv_step0.x; 1249 uv1.x += outside * uv_step1.x; 1250 uv2.x += outside * uv_step2.x; 1251 } 1252 // Find the amount of chunks inside the clamp rect before we hit the 1253 // maximum. If there are any chunks inside, we can finally dispatch to 1254 // CompositeYUV. 1255 int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x, 1256 (max_uv1.x - uv1.x.x) / uv_step1.x)), 1257 (end - buf) / swgl_StepSize); 1258 if (inside > 0) { 1259 // We need the color depth, which is relative to the texture format and 1260 // rescale factor. 1261 int colorDepth = 1262 (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor; 1263 // Finally, call the inner loop of CompositeYUV. 1264 const auto rgb_from_ycbcr = 1265 YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); 1266 linear_row_yuv<BLEND>( 1267 buf, inside * swgl_StepSize, sampler0, force_scalar(uv0), 1268 uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1), 1269 uv_step1.x / swgl_StepSize, colorDepth, rgb_from_ycbcr); 1270 // Now that we're done, advance past the processed inside portion. 1271 buf += inside * swgl_StepSize; 1272 uv0.x += inside * uv_step0.x; 1273 uv1.x += inside * uv_step1.x; 1274 uv2.x += inside * uv_step2.x; 1275 } 1276 } 1277 // We either got here because we have some samples outside the clamp rect, or 1278 // because some of the preconditions were not satisfied. Process whatever is 1279 // left of the span. 1280 blendYUVFallback<BLEND>(buf, end - buf, sampler0, uv0, uv_step0, min_uv0, 1281 max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, 1282 sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, 1283 rgb_from_debiased_ycbcr, rescaleFactor, noColor); 1284 return span; 1285 } 1286 1287 // Commit a single chunk of a YUV surface represented by multiple planar 1288 // textures. This requires a color space specifier selecting how to convert 1289 // from YUV to RGB output. In the case of HDR formats, a rescaling factor 1290 // selects how many bits of precision must be utilized on conversion. See the 1291 // sampleYUV dispatcher functions for the various supported plane 1292 // configurations this intrinsic accepts. 1293 #define swgl_commitTextureLinearYUV(...) \ 1294 do { \ 1295 int drawn = 0; \ 1296 if (blend_key) { \ 1297 drawn = blendYUV<true>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ 1298 } else { \ 1299 drawn = blendYUV<false>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ 1300 } \ 1301 swgl_OutRGBA8 += drawn; \ 1302 swgl_SpanLength -= drawn; \ 1303 } while (0) 1304 1305 // Commit a single chunk of a YUV surface scaled by a color. 1306 #define swgl_commitTextureLinearColorYUV(...) \ 1307 swgl_commitTextureLinearYUV(__VA_ARGS__) 1308 1309 // Each gradient stops entry is a pair of RGBA32F start color and end step. 1310 struct GradientStops { 1311 Float startColor; 1312 union { 1313 Float stepColor; 1314 vec4_scalar stepData; 1315 }; 1316 1317 // Whether this gradient entry can be merged with an adjacent entry. The 1318 // step will be equal with the adjacent step if and only if they can be 1319 // merged, or rather, that the stops are actually part of a single larger 1320 // gradient. 1321 bool can_merge(const GradientStops& next) const { 1322 return stepData == next.stepData; 1323 } 1324 1325 // Get the interpolated color within the entry based on the offset from its 1326 // start. 1327 Float interpolate(float offset) const { 1328 return startColor + stepColor * offset; 1329 } 1330 1331 // Get the end color of the entry where interpolation stops. 1332 Float end_color() const { return startColor + stepColor; } 1333 }; 1334 1335 // Checks if a gradient table of the specified size exists at the UV coords of 1336 // the address within an RGBA32F texture. If so, a linear address within the 1337 // texture is returned that may be used to sample the gradient table later. If 1338 // the address doesn't describe a valid gradient, then a negative value is 1339 // returned. 1340 static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address, 1341 int entries) { 1342 return sampler->format == TextureFormat::RGBA32F && address.y >= 0 && 1343 address.y < int(sampler->height) && address.x >= 0 && 1344 address.x < int(sampler->width) && entries > 0 && 1345 address.x + 1346 int(sizeof(GradientStops) / sizeof(Float)) * entries <= 1347 int(sampler->width) 1348 ? address.y * sampler->stride + address.x * 4 1349 : -1; 1350 } 1351 1352 static inline int swgl_validateGradientFromStops(sampler2D sampler, 1353 ivec2_scalar address, 1354 int entries) { 1355 // 1px (4 floats per color stop). 1356 int colors_size = entries; 1357 // 4 stop offsets (4 floats) per px. 1358 int stops_size = ((entries + 3) & ~3) / 4; 1359 return sampler->format == TextureFormat::RGBA32F && address.y >= 0 && 1360 address.y < int(sampler->height) && address.x >= 0 && 1361 address.x < int(sampler->width) && entries > 0 && 1362 address.x + colors_size + stops_size <= int(sampler->width) 1363 ? address.y * sampler->stride + address.x * 4 1364 : -1; 1365 } 1366 1367 static inline WideRGBA8 sampleGradient(sampler2D sampler, int address, 1368 Float entry) { 1369 assert(sampler->format == TextureFormat::RGBA32F); 1370 assert(address >= 0 && address < int(sampler->height * sampler->stride)); 1371 // Get the integer portion of the entry index to find the entry colors. 1372 I32 index = cast(entry); 1373 // Use the fractional portion of the entry index to control blending between 1374 // entry colors. 1375 Float offset = entry - cast(index); 1376 // Every entry is a pair of colors blended by the fractional offset. 1377 assert(test_all(index >= 0 && 1378 index * int(sizeof(GradientStops) / sizeof(Float)) < 1379 int(sampler->width))); 1380 GradientStops* stops = (GradientStops*)&sampler->buf[address]; 1381 // Blend between the colors for each SIMD lane, then pack them to RGBA8 1382 // result. Since the layout of the RGBA8 framebuffer is actually BGRA while 1383 // the gradient table has RGBA colors, swizzling is required. 1384 return combine( 1385 packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw), 1386 round_pixel(stops[index.y].interpolate(offset.y).zyxw)), 1387 packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw), 1388 round_pixel(stops[index.w].interpolate(offset.w).zyxw))); 1389 } 1390 1391 // Samples a gradient entry from the gradient at the provided linearized 1392 // address. The integer portion of the entry index is used to find the entry 1393 // within the table whereas the fractional portion is used to blend between 1394 // adjacent table entries. 1395 #define swgl_commitGradientRGBA8(sampler, address, entry) \ 1396 swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry)) 1397 1398 // Variant that allows specifying a color multiplier of the gradient result. 1399 #define swgl_commitGradientColorRGBA8(sampler, address, entry, color) \ 1400 swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \ 1401 packColor(swgl_OutRGBA, color))) 1402 1403 // Precomputed noise for adding directly to four horizontally contiguous pixels 1404 // TODO: These should be updated for parity with the shader dither 1405 // implementation once something more final exists there. Right now, these are 1406 // very close but slightly off. 1407 static const WideRGBA8 ditherNoise[64] = { 1408 {2, 2, 2, 128, 194, 194, 194, 128, 50, 50, 50, 128, 242, 242, 242, 128}, 1409 {194, 194, 194, 128, 50, 50, 50, 128, 242, 242, 242, 128, 14, 14, 14, 128}, 1410 {50, 50, 50, 128, 242, 242, 242, 128, 14, 14, 14, 128, 206, 206, 206, 128}, 1411 {242, 242, 242, 128, 14, 14, 14, 128, 206, 206, 206, 128, 62, 62, 62, 128}, 1412 {14, 14, 14, 128, 206, 206, 206, 128, 62, 62, 62, 128, 254, 254, 254, 128}, 1413 {206, 206, 206, 128, 62, 62, 62, 128, 254, 254, 254, 128, 130, 130, 130, 1414 128}, 1415 {62, 62, 62, 128, 254, 254, 254, 128, 130, 130, 130, 128, 66, 66, 66, 128}, 1416 {254, 254, 254, 128, 130, 130, 130, 128, 66, 66, 66, 128, 178, 178, 178, 1417 128}, 1418 {130, 130, 130, 128, 66, 66, 66, 128, 178, 178, 178, 128, 114, 114, 114, 1419 128}, 1420 {66, 66, 66, 128, 178, 178, 178, 128, 114, 114, 114, 128, 142, 142, 142, 1421 128}, 1422 {178, 178, 178, 128, 114, 114, 114, 128, 142, 142, 142, 128, 78, 78, 78, 1423 128}, 1424 {114, 114, 114, 128, 142, 142, 142, 128, 78, 78, 78, 128, 190, 190, 190, 1425 128}, 1426 {142, 142, 142, 128, 78, 78, 78, 128, 190, 190, 190, 128, 126, 126, 126, 1427 128}, 1428 {78, 78, 78, 128, 190, 190, 190, 128, 126, 126, 126, 128, 34, 34, 34, 128}, 1429 {190, 190, 190, 128, 126, 126, 126, 128, 34, 34, 34, 128, 226, 226, 226, 1430 128}, 1431 {126, 126, 126, 128, 34, 34, 34, 128, 226, 226, 226, 128, 18, 18, 18, 128}, 1432 {34, 34, 34, 128, 226, 226, 226, 128, 18, 18, 18, 128, 210, 210, 210, 128}, 1433 {226, 226, 226, 128, 18, 18, 18, 128, 210, 210, 210, 128, 46, 46, 46, 128}, 1434 {18, 18, 18, 128, 210, 210, 210, 128, 46, 46, 46, 128, 238, 238, 238, 128}, 1435 {210, 210, 210, 128, 46, 46, 46, 128, 238, 238, 238, 128, 30, 30, 30, 128}, 1436 {46, 46, 46, 128, 238, 238, 238, 128, 30, 30, 30, 128, 222, 222, 222, 128}, 1437 {238, 238, 238, 128, 30, 30, 30, 128, 222, 222, 222, 128, 162, 162, 162, 1438 128}, 1439 {30, 30, 30, 128, 222, 222, 222, 128, 162, 162, 162, 128, 98, 98, 98, 128}, 1440 {222, 222, 222, 128, 162, 162, 162, 128, 98, 98, 98, 128, 146, 146, 146, 1441 128}, 1442 {162, 162, 162, 128, 98, 98, 98, 128, 146, 146, 146, 128, 82, 82, 82, 128}, 1443 {98, 98, 98, 128, 146, 146, 146, 128, 82, 82, 82, 128, 174, 174, 174, 128}, 1444 {146, 146, 146, 128, 82, 82, 82, 128, 174, 174, 174, 128, 110, 110, 110, 1445 128}, 1446 {82, 82, 82, 128, 174, 174, 174, 128, 110, 110, 110, 128, 158, 158, 158, 1447 128}, 1448 {174, 174, 174, 128, 110, 110, 110, 128, 158, 158, 158, 128, 94, 94, 94, 1449 128}, 1450 {110, 110, 110, 128, 158, 158, 158, 128, 94, 94, 94, 128, 10, 10, 10, 128}, 1451 {158, 158, 158, 128, 94, 94, 94, 128, 10, 10, 10, 128, 202, 202, 202, 128}, 1452 {94, 94, 94, 128, 10, 10, 10, 128, 202, 202, 202, 128, 58, 58, 58, 128}, 1453 {10, 10, 10, 128, 202, 202, 202, 128, 58, 58, 58, 128, 250, 250, 250, 128}, 1454 {202, 202, 202, 128, 58, 58, 58, 128, 250, 250, 250, 128, 6, 6, 6, 128}, 1455 {58, 58, 58, 128, 250, 250, 250, 128, 6, 6, 6, 128, 198, 198, 198, 128}, 1456 {250, 250, 250, 128, 6, 6, 6, 128, 198, 198, 198, 128, 54, 54, 54, 128}, 1457 {6, 6, 6, 128, 198, 198, 198, 128, 54, 54, 54, 128, 246, 246, 246, 128}, 1458 {198, 198, 198, 128, 54, 54, 54, 128, 246, 246, 246, 128, 138, 138, 138, 1459 128}, 1460 {54, 54, 54, 128, 246, 246, 246, 128, 138, 138, 138, 128, 74, 74, 74, 128}, 1461 {246, 246, 246, 128, 138, 138, 138, 128, 74, 74, 74, 128, 186, 186, 186, 1462 128}, 1463 {138, 138, 138, 128, 74, 74, 74, 128, 186, 186, 186, 128, 122, 122, 122, 1464 128}, 1465 {74, 74, 74, 128, 186, 186, 186, 128, 122, 122, 122, 128, 134, 134, 134, 1466 128}, 1467 {186, 186, 186, 128, 122, 122, 122, 128, 134, 134, 134, 128, 70, 70, 70, 1468 128}, 1469 {122, 122, 122, 128, 134, 134, 134, 128, 70, 70, 70, 128, 182, 182, 182, 1470 128}, 1471 {134, 134, 134, 128, 70, 70, 70, 128, 182, 182, 182, 128, 118, 118, 118, 1472 128}, 1473 {70, 70, 70, 128, 182, 182, 182, 128, 118, 118, 118, 128, 42, 42, 42, 128}, 1474 {182, 182, 182, 128, 118, 118, 118, 128, 42, 42, 42, 128, 234, 234, 234, 1475 128}, 1476 {118, 118, 118, 128, 42, 42, 42, 128, 234, 234, 234, 128, 26, 26, 26, 128}, 1477 {42, 42, 42, 128, 234, 234, 234, 128, 26, 26, 26, 128, 218, 218, 218, 128}, 1478 {234, 234, 234, 128, 26, 26, 26, 128, 218, 218, 218, 128, 38, 38, 38, 128}, 1479 {26, 26, 26, 128, 218, 218, 218, 128, 38, 38, 38, 128, 230, 230, 230, 128}, 1480 {218, 218, 218, 128, 38, 38, 38, 128, 230, 230, 230, 128, 22, 22, 22, 128}, 1481 {38, 38, 38, 128, 230, 230, 230, 128, 22, 22, 22, 128, 214, 214, 214, 128}, 1482 {230, 230, 230, 128, 22, 22, 22, 128, 214, 214, 214, 128, 170, 170, 170, 1483 128}, 1484 {22, 22, 22, 128, 214, 214, 214, 128, 170, 170, 170, 128, 106, 106, 106, 1485 128}, 1486 {214, 214, 214, 128, 170, 170, 170, 128, 106, 106, 106, 128, 154, 154, 154, 1487 128}, 1488 {170, 170, 170, 128, 106, 106, 106, 128, 154, 154, 154, 128, 90, 90, 90, 1489 128}, 1490 {106, 106, 106, 128, 154, 154, 154, 128, 90, 90, 90, 128, 166, 166, 166, 1491 128}, 1492 {154, 154, 154, 128, 90, 90, 90, 128, 166, 166, 166, 128, 102, 102, 102, 1493 128}, 1494 {90, 90, 90, 128, 166, 166, 166, 128, 102, 102, 102, 128, 150, 150, 150, 1495 128}, 1496 {166, 166, 166, 128, 102, 102, 102, 128, 150, 150, 150, 128, 86, 86, 86, 1497 128}, 1498 {102, 102, 102, 128, 150, 150, 150, 128, 86, 86, 86, 128, 2, 2, 2, 128}, 1499 {150, 150, 150, 128, 86, 86, 86, 128, 2, 2, 2, 128, 194, 194, 194, 128}, 1500 {86, 86, 86, 128, 2, 2, 2, 128, 194, 194, 194, 128, 50, 50, 50, 128}}; 1501 1502 static ALWAYS_INLINE const WideRGBA8* getDitherNoise(int32_t fragCoordY) { 1503 return &ditherNoise[(fragCoordY & 7) * 8]; 1504 } 1505 1506 // Values in color should be in the 0..0xFF00 range so that dithering has 1507 // enough overhead to avoid overflow and underflow. 1508 static ALWAYS_INLINE WideRGBA8 dither(WideRGBA8 color, int32_t fragCoordX, 1509 const WideRGBA8* ditherNoiseYIndexed) { 1510 return color + ditherNoiseYIndexed[fragCoordX & 7]; 1511 } 1512 1513 /// Find the gradient stops pair affecting the current offset by searching 1514 /// into gradient stop offsets organized in a tree structure. 1515 /// 1516 /// This is ported from sample_gradient_stops_tree in ps_quad_gradient.glsl. 1517 /// The tree structure is explained in the documentation of 1518 /// write_gpu_gradient_stops_tree in prim_store/gradient/mod.rs 1519 static int32_t findGradientStopPair(float offset, float* stops, 1520 int32_t numStops, 1521 float& prevOffset, 1522 float& nextOffset) { 1523 int32_t levelBaseAddr = 0; 1524 // Number of blocks of 4 indices for the current level. 1525 // At the root, a single block is stored. Each level stores 1526 // 5 times more blocks than the previous one. 1527 int32_t levelStride = 1; 1528 // Relative address within the current level. 1529 int32_t offsetInLevel = 0; 1530 // By the end of this function, this will contain the index of the 1531 // second stop of the pair we are looking for. 1532 int32_t index = 0; 1533 1534 // The index distance between consecutive stop offsets at 1535 // the current level. At the last level, the stride is 1. 1536 // each has a 5 times more stride than the next (so the 1537 // index stride starts high and is divided by 5 at each 1538 // iteration). 1539 int32_t indexStride = 1; 1540 while (indexStride * 5 <= numStops) { 1541 indexStride *= 5; 1542 } 1543 1544 1545 // We take advantage of the fact that stop offsets are normalized from 1546 // 0 to 1 which means that the first offset is always 0 and the last is 1547 // always 1. 1548 // This is important because in the loop, we won't be setting prevOffset 1549 // if offset is < 0.0 and won't be setting nextOffset if offset > 1.0, 1550 // so initializing them this way here handles those cases. 1551 prevOffset = 0.0; 1552 nextOffset = 1.0; 1553 1554 while (true) { 1555 int32_t addr = (levelBaseAddr + offsetInLevel) * 4; 1556 float currentStops0 = stops[addr]; 1557 float currentStops1 = stops[addr + 1]; 1558 float currentStops2 = stops[addr + 2]; 1559 float currentStops3 = stops[addr + 3]; 1560 1561 // Determine which of the five partitions (sub-trees) 1562 // to take next. 1563 int32_t nextPartition = 4; 1564 if (currentStops0 > offset) { 1565 nextPartition = 0; 1566 nextOffset = currentStops0; 1567 } else if (currentStops1 > offset) { 1568 nextPartition = 1; 1569 prevOffset = currentStops0; 1570 nextOffset = currentStops1; 1571 } else if (currentStops2 > offset) { 1572 nextPartition = 2; 1573 prevOffset = currentStops1; 1574 nextOffset = currentStops2; 1575 } else if (currentStops3 > offset) { 1576 nextPartition = 3; 1577 prevOffset = currentStops2; 1578 nextOffset = currentStops3; 1579 } else { 1580 prevOffset = currentStops3; 1581 } 1582 1583 index += nextPartition * indexStride; 1584 1585 if (indexStride == 1) { 1586 // If the index stride is 1, we visited a leaf, 1587 // we are done. 1588 break; 1589 } 1590 1591 indexStride /= 5; 1592 levelBaseAddr += levelStride; 1593 levelStride *= 5; 1594 offsetInLevel = offsetInLevel * 5 + nextPartition; 1595 } 1596 1597 // clamp the index to [1..numStops] 1598 if (index < 1) { 1599 index = 1; 1600 } else if (index > numStops - 1) { 1601 index = numStops - 1; 1602 } 1603 1604 return index - 1; 1605 } 1606 1607 1608 // Samples an entire span of a linear gradient by crawling the gradient table 1609 // and looking for consecutive stops that can be merged into a single larger 1610 // gradient, then interpolating between those larger gradients within the span. 1611 template <bool BLEND, bool DITHER> 1612 static bool commitLinearGradient(sampler2D sampler, int address, float size, 1613 bool tileRepeat, bool gradientRepeat, vec2 pos, 1614 const vec2_scalar& scaleDir, float startOffset, 1615 uint32_t* buf, int span, 1616 vec4 fragCoord = vec4()) { 1617 assert(sampler->format == TextureFormat::RGBA32F); 1618 assert(address >= 0 && address < int(sampler->height * sampler->stride)); 1619 GradientStops* stops = (GradientStops*)&sampler->buf[address]; 1620 // Get the chunk delta from the difference in offset steps. This represents 1621 // how far within the gradient table we advance for every step in output, 1622 // normalized to gradient table size. 1623 vec2_scalar posStep = dFdx(pos) * 4.0f; 1624 float delta = dot(posStep, scaleDir); 1625 if (!isfinite(delta)) { 1626 return false; 1627 } 1628 1629 // Only incremented in the case of dithering 1630 int32_t currentFragCoordX = int32_t(fragCoord.x.x); 1631 const auto* ditherNoiseYIndexed = 1632 DITHER ? getDitherNoise(int32_t(fragCoord.y.x)) : nullptr; 1633 1634 // If we have a repeating brush, then the position will be modulo the [0,1) 1635 // interval. Compute coefficients that can be used to quickly evaluate the 1636 // distance to the interval boundary where the offset will wrap. 1637 vec2_scalar distCoeffsX = {0.25f * span, 0.0f}; 1638 vec2_scalar distCoeffsY = distCoeffsX; 1639 if (tileRepeat) { 1640 if (posStep.x != 0.0f) { 1641 distCoeffsX = vec2_scalar{step(0.0f, posStep.x), 1.0f} * recip(posStep.x); 1642 } 1643 if (posStep.y != 0.0f) { 1644 distCoeffsY = vec2_scalar{step(0.0f, posStep.y), 1.0f} * recip(posStep.y); 1645 } 1646 } 1647 1648 for (; span > 0;) { 1649 // Try to process as many chunks as are within the span if possible. 1650 float chunks = 0.25f * span; 1651 vec2 repeatPos = pos; 1652 if (tileRepeat) { 1653 // If this is a repeating brush, then limit the chunks to not cross the 1654 // interval boundaries. 1655 repeatPos = fract(pos); 1656 chunks = min(chunks, distCoeffsX.x - repeatPos.x.x * distCoeffsX.y); 1657 chunks = min(chunks, distCoeffsY.x - repeatPos.y.x * distCoeffsY.y); 1658 } 1659 // Compute the gradient offset from the position. 1660 Float offset = 1661 repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset; 1662 // If repeat is desired, we need to limit the offset to a fractional value. 1663 if (gradientRepeat) { 1664 offset = fract(offset); 1665 } 1666 // To properly handle both clamping and repeating of the table offset, we 1667 // need to ensure we don't run past the 0 and 1 points. Here we compute the 1668 // intercept points depending on whether advancing forwards or backwards in 1669 // the gradient table to ensure the chunk count is limited by the amount 1670 // before intersection. If there is no delta, then we compute no intercept. 1671 float startEntry; 1672 int minIndex, maxIndex; 1673 if (offset.x < 0) { 1674 // If we're below the gradient table, use the first color stop. We can 1675 // only intercept the table if walking forward. 1676 startEntry = 0; 1677 minIndex = int(startEntry); 1678 maxIndex = minIndex; 1679 if (delta > 0) { 1680 chunks = min(chunks, -offset.x / delta); 1681 } 1682 } else if (offset.x < 1) { 1683 // Otherwise, we're inside the gradient table. Depending on the direction 1684 // we're walking the the table, we may intersect either the 0 or 1 offset. 1685 // Compute the start entry based on our initial offset, and compute the 1686 // end entry based on the available chunks limited by intercepts. Clamp 1687 // them into the valid range of the table. 1688 startEntry = 1.0f + offset.x * size; 1689 if (delta < 0) { 1690 chunks = min(chunks, -offset.x / delta); 1691 } else if (delta > 0) { 1692 chunks = min(chunks, (1 - offset.x) / delta); 1693 } 1694 float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size, 1695 0.0f, 1.0f + size); 1696 // Now that we know the range of entries we need to sample, we want to 1697 // find the largest possible merged gradient within that range. Depending 1698 // on which direction we are advancing in the table, we either walk up or 1699 // down the table trying to merge the current entry with the adjacent 1700 // entry. We finally limit the chunks to only sample from this merged 1701 // gradient. 1702 minIndex = int(startEntry); 1703 maxIndex = minIndex; 1704 if (delta > 0) { 1705 while (maxIndex + 1 < endEntry && 1706 stops[maxIndex].can_merge(stops[maxIndex + 1])) { 1707 maxIndex++; 1708 } 1709 chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size)); 1710 } else if (delta < 0) { 1711 while (minIndex - 1 > endEntry && 1712 stops[minIndex - 1].can_merge(stops[minIndex])) { 1713 minIndex--; 1714 } 1715 chunks = min(chunks, (minIndex - startEntry) / (delta * size)); 1716 } 1717 } else { 1718 // If we're above the gradient table, use the last color stop. We can 1719 // only intercept the table if walking backward. 1720 startEntry = 1.0f + size; 1721 minIndex = int(startEntry); 1722 maxIndex = minIndex; 1723 if (delta < 0) { 1724 chunks = min(chunks, (1 - offset.x) / delta); 1725 } 1726 } 1727 // If there are any amount of whole chunks of a merged gradient found, 1728 // then we want to process that as a single gradient span with the start 1729 // and end colors from the min and max entries. 1730 if (chunks >= 1.0f) { 1731 int inside = int(chunks); 1732 // Sample the start color from the min entry and the end color from the 1733 // max entry of the merged gradient. These are scaled to a range of 1734 // 0..0xFF00, as that is the largest shifted value that can fit in a U16. 1735 // For dithering, this allows room to avoid overflow and underflow 1736 // when applying the dither pattern. Since we are only doing addition with 1737 // the step value, we can still represent negative step values without 1738 // having to use an explicit sign bit, as the result will still come out 1739 // the same, allowing us to gain an extra bit of precision. We will later 1740 // shift these into 8 bit output range while committing the span, but 1741 // stepping with higher precision to avoid banding. We convert from RGBA 1742 // to BGRA here to avoid doing this in the inner loop. 1743 auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00); 1744 auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00); 1745 // Get the color range of the merged gradient, normalized to its size. 1746 auto colorRangeF = 1747 (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex)); 1748 // Compute the actual starting color of the current start offset within 1749 // the merged gradient. The value 0.5 is added to the low bits (0x80) so 1750 // that the color will effectively round to the nearest increment below. 1751 auto colorF = 1752 minColorF + colorRangeF * (startEntry - minIndex) + float(0x80); 1753 // Compute the portion of the color range that we advance on each chunk. 1754 Float deltaColorF = colorRangeF * (delta * size); 1755 // Quantize the color delta and current color. These have already been 1756 // scaled to the 0..0xFF00 range, so we just need to round them to U16. 1757 auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16)); 1758 for (int remaining = inside;;) { 1759 auto color = 1760 combine(CONVERT(round_pixel(colorF, 1), U16), 1761 CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16), 1762 CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16), 1763 CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16)); 1764 // Finally, step the current color through the output chunks, shifting 1765 // it into 8 bit range and outputting as we go. Only process a segment 1766 // at a time to avoid overflowing 8-bit precision due to rounding of 1767 // deltas. 1768 int segment = min(remaining, 256 / 4); 1769 for (auto* end = buf + segment * 4; buf < end; buf += 4) { 1770 if (DITHER) { 1771 commit_blend_span<BLEND>( 1772 buf, 1773 dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8); 1774 currentFragCoordX += 4; 1775 } else { 1776 commit_blend_span<BLEND>(buf, color >> 8); 1777 } 1778 color += deltaColor; 1779 } 1780 remaining -= segment; 1781 if (remaining <= 0) { 1782 break; 1783 } 1784 colorF += deltaColorF * segment; 1785 } 1786 // Deduct the number of chunks inside the gradient from the remaining 1787 // overall span. If we exhausted the span, bail out. 1788 span -= inside * 4; 1789 if (span <= 0) { 1790 break; 1791 } 1792 // Otherwise, assume we're in a transitional section of the gradient that 1793 // will probably require per-sample table lookups, so fall through below. 1794 // We need to re-evaluate the position and offset first, though. 1795 pos += posStep * float(inside); 1796 repeatPos = tileRepeat ? fract(pos) : pos; 1797 offset = 1798 repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset; 1799 if (gradientRepeat) { 1800 offset = fract(offset); 1801 } 1802 } 1803 // If we get here, there were no whole chunks of a merged gradient found 1804 // that we could process, but we still have a non-zero amount of span left. 1805 // That means we have segments of gradient that begin or end at the current 1806 // entry we're on. For this case, we just fall back to sampleGradient which 1807 // will calculate a table entry for each sample, assuming the samples may 1808 // have different table entries. 1809 Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); 1810 if (DITHER) { 1811 auto gradientSample = sampleGradient(sampler, address, entry) << 8; 1812 commit_blend_span<BLEND>( 1813 buf, 1814 dither(gradientSample, currentFragCoordX, ditherNoiseYIndexed) >> 8); 1815 currentFragCoordX += 4; 1816 } else { 1817 commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry)); 1818 } 1819 span -= 4; 1820 buf += 4; 1821 pos += posStep; 1822 } 1823 return true; 1824 } 1825 1826 // Samples an entire span of a linear gradient. 1827 template <bool BLEND, bool DITHER> 1828 static bool commitLinearGradientFromStops(sampler2D sampler, int offsetsAddress, 1829 int colorsAddress, float stopCount, 1830 bool gradientRepeat, vec2 pos, 1831 const vec2_scalar& scaleDir, 1832 float startOffset, uint32_t* buf, 1833 int span, vec4 fragCoord = vec4()) { 1834 assert(sampler->format == TextureFormat::RGBA32F); 1835 // Stop offsets are expected to be stored just after the colors. 1836 assert(colorsAddress >= 0 && colorsAddress < offsetsAddress); 1837 assert(offsetsAddress >= 0 && offsetsAddress + (stopCount + 3) / 4 < 1838 int(sampler->height * sampler->stride)); 1839 float* stopOffsets = (float*)&sampler->buf[offsetsAddress]; 1840 Float* stopColors = (Float*)&sampler->buf[colorsAddress]; 1841 1842 // Number of pixels per chunks. 1843 const float CHUNK_SIZE = 4.0f; 1844 1845 // Only incremented in the case of dithering 1846 // Only incremented in the case of dithering 1847 int32_t currentFragCoordX = int32_t(fragCoord.x.x); 1848 const auto* ditherNoiseYIndexed = 1849 DITHER ? getDitherNoise(int32_t(fragCoord.y.x)) : nullptr; 1850 1851 // Get the pixel delta from the difference in offset steps. This represents 1852 // how far within the gradient offset range we advance for every step in 1853 // output. 1854 vec2_scalar posStep = dFdx(pos); 1855 float delta = dot(posStep, scaleDir); 1856 if (!isfinite(delta)) { 1857 return false; 1858 } 1859 1860 for (; span > 0;) { 1861 // The number of pixels that are affected by the current gradient stop pair. 1862 float subSpan = span; 1863 1864 // Compute the gradient offset from the position. 1865 Float offset = pos.x * scaleDir.x + pos.y * scaleDir.y - startOffset; 1866 // If repeat is desired, we need to limit the offset to a fractional value. 1867 if (gradientRepeat) { 1868 offset = fract(offset); 1869 } 1870 1871 int32_t stopIndex = 0; 1872 float prevOffset = 0.0; 1873 float nextOffset = 0.0; 1874 if (offset.x < 0) { 1875 // If before the start of the gradient stop range, then use the first 1876 // stop. 1877 if (delta > 0) { 1878 subSpan = min(subSpan, -offset.x / delta); 1879 } 1880 } else if (offset.x >= 1) { 1881 // If beyond the end of the gradient stop range, then use the last 1882 // stop. 1883 stopIndex = stopCount - 1; 1884 if (delta < 0) { 1885 subSpan = min(subSpan, (1.0f - offset.x) / delta); 1886 } 1887 } else { 1888 // Otherwise, we're inside the gradient stop range. Find the pair 1889 // that affect the start of the current block and how many blocks 1890 // are affected by the same pair. 1891 stopIndex = 1892 findGradientStopPair(offset.x, stopOffsets, stopCount, 1893 prevOffset, nextOffset); 1894 float offsetRange = 1895 delta > 0.0f ? nextOffset - offset.x : prevOffset - offset.x; 1896 subSpan = min(subSpan, offsetRange / delta); 1897 } 1898 1899 // Ensure that we advance by at least a pixel. 1900 subSpan = max(ceil(subSpan), 1.0f); 1901 1902 // Sample the start colors of the gradient stop pair. These are scaled to 1903 // a range of 0..0xFF00, as that is the largest shifted value that can fit 1904 // in a U16. Since we are only doing addition with the step value, we can 1905 // still represent negative step values without having to use an explicit 1906 // sign bit, as the result will still come out the same, allowing us to gain 1907 // an extra bit of precision. We will later shift these into 8 bit output 1908 // range while committing the span, but stepping with higher precision to 1909 // avoid banding. We convert from RGBA to BGRA here to avoid doing this in 1910 // the inner loop. 1911 // The 256 factor is a leftover from a previous version of this code that 1912 // uses a 256 pixels gradient table. The math could be simplified to avoid 1913 // it but this change requires careful consideration of its interactions 1914 // with the dithering code. 1915 auto colorScale = (DITHER ? float(0xFF00) : 255.0f) * 256.0f; 1916 auto minColorF = stopColors[stopIndex].zyxw * colorScale; 1917 auto maxColorF = stopColors[stopIndex + 1].zyxw * colorScale; 1918 auto deltaOffset = nextOffset - prevOffset; 1919 // Get the color range of the merged gradient, normalized to its size. 1920 Float colorRangeF = deltaOffset == 0.0f 1921 ? Float(0.0f) 1922 : (maxColorF - minColorF) * (1.0 / deltaOffset); 1923 1924 // Compute the actual starting color of the current start offset within 1925 // the merged gradient. The value 0.5 is added to the low bits (0x80) so 1926 // that the color will effectively round to the nearest increment below. 1927 auto colorF = 1928 minColorF + colorRangeF * (offset.x - prevOffset) + float(0x80); 1929 1930 // Compute the portion of the color range that we advance on each chunk. 1931 Float deltaColorF = colorRangeF * delta * CHUNK_SIZE; 1932 // Quantize the color delta and current color. These have already been 1933 // scaled to the 0..0xFF00 range, so we just need to round them to U16. 1934 auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16)); 1935 // If there are any amount of whole chunks of a merged gradient found, 1936 // then we want to process that as a single gradient span. 1937 int chunks = int(subSpan) / 4; 1938 if (chunks > 0) { 1939 for (int remaining = chunks;;) { 1940 auto color = 1941 combine(CONVERT(round_pixel(colorF, 1), U16), 1942 CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16), 1943 CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16), 1944 CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16)); 1945 // Finally, step the current color through the output chunks, shifting 1946 // it into 8 bit range and outputting as we go. Only process a segment 1947 // at a time to avoid overflowing 8-bit precision due to rounding of 1948 // deltas. 1949 int segment = min(remaining, 256 / 4); 1950 for (auto* end = buf + segment * 4; buf < end; buf += 4) { 1951 if (DITHER) { 1952 commit_blend_span<BLEND>( 1953 buf, 1954 dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8); 1955 currentFragCoordX += 4; 1956 } else { 1957 commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8)); 1958 } 1959 color += deltaColor; 1960 } 1961 remaining -= segment; 1962 colorF += deltaColorF * segment; 1963 if (remaining <= 0) { 1964 break; 1965 } 1966 } 1967 span -= chunks * 4; 1968 pos += posStep * float(chunks) * CHUNK_SIZE; 1969 } 1970 1971 // We may have a partial chunk to write. 1972 int remainder = int(subSpan - chunks * 4); 1973 if (remainder > 0) { 1974 assert(remainder < 4); 1975 // The logic here is similar to the full chunks loop above, but we do a 1976 // partial write instead of a pushing a full chunk. 1977 auto color = 1978 combine(CONVERT(round_pixel(colorF, 1), U16), 1979 CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16), 1980 CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16), 1981 CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16)); 1982 if (DITHER) { 1983 color = dither(color, currentFragCoordX, ditherNoiseYIndexed), 1984 currentFragCoordX += remainder; 1985 } 1986 commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8), remainder); 1987 1988 buf += remainder; 1989 span -= remainder; 1990 pos += posStep * float(remainder); 1991 } 1992 } 1993 return true; 1994 } 1995 1996 // Commits an entire span of a linear gradient, given the address of a table 1997 // previously resolved with swgl_validateGradient. The size of the inner portion 1998 // of the table is given, assuming the table start and ends with a single entry 1999 // each to deal with clamping. Repeating will be handled if necessary. The 2000 // initial offset within the table is used to designate where to start the span 2001 // and how to step through the gradient table. 2002 #define swgl_commitLinearGradientRGBA8(sampler, address, size, tileRepeat, \ 2003 gradientRepeat, pos, scaleDir, \ 2004 startOffset) \ 2005 do { \ 2006 bool drawn = false; \ 2007 if (blend_key) { \ 2008 drawn = commitLinearGradient<true, false>( \ 2009 sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ 2010 startOffset, swgl_OutRGBA8, swgl_SpanLength); \ 2011 } else { \ 2012 drawn = commitLinearGradient<false, false>( \ 2013 sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ 2014 startOffset, swgl_OutRGBA8, swgl_SpanLength); \ 2015 } \ 2016 if (drawn) { \ 2017 swgl_OutRGBA8 += swgl_SpanLength; \ 2018 swgl_SpanLength = 0; \ 2019 } \ 2020 } while (0) 2021 2022 #define swgl_commitDitheredLinearGradientRGBA8(sampler, address, size, \ 2023 tileRepeat, gradientRepeat, \ 2024 pos, scaleDir, startOffset) \ 2025 do { \ 2026 bool drawn = false; \ 2027 if (blend_key) { \ 2028 drawn = commitLinearGradient<true, true>( \ 2029 sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ 2030 startOffset, swgl_OutRGBA8, swgl_SpanLength, gl_FragCoord); \ 2031 } else { \ 2032 drawn = commitLinearGradient<false, true>( \ 2033 sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ 2034 startOffset, swgl_OutRGBA8, swgl_SpanLength, gl_FragCoord); \ 2035 } \ 2036 if (drawn) { \ 2037 swgl_OutRGBA8 += swgl_SpanLength; \ 2038 swgl_SpanLength = 0; \ 2039 } \ 2040 } while (0) 2041 2042 #define swgl_commitLinearGradientFromStopsRGBA8( \ 2043 sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos, \ 2044 scaleDir, startOffset) \ 2045 do { \ 2046 bool drawn = false; \ 2047 if (blend_key) { \ 2048 drawn = commitLinearGradientFromStops<true, false>( \ 2049 sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos, \ 2050 scaleDir, startOffset, swgl_OutRGBA8, swgl_SpanLength); \ 2051 } else { \ 2052 drawn = commitLinearGradientFromStops<false, false>( \ 2053 sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos, \ 2054 scaleDir, startOffset, swgl_OutRGBA8, swgl_SpanLength); \ 2055 } \ 2056 if (drawn) { \ 2057 swgl_OutRGBA8 += swgl_SpanLength; \ 2058 swgl_SpanLength = 0; \ 2059 } \ 2060 } while (0) 2061 2062 #define swgl_commitDitheredLinearGradientFromStopsRGBA8( \ 2063 sampler, offsetsAddress, colorsAddress, size, tileRepeat, gradientRepeat, \ 2064 pos, scaleDir, startOffset) \ 2065 do { \ 2066 bool drawn = false; \ 2067 if (blend_key) { \ 2068 drawn = commitLinearGradientFromStops<true, true>( \ 2069 sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos, \ 2070 scaleDir, startOffset, swgl_OutRGBA8, swgl_SpanLength, fragCoord); \ 2071 } else { \ 2072 drawn = commitLinearGradientFromStops<false, true>( \ 2073 sampler, offsetsAddress, colorsAddress, size, gradientRepeat, pos, \ 2074 scaleDir, startOffset, swgl_OutRGBA8, swgl_SpanLength, fragCoord); \ 2075 } \ 2076 if (drawn) { \ 2077 swgl_OutRGBA8 += swgl_SpanLength; \ 2078 swgl_SpanLength = 0; \ 2079 } \ 2080 } while (0) 2081 2082 template <bool CLAMP, typename V> 2083 static ALWAYS_INLINE V fastSqrt(V v) { 2084 if (CLAMP) { 2085 // Clamp to avoid zero or negative. 2086 v = max(v, V(1.0e-12f)); 2087 } 2088 #if USE_SSE2 || USE_NEON 2089 return v * inversesqrt(v); 2090 #else 2091 return sqrt(v); 2092 #endif 2093 } 2094 2095 template <bool CLAMP, typename V> 2096 static ALWAYS_INLINE auto fastLength(V v) { 2097 return fastSqrt<CLAMP>(dot(v, v)); 2098 } 2099 2100 // Samples an entire span of a radial gradient by crawling the gradient table 2101 // and looking for consecutive stops that can be merged into a single larger 2102 // gradient, then interpolating between those larger gradients within the span 2103 // based on the computed position relative to a radius. 2104 template <bool BLEND, bool DITHER> 2105 static bool commitRadialGradient(sampler2D sampler, int address, float size, 2106 bool repeat, vec2 pos, float radius, 2107 uint32_t* buf, int span, 2108 vec4 fragCoord = vec4()) { 2109 assert(sampler->format == TextureFormat::RGBA32F); 2110 assert(address >= 0 && address < int(sampler->height * sampler->stride)); 2111 GradientStops* stops = (GradientStops*)&sampler->buf[address]; 2112 // clang-format off 2113 // Given position p, delta d, and radius r, we need to repeatedly solve the 2114 // following quadratic for the pixel offset t: 2115 // length(p + t*d) = r 2116 // (px + t*dx)^2 + (py + t*dy)^2 = r^2 2117 // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is: 2118 // t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0 2119 // t^2*d.d + t*2*d.p + (p.p-r^2) = 0 2120 // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to: 2121 // t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d) 2122 // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so 2123 // we cache them below for faster computation. 2124 // 2125 // The quadratic has two solutions, representing the span intersecting the 2126 // given radius of gradient, which can occur at two offsets. If there is only 2127 // one solution (where b^2-4ac = 0), this represents the point at which the 2128 // span runs tangent to the radius. This middle point is significant in that 2129 // before it, we walk down the gradient ramp, and after it, we walk up the 2130 // ramp. 2131 // clang-format on 2132 vec2_scalar pos0 = {pos.x.x, pos.y.x}; 2133 vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x}; 2134 float deltaDelta = dot(delta, delta); 2135 if (!isfinite(deltaDelta) || !isfinite(radius)) { 2136 return false; 2137 } 2138 2139 // Only incremented in the case of dithering 2140 int32_t currentFragCoordX = int32_t(fragCoord.x.x); 2141 const auto* ditherNoiseYIndexed = 2142 DITHER ? getDitherNoise(int32_t(fragCoord.y.x)) : nullptr; 2143 2144 float invDelta, middleT, middleB; 2145 if (deltaDelta > 0) { 2146 invDelta = 1.0f / deltaDelta; 2147 middleT = -dot(delta, pos0) * invDelta; 2148 middleB = middleT * middleT - dot(pos0, pos0) * invDelta; 2149 } else { 2150 // If position is invariant, just set the coefficients so the quadratic 2151 // always reduces to the end of the span. 2152 invDelta = 0.0f; 2153 middleT = float(span); 2154 middleB = 0.0f; 2155 } 2156 // We only want search for merged gradients up to the minimum of either the 2157 // mid-point or the span length. Cache those offsets here as they don't vary 2158 // in the inner loop. 2159 Float middleEndRadius = fastLength<true>( 2160 pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f}); 2161 float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x; 2162 float endRadius = middleEndRadius.y; 2163 // Convert delta to change in position per chunk. 2164 delta *= 4; 2165 deltaDelta *= 4 * 4; 2166 // clang-format off 2167 // Given current position p and delta d, we reduce: 2168 // length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p)) 2169 // where dot(p+d,p+d) can be accumulated as: 2170 // (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2) 2171 // = p.p + 2p.d + d.d 2172 // Since p increases by d every loop iteration, p.d increases by d.d, and thus 2173 // we can accumulate d.d to calculate 2p.d, then allowing us to get the next 2174 // dot-product by adding it to dot-product p.p of the prior iteration. This 2175 // saves us some multiplications and an expensive sqrt inside the inner loop. 2176 // clang-format on 2177 Float dotPos = dot(pos, pos); 2178 Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta; 2179 float deltaDelta2 = 2.0f * deltaDelta; 2180 for (int t = 0; t < span;) { 2181 // Compute the gradient table offset from the current position. 2182 Float offset = fastSqrt<true>(dotPos) - radius; 2183 float startRadius = radius; 2184 // If repeat is desired, we need to limit the offset to a fractional value. 2185 if (repeat) { 2186 // The non-repeating radius at which the gradient table actually starts, 2187 // radius + floor(offset) = radius + (offset - fract(offset)). 2188 startRadius += offset.x; 2189 offset = fract(offset); 2190 startRadius -= offset.x; 2191 } 2192 // We need to find the min/max index in the table of the gradient we want to 2193 // use as well as the intercept point where we leave this gradient. 2194 float intercept = -1; 2195 int minIndex = 0; 2196 int maxIndex = int(1.0f + size); 2197 if (offset.x < 0) { 2198 // If inside the inner radius of the gradient table, then use the first 2199 // stop. Set the intercept to advance forward to the start of the gradient 2200 // table. 2201 maxIndex = minIndex; 2202 if (t >= middleT) { 2203 intercept = radius; 2204 } 2205 } else if (offset.x < 1) { 2206 // Otherwise, we're inside the valid part of the gradient table. 2207 minIndex = int(1.0f + offset.x * size); 2208 maxIndex = minIndex; 2209 // Find the offset in the gradient that corresponds to the search limit. 2210 // We only search up to the minimum of either the mid-point or the span 2211 // length. Get the table index that corresponds to this offset, clamped so 2212 // that we avoid hitting the beginning (0) or end (1 + size) of the table. 2213 float searchOffset = 2214 (t >= middleT ? endRadius : middleRadius) - startRadius; 2215 int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size)); 2216 // If we are past the mid-point, walk up the gradient table trying to 2217 // merge stops. If we're below the mid-point, we need to walk down the 2218 // table. We note the table index at which we need to look for an 2219 // intercept to determine a valid span. 2220 if (t >= middleT) { 2221 while (maxIndex + 1 <= searchIndex && 2222 stops[maxIndex].can_merge(stops[maxIndex + 1])) { 2223 maxIndex++; 2224 } 2225 intercept = maxIndex + 1; 2226 } else { 2227 while (minIndex - 1 >= searchIndex && 2228 stops[minIndex - 1].can_merge(stops[minIndex])) { 2229 minIndex--; 2230 } 2231 intercept = minIndex; 2232 } 2233 // Convert from a table index into units of radius from the center of the 2234 // gradient. 2235 intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius; 2236 } else { 2237 // If outside the outer radius of the gradient table, then use the last 2238 // stop. Set the intercept to advance toward the valid part of the 2239 // gradient table if going in, or just run to the end of the span if going 2240 // away from the gradient. 2241 minIndex = maxIndex; 2242 if (t < middleT) { 2243 intercept = radius + 1; 2244 } 2245 } 2246 // Solve the quadratic for t to find where the merged gradient ends. If no 2247 // intercept is found, just go to the middle or end of the span. 2248 float endT = t >= middleT ? span : min(span, int(middleT)); 2249 if (intercept >= 0) { 2250 float b = middleB + intercept * intercept * invDelta; 2251 if (b > 0) { 2252 b = fastSqrt<false>(b); 2253 endT = min(endT, t >= middleT ? middleT + b : middleT - b); 2254 } else { 2255 // Due to the imprecision of fastSqrt in offset calculations, solving 2256 // the quadratic may fail. However, if the discriminant is still close 2257 // to 0, then just assume it is 0. 2258 endT = min(endT, middleT); 2259 } 2260 } 2261 // Figure out how many chunks are actually inside the merged gradient. 2262 if (t + 4.0f <= endT) { 2263 int inside = int(endT - t) & ~3; 2264 // Convert start and end colors to BGRA and scale to 0..0xFF00 range 2265 // (for dithered) or 0..255 (for non-dithered) later. 2266 auto minColorF = 2267 stops[minIndex].startColor.zyxw * (DITHER ? float(0xFF00) : 255.0f); 2268 auto maxColorF = 2269 stops[maxIndex].end_color().zyxw * (DITHER ? float(0xFF00) : 255.0f); 2270 2271 // Compute the change in color per change in gradient offset. 2272 auto deltaColorF = 2273 (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex)); 2274 // Subtract off the color difference of the beginning of the current span 2275 // from the beginning of the gradient. 2276 Float colorF = 2277 minColorF - deltaColorF * (startRadius + (minIndex - 1) / size); 2278 // Finally, walk over the span accumulating the position dot product and 2279 // getting its sqrt as an offset into the color ramp. At this point we 2280 // just need to round to an integer and pack down to pixel format. 2281 for (auto* end = buf + inside; buf < end; buf += 4) { 2282 Float offsetG = fastSqrt<false>(dotPos); 2283 if (DITHER) { 2284 auto color = combine( 2285 CONVERT(round_pixel(colorF + deltaColorF * offsetG.x, 1), U16), 2286 CONVERT(round_pixel(colorF + deltaColorF * offsetG.y, 1), U16), 2287 CONVERT(round_pixel(colorF + deltaColorF * offsetG.z, 1), U16), 2288 CONVERT(round_pixel(colorF + deltaColorF * offsetG.w, 1), U16)); 2289 commit_blend_span<BLEND>( 2290 buf, dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8); 2291 currentFragCoordX += 4; 2292 } else { 2293 auto color = combine( 2294 packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1), 2295 round_pixel(colorF + deltaColorF * offsetG.y, 1)), 2296 packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1), 2297 round_pixel(colorF + deltaColorF * offsetG.w, 1))); 2298 commit_blend_span<BLEND>(buf, color); 2299 } 2300 2301 dotPos += dotPosDelta; 2302 dotPosDelta += deltaDelta2; 2303 } 2304 // Advance past the portion of gradient we just processed. 2305 t += inside; 2306 // If we hit the end of the span, exit out now. 2307 if (t >= span) { 2308 break; 2309 } 2310 // Otherwise, we are most likely in a transitional section of the gradient 2311 // between stops that will likely require doing per-sample table lookups. 2312 // Rather than having to redo all the searching above to figure that out, 2313 // just assume that to be the case and fall through below to doing the 2314 // table lookups to hopefully avoid an iteration. 2315 offset = fastSqrt<true>(dotPos) - radius; 2316 if (repeat) { 2317 offset = fract(offset); 2318 } 2319 } 2320 // If we got here, that means we still have span left to process but did not 2321 // have any whole chunks that fell within a merged gradient. Just fall back 2322 // to doing a table lookup for each sample. 2323 Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); 2324 commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry)); 2325 buf += 4; 2326 t += 4; 2327 dotPos += dotPosDelta; 2328 dotPosDelta += deltaDelta2; 2329 } 2330 return true; 2331 } 2332 2333 // Samples an entire span of a radial gradient. 2334 template <bool BLEND, bool DITHER> 2335 static bool commitRadialGradientFromStops(sampler2D sampler, int offsetsAddress, 2336 int colorsAddress, float stopCount, 2337 bool repeat, vec2 pos, 2338 float startRadius, uint32_t* buf, 2339 int span, vec4 fragCoord = vec4()) { 2340 assert(sampler->format == TextureFormat::RGBA32F); 2341 // Stop offsets are expected to be stored just after the colors. 2342 assert(colorsAddress >= 0 && colorsAddress < offsetsAddress); 2343 assert(offsetsAddress >= 0 && offsetsAddress + (stopCount + 3) / 4 < 2344 int(sampler->height * sampler->stride)); 2345 float* stopOffsets = (float*)&sampler->buf[offsetsAddress]; 2346 Float* stopColors = (Float*)&sampler->buf[colorsAddress]; 2347 // clang-format off 2348 // Given position p, delta d, and radius r, we need to repeatedly solve the 2349 // following quadratic for the pixel offset t: 2350 // length(p + t*d) = r 2351 // (px + t*dx)^2 + (py + t*dy)^2 = r^2 2352 // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is: 2353 // t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0 2354 // t^2*d.d + t*2*d.p + (p.p-r^2) = 0 2355 // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to: 2356 // t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d) 2357 // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so 2358 // we cache them below for faster computation. 2359 // 2360 // The quadratic has two solutions, representing the span intersecting the 2361 // given radius of gradient, which can occur at two offsets. If there is only 2362 // one solution (where b^2-4ac = 0), this represents the point at which the 2363 // span runs tangent to the radius. This middle point is significant in that 2364 // before it, we walk down the gradient ramp, and after it, we walk up the 2365 // ramp. 2366 // clang-format on 2367 vec2_scalar pos0 = {pos.x.x, pos.y.x}; 2368 vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x}; 2369 float deltaDelta = dot(delta, delta); 2370 if (!isfinite(deltaDelta) || !isfinite(startRadius)) { 2371 return false; 2372 } 2373 2374 // Only incremented in the case of dithering 2375 int32_t currentFragCoordX = int32_t(fragCoord.x.x); 2376 const auto* ditherNoiseYIndexed = 2377 DITHER ? getDitherNoise(int32_t(fragCoord.y.x)) : nullptr; 2378 2379 float invDelta, middleT, middleB; 2380 if (deltaDelta > 0) { 2381 invDelta = 1.0f / deltaDelta; 2382 middleT = -dot(delta, pos0) * invDelta; 2383 middleB = middleT * middleT - dot(pos0, pos0) * invDelta; 2384 } else { 2385 // If position is invariant, just set the coefficients so the quadratic 2386 // always reduces to the end of the span. 2387 invDelta = 0.0f; 2388 middleT = float(span); 2389 middleB = 0.0f; 2390 } 2391 2392 // Convert delta to change in position per chunk. 2393 delta *= 4; 2394 deltaDelta *= 4 * 4; 2395 // clang-format off 2396 // Given current position p and delta d, we reduce: 2397 // length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p)) 2398 // where dot(p+d,p+d) can be accumulated as: 2399 // (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2) 2400 // = p.p + 2p.d + d.d 2401 // Since p increases by d every loop iteration, p.d increases by d.d, and thus 2402 // we can accumulate d.d to calculate 2p.d, then allowing us to get the next 2403 // dot-product by adding it to dot-product p.p of the prior iteration. This 2404 // saves us some multiplications and an expensive sqrt inside the inner loop. 2405 // clang-format on 2406 Float dotPos = dot(pos, pos); 2407 Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta; 2408 float deltaDelta2 = 2.0f * deltaDelta; 2409 2410 for (int t = 0; t < span;) { 2411 // Compute the gradient table offset from the current position. 2412 Float offset = fastSqrt<true>(dotPos) - startRadius; 2413 float adjustedStartRadius = startRadius; 2414 // If repeat is desired, we need to limit the offset to a fractional value. 2415 if (repeat) { 2416 // The non-repeating radius at which the gradient table actually starts, 2417 // startRadius + floor(offset) = startRadius + (offset - fract(offset)). 2418 adjustedStartRadius += offset.x; 2419 offset = fract(offset); 2420 adjustedStartRadius -= offset.x; 2421 } 2422 2423 // We need to find the pair of gradient stops that affect the the current 2424 // portion of the span as well as the intercept point where we leave this 2425 // gradient. 2426 float intercept = -1; 2427 int32_t stopIndex = 0; 2428 float prevOffset = 0.0f; 2429 float nextOffset = 0.0f; 2430 if (offset.x < 0) { 2431 // If inside the inner radius of the gradient table, then use the first 2432 // stop. Set the intercept to advance forward to the start of the gradient 2433 // table. 2434 if (t >= middleT) { 2435 intercept = startRadius; 2436 } 2437 } else if (offset.x >= 1) { 2438 // If outside the outer radius of the gradient table, then use the last 2439 // stop. Set the intercept to advance toward the valid part of the 2440 // gradient table if going in, or just run to the end of the span if going 2441 // away from the gradient. 2442 stopIndex = stopCount - 1; 2443 if (t < middleT) { 2444 intercept = startRadius + 1; 2445 } 2446 } else { 2447 // Otherwise, we're inside the valid part of the gradient table. 2448 2449 stopIndex = 2450 findGradientStopPair(offset.x, stopOffsets, stopCount, 2451 prevOffset, nextOffset); 2452 if (t >= middleT) { 2453 intercept = adjustedStartRadius + nextOffset; 2454 } else { 2455 intercept = adjustedStartRadius + prevOffset; 2456 } 2457 } 2458 // Solve the quadratic for t to find where the current stop pair ends. If no 2459 // intercept is found, just go to the middle or end of the span. 2460 float endT = t >= middleT ? span : min(span, int(middleT)); 2461 if (intercept >= 0) { 2462 float b = middleB + intercept * intercept * invDelta; 2463 if (b > 0) { 2464 b = fastSqrt<false>(b); 2465 endT = min(endT, t >= middleT ? middleT + b : middleT - b); 2466 } else { 2467 // Due to the imprecision of fastSqrt in offset calculations, solving 2468 // the quadratic may fail. However, if the discriminant is still close 2469 // to 0, then just assume it is 0. 2470 endT = min(endT, middleT); 2471 } 2472 } 2473 // Ensure that we are advancing by at least one pixel at each iteration. 2474 endT = max(ceil(endT), t + 1.0f); 2475 2476 // Figure out how many pixels belonging to whole chunks are inside the 2477 // gradient stop pair. 2478 int inside = int(endT - t) & ~3; 2479 // Convert start and end colors to BGRA and scale to 0..0xFF00 range 2480 // (for dithered) and 0.255 range (for non-dithered). 2481 auto minColorF = 2482 stopColors[stopIndex].zyxw * (DITHER ? float(0xFF00) : 255.0f); 2483 auto maxColorF = 2484 stopColors[stopIndex + 1].zyxw * (DITHER ? float(0xFF00) : 255.0f); 2485 2486 // Compute the change in color per change in gradient offset. 2487 auto deltaOffset = nextOffset - prevOffset; 2488 Float deltaColorF = 2489 deltaOffset == 0.0f 2490 ? 2491 // Note: If we take this branch, we know that we are going to fill 2492 // some pixels with a solid color (we are in or out of the range of 2493 // gradient stops). We could leverage that to skip the offset 2494 // calculation. 2495 Float(0.0f) 2496 : (maxColorF - minColorF) / deltaOffset; 2497 // Subtract off the color difference of the beginning of the current span 2498 // from the beginning of the gradient. 2499 Float colorF = minColorF - deltaColorF * (adjustedStartRadius + prevOffset); 2500 // Finally, walk over the span accumulating the position dot product and 2501 // getting its sqrt as an offset into the color ramp. At this point we just 2502 // need to round to an integer and pack down to pixel format. 2503 for (auto* end = buf + inside; buf < end; buf += 4) { 2504 Float offsetG = fastSqrt<false>(dotPos); 2505 if (DITHER) { 2506 auto color = combine( 2507 CONVERT(round_pixel(colorF + deltaColorF * offsetG.x, 1), U16), 2508 CONVERT(round_pixel(colorF + deltaColorF * offsetG.y, 1), U16), 2509 CONVERT(round_pixel(colorF + deltaColorF * offsetG.z, 1), U16), 2510 CONVERT(round_pixel(colorF + deltaColorF * offsetG.w, 1), U16)); 2511 commit_blend_span<BLEND>( 2512 buf, dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8); 2513 currentFragCoordX += 4; 2514 } else { 2515 auto color = combine( 2516 packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1), 2517 round_pixel(colorF + deltaColorF * offsetG.y, 1)), 2518 packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1), 2519 round_pixel(colorF + deltaColorF * offsetG.w, 1))); 2520 commit_blend_span<BLEND>(buf, color); 2521 } 2522 dotPos += dotPosDelta; 2523 dotPosDelta += deltaDelta2; 2524 } 2525 // Advance past the portion of gradient we just processed. 2526 t += inside; 2527 2528 // If we hit the end of the span, exit out now. 2529 if (t >= span) { 2530 break; 2531 } 2532 2533 // Otherwise we may have a partial chunk to write. 2534 int remainder = endT - t; 2535 if (remainder > 0) { 2536 assert(remainder < 4); 2537 // The logic here is similar to the full chunks loop above, but we do a 2538 // partial write instead of a pushing a full chunk. 2539 Float offsetG = fastSqrt<false>(dotPos); 2540 if (DITHER) { 2541 auto color = combine( 2542 CONVERT(round_pixel(colorF + deltaColorF * offsetG.x, 1), U16), 2543 CONVERT(round_pixel(colorF + deltaColorF * offsetG.y, 1), U16), 2544 CONVERT(round_pixel(colorF + deltaColorF * offsetG.z, 1), U16), 2545 CONVERT(round_pixel(colorF + deltaColorF * offsetG.w, 1), U16)); 2546 commit_blend_span<BLEND>( 2547 buf, dither(color, currentFragCoordX, ditherNoiseYIndexed) >> 8, 2548 remainder); 2549 currentFragCoordX += 4; 2550 } else { 2551 auto color = combine( 2552 packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1), 2553 round_pixel(colorF + deltaColorF * offsetG.y, 1)), 2554 packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1), 2555 round_pixel(colorF + deltaColorF * offsetG.w, 1))); 2556 commit_blend_span<BLEND>(buf, color, remainder); 2557 } 2558 buf += remainder; 2559 t += remainder; 2560 2561 // dotPosDelta's members are monotonically increasing, so adjusting the 2562 // step only requires undoing the factor of 4 and multiplying with the 2563 // actual number of remainder pixels. 2564 float partialDeltaDelta2 = deltaDelta2 * 0.25f * float(remainder); 2565 dotPosDelta += partialDeltaDelta2; 2566 2567 // For dotPos, however, there is a compounding effect that makes the math 2568 // trickier. For simplicity's sake we are just computing the the 2569 // parameters for a single-pixel step and applying it remainder times. 2570 2571 // The deltaDelta2 for a single-pixel step (undoing the 4*4 factor we did 2572 // earlier when making deltaDelta2 work for 4-pixels chunks). 2573 float singlePxDeltaDelta2 = deltaDelta2 * 0.0625f; 2574 // The first single-pixel delta for dotPos (The difference between 2575 // dotPos's first two lanes). 2576 float dotPosDeltaFirst = dotPos.y - dotPos.x; 2577 // For each 1-pixel step the delta is applied and monotonically increased 2578 // by singleDeltaDelta2. 2579 Float pxOffsets = {0.0f, 1.0f, 2.0f, 3.0f}; 2580 Float partialDotPosDelta = 2581 pxOffsets * singlePxDeltaDelta2 + dotPosDeltaFirst; 2582 2583 // Apply each single-pixel step. 2584 for (int i = 0; i < remainder; ++i) { 2585 dotPos += partialDotPosDelta; 2586 partialDotPosDelta += singlePxDeltaDelta2; 2587 } 2588 } 2589 } 2590 return true; 2591 } 2592 2593 // Commits an entire span of a radial gradient similar to 2594 // swglcommitLinearGradient, but given a varying 2D position scaled to 2595 // gradient-space and a radius at which the distance from the origin maps to the 2596 // start of the gradient table. 2597 #define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos, \ 2598 radius) \ 2599 do { \ 2600 bool drawn = false; \ 2601 if (blend_key) { \ 2602 drawn = commitRadialGradient<true, false>( \ 2603 sampler, address, size, repeat, pos, radius, swgl_OutRGBA8, \ 2604 swgl_SpanLength); \ 2605 } else { \ 2606 drawn = commitRadialGradient<false, false>( \ 2607 sampler, address, size, repeat, pos, radius, swgl_OutRGBA8, \ 2608 swgl_SpanLength); \ 2609 } \ 2610 if (drawn) { \ 2611 swgl_OutRGBA8 += swgl_SpanLength; \ 2612 swgl_SpanLength = 0; \ 2613 } \ 2614 } while (0) 2615 2616 #define swgl_commitDitheredRadialGradientRGBA8(sampler, address, size, repeat, \ 2617 pos, radius) \ 2618 do { \ 2619 bool drawn = false; \ 2620 if (blend_key) { \ 2621 drawn = commitRadialGradient<true, true>(sampler, address, size, repeat, \ 2622 pos, radius, swgl_OutRGBA8, \ 2623 swgl_SpanLength, gl_FragCoord); \ 2624 } else { \ 2625 drawn = commitRadialGradient<false, true>( \ 2626 sampler, address, size, repeat, pos, radius, swgl_OutRGBA8, \ 2627 swgl_SpanLength, gl_FragCoord); \ 2628 } \ 2629 if (drawn) { \ 2630 swgl_OutRGBA8 += swgl_SpanLength; \ 2631 swgl_SpanLength = 0; \ 2632 } \ 2633 } while (0) 2634 2635 // Commits an entire span of a radial gradient similar to 2636 // swglcommitLinearGradient, but given a varying 2D position scaled to 2637 // gradient-space and a radius at which the distance from the origin maps to the 2638 // start of the gradient table. 2639 #define swgl_commitRadialGradientFromStopsRGBA8( \ 2640 sampler, offsetsAddress, colorsAddress, size, repeat, pos, startRadius) \ 2641 do { \ 2642 bool drawn = false; \ 2643 if (blend_key) { \ 2644 drawn = commitRadialGradientFromStops<true, false>( \ 2645 sampler, offsetsAddress, colorsAddress, size, repeat, pos, \ 2646 startRadius, swgl_OutRGBA8, swgl_SpanLength); \ 2647 } else { \ 2648 drawn = commitRadialGradientFromStops<false, false>( \ 2649 sampler, offsetsAddress, colorsAddress, size, repeat, pos, \ 2650 startRadius, swgl_OutRGBA8, swgl_SpanLength); \ 2651 } \ 2652 if (drawn) { \ 2653 swgl_OutRGBA8 += swgl_SpanLength; \ 2654 swgl_SpanLength = 0; \ 2655 } \ 2656 } while (0) 2657 2658 #define swgl_commitDitheredRadialGradientFromStopsRGBA8( \ 2659 sampler, offsetsAddress, colorsAddress, size, repeat, pos, startRadius) \ 2660 do { \ 2661 bool drawn = false; \ 2662 if (blend_key) { \ 2663 drawn = commitRadialGradientFromStops<true, true>( \ 2664 sampler, offsetsAddress, colorsAddress, size, repeat, pos, \ 2665 startRadius, swgl_OutRGBA8, swgl_SpanLength, gl_FragCoord); \ 2666 } else { \ 2667 drawn = commitRadialGradientFromStops<false, true>( \ 2668 sampler, offsetsAddress, colorsAddress, size, repeat, pos, \ 2669 startRadius, swgl_OutRGBA8, swgl_SpanLength, gl_FragCoord); \ 2670 } \ 2671 if (drawn) { \ 2672 swgl_OutRGBA8 += swgl_SpanLength; \ 2673 swgl_SpanLength = 0; \ 2674 } \ 2675 } while (0) 2676 2677 // Extension to set a clip mask image to be sampled during blending. The offset 2678 // specifies the positioning of the clip mask image relative to the viewport 2679 // origin. The bounding box specifies the rectangle relative to the clip mask's 2680 // origin that constrains sampling within the clip mask. Blending must be 2681 // enabled for this to work. 2682 static sampler2D swgl_ClipMask = nullptr; 2683 static IntPoint swgl_ClipMaskOffset = {0, 0}; 2684 static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0}; 2685 #define swgl_clipMask(mask, offset, bb_origin, bb_size) \ 2686 do { \ 2687 if (bb_size != vec2_scalar(0.0f, 0.0f)) { \ 2688 swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK; \ 2689 swgl_ClipMask = mask; \ 2690 swgl_ClipMaskOffset = make_ivec2(offset); \ 2691 swgl_ClipMaskBounds = \ 2692 IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \ 2693 } \ 2694 } while (0) 2695 2696 // Extension to enable anti-aliasing for the given edges of a quad. 2697 // Blending must be enable for this to work. 2698 static int swgl_AAEdgeMask = 0; 2699 2700 static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; } 2701 static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; } 2702 static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) { 2703 return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) | 2704 (mask.w ? 8 : 0); 2705 } 2706 2707 #define swgl_antiAlias(edges) \ 2708 do { \ 2709 swgl_AAEdgeMask = calcAAEdgeMask(edges); \ 2710 if (swgl_AAEdgeMask) { \ 2711 swgl_ClipFlags |= SWGL_CLIP_FLAG_AA; \ 2712 } \ 2713 } while (0) 2714 2715 #define swgl_blendDropShadow(color) \ 2716 do { \ 2717 swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ 2718 swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \ 2719 swgl_BlendColorRGBA8 = packColor<uint32_t>(color); \ 2720 } while (0) 2721 2722 #define swgl_blendSubpixelText(color) \ 2723 do { \ 2724 swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ 2725 swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \ 2726 swgl_BlendColorRGBA8 = packColor<uint32_t>(color); \ 2727 swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8); \ 2728 } while (0) 2729 2730 // Dispatch helper used by the GLSL translator to swgl_drawSpan functions. 2731 // The number of pixels committed is tracked by checking for the difference in 2732 // swgl_SpanLength. Any varying interpolants used will be advanced past the 2733 // committed part of the span in case the fragment shader must be executed for 2734 // any remaining pixels that were not committed by the span shader. 2735 #define DISPATCH_DRAW_SPAN(self, format) \ 2736 do { \ 2737 int total = self->swgl_SpanLength; \ 2738 self->swgl_drawSpan##format(); \ 2739 int drawn = total - self->swgl_SpanLength; \ 2740 if (drawn) self->step_interp_inputs(drawn); \ 2741 return drawn; \ 2742 } while (0)