rasterize.h (73093B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 // The SWGL depth buffer is roughly organized as a span buffer where each row 6 // of the depth buffer is a list of spans, and each span has a constant depth 7 // and a run length (represented by DepthRun). The span from start..start+count 8 // is placed directly at that start index in the row's array of runs, so that 9 // there is no need to explicitly record the start index at all. This also 10 // avoids the need to move items around in the run array to manage insertions 11 // since space is implicitly always available for a run between any two 12 // pre-existing runs. Linkage from one run to the next is implicitly defined by 13 // the count, so if a run exists from start..start+count, the next run will 14 // implicitly pick up right at index start+count where that preceding run left 15 // off. All of the DepthRun items that are after the head of the run can remain 16 // uninitialized until the run needs to be split and a new run needs to start 17 // somewhere in between. 18 // For uses like perspective-correct rasterization or with a discard mask, a 19 // run is not an efficient representation, and it is more beneficial to have 20 // a flattened array of individual depth samples that can be masked off easily. 21 // To support this case, the first run in a given row's run array may have a 22 // zero count, signaling that this entire row is flattened. Critically, the 23 // depth and count fields in DepthRun are ordered (endian-dependently) so that 24 // the DepthRun struct can be interpreted as a sign-extended int32_t depth. It 25 // is then possible to just treat the entire row as an array of int32_t depth 26 // samples that can be processed with SIMD comparisons, since the count field 27 // behaves as just the sign-extension of the depth field. The count field is 28 // limited to 8 bits so that we can support depth values up to 24 bits. 29 // When a depth buffer is cleared, each row is initialized to a maximal runs 30 // spanning the entire row. In the normal case, the depth buffer will continue 31 // to manage itself as a list of runs. If perspective or discard is used for 32 // a given row, the row will be converted to the flattened representation to 33 // support it, after which it will only ever revert back to runs if the depth 34 // buffer is cleared. 35 36 // The largest 24-bit depth value supported. 37 constexpr uint32_t MAX_DEPTH_VALUE = 0xFFFFFF; 38 // The longest 8-bit depth run that is supported, aligned to SIMD chunk size. 39 constexpr uint32_t MAX_DEPTH_RUN = 255 & ~3; 40 41 struct DepthRun { 42 // Ensure that depth always occupies the LSB and count the MSB so that we 43 // can sign-extend depth just by setting count to zero, marking it flat. 44 // When count is non-zero, then this is interpreted as an actual run and 45 // depth is read in isolation. 46 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 47 uint32_t depth : 24; 48 uint32_t count : 8; 49 #else 50 uint32_t count : 8; 51 uint32_t depth : 24; 52 #endif 53 54 DepthRun() = default; 55 DepthRun(uint32_t depth, uint8_t count) : depth(depth), count(count) {} 56 57 // If count is zero, this is actually a flat depth sample rather than a run. 58 bool is_flat() const { return !count; } 59 60 // Compare a source depth from rasterization with a stored depth value. 61 template <int FUNC> 62 ALWAYS_INLINE bool compare(uint32_t src) const { 63 switch (FUNC) { 64 case GL_LEQUAL: 65 return src <= depth; 66 case GL_LESS: 67 return src < depth; 68 case GL_ALWAYS: 69 return true; 70 default: 71 assert(false); 72 return false; 73 } 74 } 75 }; 76 77 // Fills runs at the given position with the given depth up to the span width. 78 static ALWAYS_INLINE void set_depth_runs(DepthRun* runs, uint32_t depth, 79 uint32_t width) { 80 // If the width exceeds the maximum run size, then we need to output clamped 81 // runs first. 82 for (; width >= MAX_DEPTH_RUN; 83 runs += MAX_DEPTH_RUN, width -= MAX_DEPTH_RUN) { 84 *runs = DepthRun(depth, MAX_DEPTH_RUN); 85 } 86 // If there are still any left over samples to fill under the maximum run 87 // size, then output one last run for them. 88 if (width > 0) { 89 *runs = DepthRun(depth, width); 90 } 91 } 92 93 // A cursor for reading and modifying a row's depth run array. It locates 94 // and iterates through a desired span within all the runs, testing if 95 // the depth of this span passes or fails the depth test against existing 96 // runs. If desired, new runs may be inserted to represent depth occlusion 97 // from this span in the run array. 98 struct DepthCursor { 99 // Current position of run the cursor has advanced to. 100 DepthRun* cur = nullptr; 101 // The start of the remaining potential samples in the desired span. 102 DepthRun* start = nullptr; 103 // The end of the potential samples in the desired span. 104 DepthRun* end = nullptr; 105 106 DepthCursor() = default; 107 108 // Construct a cursor with runs for a given row's run array and the bounds 109 // of the span we wish to iterate within it. 110 DepthCursor(DepthRun* runs, int num_runs, int span_offset, int span_count) 111 : cur(runs), start(&runs[span_offset]), end(start + span_count) { 112 // This cursor should never iterate over flat runs 113 assert(!runs->is_flat()); 114 DepthRun* end_runs = &runs[num_runs]; 115 // Clamp end of span to end of row 116 if (end > end_runs) { 117 end = end_runs; 118 } 119 // If the span starts past the end of the row, just advance immediately 120 // to it to signal that we're done. 121 if (start >= end_runs) { 122 cur = end_runs; 123 start = end_runs; 124 return; 125 } 126 // Otherwise, find the first depth run that contains the start of the span. 127 // If the span starts after the given run, then we need to keep searching 128 // through the row to find an appropriate run. The check above already 129 // guaranteed that the span starts within the row's runs, and the search 130 // won't fall off the end. 131 for (;;) { 132 assert(cur < end); 133 DepthRun* next = cur + cur->count; 134 if (start < next) { 135 break; 136 } 137 cur = next; 138 } 139 } 140 141 // The cursor is valid if the current position is at the end or if the run 142 // contains the start position. 143 bool valid() const { 144 return cur >= end || (cur <= start && start < cur + cur->count); 145 } 146 147 // Skip past any initial runs that fail the depth test. If we find a run that 148 // would pass, then return the accumulated length between where we started 149 // and that position. Otherwise, if we fall off the end, return -1 to signal 150 // that there are no more passed runs at the end of this failed region and 151 // so it is safe for the caller to stop processing any more regions in this 152 // row. 153 template <int FUNC> 154 int skip_failed(uint32_t val) { 155 assert(valid()); 156 DepthRun* prev = start; 157 while (cur < end) { 158 if (cur->compare<FUNC>(val)) { 159 return start - prev; 160 } 161 cur += cur->count; 162 start = cur; 163 } 164 return -1; 165 } 166 167 // Helper to convert function parameters into template parameters to hoist 168 // some checks out of inner loops. 169 ALWAYS_INLINE int skip_failed(uint32_t val, GLenum func) { 170 switch (func) { 171 case GL_LEQUAL: 172 return skip_failed<GL_LEQUAL>(val); 173 case GL_LESS: 174 return skip_failed<GL_LESS>(val); 175 default: 176 assert(false); 177 return -1; 178 } 179 } 180 181 // Find a region of runs that passes the depth test. It is assumed the caller 182 // has called skip_failed first to skip past any runs that failed the depth 183 // test. This stops when it finds a run that fails the depth test or we fall 184 // off the end of the row. If the write mask is enabled, this will insert runs 185 // to represent this new region that passed the depth test. The length of the 186 // region is returned. 187 template <int FUNC, bool MASK> 188 int check_passed(uint32_t val) { 189 assert(valid()); 190 DepthRun* prev = cur; 191 while (cur < end) { 192 if (!cur->compare<FUNC>(val)) { 193 break; 194 } 195 DepthRun* next = cur + cur->count; 196 if (next > end) { 197 if (MASK) { 198 // Chop the current run where the end of the span falls, making a new 199 // run from the end of the span till the next run. The beginning of 200 // the current run will be folded into the run from the start of the 201 // passed region before returning below. 202 *end = DepthRun(cur->depth, next - end); 203 } 204 // If the next run starts past the end, then just advance the current 205 // run to the end to signal that we're now at the end of the row. 206 next = end; 207 } 208 cur = next; 209 } 210 // If we haven't advanced past the start of the span region, then we found 211 // nothing that passed. 212 if (cur <= start) { 213 return 0; 214 } 215 // If 'end' fell within the middle of a passing run, then 'cur' will end up 216 // pointing at the new partial run created at 'end' where the passing run 217 // was split to accommodate starting in the middle. The preceding runs will 218 // be fixed below to properly join with this new split. 219 int passed = cur - start; 220 if (MASK) { 221 // If the search started from a run before the start of the span, then 222 // edit that run to meet up with the start. 223 if (prev < start) { 224 prev->count = start - prev; 225 } 226 // Create a new run for the entirety of the passed samples. 227 set_depth_runs(start, val, passed); 228 } 229 start = cur; 230 return passed; 231 } 232 233 // Helper to convert function parameters into template parameters to hoist 234 // some checks out of inner loops. 235 template <bool MASK> 236 ALWAYS_INLINE int check_passed(uint32_t val, GLenum func) { 237 switch (func) { 238 case GL_LEQUAL: 239 return check_passed<GL_LEQUAL, MASK>(val); 240 case GL_LESS: 241 return check_passed<GL_LESS, MASK>(val); 242 default: 243 assert(false); 244 return 0; 245 } 246 } 247 248 ALWAYS_INLINE int check_passed(uint32_t val, GLenum func, bool mask) { 249 return mask ? check_passed<true>(val, func) 250 : check_passed<false>(val, func); 251 } 252 253 // Fill a region of runs with a given depth value, bypassing any depth test. 254 ALWAYS_INLINE void fill(uint32_t depth) { 255 check_passed<GL_ALWAYS, true>(depth); 256 } 257 }; 258 259 // Initialize a depth texture by setting the first run in each row to encompass 260 // the entire row. 261 void Texture::init_depth_runs(uint32_t depth) { 262 if (!buf) return; 263 DepthRun* runs = (DepthRun*)buf; 264 for (int y = 0; y < height; y++) { 265 set_depth_runs(runs, depth, width); 266 runs += stride() / sizeof(DepthRun); 267 } 268 set_cleared(true); 269 } 270 271 // Fill a portion of the run array with flattened depth samples. 272 static ALWAYS_INLINE void fill_flat_depth(DepthRun* dst, size_t n, 273 uint32_t depth) { 274 fill_n((uint32_t*)dst, n, depth); 275 } 276 277 // Fills a scissored region of a depth texture with a given depth. 278 void Texture::fill_depth_runs(uint32_t depth, const IntRect& scissor) { 279 if (!buf) return; 280 assert(cleared()); 281 IntRect bb = bounds().intersection(scissor - offset); 282 DepthRun* runs = (DepthRun*)sample_ptr(0, bb.y0); 283 for (int rows = bb.height(); rows > 0; rows--) { 284 if (bb.width() >= width) { 285 // If the scissor region encompasses the entire row, reset the row to a 286 // single run encompassing the entire row. 287 set_depth_runs(runs, depth, width); 288 } else if (runs->is_flat()) { 289 // If the row is flattened, just directly fill the portion of the row. 290 fill_flat_depth(&runs[bb.x0], bb.width(), depth); 291 } else { 292 // Otherwise, if we are still using runs, then set up a cursor to fill 293 // it with depth runs. 294 DepthCursor(runs, width, bb.x0, bb.width()).fill(depth); 295 } 296 runs += stride() / sizeof(DepthRun); 297 } 298 } 299 300 using ZMask = I32; 301 302 #if USE_SSE2 303 # define ZMASK_NONE_PASSED 0xFFFF 304 # define ZMASK_ALL_PASSED 0 305 static inline uint32_t zmask_code(ZMask mask) { 306 return _mm_movemask_epi8(mask); 307 } 308 #else 309 # define ZMASK_NONE_PASSED 0xFFFFFFFFU 310 # define ZMASK_ALL_PASSED 0 311 static inline uint32_t zmask_code(ZMask mask) { 312 return bit_cast<uint32_t>(CONVERT(mask, U8)); 313 } 314 #endif 315 316 // Interprets items in the depth buffer as sign-extended 32-bit depth values 317 // instead of as runs. Returns a mask that signals which samples in the given 318 // chunk passed or failed the depth test with given Z value. 319 template <bool DISCARD> 320 static ALWAYS_INLINE bool check_depth(I32 src, DepthRun* zbuf, ZMask& outmask, 321 int span = 4) { 322 // SSE2 does not support unsigned comparison. So ensure Z value is 323 // sign-extended to int32_t. 324 I32 dest = unaligned_load<I32>(zbuf); 325 // Invert the depth test to check which pixels failed and should be discarded. 326 ZMask mask = ctx->depthfunc == GL_LEQUAL 327 ? 328 // GL_LEQUAL: Not(LessEqual) = Greater 329 ZMask(src > dest) 330 : 331 // GL_LESS: Not(Less) = GreaterEqual 332 ZMask(src >= dest); 333 // Mask off any unused lanes in the span. 334 mask |= ZMask(span) < ZMask{1, 2, 3, 4}; 335 if (zmask_code(mask) == ZMASK_NONE_PASSED) { 336 return false; 337 } 338 if (!DISCARD && ctx->depthmask) { 339 unaligned_store(zbuf, (mask & dest) | (~mask & src)); 340 } 341 outmask = mask; 342 return true; 343 } 344 345 static ALWAYS_INLINE I32 packDepth() { 346 return cast(fragment_shader->gl_FragCoord.z * MAX_DEPTH_VALUE); 347 } 348 349 static ALWAYS_INLINE void discard_depth(I32 src, DepthRun* zbuf, I32 mask) { 350 if (ctx->depthmask) { 351 I32 dest = unaligned_load<I32>(zbuf); 352 mask |= fragment_shader->swgl_IsPixelDiscarded; 353 unaligned_store(zbuf, (mask & dest) | (~mask & src)); 354 } 355 } 356 357 static ALWAYS_INLINE void mask_output(uint32_t* buf, ZMask zmask, 358 int span = 4) { 359 WideRGBA8 r = pack_pixels_RGBA8(); 360 PackedRGBA8 dst = load_span<PackedRGBA8>(buf, span); 361 if (blend_key) r = blend_pixels(buf, dst, r, span); 362 PackedRGBA8 mask = bit_cast<PackedRGBA8>(zmask); 363 store_span(buf, (mask & dst) | (~mask & pack(r)), span); 364 } 365 366 template <bool DISCARD> 367 static ALWAYS_INLINE void discard_output(uint32_t* buf, int span = 4) { 368 mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span); 369 } 370 371 template <> 372 ALWAYS_INLINE void discard_output<false>(uint32_t* buf, int span) { 373 WideRGBA8 r = pack_pixels_RGBA8(); 374 if (blend_key) 375 r = blend_pixels(buf, load_span<PackedRGBA8>(buf, span), r, span); 376 store_span(buf, pack(r), span); 377 } 378 379 static ALWAYS_INLINE void mask_output(uint8_t* buf, ZMask zmask, int span = 4) { 380 WideR8 r = pack_pixels_R8(); 381 WideR8 dst = unpack(load_span<PackedR8>(buf, span)); 382 if (blend_key) r = blend_pixels(buf, dst, r, span); 383 WideR8 mask = packR8(zmask); 384 store_span(buf, pack((mask & dst) | (~mask & r)), span); 385 } 386 387 template <bool DISCARD> 388 static ALWAYS_INLINE void discard_output(uint8_t* buf, int span = 4) { 389 mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span); 390 } 391 392 template <> 393 ALWAYS_INLINE void discard_output<false>(uint8_t* buf, int span) { 394 WideR8 r = pack_pixels_R8(); 395 if (blend_key) 396 r = blend_pixels(buf, unpack(load_span<PackedR8>(buf, span)), r, span); 397 store_span(buf, pack(r), span); 398 } 399 400 struct ClipRect { 401 float x0; 402 float y0; 403 float x1; 404 float y1; 405 406 explicit ClipRect(const IntRect& i) 407 : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {} 408 explicit ClipRect(const Texture& t) : ClipRect(ctx->apply_scissor(t)) { 409 // If blending is enabled, set blend_key to reflect the resolved blend 410 // state for the currently drawn primitive. 411 if (ctx->blend) { 412 blend_key = ctx->blend_key; 413 if (swgl_ClipFlags) { 414 // If there is a blend override set, replace the blend key with it. 415 if (swgl_ClipFlags & SWGL_CLIP_FLAG_BLEND_OVERRIDE) { 416 blend_key = swgl_BlendOverride; 417 } 418 // If a clip mask is available, set up blending state to use the clip 419 // mask. 420 if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { 421 assert(swgl_ClipMask->format == TextureFormat::R8); 422 // Constrain the clip mask bounds to always fall within the clip mask. 423 swgl_ClipMaskBounds.intersect(IntRect{0, 0, int(swgl_ClipMask->width), 424 int(swgl_ClipMask->height)}); 425 // The clip mask offset is relative to the viewport. 426 swgl_ClipMaskOffset += ctx->viewport.origin() - t.offset; 427 // The clip mask bounds are relative to the clip mask offset. 428 swgl_ClipMaskBounds.offset(swgl_ClipMaskOffset); 429 // Finally, constrain the clip rectangle by the clip mask bounds. 430 intersect(swgl_ClipMaskBounds); 431 // Modify the blend key so that it will use the clip mask while 432 // blending. 433 restore_clip_mask(); 434 } 435 if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { 436 // Modify the blend key so that it will use AA while blending. 437 restore_aa(); 438 } 439 } 440 } else { 441 blend_key = BLEND_KEY_NONE; 442 swgl_ClipFlags = 0; 443 } 444 } 445 446 FloatRange x_range() const { return {x0, x1}; } 447 448 void intersect(const IntRect& c) { 449 x0 = max(x0, float(c.x0)); 450 y0 = max(y0, float(c.y0)); 451 x1 = min(x1, float(c.x1)); 452 y1 = min(y1, float(c.y1)); 453 } 454 455 template <typename P> 456 void set_clip_mask(int x, int y, P* buf) const { 457 if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { 458 swgl_SpanBuf = buf; 459 swgl_ClipMaskBuf = (uint8_t*)swgl_ClipMask->buf + 460 (y - swgl_ClipMaskOffset.y) * swgl_ClipMask->stride + 461 (x - swgl_ClipMaskOffset.x); 462 } 463 } 464 465 template <typename P> 466 bool overlaps(int nump, const P* p) const { 467 // Generate a mask of which side of the clip rect all of a polygon's points 468 // fall inside of. This is a cheap conservative estimate of whether the 469 // bounding box of the polygon might overlap the clip rect, rather than an 470 // exact test that would require multiple slower line intersections. 471 int sides = 0; 472 for (int i = 0; i < nump; i++) { 473 sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2; 474 sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8; 475 } 476 return sides == 0xF; 477 } 478 }; 479 480 // Given a current X position at the center Y position of a row, return the X 481 // position of the left and right intercepts of the row top and bottom. 482 template <typename E> 483 static ALWAYS_INLINE FloatRange x_intercepts(const E& e) { 484 float rad = 0.5f * abs(e.x_slope()); 485 return {e.cur_x() - rad, e.cur_x() + rad}; 486 } 487 488 // Return the AA sub-span corresponding to a given edge. If AA is requested, 489 // then this finds the X intercepts with the row clipped into range of the 490 // edge and finally conservatively rounds them out. If there is no AA, then 491 // it just returns the current rounded X position clipped within bounds. 492 template <typename E> 493 static ALWAYS_INLINE IntRange aa_edge(const E& e, const FloatRange& bounds) { 494 return e.edgeMask ? bounds.clip(x_intercepts(e)).round_out() 495 : bounds.clip({e.cur_x(), e.cur_x()}).round(); 496 } 497 498 // Calculate the initial AA coverage as an approximation of the distance from 499 // the center of the pixel in the direction of the edge slope. Given an edge 500 // (x,y)..(x+dx,y+dy), then the normalized tangent vector along the edge is 501 // (dx,dy)/sqrt(dx^2+dy^2). We know that for dy=1 then dx=e.x_slope. We rotate 502 // the tangent vector either -90 or +90 degrees to get the edge normal vector, 503 // where 'dx=-dy and 'dy=dx. Once normalized by 1/sqrt(dx^2+dy^2), scale into 504 // the range of 0..256 so that we can cheaply convert to a fixed-point scale 505 // factor. It is assumed that at exactly the pixel center the opacity is half 506 // (128) and linearly decreases along the normal vector at 1:1 scale with the 507 // slope. While not entirely accurate, this gives a reasonably agreeable looking 508 // approximation of AA. For edges on which there is no AA, just force the 509 // opacity to maximum (256) with no slope, relying on the span clipping to trim 510 // pixels outside the span. 511 template <typename E> 512 static ALWAYS_INLINE FloatRange aa_dist(const E& e, float dir) { 513 if (e.edgeMask) { 514 float dx = (dir * 256.0f) * inversesqrt(1.0f + e.x_slope() * e.x_slope()); 515 return {128.0f + dx * (e.cur_x() - 0.5f), -dx}; 516 } else { 517 return {256.0f, 0.0f}; 518 } 519 } 520 521 template <typename P, typename E> 522 static ALWAYS_INLINE IntRange aa_span(P* buf, const E& left, const E& right, 523 const FloatRange& bounds) { 524 // If there is no AA, just return the span from the rounded left edge X 525 // position to the rounded right edge X position. Clip the span to be within 526 // the valid bounds. 527 if (!(swgl_ClipFlags & SWGL_CLIP_FLAG_AA)) { 528 return bounds.clip({left.cur_x(), right.cur_x()}).round(); 529 } 530 531 // Calculate the left and right AA spans along with the coverage distances 532 // and slopes necessary to do blending. 533 IntRange leftAA = aa_edge(left, bounds); 534 FloatRange leftDist = aa_dist(left, -1.0f); 535 IntRange rightAA = aa_edge(right, bounds); 536 FloatRange rightDist = aa_dist(right, 1.0f); 537 538 // Use the pointer into the destination buffer as a status indicator of the 539 // coverage offset. The pointer is calculated so that subtracting it with 540 // the current destination pointer will yield a negative value if the span 541 // is outside the opaque area and otherwise will yield a positive value 542 // above the opaque size. This pointer is stored as a uint8 pointer so that 543 // there are no hidden multiplication instructions and will just return a 544 // 1:1 linear memory address. Thus the size of the opaque region must also 545 // be scaled by the pixel size in bytes. 546 swgl_OpaqueStart = (const uint8_t*)(buf + leftAA.end); 547 swgl_OpaqueSize = max(rightAA.start - leftAA.end - 3, 0) * sizeof(P); 548 549 // Offset the coverage distances by the end of the left AA span, which 550 // corresponds to the opaque start pointer, so that pixels become opaque 551 // immediately after. The distances are also offset for each lane in the 552 // chunk. 553 Float offset = cast(leftAA.end + (I32){0, 1, 2, 3}); 554 swgl_LeftAADist = leftDist.start + offset * leftDist.end; 555 swgl_RightAADist = rightDist.start + offset * rightDist.end; 556 swgl_AASlope = 557 (Float){leftDist.end, rightDist.end, 0.0f, 0.0f} / float(sizeof(P)); 558 559 // Return the full span width from the start of the left span to the end of 560 // the right span. 561 return {leftAA.start, rightAA.end}; 562 } 563 564 // Calculate the span the user clip distances occupy from the left and right 565 // edges at the current row. 566 template <typename E> 567 static ALWAYS_INLINE IntRange clip_distance_range(const E& left, 568 const E& right) { 569 Float leftClip = get_clip_distances(left.interp); 570 Float rightClip = get_clip_distances(right.interp); 571 // Get the change in clip dist per X step. 572 Float clipStep = (rightClip - leftClip) / (right.cur_x() - left.cur_x()); 573 // Find the zero intercepts starting from the left edge. 574 Float clipDist = 575 clamp(left.cur_x() - leftClip * recip(clipStep), 0.0f, 1.0e6f); 576 // Find the distance to the start of the span for any clip distances that 577 // are increasing in value. If the clip distance is constant or decreasing 578 // in value, then check if it starts outside the clip volume. 579 Float start = if_then_else(clipStep > 0.0f, clipDist, 580 if_then_else(leftClip < 0.0f, 1.0e6f, 0.0f)); 581 // Find the distance to the end of the span for any clip distances that are 582 // decreasing in value. If the clip distance is constant or increasing in 583 // value, then check if it ends inside the clip volume. 584 Float end = if_then_else(clipStep < 0.0f, clipDist, 585 if_then_else(rightClip >= 0.0f, 1.0e6f, 0.0f)); 586 // Find the furthest start offset. 587 start = max(start, start.zwxy); 588 // Find the closest end offset. 589 end = min(end, end.zwxy); 590 // Finally, round the offsets to an integer span that can be used to bound 591 // the current span. 592 return FloatRange{max(start.x, start.y), min(end.x, end.y)}.round(); 593 } 594 595 // Converts a run array into a flattened array of depth samples. This just 596 // walks through every run and fills the samples with the depth value from 597 // the run. 598 static void flatten_depth_runs(DepthRun* runs, size_t width) { 599 if (runs->is_flat()) { 600 return; 601 } 602 while (width > 0) { 603 size_t n = runs->count; 604 fill_flat_depth(runs, n, runs->depth); 605 runs += n; 606 width -= n; 607 } 608 } 609 610 // Helper function for drawing passed depth runs within the depth buffer. 611 // Flattened depth (perspective or discard) is not supported. 612 template <typename P> 613 static ALWAYS_INLINE void draw_depth_span(uint32_t z, P* buf, 614 DepthCursor& cursor) { 615 for (;;) { 616 // Get the span that passes the depth test. Assume on entry that 617 // any failed runs have already been skipped. 618 int span = cursor.check_passed(z, ctx->depthfunc, ctx->depthmask); 619 // If nothing passed, since we already skipped passed failed runs 620 // previously, we must have hit the end of the row. Bail out. 621 if (span <= 0) { 622 break; 623 } 624 if (span >= 4) { 625 // If we have a draw specialization, try to process as many 4-pixel 626 // chunks as possible using it. 627 if (fragment_shader->has_draw_span(buf)) { 628 int drawn = fragment_shader->draw_span(buf, span & ~3); 629 buf += drawn; 630 span -= drawn; 631 } 632 // Otherwise, just process each chunk individually. 633 while (span >= 4) { 634 fragment_shader->run(); 635 discard_output<false>(buf); 636 buf += 4; 637 span -= 4; 638 } 639 } 640 // If we have a partial chunk left over, we still have to process it as if 641 // it were a full chunk. Mask off only the part of the chunk we want to 642 // use. 643 if (span > 0) { 644 fragment_shader->run(); 645 discard_output<false>(buf, span); 646 buf += span; 647 } 648 // Skip past any runs that fail the depth test. 649 int skip = cursor.skip_failed(z, ctx->depthfunc); 650 // If there aren't any, that means we won't encounter any more passing runs 651 // and so it's safe to bail out. 652 if (skip <= 0) { 653 break; 654 } 655 // Advance interpolants for the fragment shader past the skipped region. 656 // If we processed a partial chunk above, we actually advanced the 657 // interpolants a full chunk in the fragment shader's run function. Thus, 658 // we need to first subtract off that 4-pixel chunk and only partially 659 // advance them to that partial chunk before we can add on the rest of the 660 // skips. This is combined with the skip here for efficiency's sake. 661 fragment_shader->skip(skip - (span > 0 ? 4 - span : 0)); 662 buf += skip; 663 } 664 } 665 666 // Draw a simple span in 4-pixel wide chunks, optionally using depth. 667 template <bool DISCARD, bool W, typename P, typename Z> 668 static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) { 669 if (depth) { 670 // Depth testing is enabled. If perspective is used, Z values will vary 671 // across the span, we use packDepth to generate packed Z values suitable 672 // for depth testing based on current values from gl_FragCoord.z. 673 // Otherwise, for the no-perspective case, we just use the provided Z. 674 // Process 4-pixel chunks first. 675 for (; span >= 4; span -= 4, buf += 4, depth += 4) { 676 I32 zsrc = z(); 677 ZMask zmask; 678 if (check_depth<DISCARD>(zsrc, depth, zmask)) { 679 fragment_shader->run<W>(); 680 mask_output(buf, zmask); 681 if (DISCARD) discard_depth(zsrc, depth, zmask); 682 } else { 683 fragment_shader->skip<W>(); 684 } 685 } 686 // If there are any remaining pixels, do a partial chunk. 687 if (span > 0) { 688 I32 zsrc = z(); 689 ZMask zmask; 690 if (check_depth<DISCARD>(zsrc, depth, zmask, span)) { 691 fragment_shader->run<W>(); 692 mask_output(buf, zmask, span); 693 if (DISCARD) discard_depth(zsrc, depth, zmask); 694 } 695 } 696 } else { 697 // Process 4-pixel chunks first. 698 for (; span >= 4; span -= 4, buf += 4) { 699 fragment_shader->run<W>(); 700 discard_output<DISCARD>(buf); 701 } 702 // If there are any remaining pixels, do a partial chunk. 703 if (span > 0) { 704 fragment_shader->run<W>(); 705 discard_output<DISCARD>(buf, span); 706 } 707 } 708 } 709 710 // Called during rasterization to forcefully clear a row on which delayed clear 711 // has been enabled. If we know that we are going to completely overwrite a part 712 // of the row, then we only need to clear the row outside of that part. However, 713 // if blending or discard is enabled, the values of that underlying part of the 714 // row may be used regardless to produce the final rasterization result, so we 715 // have to then clear the entire underlying row to prepare it. 716 template <typename P> 717 static inline void prepare_row(Texture& colortex, int y, int startx, int endx, 718 bool use_discard, DepthRun* depth, 719 uint32_t z = 0, DepthCursor* cursor = nullptr) { 720 assert(colortex.delay_clear > 0); 721 // Delayed clear is enabled for the color buffer. Check if needs clear. 722 uint32_t& mask = colortex.cleared_rows[y / 32]; 723 if ((mask & (1 << (y & 31))) == 0) { 724 mask |= 1 << (y & 31); 725 colortex.delay_clear--; 726 if (blend_key || use_discard) { 727 // If depth test, blending, or discard is used, old color values 728 // might be sampled, so we need to clear the entire row to fill it. 729 force_clear_row<P>(colortex, y); 730 } else if (depth) { 731 if (depth->is_flat() || !cursor) { 732 // If flat depth is used, we can't cheaply predict if which samples will 733 // pass. 734 force_clear_row<P>(colortex, y); 735 } else { 736 // Otherwise if depth runs are used, see how many samples initially pass 737 // the depth test and only fill the row outside those. The fragment 738 // shader will fill the row within the passed samples. 739 int passed = 740 DepthCursor(*cursor).check_passed<false>(z, ctx->depthfunc); 741 if (startx > 0 || startx + passed < colortex.width) { 742 force_clear_row<P>(colortex, y, startx, startx + passed); 743 } 744 } 745 } else if (startx > 0 || endx < colortex.width) { 746 // Otherwise, we only need to clear the row outside of the span. 747 // The fragment shader will fill the row within the span itself. 748 force_clear_row<P>(colortex, y, startx, endx); 749 } 750 } 751 } 752 753 // Perpendicular dot-product is the dot-product of a vector with the 754 // perpendicular vector of the other, i.e. dot(a, {-b.y, b.x}) 755 template <typename T> 756 static ALWAYS_INLINE auto perpDot(T a, T b) { 757 return a.x * b.y - a.y * b.x; 758 } 759 760 // Check if the winding of the initial edges is flipped, requiring us to swap 761 // the edges to avoid spans having negative lengths. Assume that l0.y == r0.y 762 // due to the initial edge scan in draw_quad/perspective_spans. 763 template <typename T> 764 static ALWAYS_INLINE bool checkIfEdgesFlipped(T l0, T l1, T r0, T r1) { 765 // If the starting point of the left edge is to the right of the starting 766 // point of the right edge, then just assume the edges are flipped. If the 767 // left and right starting points are the same, then check the sign of the 768 // cross-product of the edges to see if the edges are flipped. Otherwise, 769 // if the left starting point is actually just to the left of the right 770 // starting point, then assume no edge flip. 771 return l0.x > r0.x || (l0.x == r0.x && perpDot(l1 - l0, r1 - r0) > 0.0f); 772 } 773 774 // Draw spans for each row of a given quad (or triangle) with a constant Z 775 // value. The quad is assumed convex. It is clipped to fall within the given 776 // clip rect. In short, this function rasterizes a quad by first finding a 777 // top most starting point and then from there tracing down the left and right 778 // sides of this quad until it hits the bottom, outputting a span between the 779 // current left and right positions at each row along the way. Points are 780 // assumed to be ordered in either CW or CCW to support this, but currently 781 // both orders (CW and CCW) are supported and equivalent. 782 template <typename P> 783 static inline void draw_quad_spans(int nump, Point2D p[4], uint32_t z, 784 Interpolants interp_outs[4], 785 Texture& colortex, Texture& depthtex, 786 const ClipRect& clipRect) { 787 // Only triangles and convex quads supported. 788 assert(nump == 3 || nump == 4); 789 790 Point2D l0, r0, l1, r1; 791 int l0i, r0i, l1i, r1i; 792 { 793 // Find the index of the top-most (smallest Y) point from which 794 // rasterization can start. 795 int top = nump > 3 && p[3].y < p[2].y 796 ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3) 797 : (p[1].y < p[3].y ? 1 : 3)) 798 : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2) 799 : (p[1].y < p[2].y ? 1 : 2)); 800 // Helper to find next index in the points array, walking forward. 801 #define NEXT_POINT(idx) \ 802 ({ \ 803 int cur = (idx) + 1; \ 804 cur < nump ? cur : 0; \ 805 }) 806 // Helper to find the previous index in the points array, walking backward. 807 #define PREV_POINT(idx) \ 808 ({ \ 809 int cur = (idx) - 1; \ 810 cur >= 0 ? cur : nump - 1; \ 811 }) 812 // Start looking for "left"-side and "right"-side descending edges starting 813 // from the determined top point. 814 int next = NEXT_POINT(top); 815 int prev = PREV_POINT(top); 816 if (p[top].y == p[next].y) { 817 // If the next point is on the same row as the top, then advance one more 818 // time to the next point and use that as the "left" descending edge. 819 l0i = next; 820 l1i = NEXT_POINT(next); 821 // Assume top and prev form a descending "right" edge, as otherwise this 822 // will be a collapsed polygon and harmlessly bail out down below. 823 r0i = top; 824 r1i = prev; 825 } else if (p[top].y == p[prev].y) { 826 // If the prev point is on the same row as the top, then advance to the 827 // prev again and use that as the "right" descending edge. 828 // Assume top and next form a non-empty descending "left" edge. 829 l0i = top; 830 l1i = next; 831 r0i = prev; 832 r1i = PREV_POINT(prev); 833 } else { 834 // Both next and prev are on distinct rows from top, so both "left" and 835 // "right" edges are non-empty/descending. 836 l0i = r0i = top; 837 l1i = next; 838 r1i = prev; 839 } 840 // Load the points from the indices. 841 l0 = p[l0i]; // Start of left edge 842 r0 = p[r0i]; // End of left edge 843 l1 = p[l1i]; // Start of right edge 844 r1 = p[r1i]; // End of right edge 845 // debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1: 846 // %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i, 847 // r1.x, r1.y); 848 } 849 850 struct Edge { 851 float yScale; 852 float xSlope; 853 float x; 854 Interpolants interpSlope; 855 Interpolants interp; 856 bool edgeMask; 857 858 Edge(float y, const Point2D& p0, const Point2D& p1, const Interpolants& i0, 859 const Interpolants& i1, int edgeIndex) 860 : // Inverse Y scale for slope calculations. Avoid divide on 0-length 861 // edge. Later checks below ensure that Y <= p1.y, or otherwise we 862 // don't use this edge. We just need to guard against Y == p1.y == 863 // p0.y. In that case, Y - p0.y == 0 and will cancel out the slopes 864 // below, except if yScale is Inf for some reason (or worse, NaN), 865 // which 1/(p1.y-p0.y) might produce if we don't bound it. 866 yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), 867 // Calculate dX/dY slope 868 xSlope((p1.x - p0.x) * yScale), 869 // Initialize current X based on Y and slope 870 x(p0.x + (y - p0.y) * xSlope), 871 // Calculate change in interpolants per change in Y 872 interpSlope((i1 - i0) * yScale), 873 // Initialize current interpolants based on Y and slope 874 interp(i0 + (y - p0.y) * interpSlope), 875 // Extract the edge mask status for this edge 876 edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {} 877 878 void nextRow() { 879 // step current X and interpolants to next row from slope 880 x += xSlope; 881 interp += interpSlope; 882 } 883 884 float cur_x() const { return x; } 885 float x_slope() const { return xSlope; } 886 }; 887 888 // Vertex selection above should result in equal left and right start rows 889 assert(l0.y == r0.y); 890 // Find the start y, clip to within the clip rect, and round to row center. 891 // If AA is enabled, round out conservatively rather than round to nearest. 892 float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f; 893 float y = floor(max(min(l0.y, clipRect.y1), clipRect.y0) + aaRound) + 0.5f; 894 // Initialize left and right edges from end points and start Y 895 Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); 896 Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); 897 // WR does not use backface culling, so check if edges are flipped. 898 bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1); 899 if (flipped) swap(left, right); 900 // Get pointer to color buffer and depth buffer at current Y 901 P* fbuf = (P*)colortex.sample_ptr(0, int(y)); 902 DepthRun* fdepth = depthtex.buf != nullptr 903 ? (DepthRun*)depthtex.sample_ptr(0, int(y)) 904 : nullptr; 905 // Loop along advancing Ys, rasterizing spans at each row 906 float checkY = min(min(l1.y, r1.y), clipRect.y1); 907 // Ensure we don't rasterize out edge bounds 908 FloatRange clipSpan = 909 clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); 910 for (;;) { 911 // Check if we maybe passed edge ends or outside clip rect... 912 if (y > checkY) { 913 // If we're outside the clip rect, we're done. 914 if (y > clipRect.y1) break; 915 // Helper to find the next non-duplicate vertex that doesn't loop back. 916 #define STEP_EDGE(y, e0i, e0, e1i, e1, STEP_POINT, end) \ 917 do { \ 918 /* Set new start of edge to be end of old edge */ \ 919 e0i = e1i; \ 920 e0 = e1; \ 921 /* Set new end of edge to next point */ \ 922 e1i = STEP_POINT(e1i); \ 923 e1 = p[e1i]; \ 924 /* If the edge crossed the end, we're done. */ \ 925 if (e0i == end) return; \ 926 /* Otherwise, it doesn't advance, so keep searching. */ \ 927 } while (y > e1.y) 928 // Check if Y advanced past the end of the left edge 929 if (y > l1.y) { 930 // Step to next left edge past Y and reset edge interpolants. 931 STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i); 932 (flipped ? right : left) = 933 Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); 934 } 935 // Check if Y advanced past the end of the right edge 936 if (y > r1.y) { 937 // Step to next right edge past Y and reset edge interpolants. 938 STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i); 939 (flipped ? left : right) = 940 Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); 941 } 942 // Reset the clip bounds for the new edges 943 clipSpan = 944 clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); 945 // Reset check condition for next time around. 946 checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1); 947 } 948 949 // Calculate a potentially AA'd span and check if it is non-empty. 950 IntRange span = aa_span(fbuf, left, right, clipSpan); 951 if (span.len() > 0) { 952 // If user clip planes are enabled, use them to bound the current span. 953 if (vertex_shader->use_clip_distance()) { 954 span = span.intersect(clip_distance_range(left, right)); 955 if (span.len() <= 0) goto next_span; 956 } 957 ctx->shaded_rows++; 958 ctx->shaded_pixels += span.len(); 959 // Advance color/depth buffer pointers to the start of the span. 960 P* buf = fbuf + span.start; 961 // Check if we will need to use depth-buffer or discard on this span. 962 DepthRun* depth = 963 depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr; 964 DepthCursor cursor; 965 bool use_discard = fragment_shader->use_discard(); 966 if (use_discard) { 967 if (depth) { 968 // If we're using discard, we may have to unpredictably drop out some 969 // samples. Flatten the depth run array here to allow this. 970 if (!depth->is_flat()) { 971 flatten_depth_runs(depth, depthtex.width); 972 } 973 // Advance to the depth sample at the start of the span. 974 depth += span.start; 975 } 976 } else if (depth) { 977 if (!depth->is_flat()) { 978 // We're not using discard and the depth row is still organized into 979 // runs. Skip past any runs that would fail the depth test so we 980 // don't have to do any extra work to process them with the rest of 981 // the span. 982 cursor = DepthCursor(depth, depthtex.width, span.start, span.len()); 983 int skipped = cursor.skip_failed(z, ctx->depthfunc); 984 // If we fell off the row, that means we couldn't find any passing 985 // runs. We can just skip the entire span. 986 if (skipped < 0) { 987 goto next_span; 988 } 989 buf += skipped; 990 span.start += skipped; 991 } else { 992 // The row is already flattened, so just advance to the span start. 993 depth += span.start; 994 } 995 } 996 997 if (colortex.delay_clear) { 998 // Delayed clear is enabled for the color buffer. Check if needs clear. 999 prepare_row<P>(colortex, int(y), span.start, span.end, use_discard, 1000 depth, z, &cursor); 1001 } 1002 1003 // Initialize fragment shader interpolants to current span position. 1004 fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1); 1005 fragment_shader->gl_FragCoord.y = y; 1006 { 1007 // Change in interpolants is difference between current right and left 1008 // edges per the change in right and left X. If the left and right X 1009 // positions are extremely close together, then avoid stepping the 1010 // interpolants. 1011 float stepScale = 1.0f / (right.x - left.x); 1012 if (!isfinite(stepScale)) stepScale = 0.0f; 1013 Interpolants step = (right.interp - left.interp) * stepScale; 1014 // Advance current interpolants to X at start of span. 1015 Interpolants o = left.interp + step * (span.start + 0.5f - left.x); 1016 fragment_shader->init_span(&o, &step); 1017 } 1018 clipRect.set_clip_mask(span.start, y, buf); 1019 if (!use_discard) { 1020 // Fast paths for the case where fragment discard is not used. 1021 if (depth) { 1022 // If depth is used, we want to process entire depth runs if depth is 1023 // not flattened. 1024 if (!depth->is_flat()) { 1025 draw_depth_span(z, buf, cursor); 1026 goto next_span; 1027 } 1028 // Otherwise, flattened depth must fall back to the slightly slower 1029 // per-chunk depth test path in draw_span below. 1030 } else { 1031 // Check if the fragment shader has an optimized draw specialization. 1032 if (span.len() >= 4 && fragment_shader->has_draw_span(buf)) { 1033 // Draw specialization expects 4-pixel chunks. 1034 int drawn = fragment_shader->draw_span(buf, span.len() & ~3); 1035 buf += drawn; 1036 span.start += drawn; 1037 } 1038 } 1039 draw_span<false, false>(buf, depth, span.len(), [=] { return z; }); 1040 } else { 1041 // If discard is used, then use slower fallbacks. This should be rare. 1042 // Just needs to work, doesn't need to be too fast yet... 1043 draw_span<true, false>(buf, depth, span.len(), [=] { return z; }); 1044 } 1045 } 1046 next_span: 1047 // Advance Y and edge interpolants to next row. 1048 y++; 1049 left.nextRow(); 1050 right.nextRow(); 1051 // Advance buffers to next row. 1052 fbuf += colortex.stride() / sizeof(P); 1053 fdepth += depthtex.stride() / sizeof(DepthRun); 1054 } 1055 } 1056 1057 // Draw perspective-correct spans for a convex quad that has been clipped to 1058 // the near and far Z planes, possibly producing a clipped convex polygon with 1059 // more than 4 sides. This assumes the Z value will vary across the spans and 1060 // requires interpolants to factor in W values. This tends to be slower than 1061 // the simpler 2D draw_quad_spans above, especially since we can't optimize the 1062 // depth test easily when Z values, and should be used only rarely if possible. 1063 template <typename P> 1064 static inline void draw_perspective_spans(int nump, Point3D* p, 1065 Interpolants* interp_outs, 1066 Texture& colortex, Texture& depthtex, 1067 const ClipRect& clipRect) { 1068 Point3D l0, r0, l1, r1; 1069 int l0i, r0i, l1i, r1i; 1070 { 1071 // Find the index of the top-most point (smallest Y) from which 1072 // rasterization can start. 1073 int top = 0; 1074 for (int i = 1; i < nump; i++) { 1075 if (p[i].y < p[top].y) { 1076 top = i; 1077 } 1078 } 1079 // Find left-most top point, the start of the left descending edge. 1080 // Advance forward in the points array, searching at most nump points 1081 // in case the polygon is flat. 1082 l0i = top; 1083 for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) { 1084 l0i = i; 1085 } 1086 if (l0i == nump - 1) { 1087 for (int i = 0; i <= top && p[i].y == p[top].y; i++) { 1088 l0i = i; 1089 } 1090 } 1091 // Find right-most top point, the start of the right descending edge. 1092 // Advance backward in the points array, searching at most nump points. 1093 r0i = top; 1094 for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) { 1095 r0i = i; 1096 } 1097 if (r0i == 0) { 1098 for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) { 1099 r0i = i; 1100 } 1101 } 1102 // End of left edge is next point after left edge start. 1103 l1i = NEXT_POINT(l0i); 1104 // End of right edge is prev point after right edge start. 1105 r1i = PREV_POINT(r0i); 1106 l0 = p[l0i]; // Start of left edge 1107 r0 = p[r0i]; // End of left edge 1108 l1 = p[l1i]; // Start of right edge 1109 r1 = p[r1i]; // End of right edge 1110 } 1111 1112 struct Edge { 1113 float yScale; 1114 // Current coordinates for edge. Where in the 2D case of draw_quad_spans, 1115 // it is enough to just track the X coordinate as we advance along the rows, 1116 // for the perspective case we also need to keep track of Z and W. For 1117 // simplicity, we just use the full 3D point to track all these coordinates. 1118 Point3D pSlope; 1119 Point3D p; 1120 Interpolants interpSlope; 1121 Interpolants interp; 1122 bool edgeMask; 1123 1124 Edge(float y, const Point3D& p0, const Point3D& p1, const Interpolants& i0, 1125 const Interpolants& i1, int edgeIndex) 1126 : // Inverse Y scale for slope calculations. Avoid divide on 0-length 1127 // edge. 1128 yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), 1129 // Calculate dX/dY slope 1130 pSlope((p1 - p0) * yScale), 1131 // Initialize current coords based on Y and slope 1132 p(p0 + (y - p0.y) * pSlope), 1133 // Crucially, these interpolants must be scaled by the point's 1/w 1134 // value, which allows linear interpolation in a perspective-correct 1135 // manner. This will be canceled out inside the fragment shader later. 1136 // Calculate change in interpolants per change in Y 1137 interpSlope((i1 * p1.w - i0 * p0.w) * yScale), 1138 // Initialize current interpolants based on Y and slope 1139 interp(i0 * p0.w + (y - p0.y) * interpSlope), 1140 // Extract the edge mask status for this edge 1141 edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {} 1142 1143 float x() const { return p.x; } 1144 vec2_scalar zw() const { return {p.z, p.w}; } 1145 1146 void nextRow() { 1147 // step current coords and interpolants to next row from slope 1148 p += pSlope; 1149 interp += interpSlope; 1150 } 1151 1152 float cur_x() const { return p.x; } 1153 float x_slope() const { return pSlope.x; } 1154 }; 1155 1156 // Vertex selection above should result in equal left and right start rows 1157 assert(l0.y == r0.y); 1158 // Find the start y, clip to within the clip rect, and round to row center. 1159 // If AA is enabled, round out conservatively rather than round to nearest. 1160 float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f; 1161 float y = floor(max(min(l0.y, clipRect.y1), clipRect.y0) + aaRound) + 0.5f; 1162 // Initialize left and right edges from end points and start Y 1163 Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); 1164 Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); 1165 // WR does not use backface culling, so check if edges are flipped. 1166 bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1); 1167 if (flipped) swap(left, right); 1168 // Get pointer to color buffer and depth buffer at current Y 1169 P* fbuf = (P*)colortex.sample_ptr(0, int(y)); 1170 DepthRun* fdepth = depthtex.buf != nullptr 1171 ? (DepthRun*)depthtex.sample_ptr(0, int(y)) 1172 : nullptr; 1173 // Loop along advancing Ys, rasterizing spans at each row 1174 float checkY = min(min(l1.y, r1.y), clipRect.y1); 1175 // Ensure we don't rasterize out edge bounds 1176 FloatRange clipSpan = 1177 clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); 1178 for (;;) { 1179 // Check if we maybe passed edge ends or outside clip rect... 1180 if (y > checkY) { 1181 // If we're outside the clip rect, we're done. 1182 if (y > clipRect.y1) break; 1183 // Check if Y advanced past the end of the left edge 1184 if (y > l1.y) { 1185 // Step to next left edge past Y and reset edge interpolants. 1186 STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i); 1187 (flipped ? right : left) = 1188 Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); 1189 } 1190 // Check if Y advanced past the end of the right edge 1191 if (y > r1.y) { 1192 // Step to next right edge past Y and reset edge interpolants. 1193 STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i); 1194 (flipped ? left : right) = 1195 Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); 1196 } 1197 // Reset the clip bounds for the new edges 1198 clipSpan = 1199 clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); 1200 // Reset check condition for next time around. 1201 checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1); 1202 } 1203 1204 // Calculate a potentially AA'd span and check if it is non-empty. 1205 IntRange span = aa_span(fbuf, left, right, clipSpan); 1206 if (span.len() > 0) { 1207 // If user clip planes are enabled, use them to bound the current span. 1208 if (vertex_shader->use_clip_distance()) { 1209 span = span.intersect(clip_distance_range(left, right)); 1210 if (span.len() <= 0) goto next_span; 1211 } 1212 ctx->shaded_rows++; 1213 ctx->shaded_pixels += span.len(); 1214 // Advance color/depth buffer pointers to the start of the span. 1215 P* buf = fbuf + span.start; 1216 // Check if the we will need to use depth-buffer or discard on this span. 1217 DepthRun* depth = 1218 depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr; 1219 bool use_discard = fragment_shader->use_discard(); 1220 if (depth) { 1221 // Perspective may cause the depth value to vary on a per sample basis. 1222 // Ensure the depth row is flattened to allow testing of individual 1223 // samples 1224 if (!depth->is_flat()) { 1225 flatten_depth_runs(depth, depthtex.width); 1226 } 1227 // Advance to the depth sample at the start of the span. 1228 depth += span.start; 1229 } 1230 if (colortex.delay_clear) { 1231 // Delayed clear is enabled for the color buffer. Check if needs clear. 1232 prepare_row<P>(colortex, int(y), span.start, span.end, use_discard, 1233 depth); 1234 } 1235 // Initialize fragment shader interpolants to current span position. 1236 fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1); 1237 fragment_shader->gl_FragCoord.y = y; 1238 { 1239 // Calculate the fragment Z and W change per change in fragment X step. 1240 // If the left and right X positions are extremely close together, then 1241 // avoid stepping. 1242 float stepScale = 1.0f / (right.x() - left.x()); 1243 if (!isfinite(stepScale)) stepScale = 0.0f; 1244 vec2_scalar stepZW = (right.zw() - left.zw()) * stepScale; 1245 // Calculate initial Z and W values for span start. 1246 vec2_scalar zw = left.zw() + stepZW * (span.start + 0.5f - left.x()); 1247 // Set fragment shader's Z and W values so that it can use them to 1248 // cancel out the 1/w baked into the interpolants. 1249 fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x); 1250 fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y); 1251 fragment_shader->swgl_StepZW = stepZW; 1252 // Change in interpolants is difference between current right and left 1253 // edges per the change in right and left X. The left and right 1254 // interpolant values were previously multipled by 1/w, so the step and 1255 // initial span values take this into account. 1256 Interpolants step = (right.interp - left.interp) * stepScale; 1257 // Advance current interpolants to X at start of span. 1258 Interpolants o = left.interp + step * (span.start + 0.5f - left.x()); 1259 fragment_shader->init_span<true>(&o, &step); 1260 } 1261 clipRect.set_clip_mask(span.start, y, buf); 1262 if (!use_discard) { 1263 // No discard is used. Common case. 1264 draw_span<false, true>(buf, depth, span.len(), packDepth); 1265 } else { 1266 // Discard is used. Rare. 1267 draw_span<true, true>(buf, depth, span.len(), packDepth); 1268 } 1269 } 1270 next_span: 1271 // Advance Y and edge interpolants to next row. 1272 y++; 1273 left.nextRow(); 1274 right.nextRow(); 1275 // Advance buffers to next row. 1276 fbuf += colortex.stride() / sizeof(P); 1277 fdepth += depthtex.stride() / sizeof(DepthRun); 1278 } 1279 } 1280 1281 // Clip a primitive against both sides of a view-frustum axis, producing 1282 // intermediate vertexes with interpolated attributes that will no longer 1283 // intersect the selected axis planes. This assumes the primitive is convex 1284 // and should produce at most N+2 vertexes for each invocation (only in the 1285 // worst case where one point falls outside on each of the opposite sides 1286 // with the rest of the points inside). The supplied AA edge mask will be 1287 // modified such that it corresponds to the clipped polygon edges. 1288 template <XYZW AXIS> 1289 static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP, 1290 Interpolants* outInterp, int& outEdgeMask) { 1291 // Potential mask bits of which side of a plane a coordinate falls on. 1292 enum SIDE { POSITIVE = 1, NEGATIVE = 2 }; 1293 int numClip = 0; 1294 int edgeMask = outEdgeMask; 1295 Point3D prev = p[nump - 1]; 1296 Interpolants prevInterp = interp[nump - 1]; 1297 float prevCoord = prev.select(AXIS); 1298 // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and 1299 // if so, remember which side it is outside of. In the special case that W is 1300 // negative and |C| < |W|, both -W <= C and C <= W will be false, such that 1301 // we must consider the coordinate as falling outside of both plane sides 1302 // simultaneously. We test each condition separately and combine them to form 1303 // a mask of which plane sides we exceeded. If we neglect to consider both 1304 // sides simultaneously, points can erroneously oscillate from one plane side 1305 // to the other and exceed the supported maximum number of clip outputs. 1306 int prevMask = (prevCoord < -prev.w ? NEGATIVE : 0) | 1307 (prevCoord > prev.w ? POSITIVE : 0); 1308 // Loop through points, finding edges that cross the planes by evaluating 1309 // the side at each point. 1310 outEdgeMask = 0; 1311 for (int i = 0; i < nump; i++, edgeMask >>= 1) { 1312 Point3D cur = p[i]; 1313 Interpolants curInterp = interp[i]; 1314 float curCoord = cur.select(AXIS); 1315 int curMask = 1316 (curCoord < -cur.w ? NEGATIVE : 0) | (curCoord > cur.w ? POSITIVE : 0); 1317 // Check if the previous and current end points are on different sides. If 1318 // the masks of sides intersect, then we consider them to be on the same 1319 // side. So in the case the masks do not intersect, we then consider them 1320 // to fall on different sides. 1321 if (!(curMask & prevMask)) { 1322 // One of the edge's end points is outside the plane with the other 1323 // inside the plane. Find the offset where it crosses the plane and 1324 // adjust the point and interpolants to there. 1325 if (prevMask) { 1326 // Edge that was previously outside crosses inside. 1327 // Evaluate plane equation for previous and current end-point 1328 // based on previous side and calculate relative offset. 1329 if (numClip >= nump + 2) { 1330 // If for some reason we produced more vertexes than we support, just 1331 // bail out. 1332 assert(false); 1333 return 0; 1334 } 1335 // The positive plane is assigned the sign 1, and the negative plane is 1336 // assigned -1. If the point falls outside both planes, that means W is 1337 // negative. To compensate for this, we must interpolate the coordinate 1338 // till W=0, at which point we can choose a single plane side for the 1339 // coordinate to fall on since W will no longer be negative. To compute 1340 // the coordinate where W=0, we compute K = prev.w / (prev.w-cur.w) and 1341 // interpolate C = prev.C + K*(cur.C - prev.C). The sign of C will be 1342 // the side of the plane we need to consider. Substituting K into the 1343 // comparison C < 0, we can then avoid the division in K with a 1344 // cross-multiplication. 1345 float prevSide = 1346 (prevMask & NEGATIVE) && (!(prevMask & POSITIVE) || 1347 prevCoord * (cur.w - prev.w) < 1348 prev.w * (curCoord - prevCoord)) 1349 ? -1 1350 : 1; 1351 float prevDist = prevCoord - prevSide * prev.w; 1352 float curDist = curCoord - prevSide * cur.w; 1353 // It may happen that after we interpolate by the weight k that due to 1354 // floating point rounding we've underestimated the value necessary to 1355 // push it over the clipping boundary. Just in case, nudge the mantissa 1356 // by a single increment so that we essentially round it up and move it 1357 // further inside the clipping boundary. We use nextafter to do this in 1358 // a portable fashion. 1359 float k = prevDist / (prevDist - curDist); 1360 Point3D clipped = prev + (cur - prev) * k; 1361 if (prevSide * clipped.select(AXIS) > clipped.w) { 1362 k = nextafterf(k, 1.0f); 1363 clipped = prev + (cur - prev) * k; 1364 } 1365 outP[numClip] = clipped; 1366 outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; 1367 // Don't output the current edge mask since start point was outside. 1368 numClip++; 1369 } 1370 if (curMask) { 1371 // Edge that was previously inside crosses outside. 1372 // Evaluate plane equation for previous and current end-point 1373 // based on current side and calculate relative offset. 1374 if (numClip >= nump + 2) { 1375 assert(false); 1376 return 0; 1377 } 1378 // In the case the coordinate falls on both plane sides, the computation 1379 // here is much the same as for prevSide, but since we are going from a 1380 // previous W that is positive to current W that is negative, then the 1381 // sign of cur.w - prev.w will flip in the equation. The resulting sign 1382 // is negated to compensate for this. 1383 float curSide = 1384 (curMask & POSITIVE) && (!(curMask & NEGATIVE) || 1385 prevCoord * (cur.w - prev.w) < 1386 prev.w * (curCoord - prevCoord)) 1387 ? 1 1388 : -1; 1389 float prevDist = prevCoord - curSide * prev.w; 1390 float curDist = curCoord - curSide * cur.w; 1391 // Calculate interpolation weight k and the nudge it inside clipping 1392 // boundary with nextafter. Note that since we were previously inside 1393 // and now crossing outside, we have to flip the nudge direction for 1394 // the weight towards 0 instead of 1. 1395 float k = prevDist / (prevDist - curDist); 1396 Point3D clipped = prev + (cur - prev) * k; 1397 if (curSide * clipped.select(AXIS) > clipped.w) { 1398 k = nextafterf(k, 0.0f); 1399 clipped = prev + (cur - prev) * k; 1400 } 1401 outP[numClip] = clipped; 1402 outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; 1403 // Output the current edge mask since the end point is inside. 1404 outEdgeMask |= (edgeMask & 1) << numClip; 1405 numClip++; 1406 } 1407 } 1408 if (!curMask) { 1409 // The current end point is inside the plane, so output point unmodified. 1410 if (numClip >= nump + 2) { 1411 assert(false); 1412 return 0; 1413 } 1414 outP[numClip] = cur; 1415 outInterp[numClip] = curInterp; 1416 // Output the current edge mask since the end point is inside. 1417 outEdgeMask |= (edgeMask & 1) << numClip; 1418 numClip++; 1419 } 1420 prev = cur; 1421 prevInterp = curInterp; 1422 prevCoord = curCoord; 1423 prevMask = curMask; 1424 } 1425 return numClip; 1426 } 1427 1428 // Helper function to dispatch to perspective span drawing with points that 1429 // have already been transformed and clipped. 1430 static inline void draw_perspective_clipped(int nump, Point3D* p_clip, 1431 Interpolants* interp_clip, 1432 Texture& colortex, 1433 Texture& depthtex) { 1434 // If polygon is ouside clip rect, nothing to draw. 1435 ClipRect clipRect(colortex); 1436 if (!clipRect.overlaps(nump, p_clip)) { 1437 return; 1438 } 1439 1440 // Finally draw perspective-correct spans for the polygon. 1441 if (colortex.internal_format == GL_RGBA8) { 1442 draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex, 1443 depthtex, clipRect); 1444 } else if (colortex.internal_format == GL_R8) { 1445 draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex, 1446 depthtex, clipRect); 1447 } else { 1448 assert(false); 1449 } 1450 } 1451 1452 // Draws a perspective-correct 3D primitive with varying Z value, as opposed 1453 // to a simple 2D planar primitive with a constant Z value that could be 1454 // trivially Z rejected. This requires clipping the primitive against the near 1455 // and far planes to ensure it stays within the valid Z-buffer range. The Z 1456 // and W of each fragment of the primitives are interpolated across the 1457 // generated spans and then depth-tested as appropriate. 1458 // Additionally, vertex attributes must be interpolated with perspective- 1459 // correction by dividing by W before interpolation, and then later multiplied 1460 // by W again to produce the final correct attribute value for each fragment. 1461 // This process is expensive and should be avoided if possible for primitive 1462 // batches that are known ahead of time to not need perspective-correction. 1463 static void draw_perspective(int nump, Interpolants interp_outs[4], 1464 Texture& colortex, Texture& depthtex) { 1465 // Lines are not supported with perspective. 1466 assert(nump >= 3); 1467 // Convert output of vertex shader to screen space. 1468 vec4 pos = vertex_shader->gl_Position; 1469 vec3_scalar scale = 1470 vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f; 1471 vec3_scalar offset = 1472 make_vec3(make_vec2(ctx->viewport.origin() - colortex.offset), 0.0f) + 1473 scale; 1474 // Verify if point is between near and far planes, rejecting NaN. 1475 if (test_all(pos.z > -pos.w && pos.z < pos.w)) { 1476 // No points cross the near or far planes, so no clipping required. 1477 // Just divide coords by W and convert to viewport. We assume the W 1478 // coordinate is non-zero and the reciprocal is finite since it would 1479 // otherwise fail the test_none condition. 1480 Float w = 1.0f / pos.w; 1481 vec3 screen = pos.sel(X, Y, Z) * w * scale + offset; 1482 Point3D p[4] = {{screen.x.x, screen.y.x, screen.z.x, w.x}, 1483 {screen.x.y, screen.y.y, screen.z.y, w.y}, 1484 {screen.x.z, screen.y.z, screen.z.z, w.z}, 1485 {screen.x.w, screen.y.w, screen.z.w, w.w}}; 1486 draw_perspective_clipped(nump, p, interp_outs, colortex, depthtex); 1487 } else { 1488 // Points cross the near or far planes, so we need to clip. 1489 // Start with the original 3 or 4 points... 1490 Point3D p[4] = {{pos.x.x, pos.y.x, pos.z.x, pos.w.x}, 1491 {pos.x.y, pos.y.y, pos.z.y, pos.w.y}, 1492 {pos.x.z, pos.y.z, pos.z.z, pos.w.z}, 1493 {pos.x.w, pos.y.w, pos.z.w, pos.w.w}}; 1494 // Clipping can expand the points by 1 for each of 6 view frustum planes. 1495 Point3D p_clip[4 + 6]; 1496 Interpolants interp_clip[4 + 6]; 1497 // Clip against near and far Z planes. 1498 nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip, 1499 swgl_AAEdgeMask); 1500 // If no points are left inside the view frustum, there's nothing to draw. 1501 if (nump < 3) { 1502 return; 1503 } 1504 // After clipping against only the near and far planes, we might still 1505 // produce points where W = 0, exactly at the camera plane. OpenGL specifies 1506 // that for clip coordinates, points must satisfy: 1507 // -W <= X <= W 1508 // -W <= Y <= W 1509 // -W <= Z <= W 1510 // When Z = W = 0, this is trivially satisfied, but when we transform and 1511 // divide by W below it will produce a divide by 0. Usually we want to only 1512 // clip Z to avoid the extra work of clipping X and Y. We can still project 1513 // points that fall outside the view frustum X and Y so long as Z is valid. 1514 // The span drawing code will then ensure X and Y are clamped to viewport 1515 // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y, 1516 // will push W further inside the view frustum so that it is no longer 0, 1517 // allowing us to finally proceed to projecting the points to the screen. 1518 for (int i = 0; i < nump; i++) { 1519 // Found an invalid W, so need to clip against X and Y... 1520 if (p_clip[i].w <= 0.0f) { 1521 // Ping-pong p_clip -> p_tmp -> p_clip. 1522 Point3D p_tmp[4 + 6]; 1523 Interpolants interp_tmp[4 + 6]; 1524 nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp, 1525 swgl_AAEdgeMask); 1526 if (nump < 3) return; 1527 nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip, 1528 swgl_AAEdgeMask); 1529 if (nump < 3) return; 1530 // After clipping against X and Y planes, there's still points left 1531 // to draw, so proceed to trying projection now... 1532 break; 1533 } 1534 } 1535 // Divide coords by W and convert to viewport. 1536 for (int i = 0; i < nump; i++) { 1537 float w = 1.0f / p_clip[i].w; 1538 // If the W coord is essentially zero, small enough that division would 1539 // result in Inf/NaN, then just set the point to all zeroes, as the only 1540 // point that satisfies -W <= X/Y/Z <= W is all zeroes. 1541 p_clip[i] = isfinite(w) 1542 ? Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w) 1543 : Point3D(0.0f); 1544 } 1545 draw_perspective_clipped(nump, p_clip, interp_clip, colortex, depthtex); 1546 } 1547 } 1548 1549 static void draw_quad(int nump, Texture& colortex, Texture& depthtex) { 1550 // Run vertex shader once for the primitive's vertices. 1551 // Reserve space for 6 sets of interpolants, in case we need to clip against 1552 // near and far planes in the perspective case. 1553 Interpolants interp_outs[4]; 1554 swgl_ClipFlags = 0; 1555 vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants)); 1556 vec4 pos = vertex_shader->gl_Position; 1557 // Check if any vertex W is different from another. If so, use perspective. 1558 if (test_any(pos.w != pos.w.x)) { 1559 draw_perspective(nump, interp_outs, colortex, depthtex); 1560 return; 1561 } 1562 1563 // Convert output of vertex shader to screen space. 1564 // Divide coords by W and convert to viewport. 1565 float w = 1.0f / pos.w.x; 1566 // If the W coord is essentially zero, small enough that division would 1567 // result in Inf/NaN, then just set the reciprocal itself to zero so that 1568 // the coordinates becomes zeroed out, as the only valid point that 1569 // satisfies -W <= X/Y/Z <= W is all zeroes. 1570 if (!isfinite(w)) w = 0.0f; 1571 vec2 screen = (pos.sel(X, Y) * w + 1) * 0.5f * 1572 vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) + 1573 make_vec2(ctx->viewport.origin() - colortex.offset); 1574 Point2D p[4] = {{screen.x.x, screen.y.x}, 1575 {screen.x.y, screen.y.y}, 1576 {screen.x.z, screen.y.z}, 1577 {screen.x.w, screen.y.w}}; 1578 1579 // If quad is ouside clip rect, nothing to draw. 1580 ClipRect clipRect(colortex); 1581 if (!clipRect.overlaps(nump, p)) { 1582 return; 1583 } 1584 1585 // Since the quad is assumed 2D, Z is constant across the quad. 1586 float screenZ = (pos.z.x * w + 1) * 0.5f; 1587 if (screenZ < 0 || screenZ > 1) { 1588 // Z values would cross the near or far plane, so just bail. 1589 return; 1590 } 1591 // Since Z doesn't need to be interpolated, just set the fragment shader's 1592 // Z and W values here, once and for all fragment shader invocations. 1593 uint32_t z = uint32_t(MAX_DEPTH_VALUE * screenZ); 1594 fragment_shader->gl_FragCoord.z = screenZ; 1595 fragment_shader->gl_FragCoord.w = w; 1596 1597 // If supplied a line, adjust it so that it is a quad at least 1 pixel thick. 1598 // Assume that for a line that all 4 SIMD lanes were actually filled with 1599 // vertexes 0, 1, 1, 0. 1600 if (nump == 2) { 1601 // Nudge Y height to span at least 1 pixel by advancing to next pixel 1602 // boundary so that we step at least 1 row when drawing spans. 1603 if (int(p[0].y + 0.5f) == int(p[1].y + 0.5f)) { 1604 p[2].y = 1 + int(p[1].y + 0.5f); 1605 p[3].y = p[2].y; 1606 // Nudge X width to span at least 1 pixel so that rounded coords fall on 1607 // separate pixels. 1608 if (int(p[0].x + 0.5f) == int(p[1].x + 0.5f)) { 1609 p[1].x += 1.0f; 1610 p[2].x += 1.0f; 1611 } 1612 } else { 1613 // If the line already spans at least 1 row, then assume line is vertical 1614 // or diagonal and just needs to be dilated horizontally. 1615 p[2].x += 1.0f; 1616 p[3].x += 1.0f; 1617 } 1618 // Pretend that it's a quad now... 1619 nump = 4; 1620 } 1621 1622 // Finally draw 2D spans for the quad. Currently only supports drawing to 1623 // RGBA8 and R8 color buffers. 1624 if (colortex.internal_format == GL_RGBA8) { 1625 draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, depthtex, 1626 clipRect); 1627 } else if (colortex.internal_format == GL_R8) { 1628 draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, depthtex, 1629 clipRect); 1630 } else { 1631 assert(false); 1632 } 1633 } 1634 1635 template <typename INDEX> 1636 static inline void draw_elements(GLsizei count, GLsizei instancecount, 1637 size_t offset, VertexArray& v, 1638 Texture& colortex, Texture& depthtex) { 1639 Buffer& indices_buf = ctx->buffers[v.element_array_buffer_binding]; 1640 if (!indices_buf.buf || offset >= indices_buf.size) { 1641 return; 1642 } 1643 assert((offset & (sizeof(INDEX) - 1)) == 0); 1644 INDEX* indices = (INDEX*)(indices_buf.buf + offset); 1645 count = min(count, (GLsizei)((indices_buf.size - offset) / sizeof(INDEX))); 1646 // Triangles must be indexed at offsets 0, 1, 2. 1647 // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3. 1648 if (count == 6 && indices[1] == indices[0] + 1 && 1649 indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) { 1650 assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1); 1651 // Fast path - since there is only a single quad, we only load per-vertex 1652 // attribs once for all instances, as they won't change across instances 1653 // or within an instance. 1654 vertex_shader->load_attribs(v.attribs, indices[0], 0, 4); 1655 draw_quad(4, colortex, depthtex); 1656 for (GLsizei instance = 1; instance < instancecount; instance++) { 1657 vertex_shader->load_attribs(v.attribs, indices[0], instance, 0); 1658 draw_quad(4, colortex, depthtex); 1659 } 1660 } else { 1661 for (GLsizei instance = 0; instance < instancecount; instance++) { 1662 for (GLsizei i = 0; i + 3 <= count; i += 3) { 1663 if (indices[i + 1] != indices[i] + 1 || 1664 indices[i + 2] != indices[i] + 2) { 1665 continue; 1666 } 1667 if (i + 6 <= count && indices[i + 5] == indices[i] + 3) { 1668 assert(indices[i + 3] == indices[i] + 2 && 1669 indices[i + 4] == indices[i] + 1); 1670 vertex_shader->load_attribs(v.attribs, indices[i], instance, 4); 1671 draw_quad(4, colortex, depthtex); 1672 i += 3; 1673 } else { 1674 vertex_shader->load_attribs(v.attribs, indices[i], instance, 3); 1675 draw_quad(3, colortex, depthtex); 1676 } 1677 } 1678 } 1679 } 1680 }