tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rasterize.h (73093B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 // The SWGL depth buffer is roughly organized as a span buffer where each row
      6 // of the depth buffer is a list of spans, and each span has a constant depth
      7 // and a run length (represented by DepthRun). The span from start..start+count
      8 // is placed directly at that start index in the row's array of runs, so that
      9 // there is no need to explicitly record the start index at all. This also
     10 // avoids the need to move items around in the run array to manage insertions
     11 // since space is implicitly always available for a run between any two
     12 // pre-existing runs. Linkage from one run to the next is implicitly defined by
     13 // the count, so if a run exists from start..start+count, the next run will
     14 // implicitly pick up right at index start+count where that preceding run left
     15 // off. All of the DepthRun items that are after the head of the run can remain
     16 // uninitialized until the run needs to be split and a new run needs to start
     17 // somewhere in between.
     18 // For uses like perspective-correct rasterization or with a discard mask, a
     19 // run is not an efficient representation, and it is more beneficial to have
     20 // a flattened array of individual depth samples that can be masked off easily.
     21 // To support this case, the first run in a given row's run array may have a
     22 // zero count, signaling that this entire row is flattened. Critically, the
     23 // depth and count fields in DepthRun are ordered (endian-dependently) so that
     24 // the DepthRun struct can be interpreted as a sign-extended int32_t depth. It
     25 // is then possible to just treat the entire row as an array of int32_t depth
     26 // samples that can be processed with SIMD comparisons, since the count field
     27 // behaves as just the sign-extension of the depth field. The count field is
     28 // limited to 8 bits so that we can support depth values up to 24 bits.
     29 // When a depth buffer is cleared, each row is initialized to a maximal runs
     30 // spanning the entire row. In the normal case, the depth buffer will continue
     31 // to manage itself as a list of runs. If perspective or discard is used for
     32 // a given row, the row will be converted to the flattened representation to
     33 // support it, after which it will only ever revert back to runs if the depth
     34 // buffer is cleared.
     35 
     36 // The largest 24-bit depth value supported.
     37 constexpr uint32_t MAX_DEPTH_VALUE = 0xFFFFFF;
     38 // The longest 8-bit depth run that is supported, aligned to SIMD chunk size.
     39 constexpr uint32_t MAX_DEPTH_RUN = 255 & ~3;
     40 
     41 struct DepthRun {
     42  // Ensure that depth always occupies the LSB and count the MSB so that we
     43  // can sign-extend depth just by setting count to zero, marking it flat.
     44  // When count is non-zero, then this is interpreted as an actual run and
     45  // depth is read in isolation.
     46 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     47  uint32_t depth : 24;
     48  uint32_t count : 8;
     49 #else
     50  uint32_t count : 8;
     51  uint32_t depth : 24;
     52 #endif
     53 
     54  DepthRun() = default;
     55  DepthRun(uint32_t depth, uint8_t count) : depth(depth), count(count) {}
     56 
     57  // If count is zero, this is actually a flat depth sample rather than a run.
     58  bool is_flat() const { return !count; }
     59 
     60  // Compare a source depth from rasterization with a stored depth value.
     61  template <int FUNC>
     62  ALWAYS_INLINE bool compare(uint32_t src) const {
     63    switch (FUNC) {
     64      case GL_LEQUAL:
     65        return src <= depth;
     66      case GL_LESS:
     67        return src < depth;
     68      case GL_ALWAYS:
     69        return true;
     70      default:
     71        assert(false);
     72        return false;
     73    }
     74  }
     75 };
     76 
     77 // Fills runs at the given position with the given depth up to the span width.
     78 static ALWAYS_INLINE void set_depth_runs(DepthRun* runs, uint32_t depth,
     79                                         uint32_t width) {
     80  // If the width exceeds the maximum run size, then we need to output clamped
     81  // runs first.
     82  for (; width >= MAX_DEPTH_RUN;
     83       runs += MAX_DEPTH_RUN, width -= MAX_DEPTH_RUN) {
     84    *runs = DepthRun(depth, MAX_DEPTH_RUN);
     85  }
     86  // If there are still any left over samples to fill under the maximum run
     87  // size, then output one last run for them.
     88  if (width > 0) {
     89    *runs = DepthRun(depth, width);
     90  }
     91 }
     92 
     93 // A cursor for reading and modifying a row's depth run array. It locates
     94 // and iterates through a desired span within all the runs, testing if
     95 // the depth of this span passes or fails the depth test against existing
     96 // runs. If desired, new runs may be inserted to represent depth occlusion
     97 // from this span in the run array.
     98 struct DepthCursor {
     99  // Current position of run the cursor has advanced to.
    100  DepthRun* cur = nullptr;
    101  // The start of the remaining potential samples in the desired span.
    102  DepthRun* start = nullptr;
    103  // The end of the potential samples in the desired span.
    104  DepthRun* end = nullptr;
    105 
    106  DepthCursor() = default;
    107 
    108  // Construct a cursor with runs for a given row's run array and the bounds
    109  // of the span we wish to iterate within it.
    110  DepthCursor(DepthRun* runs, int num_runs, int span_offset, int span_count)
    111      : cur(runs), start(&runs[span_offset]), end(start + span_count) {
    112    // This cursor should never iterate over flat runs
    113    assert(!runs->is_flat());
    114    DepthRun* end_runs = &runs[num_runs];
    115    // Clamp end of span to end of row
    116    if (end > end_runs) {
    117      end = end_runs;
    118    }
    119    // If the span starts past the end of the row, just advance immediately
    120    // to it to signal that we're done.
    121    if (start >= end_runs) {
    122      cur = end_runs;
    123      start = end_runs;
    124      return;
    125    }
    126    // Otherwise, find the first depth run that contains the start of the span.
    127    // If the span starts after the given run, then we need to keep searching
    128    // through the row to find an appropriate run. The check above already
    129    // guaranteed that the span starts within the row's runs, and the search
    130    // won't fall off the end.
    131    for (;;) {
    132      assert(cur < end);
    133      DepthRun* next = cur + cur->count;
    134      if (start < next) {
    135        break;
    136      }
    137      cur = next;
    138    }
    139  }
    140 
    141  // The cursor is valid if the current position is at the end or if the run
    142  // contains the start position.
    143  bool valid() const {
    144    return cur >= end || (cur <= start && start < cur + cur->count);
    145  }
    146 
    147  // Skip past any initial runs that fail the depth test. If we find a run that
    148  // would pass, then return the accumulated length between where we started
    149  // and that position. Otherwise, if we fall off the end, return -1 to signal
    150  // that there are no more passed runs at the end of this failed region and
    151  // so it is safe for the caller to stop processing any more regions in this
    152  // row.
    153  template <int FUNC>
    154  int skip_failed(uint32_t val) {
    155    assert(valid());
    156    DepthRun* prev = start;
    157    while (cur < end) {
    158      if (cur->compare<FUNC>(val)) {
    159        return start - prev;
    160      }
    161      cur += cur->count;
    162      start = cur;
    163    }
    164    return -1;
    165  }
    166 
    167  // Helper to convert function parameters into template parameters to hoist
    168  // some checks out of inner loops.
    169  ALWAYS_INLINE int skip_failed(uint32_t val, GLenum func) {
    170    switch (func) {
    171      case GL_LEQUAL:
    172        return skip_failed<GL_LEQUAL>(val);
    173      case GL_LESS:
    174        return skip_failed<GL_LESS>(val);
    175      default:
    176        assert(false);
    177        return -1;
    178    }
    179  }
    180 
    181  // Find a region of runs that passes the depth test. It is assumed the caller
    182  // has called skip_failed first to skip past any runs that failed the depth
    183  // test. This stops when it finds a run that fails the depth test or we fall
    184  // off the end of the row. If the write mask is enabled, this will insert runs
    185  // to represent this new region that passed the depth test. The length of the
    186  // region is returned.
    187  template <int FUNC, bool MASK>
    188  int check_passed(uint32_t val) {
    189    assert(valid());
    190    DepthRun* prev = cur;
    191    while (cur < end) {
    192      if (!cur->compare<FUNC>(val)) {
    193        break;
    194      }
    195      DepthRun* next = cur + cur->count;
    196      if (next > end) {
    197        if (MASK) {
    198          // Chop the current run where the end of the span falls, making a new
    199          // run from the end of the span till the next run. The beginning of
    200          // the current run will be folded into the run from the start of the
    201          // passed region before returning below.
    202          *end = DepthRun(cur->depth, next - end);
    203        }
    204        // If the next run starts past the end, then just advance the current
    205        // run to the end to signal that we're now at the end of the row.
    206        next = end;
    207      }
    208      cur = next;
    209    }
    210    // If we haven't advanced past the start of the span region, then we found
    211    // nothing that passed.
    212    if (cur <= start) {
    213      return 0;
    214    }
    215    // If 'end' fell within the middle of a passing run, then 'cur' will end up
    216    // pointing at the new partial run created at 'end' where the passing run
    217    // was split to accommodate starting in the middle. The preceding runs will
    218    // be fixed below to properly join with this new split.
    219    int passed = cur - start;
    220    if (MASK) {
    221      // If the search started from a run before the start of the span, then
    222      // edit that run to meet up with the start.
    223      if (prev < start) {
    224        prev->count = start - prev;
    225      }
    226      // Create a new run for the entirety of the passed samples.
    227      set_depth_runs(start, val, passed);
    228    }
    229    start = cur;
    230    return passed;
    231  }
    232 
    233  // Helper to convert function parameters into template parameters to hoist
    234  // some checks out of inner loops.
    235  template <bool MASK>
    236  ALWAYS_INLINE int check_passed(uint32_t val, GLenum func) {
    237    switch (func) {
    238      case GL_LEQUAL:
    239        return check_passed<GL_LEQUAL, MASK>(val);
    240      case GL_LESS:
    241        return check_passed<GL_LESS, MASK>(val);
    242      default:
    243        assert(false);
    244        return 0;
    245    }
    246  }
    247 
    248  ALWAYS_INLINE int check_passed(uint32_t val, GLenum func, bool mask) {
    249    return mask ? check_passed<true>(val, func)
    250                : check_passed<false>(val, func);
    251  }
    252 
    253  // Fill a region of runs with a given depth value, bypassing any depth test.
    254  ALWAYS_INLINE void fill(uint32_t depth) {
    255    check_passed<GL_ALWAYS, true>(depth);
    256  }
    257 };
    258 
    259 // Initialize a depth texture by setting the first run in each row to encompass
    260 // the entire row.
    261 void Texture::init_depth_runs(uint32_t depth) {
    262  if (!buf) return;
    263  DepthRun* runs = (DepthRun*)buf;
    264  for (int y = 0; y < height; y++) {
    265    set_depth_runs(runs, depth, width);
    266    runs += stride() / sizeof(DepthRun);
    267  }
    268  set_cleared(true);
    269 }
    270 
    271 // Fill a portion of the run array with flattened depth samples.
    272 static ALWAYS_INLINE void fill_flat_depth(DepthRun* dst, size_t n,
    273                                          uint32_t depth) {
    274  fill_n((uint32_t*)dst, n, depth);
    275 }
    276 
    277 // Fills a scissored region of a depth texture with a given depth.
    278 void Texture::fill_depth_runs(uint32_t depth, const IntRect& scissor) {
    279  if (!buf) return;
    280  assert(cleared());
    281  IntRect bb = bounds().intersection(scissor - offset);
    282  DepthRun* runs = (DepthRun*)sample_ptr(0, bb.y0);
    283  for (int rows = bb.height(); rows > 0; rows--) {
    284    if (bb.width() >= width) {
    285      // If the scissor region encompasses the entire row, reset the row to a
    286      // single run encompassing the entire row.
    287      set_depth_runs(runs, depth, width);
    288    } else if (runs->is_flat()) {
    289      // If the row is flattened, just directly fill the portion of the row.
    290      fill_flat_depth(&runs[bb.x0], bb.width(), depth);
    291    } else {
    292      // Otherwise, if we are still using runs, then set up a cursor to fill
    293      // it with depth runs.
    294      DepthCursor(runs, width, bb.x0, bb.width()).fill(depth);
    295    }
    296    runs += stride() / sizeof(DepthRun);
    297  }
    298 }
    299 
    300 using ZMask = I32;
    301 
    302 #if USE_SSE2
    303 #  define ZMASK_NONE_PASSED 0xFFFF
    304 #  define ZMASK_ALL_PASSED 0
    305 static inline uint32_t zmask_code(ZMask mask) {
    306  return _mm_movemask_epi8(mask);
    307 }
    308 #else
    309 #  define ZMASK_NONE_PASSED 0xFFFFFFFFU
    310 #  define ZMASK_ALL_PASSED 0
    311 static inline uint32_t zmask_code(ZMask mask) {
    312  return bit_cast<uint32_t>(CONVERT(mask, U8));
    313 }
    314 #endif
    315 
    316 // Interprets items in the depth buffer as sign-extended 32-bit depth values
    317 // instead of as runs. Returns a mask that signals which samples in the given
    318 // chunk passed or failed the depth test with given Z value.
    319 template <bool DISCARD>
    320 static ALWAYS_INLINE bool check_depth(I32 src, DepthRun* zbuf, ZMask& outmask,
    321                                      int span = 4) {
    322  // SSE2 does not support unsigned comparison. So ensure Z value is
    323  // sign-extended to int32_t.
    324  I32 dest = unaligned_load<I32>(zbuf);
    325  // Invert the depth test to check which pixels failed and should be discarded.
    326  ZMask mask = ctx->depthfunc == GL_LEQUAL
    327                   ?
    328                   // GL_LEQUAL: Not(LessEqual) = Greater
    329                   ZMask(src > dest)
    330                   :
    331                   // GL_LESS: Not(Less) = GreaterEqual
    332                   ZMask(src >= dest);
    333  // Mask off any unused lanes in the span.
    334  mask |= ZMask(span) < ZMask{1, 2, 3, 4};
    335  if (zmask_code(mask) == ZMASK_NONE_PASSED) {
    336    return false;
    337  }
    338  if (!DISCARD && ctx->depthmask) {
    339    unaligned_store(zbuf, (mask & dest) | (~mask & src));
    340  }
    341  outmask = mask;
    342  return true;
    343 }
    344 
    345 static ALWAYS_INLINE I32 packDepth() {
    346  return cast(fragment_shader->gl_FragCoord.z * MAX_DEPTH_VALUE);
    347 }
    348 
    349 static ALWAYS_INLINE void discard_depth(I32 src, DepthRun* zbuf, I32 mask) {
    350  if (ctx->depthmask) {
    351    I32 dest = unaligned_load<I32>(zbuf);
    352    mask |= fragment_shader->swgl_IsPixelDiscarded;
    353    unaligned_store(zbuf, (mask & dest) | (~mask & src));
    354  }
    355 }
    356 
    357 static ALWAYS_INLINE void mask_output(uint32_t* buf, ZMask zmask,
    358                                      int span = 4) {
    359  WideRGBA8 r = pack_pixels_RGBA8();
    360  PackedRGBA8 dst = load_span<PackedRGBA8>(buf, span);
    361  if (blend_key) r = blend_pixels(buf, dst, r, span);
    362  PackedRGBA8 mask = bit_cast<PackedRGBA8>(zmask);
    363  store_span(buf, (mask & dst) | (~mask & pack(r)), span);
    364 }
    365 
    366 template <bool DISCARD>
    367 static ALWAYS_INLINE void discard_output(uint32_t* buf, int span = 4) {
    368  mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span);
    369 }
    370 
    371 template <>
    372 ALWAYS_INLINE void discard_output<false>(uint32_t* buf, int span) {
    373  WideRGBA8 r = pack_pixels_RGBA8();
    374  if (blend_key)
    375    r = blend_pixels(buf, load_span<PackedRGBA8>(buf, span), r, span);
    376  store_span(buf, pack(r), span);
    377 }
    378 
    379 static ALWAYS_INLINE void mask_output(uint8_t* buf, ZMask zmask, int span = 4) {
    380  WideR8 r = pack_pixels_R8();
    381  WideR8 dst = unpack(load_span<PackedR8>(buf, span));
    382  if (blend_key) r = blend_pixels(buf, dst, r, span);
    383  WideR8 mask = packR8(zmask);
    384  store_span(buf, pack((mask & dst) | (~mask & r)), span);
    385 }
    386 
    387 template <bool DISCARD>
    388 static ALWAYS_INLINE void discard_output(uint8_t* buf, int span = 4) {
    389  mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span);
    390 }
    391 
    392 template <>
    393 ALWAYS_INLINE void discard_output<false>(uint8_t* buf, int span) {
    394  WideR8 r = pack_pixels_R8();
    395  if (blend_key)
    396    r = blend_pixels(buf, unpack(load_span<PackedR8>(buf, span)), r, span);
    397  store_span(buf, pack(r), span);
    398 }
    399 
    400 struct ClipRect {
    401  float x0;
    402  float y0;
    403  float x1;
    404  float y1;
    405 
    406  explicit ClipRect(const IntRect& i)
    407      : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {}
    408  explicit ClipRect(const Texture& t) : ClipRect(ctx->apply_scissor(t)) {
    409    // If blending is enabled, set blend_key to reflect the resolved blend
    410    // state for the currently drawn primitive.
    411    if (ctx->blend) {
    412      blend_key = ctx->blend_key;
    413      if (swgl_ClipFlags) {
    414        // If there is a blend override set, replace the blend key with it.
    415        if (swgl_ClipFlags & SWGL_CLIP_FLAG_BLEND_OVERRIDE) {
    416          blend_key = swgl_BlendOverride;
    417        }
    418        // If a clip mask is available, set up blending state to use the clip
    419        // mask.
    420        if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) {
    421          assert(swgl_ClipMask->format == TextureFormat::R8);
    422          // Constrain the clip mask bounds to always fall within the clip mask.
    423          swgl_ClipMaskBounds.intersect(IntRect{0, 0, int(swgl_ClipMask->width),
    424                                                int(swgl_ClipMask->height)});
    425          // The clip mask offset is relative to the viewport.
    426          swgl_ClipMaskOffset += ctx->viewport.origin() - t.offset;
    427          // The clip mask bounds are relative to the clip mask offset.
    428          swgl_ClipMaskBounds.offset(swgl_ClipMaskOffset);
    429          // Finally, constrain the clip rectangle by the clip mask bounds.
    430          intersect(swgl_ClipMaskBounds);
    431          // Modify the blend key so that it will use the clip mask while
    432          // blending.
    433          restore_clip_mask();
    434        }
    435        if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) {
    436          // Modify the blend key so that it will use AA while blending.
    437          restore_aa();
    438        }
    439      }
    440    } else {
    441      blend_key = BLEND_KEY_NONE;
    442      swgl_ClipFlags = 0;
    443    }
    444  }
    445 
    446  FloatRange x_range() const { return {x0, x1}; }
    447 
    448  void intersect(const IntRect& c) {
    449    x0 = max(x0, float(c.x0));
    450    y0 = max(y0, float(c.y0));
    451    x1 = min(x1, float(c.x1));
    452    y1 = min(y1, float(c.y1));
    453  }
    454 
    455  template <typename P>
    456  void set_clip_mask(int x, int y, P* buf) const {
    457    if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) {
    458      swgl_SpanBuf = buf;
    459      swgl_ClipMaskBuf = (uint8_t*)swgl_ClipMask->buf +
    460                         (y - swgl_ClipMaskOffset.y) * swgl_ClipMask->stride +
    461                         (x - swgl_ClipMaskOffset.x);
    462    }
    463  }
    464 
    465  template <typename P>
    466  bool overlaps(int nump, const P* p) const {
    467    // Generate a mask of which side of the clip rect all of a polygon's points
    468    // fall inside of. This is a cheap conservative estimate of whether the
    469    // bounding box of the polygon might overlap the clip rect, rather than an
    470    // exact test that would require multiple slower line intersections.
    471    int sides = 0;
    472    for (int i = 0; i < nump; i++) {
    473      sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2;
    474      sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8;
    475    }
    476    return sides == 0xF;
    477  }
    478 };
    479 
    480 // Given a current X position at the center Y position of a row, return the X
    481 // position of the left and right intercepts of the row top and bottom.
    482 template <typename E>
    483 static ALWAYS_INLINE FloatRange x_intercepts(const E& e) {
    484  float rad = 0.5f * abs(e.x_slope());
    485  return {e.cur_x() - rad, e.cur_x() + rad};
    486 }
    487 
    488 // Return the AA sub-span corresponding to a given edge. If AA is requested,
    489 // then this finds the X intercepts with the row clipped into range of the
    490 // edge and finally conservatively rounds them out. If there is no AA, then
    491 // it just returns the current rounded X position clipped within bounds.
    492 template <typename E>
    493 static ALWAYS_INLINE IntRange aa_edge(const E& e, const FloatRange& bounds) {
    494  return e.edgeMask ? bounds.clip(x_intercepts(e)).round_out()
    495                    : bounds.clip({e.cur_x(), e.cur_x()}).round();
    496 }
    497 
    498 // Calculate the initial AA coverage as an approximation of the distance from
    499 // the center of the pixel in the direction of the edge slope. Given an edge
    500 // (x,y)..(x+dx,y+dy), then the normalized tangent vector along the edge is
    501 // (dx,dy)/sqrt(dx^2+dy^2). We know that for dy=1 then dx=e.x_slope. We rotate
    502 // the tangent vector either -90 or +90 degrees to get the edge normal vector,
    503 // where 'dx=-dy and 'dy=dx. Once normalized by 1/sqrt(dx^2+dy^2), scale into
    504 // the range of 0..256 so that we can cheaply convert to a fixed-point scale
    505 // factor. It is assumed that at exactly the pixel center the opacity is half
    506 // (128) and linearly decreases along the normal vector at 1:1 scale with the
    507 // slope. While not entirely accurate, this gives a reasonably agreeable looking
    508 // approximation of AA. For edges on which there is no AA, just force the
    509 // opacity to maximum (256) with no slope, relying on the span clipping to trim
    510 // pixels outside the span.
    511 template <typename E>
    512 static ALWAYS_INLINE FloatRange aa_dist(const E& e, float dir) {
    513  if (e.edgeMask) {
    514    float dx = (dir * 256.0f) * inversesqrt(1.0f + e.x_slope() * e.x_slope());
    515    return {128.0f + dx * (e.cur_x() - 0.5f), -dx};
    516  } else {
    517    return {256.0f, 0.0f};
    518  }
    519 }
    520 
    521 template <typename P, typename E>
    522 static ALWAYS_INLINE IntRange aa_span(P* buf, const E& left, const E& right,
    523                                      const FloatRange& bounds) {
    524  // If there is no AA, just return the span from the rounded left edge X
    525  // position to the rounded right edge X position. Clip the span to be within
    526  // the valid bounds.
    527  if (!(swgl_ClipFlags & SWGL_CLIP_FLAG_AA)) {
    528    return bounds.clip({left.cur_x(), right.cur_x()}).round();
    529  }
    530 
    531  // Calculate the left and right AA spans along with the coverage distances
    532  // and slopes necessary to do blending.
    533  IntRange leftAA = aa_edge(left, bounds);
    534  FloatRange leftDist = aa_dist(left, -1.0f);
    535  IntRange rightAA = aa_edge(right, bounds);
    536  FloatRange rightDist = aa_dist(right, 1.0f);
    537 
    538  // Use the pointer into the destination buffer as a status indicator of the
    539  // coverage offset. The pointer is calculated so that subtracting it with
    540  // the current destination pointer will yield a negative value if the span
    541  // is outside the opaque area and otherwise will yield a positive value
    542  // above the opaque size. This pointer is stored as a uint8 pointer so that
    543  // there are no hidden multiplication instructions and will just return a
    544  // 1:1 linear memory address. Thus the size of the opaque region must also
    545  // be scaled by the pixel size in bytes.
    546  swgl_OpaqueStart = (const uint8_t*)(buf + leftAA.end);
    547  swgl_OpaqueSize = max(rightAA.start - leftAA.end - 3, 0) * sizeof(P);
    548 
    549  // Offset the coverage distances by the end of the left AA span, which
    550  // corresponds to the opaque start pointer, so that pixels become opaque
    551  // immediately after. The distances are also offset for each lane in the
    552  // chunk.
    553  Float offset = cast(leftAA.end + (I32){0, 1, 2, 3});
    554  swgl_LeftAADist = leftDist.start + offset * leftDist.end;
    555  swgl_RightAADist = rightDist.start + offset * rightDist.end;
    556  swgl_AASlope =
    557      (Float){leftDist.end, rightDist.end, 0.0f, 0.0f} / float(sizeof(P));
    558 
    559  // Return the full span width from the start of the left span to the end of
    560  // the right span.
    561  return {leftAA.start, rightAA.end};
    562 }
    563 
    564 // Calculate the span the user clip distances occupy from the left and right
    565 // edges at the current row.
    566 template <typename E>
    567 static ALWAYS_INLINE IntRange clip_distance_range(const E& left,
    568                                                  const E& right) {
    569  Float leftClip = get_clip_distances(left.interp);
    570  Float rightClip = get_clip_distances(right.interp);
    571  // Get the change in clip dist per X step.
    572  Float clipStep = (rightClip - leftClip) / (right.cur_x() - left.cur_x());
    573  // Find the zero intercepts starting from the left edge.
    574  Float clipDist =
    575      clamp(left.cur_x() - leftClip * recip(clipStep), 0.0f, 1.0e6f);
    576  // Find the distance to the start of the span for any clip distances that
    577  // are increasing in value. If the clip distance is constant or decreasing
    578  // in value, then check if it starts outside the clip volume.
    579  Float start = if_then_else(clipStep > 0.0f, clipDist,
    580                             if_then_else(leftClip < 0.0f, 1.0e6f, 0.0f));
    581  // Find the distance to the end of the span for any clip distances that are
    582  // decreasing in value. If the clip distance is constant or increasing in
    583  // value, then check if it ends inside the clip volume.
    584  Float end = if_then_else(clipStep < 0.0f, clipDist,
    585                           if_then_else(rightClip >= 0.0f, 1.0e6f, 0.0f));
    586  // Find the furthest start offset.
    587  start = max(start, start.zwxy);
    588  // Find the closest end offset.
    589  end = min(end, end.zwxy);
    590  // Finally, round the offsets to an integer span that can be used to bound
    591  // the current span.
    592  return FloatRange{max(start.x, start.y), min(end.x, end.y)}.round();
    593 }
    594 
    595 // Converts a run array into a flattened array of depth samples. This just
    596 // walks through every run and fills the samples with the depth value from
    597 // the run.
    598 static void flatten_depth_runs(DepthRun* runs, size_t width) {
    599  if (runs->is_flat()) {
    600    return;
    601  }
    602  while (width > 0) {
    603    size_t n = runs->count;
    604    fill_flat_depth(runs, n, runs->depth);
    605    runs += n;
    606    width -= n;
    607  }
    608 }
    609 
    610 // Helper function for drawing passed depth runs within the depth buffer.
    611 // Flattened depth (perspective or discard) is not supported.
    612 template <typename P>
    613 static ALWAYS_INLINE void draw_depth_span(uint32_t z, P* buf,
    614                                          DepthCursor& cursor) {
    615  for (;;) {
    616    // Get the span that passes the depth test. Assume on entry that
    617    // any failed runs have already been skipped.
    618    int span = cursor.check_passed(z, ctx->depthfunc, ctx->depthmask);
    619    // If nothing passed, since we already skipped passed failed runs
    620    // previously, we must have hit the end of the row. Bail out.
    621    if (span <= 0) {
    622      break;
    623    }
    624    if (span >= 4) {
    625      // If we have a draw specialization, try to process as many 4-pixel
    626      // chunks as possible using it.
    627      if (fragment_shader->has_draw_span(buf)) {
    628        int drawn = fragment_shader->draw_span(buf, span & ~3);
    629        buf += drawn;
    630        span -= drawn;
    631      }
    632      // Otherwise, just process each chunk individually.
    633      while (span >= 4) {
    634        fragment_shader->run();
    635        discard_output<false>(buf);
    636        buf += 4;
    637        span -= 4;
    638      }
    639    }
    640    // If we have a partial chunk left over, we still have to process it as if
    641    // it were a full chunk. Mask off only the part of the chunk we want to
    642    // use.
    643    if (span > 0) {
    644      fragment_shader->run();
    645      discard_output<false>(buf, span);
    646      buf += span;
    647    }
    648    // Skip past any runs that fail the depth test.
    649    int skip = cursor.skip_failed(z, ctx->depthfunc);
    650    // If there aren't any, that means we won't encounter any more passing runs
    651    // and so it's safe to bail out.
    652    if (skip <= 0) {
    653      break;
    654    }
    655    // Advance interpolants for the fragment shader past the skipped region.
    656    // If we processed a partial chunk above, we actually advanced the
    657    // interpolants a full chunk in the fragment shader's run function. Thus,
    658    // we need to first subtract off that 4-pixel chunk and only partially
    659    // advance them to that partial chunk before we can add on the rest of the
    660    // skips. This is combined with the skip here for efficiency's sake.
    661    fragment_shader->skip(skip - (span > 0 ? 4 - span : 0));
    662    buf += skip;
    663  }
    664 }
    665 
    666 // Draw a simple span in 4-pixel wide chunks, optionally using depth.
    667 template <bool DISCARD, bool W, typename P, typename Z>
    668 static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) {
    669  if (depth) {
    670    // Depth testing is enabled. If perspective is used, Z values will vary
    671    // across the span, we use packDepth to generate packed Z values suitable
    672    // for depth testing based on current values from gl_FragCoord.z.
    673    // Otherwise, for the no-perspective case, we just use the provided Z.
    674    // Process 4-pixel chunks first.
    675    for (; span >= 4; span -= 4, buf += 4, depth += 4) {
    676      I32 zsrc = z();
    677      ZMask zmask;
    678      if (check_depth<DISCARD>(zsrc, depth, zmask)) {
    679        fragment_shader->run<W>();
    680        mask_output(buf, zmask);
    681        if (DISCARD) discard_depth(zsrc, depth, zmask);
    682      } else {
    683        fragment_shader->skip<W>();
    684      }
    685    }
    686    // If there are any remaining pixels, do a partial chunk.
    687    if (span > 0) {
    688      I32 zsrc = z();
    689      ZMask zmask;
    690      if (check_depth<DISCARD>(zsrc, depth, zmask, span)) {
    691        fragment_shader->run<W>();
    692        mask_output(buf, zmask, span);
    693        if (DISCARD) discard_depth(zsrc, depth, zmask);
    694      }
    695    }
    696  } else {
    697    // Process 4-pixel chunks first.
    698    for (; span >= 4; span -= 4, buf += 4) {
    699      fragment_shader->run<W>();
    700      discard_output<DISCARD>(buf);
    701    }
    702    // If there are any remaining pixels, do a partial chunk.
    703    if (span > 0) {
    704      fragment_shader->run<W>();
    705      discard_output<DISCARD>(buf, span);
    706    }
    707  }
    708 }
    709 
    710 // Called during rasterization to forcefully clear a row on which delayed clear
    711 // has been enabled. If we know that we are going to completely overwrite a part
    712 // of the row, then we only need to clear the row outside of that part. However,
    713 // if blending or discard is enabled, the values of that underlying part of the
    714 // row may be used regardless to produce the final rasterization result, so we
    715 // have to then clear the entire underlying row to prepare it.
    716 template <typename P>
    717 static inline void prepare_row(Texture& colortex, int y, int startx, int endx,
    718                               bool use_discard, DepthRun* depth,
    719                               uint32_t z = 0, DepthCursor* cursor = nullptr) {
    720  assert(colortex.delay_clear > 0);
    721  // Delayed clear is enabled for the color buffer. Check if needs clear.
    722  uint32_t& mask = colortex.cleared_rows[y / 32];
    723  if ((mask & (1 << (y & 31))) == 0) {
    724    mask |= 1 << (y & 31);
    725    colortex.delay_clear--;
    726    if (blend_key || use_discard) {
    727      // If depth test, blending, or discard is used, old color values
    728      // might be sampled, so we need to clear the entire row to fill it.
    729      force_clear_row<P>(colortex, y);
    730    } else if (depth) {
    731      if (depth->is_flat() || !cursor) {
    732        // If flat depth is used, we can't cheaply predict if which samples will
    733        // pass.
    734        force_clear_row<P>(colortex, y);
    735      } else {
    736        // Otherwise if depth runs are used, see how many samples initially pass
    737        // the depth test and only fill the row outside those. The fragment
    738        // shader will fill the row within the passed samples.
    739        int passed =
    740            DepthCursor(*cursor).check_passed<false>(z, ctx->depthfunc);
    741        if (startx > 0 || startx + passed < colortex.width) {
    742          force_clear_row<P>(colortex, y, startx, startx + passed);
    743        }
    744      }
    745    } else if (startx > 0 || endx < colortex.width) {
    746      // Otherwise, we only need to clear the row outside of the span.
    747      // The fragment shader will fill the row within the span itself.
    748      force_clear_row<P>(colortex, y, startx, endx);
    749    }
    750  }
    751 }
    752 
    753 // Perpendicular dot-product is the dot-product of a vector with the
    754 // perpendicular vector of the other, i.e. dot(a, {-b.y, b.x})
    755 template <typename T>
    756 static ALWAYS_INLINE auto perpDot(T a, T b) {
    757  return a.x * b.y - a.y * b.x;
    758 }
    759 
    760 // Check if the winding of the initial edges is flipped, requiring us to swap
    761 // the edges to avoid spans having negative lengths. Assume that l0.y == r0.y
    762 // due to the initial edge scan in draw_quad/perspective_spans.
    763 template <typename T>
    764 static ALWAYS_INLINE bool checkIfEdgesFlipped(T l0, T l1, T r0, T r1) {
    765  // If the starting point of the left edge is to the right of the starting
    766  // point of the right edge, then just assume the edges are flipped. If the
    767  // left and right starting points are the same, then check the sign of the
    768  // cross-product of the edges to see if the edges are flipped. Otherwise,
    769  // if the left starting point is actually just to the left of the right
    770  // starting point, then assume no edge flip.
    771  return l0.x > r0.x || (l0.x == r0.x && perpDot(l1 - l0, r1 - r0) > 0.0f);
    772 }
    773 
    774 // Draw spans for each row of a given quad (or triangle) with a constant Z
    775 // value. The quad is assumed convex. It is clipped to fall within the given
    776 // clip rect. In short, this function rasterizes a quad by first finding a
    777 // top most starting point and then from there tracing down the left and right
    778 // sides of this quad until it hits the bottom, outputting a span between the
    779 // current left and right positions at each row along the way. Points are
    780 // assumed to be ordered in either CW or CCW to support this, but currently
    781 // both orders (CW and CCW) are supported and equivalent.
    782 template <typename P>
    783 static inline void draw_quad_spans(int nump, Point2D p[4], uint32_t z,
    784                                   Interpolants interp_outs[4],
    785                                   Texture& colortex, Texture& depthtex,
    786                                   const ClipRect& clipRect) {
    787  // Only triangles and convex quads supported.
    788  assert(nump == 3 || nump == 4);
    789 
    790  Point2D l0, r0, l1, r1;
    791  int l0i, r0i, l1i, r1i;
    792  {
    793    // Find the index of the top-most (smallest Y) point from which
    794    // rasterization can start.
    795    int top = nump > 3 && p[3].y < p[2].y
    796                  ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3)
    797                                     : (p[1].y < p[3].y ? 1 : 3))
    798                  : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2)
    799                                     : (p[1].y < p[2].y ? 1 : 2));
    800    // Helper to find next index in the points array, walking forward.
    801 #define NEXT_POINT(idx)   \
    802  ({                      \
    803    int cur = (idx) + 1;  \
    804    cur < nump ? cur : 0; \
    805  })
    806    // Helper to find the previous index in the points array, walking backward.
    807 #define PREV_POINT(idx)        \
    808  ({                           \
    809    int cur = (idx) - 1;       \
    810    cur >= 0 ? cur : nump - 1; \
    811  })
    812    // Start looking for "left"-side and "right"-side descending edges starting
    813    // from the determined top point.
    814    int next = NEXT_POINT(top);
    815    int prev = PREV_POINT(top);
    816    if (p[top].y == p[next].y) {
    817      // If the next point is on the same row as the top, then advance one more
    818      // time to the next point and use that as the "left" descending edge.
    819      l0i = next;
    820      l1i = NEXT_POINT(next);
    821      // Assume top and prev form a descending "right" edge, as otherwise this
    822      // will be a collapsed polygon and harmlessly bail out down below.
    823      r0i = top;
    824      r1i = prev;
    825    } else if (p[top].y == p[prev].y) {
    826      // If the prev point is on the same row as the top, then advance to the
    827      // prev again and use that as the "right" descending edge.
    828      // Assume top and next form a non-empty descending "left" edge.
    829      l0i = top;
    830      l1i = next;
    831      r0i = prev;
    832      r1i = PREV_POINT(prev);
    833    } else {
    834      // Both next and prev are on distinct rows from top, so both "left" and
    835      // "right" edges are non-empty/descending.
    836      l0i = r0i = top;
    837      l1i = next;
    838      r1i = prev;
    839    }
    840    // Load the points from the indices.
    841    l0 = p[l0i];  // Start of left edge
    842    r0 = p[r0i];  // End of left edge
    843    l1 = p[l1i];  // Start of right edge
    844    r1 = p[r1i];  // End of right edge
    845    //    debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1:
    846    //    %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i,
    847    //    r1.x, r1.y);
    848  }
    849 
    850  struct Edge {
    851    float yScale;
    852    float xSlope;
    853    float x;
    854    Interpolants interpSlope;
    855    Interpolants interp;
    856    bool edgeMask;
    857 
    858    Edge(float y, const Point2D& p0, const Point2D& p1, const Interpolants& i0,
    859         const Interpolants& i1, int edgeIndex)
    860        :  // Inverse Y scale for slope calculations. Avoid divide on 0-length
    861           // edge. Later checks below ensure that Y <= p1.y, or otherwise we
    862           // don't use this edge. We just need to guard against Y == p1.y ==
    863           // p0.y. In that case, Y - p0.y == 0 and will cancel out the slopes
    864           // below, except if yScale is Inf for some reason (or worse, NaN),
    865           // which 1/(p1.y-p0.y) might produce if we don't bound it.
    866          yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
    867          // Calculate dX/dY slope
    868          xSlope((p1.x - p0.x) * yScale),
    869          // Initialize current X based on Y and slope
    870          x(p0.x + (y - p0.y) * xSlope),
    871          // Calculate change in interpolants per change in Y
    872          interpSlope((i1 - i0) * yScale),
    873          // Initialize current interpolants based on Y and slope
    874          interp(i0 + (y - p0.y) * interpSlope),
    875          // Extract the edge mask status for this edge
    876          edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {}
    877 
    878    void nextRow() {
    879      // step current X and interpolants to next row from slope
    880      x += xSlope;
    881      interp += interpSlope;
    882    }
    883 
    884    float cur_x() const { return x; }
    885    float x_slope() const { return xSlope; }
    886  };
    887 
    888  // Vertex selection above should result in equal left and right start rows
    889  assert(l0.y == r0.y);
    890  // Find the start y, clip to within the clip rect, and round to row center.
    891  // If AA is enabled, round out conservatively rather than round to nearest.
    892  float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f;
    893  float y = floor(max(min(l0.y, clipRect.y1), clipRect.y0) + aaRound) + 0.5f;
    894  // Initialize left and right edges from end points and start Y
    895  Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
    896  Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
    897  // WR does not use backface culling, so check if edges are flipped.
    898  bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1);
    899  if (flipped) swap(left, right);
    900  // Get pointer to color buffer and depth buffer at current Y
    901  P* fbuf = (P*)colortex.sample_ptr(0, int(y));
    902  DepthRun* fdepth = depthtex.buf != nullptr
    903                         ? (DepthRun*)depthtex.sample_ptr(0, int(y))
    904                         : nullptr;
    905  // Loop along advancing Ys, rasterizing spans at each row
    906  float checkY = min(min(l1.y, r1.y), clipRect.y1);
    907  // Ensure we don't rasterize out edge bounds
    908  FloatRange clipSpan =
    909      clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
    910  for (;;) {
    911    // Check if we maybe passed edge ends or outside clip rect...
    912    if (y > checkY) {
    913      // If we're outside the clip rect, we're done.
    914      if (y > clipRect.y1) break;
    915      // Helper to find the next non-duplicate vertex that doesn't loop back.
    916 #define STEP_EDGE(y, e0i, e0, e1i, e1, STEP_POINT, end)     \
    917  do {                                                      \
    918    /* Set new start of edge to be end of old edge */       \
    919    e0i = e1i;                                              \
    920    e0 = e1;                                                \
    921    /* Set new end of edge to next point */                 \
    922    e1i = STEP_POINT(e1i);                                  \
    923    e1 = p[e1i];                                            \
    924    /* If the edge crossed the end, we're done. */          \
    925    if (e0i == end) return;                                 \
    926    /* Otherwise, it doesn't advance, so keep searching. */ \
    927  } while (y > e1.y)
    928      // Check if Y advanced past the end of the left edge
    929      if (y > l1.y) {
    930        // Step to next left edge past Y and reset edge interpolants.
    931        STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i);
    932        (flipped ? right : left) =
    933            Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
    934      }
    935      // Check if Y advanced past the end of the right edge
    936      if (y > r1.y) {
    937        // Step to next right edge past Y and reset edge interpolants.
    938        STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i);
    939        (flipped ? left : right) =
    940            Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
    941      }
    942      // Reset the clip bounds for the new edges
    943      clipSpan =
    944          clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
    945      // Reset check condition for next time around.
    946      checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1);
    947    }
    948 
    949    // Calculate a potentially AA'd span and check if it is non-empty.
    950    IntRange span = aa_span(fbuf, left, right, clipSpan);
    951    if (span.len() > 0) {
    952      // If user clip planes are enabled, use them to bound the current span.
    953      if (vertex_shader->use_clip_distance()) {
    954        span = span.intersect(clip_distance_range(left, right));
    955        if (span.len() <= 0) goto next_span;
    956      }
    957      ctx->shaded_rows++;
    958      ctx->shaded_pixels += span.len();
    959      // Advance color/depth buffer pointers to the start of the span.
    960      P* buf = fbuf + span.start;
    961      // Check if we will need to use depth-buffer or discard on this span.
    962      DepthRun* depth =
    963          depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr;
    964      DepthCursor cursor;
    965      bool use_discard = fragment_shader->use_discard();
    966      if (use_discard) {
    967        if (depth) {
    968          // If we're using discard, we may have to unpredictably drop out some
    969          // samples. Flatten the depth run array here to allow this.
    970          if (!depth->is_flat()) {
    971            flatten_depth_runs(depth, depthtex.width);
    972          }
    973          // Advance to the depth sample at the start of the span.
    974          depth += span.start;
    975        }
    976      } else if (depth) {
    977        if (!depth->is_flat()) {
    978          // We're not using discard and the depth row is still organized into
    979          // runs. Skip past any runs that would fail the depth test so we
    980          // don't have to do any extra work to process them with the rest of
    981          // the span.
    982          cursor = DepthCursor(depth, depthtex.width, span.start, span.len());
    983          int skipped = cursor.skip_failed(z, ctx->depthfunc);
    984          // If we fell off the row, that means we couldn't find any passing
    985          // runs. We can just skip the entire span.
    986          if (skipped < 0) {
    987            goto next_span;
    988          }
    989          buf += skipped;
    990          span.start += skipped;
    991        } else {
    992          // The row is already flattened, so just advance to the span start.
    993          depth += span.start;
    994        }
    995      }
    996 
    997      if (colortex.delay_clear) {
    998        // Delayed clear is enabled for the color buffer. Check if needs clear.
    999        prepare_row<P>(colortex, int(y), span.start, span.end, use_discard,
   1000                       depth, z, &cursor);
   1001      }
   1002 
   1003      // Initialize fragment shader interpolants to current span position.
   1004      fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1);
   1005      fragment_shader->gl_FragCoord.y = y;
   1006      {
   1007        // Change in interpolants is difference between current right and left
   1008        // edges per the change in right and left X. If the left and right X
   1009        // positions are extremely close together, then avoid stepping the
   1010        // interpolants.
   1011        float stepScale = 1.0f / (right.x - left.x);
   1012        if (!isfinite(stepScale)) stepScale = 0.0f;
   1013        Interpolants step = (right.interp - left.interp) * stepScale;
   1014        // Advance current interpolants to X at start of span.
   1015        Interpolants o = left.interp + step * (span.start + 0.5f - left.x);
   1016        fragment_shader->init_span(&o, &step);
   1017      }
   1018      clipRect.set_clip_mask(span.start, y, buf);
   1019      if (!use_discard) {
   1020        // Fast paths for the case where fragment discard is not used.
   1021        if (depth) {
   1022          // If depth is used, we want to process entire depth runs if depth is
   1023          // not flattened.
   1024          if (!depth->is_flat()) {
   1025            draw_depth_span(z, buf, cursor);
   1026            goto next_span;
   1027          }
   1028          // Otherwise, flattened depth must fall back to the slightly slower
   1029          // per-chunk depth test path in draw_span below.
   1030        } else {
   1031          // Check if the fragment shader has an optimized draw specialization.
   1032          if (span.len() >= 4 && fragment_shader->has_draw_span(buf)) {
   1033            // Draw specialization expects 4-pixel chunks.
   1034            int drawn = fragment_shader->draw_span(buf, span.len() & ~3);
   1035            buf += drawn;
   1036            span.start += drawn;
   1037          }
   1038        }
   1039        draw_span<false, false>(buf, depth, span.len(), [=] { return z; });
   1040      } else {
   1041        // If discard is used, then use slower fallbacks. This should be rare.
   1042        // Just needs to work, doesn't need to be too fast yet...
   1043        draw_span<true, false>(buf, depth, span.len(), [=] { return z; });
   1044      }
   1045    }
   1046  next_span:
   1047    // Advance Y and edge interpolants to next row.
   1048    y++;
   1049    left.nextRow();
   1050    right.nextRow();
   1051    // Advance buffers to next row.
   1052    fbuf += colortex.stride() / sizeof(P);
   1053    fdepth += depthtex.stride() / sizeof(DepthRun);
   1054  }
   1055 }
   1056 
   1057 // Draw perspective-correct spans for a convex quad that has been clipped to
   1058 // the near and far Z planes, possibly producing a clipped convex polygon with
   1059 // more than 4 sides. This assumes the Z value will vary across the spans and
   1060 // requires interpolants to factor in W values. This tends to be slower than
   1061 // the simpler 2D draw_quad_spans above, especially since we can't optimize the
   1062 // depth test easily when Z values, and should be used only rarely if possible.
   1063 template <typename P>
   1064 static inline void draw_perspective_spans(int nump, Point3D* p,
   1065                                          Interpolants* interp_outs,
   1066                                          Texture& colortex, Texture& depthtex,
   1067                                          const ClipRect& clipRect) {
   1068  Point3D l0, r0, l1, r1;
   1069  int l0i, r0i, l1i, r1i;
   1070  {
   1071    // Find the index of the top-most point (smallest Y) from which
   1072    // rasterization can start.
   1073    int top = 0;
   1074    for (int i = 1; i < nump; i++) {
   1075      if (p[i].y < p[top].y) {
   1076        top = i;
   1077      }
   1078    }
   1079    // Find left-most top point, the start of the left descending edge.
   1080    // Advance forward in the points array, searching at most nump points
   1081    // in case the polygon is flat.
   1082    l0i = top;
   1083    for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) {
   1084      l0i = i;
   1085    }
   1086    if (l0i == nump - 1) {
   1087      for (int i = 0; i <= top && p[i].y == p[top].y; i++) {
   1088        l0i = i;
   1089      }
   1090    }
   1091    // Find right-most top point, the start of the right descending edge.
   1092    // Advance backward in the points array, searching at most nump points.
   1093    r0i = top;
   1094    for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) {
   1095      r0i = i;
   1096    }
   1097    if (r0i == 0) {
   1098      for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) {
   1099        r0i = i;
   1100      }
   1101    }
   1102    // End of left edge is next point after left edge start.
   1103    l1i = NEXT_POINT(l0i);
   1104    // End of right edge is prev point after right edge start.
   1105    r1i = PREV_POINT(r0i);
   1106    l0 = p[l0i];  // Start of left edge
   1107    r0 = p[r0i];  // End of left edge
   1108    l1 = p[l1i];  // Start of right edge
   1109    r1 = p[r1i];  // End of right edge
   1110  }
   1111 
   1112  struct Edge {
   1113    float yScale;
   1114    // Current coordinates for edge. Where in the 2D case of draw_quad_spans,
   1115    // it is enough to just track the X coordinate as we advance along the rows,
   1116    // for the perspective case we also need to keep track of Z and W. For
   1117    // simplicity, we just use the full 3D point to track all these coordinates.
   1118    Point3D pSlope;
   1119    Point3D p;
   1120    Interpolants interpSlope;
   1121    Interpolants interp;
   1122    bool edgeMask;
   1123 
   1124    Edge(float y, const Point3D& p0, const Point3D& p1, const Interpolants& i0,
   1125         const Interpolants& i1, int edgeIndex)
   1126        :  // Inverse Y scale for slope calculations. Avoid divide on 0-length
   1127           // edge.
   1128          yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
   1129          // Calculate dX/dY slope
   1130          pSlope((p1 - p0) * yScale),
   1131          // Initialize current coords based on Y and slope
   1132          p(p0 + (y - p0.y) * pSlope),
   1133          // Crucially, these interpolants must be scaled by the point's 1/w
   1134          // value, which allows linear interpolation in a perspective-correct
   1135          // manner. This will be canceled out inside the fragment shader later.
   1136          // Calculate change in interpolants per change in Y
   1137          interpSlope((i1 * p1.w - i0 * p0.w) * yScale),
   1138          // Initialize current interpolants based on Y and slope
   1139          interp(i0 * p0.w + (y - p0.y) * interpSlope),
   1140          // Extract the edge mask status for this edge
   1141          edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {}
   1142 
   1143    float x() const { return p.x; }
   1144    vec2_scalar zw() const { return {p.z, p.w}; }
   1145 
   1146    void nextRow() {
   1147      // step current coords and interpolants to next row from slope
   1148      p += pSlope;
   1149      interp += interpSlope;
   1150    }
   1151 
   1152    float cur_x() const { return p.x; }
   1153    float x_slope() const { return pSlope.x; }
   1154  };
   1155 
   1156  // Vertex selection above should result in equal left and right start rows
   1157  assert(l0.y == r0.y);
   1158  // Find the start y, clip to within the clip rect, and round to row center.
   1159  // If AA is enabled, round out conservatively rather than round to nearest.
   1160  float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f;
   1161  float y = floor(max(min(l0.y, clipRect.y1), clipRect.y0) + aaRound) + 0.5f;
   1162  // Initialize left and right edges from end points and start Y
   1163  Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
   1164  Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
   1165  // WR does not use backface culling, so check if edges are flipped.
   1166  bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1);
   1167  if (flipped) swap(left, right);
   1168  // Get pointer to color buffer and depth buffer at current Y
   1169  P* fbuf = (P*)colortex.sample_ptr(0, int(y));
   1170  DepthRun* fdepth = depthtex.buf != nullptr
   1171                         ? (DepthRun*)depthtex.sample_ptr(0, int(y))
   1172                         : nullptr;
   1173  // Loop along advancing Ys, rasterizing spans at each row
   1174  float checkY = min(min(l1.y, r1.y), clipRect.y1);
   1175  // Ensure we don't rasterize out edge bounds
   1176  FloatRange clipSpan =
   1177      clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
   1178  for (;;) {
   1179    // Check if we maybe passed edge ends or outside clip rect...
   1180    if (y > checkY) {
   1181      // If we're outside the clip rect, we're done.
   1182      if (y > clipRect.y1) break;
   1183      // Check if Y advanced past the end of the left edge
   1184      if (y > l1.y) {
   1185        // Step to next left edge past Y and reset edge interpolants.
   1186        STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i);
   1187        (flipped ? right : left) =
   1188            Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
   1189      }
   1190      // Check if Y advanced past the end of the right edge
   1191      if (y > r1.y) {
   1192        // Step to next right edge past Y and reset edge interpolants.
   1193        STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i);
   1194        (flipped ? left : right) =
   1195            Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
   1196      }
   1197      // Reset the clip bounds for the new edges
   1198      clipSpan =
   1199          clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
   1200      // Reset check condition for next time around.
   1201      checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1);
   1202    }
   1203 
   1204    // Calculate a potentially AA'd span and check if it is non-empty.
   1205    IntRange span = aa_span(fbuf, left, right, clipSpan);
   1206    if (span.len() > 0) {
   1207      // If user clip planes are enabled, use them to bound the current span.
   1208      if (vertex_shader->use_clip_distance()) {
   1209        span = span.intersect(clip_distance_range(left, right));
   1210        if (span.len() <= 0) goto next_span;
   1211      }
   1212      ctx->shaded_rows++;
   1213      ctx->shaded_pixels += span.len();
   1214      // Advance color/depth buffer pointers to the start of the span.
   1215      P* buf = fbuf + span.start;
   1216      // Check if the we will need to use depth-buffer or discard on this span.
   1217      DepthRun* depth =
   1218          depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr;
   1219      bool use_discard = fragment_shader->use_discard();
   1220      if (depth) {
   1221        // Perspective may cause the depth value to vary on a per sample basis.
   1222        // Ensure the depth row is flattened to allow testing of individual
   1223        // samples
   1224        if (!depth->is_flat()) {
   1225          flatten_depth_runs(depth, depthtex.width);
   1226        }
   1227        // Advance to the depth sample at the start of the span.
   1228        depth += span.start;
   1229      }
   1230      if (colortex.delay_clear) {
   1231        // Delayed clear is enabled for the color buffer. Check if needs clear.
   1232        prepare_row<P>(colortex, int(y), span.start, span.end, use_discard,
   1233                       depth);
   1234      }
   1235      // Initialize fragment shader interpolants to current span position.
   1236      fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1);
   1237      fragment_shader->gl_FragCoord.y = y;
   1238      {
   1239        // Calculate the fragment Z and W change per change in fragment X step.
   1240        // If the left and right X positions are extremely close together, then
   1241        // avoid stepping.
   1242        float stepScale = 1.0f / (right.x() - left.x());
   1243        if (!isfinite(stepScale)) stepScale = 0.0f;
   1244        vec2_scalar stepZW = (right.zw() - left.zw()) * stepScale;
   1245        // Calculate initial Z and W values for span start.
   1246        vec2_scalar zw = left.zw() + stepZW * (span.start + 0.5f - left.x());
   1247        // Set fragment shader's Z and W values so that it can use them to
   1248        // cancel out the 1/w baked into the interpolants.
   1249        fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x);
   1250        fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y);
   1251        fragment_shader->swgl_StepZW = stepZW;
   1252        // Change in interpolants is difference between current right and left
   1253        // edges per the change in right and left X. The left and right
   1254        // interpolant values were previously multipled by 1/w, so the step and
   1255        // initial span values take this into account.
   1256        Interpolants step = (right.interp - left.interp) * stepScale;
   1257        // Advance current interpolants to X at start of span.
   1258        Interpolants o = left.interp + step * (span.start + 0.5f - left.x());
   1259        fragment_shader->init_span<true>(&o, &step);
   1260      }
   1261      clipRect.set_clip_mask(span.start, y, buf);
   1262      if (!use_discard) {
   1263        // No discard is used. Common case.
   1264        draw_span<false, true>(buf, depth, span.len(), packDepth);
   1265      } else {
   1266        // Discard is used. Rare.
   1267        draw_span<true, true>(buf, depth, span.len(), packDepth);
   1268      }
   1269    }
   1270  next_span:
   1271    // Advance Y and edge interpolants to next row.
   1272    y++;
   1273    left.nextRow();
   1274    right.nextRow();
   1275    // Advance buffers to next row.
   1276    fbuf += colortex.stride() / sizeof(P);
   1277    fdepth += depthtex.stride() / sizeof(DepthRun);
   1278  }
   1279 }
   1280 
   1281 // Clip a primitive against both sides of a view-frustum axis, producing
   1282 // intermediate vertexes with interpolated attributes that will no longer
   1283 // intersect the selected axis planes. This assumes the primitive is convex
   1284 // and should produce at most N+2 vertexes for each invocation (only in the
   1285 // worst case where one point falls outside on each of the opposite sides
   1286 // with the rest of the points inside). The supplied AA edge mask will be
   1287 // modified such that it corresponds to the clipped polygon edges.
   1288 template <XYZW AXIS>
   1289 static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP,
   1290                     Interpolants* outInterp, int& outEdgeMask) {
   1291  // Potential mask bits of which side of a plane a coordinate falls on.
   1292  enum SIDE { POSITIVE = 1, NEGATIVE = 2 };
   1293  int numClip = 0;
   1294  int edgeMask = outEdgeMask;
   1295  Point3D prev = p[nump - 1];
   1296  Interpolants prevInterp = interp[nump - 1];
   1297  float prevCoord = prev.select(AXIS);
   1298  // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and
   1299  // if so, remember which side it is outside of. In the special case that W is
   1300  // negative and |C| < |W|, both -W <= C and C <= W will be false, such that
   1301  // we must consider the coordinate as falling outside of both plane sides
   1302  // simultaneously. We test each condition separately and combine them to form
   1303  // a mask of which plane sides we exceeded. If we neglect to consider both
   1304  // sides simultaneously, points can erroneously oscillate from one plane side
   1305  // to the other and exceed the supported maximum number of clip outputs.
   1306  int prevMask = (prevCoord < -prev.w ? NEGATIVE : 0) |
   1307                 (prevCoord > prev.w ? POSITIVE : 0);
   1308  // Loop through points, finding edges that cross the planes by evaluating
   1309  // the side at each point.
   1310  outEdgeMask = 0;
   1311  for (int i = 0; i < nump; i++, edgeMask >>= 1) {
   1312    Point3D cur = p[i];
   1313    Interpolants curInterp = interp[i];
   1314    float curCoord = cur.select(AXIS);
   1315    int curMask =
   1316        (curCoord < -cur.w ? NEGATIVE : 0) | (curCoord > cur.w ? POSITIVE : 0);
   1317    // Check if the previous and current end points are on different sides. If
   1318    // the masks of sides intersect, then we consider them to be on the same
   1319    // side. So in the case the masks do not intersect, we then consider them
   1320    // to fall on different sides.
   1321    if (!(curMask & prevMask)) {
   1322      // One of the edge's end points is outside the plane with the other
   1323      // inside the plane. Find the offset where it crosses the plane and
   1324      // adjust the point and interpolants to there.
   1325      if (prevMask) {
   1326        // Edge that was previously outside crosses inside.
   1327        // Evaluate plane equation for previous and current end-point
   1328        // based on previous side and calculate relative offset.
   1329        if (numClip >= nump + 2) {
   1330          // If for some reason we produced more vertexes than we support, just
   1331          // bail out.
   1332          assert(false);
   1333          return 0;
   1334        }
   1335        // The positive plane is assigned the sign 1, and the negative plane is
   1336        // assigned -1. If the point falls outside both planes, that means W is
   1337        // negative. To compensate for this, we must interpolate the coordinate
   1338        // till W=0, at which point we can choose a single plane side for the
   1339        // coordinate to fall on since W will no longer be negative. To compute
   1340        // the coordinate where W=0, we compute K = prev.w / (prev.w-cur.w) and
   1341        // interpolate C = prev.C + K*(cur.C - prev.C). The sign of C will be
   1342        // the side of the plane we need to consider. Substituting K into the
   1343        // comparison C < 0, we can then avoid the division in K with a
   1344        // cross-multiplication.
   1345        float prevSide =
   1346            (prevMask & NEGATIVE) && (!(prevMask & POSITIVE) ||
   1347                                      prevCoord * (cur.w - prev.w) <
   1348                                          prev.w * (curCoord - prevCoord))
   1349                ? -1
   1350                : 1;
   1351        float prevDist = prevCoord - prevSide * prev.w;
   1352        float curDist = curCoord - prevSide * cur.w;
   1353        // It may happen that after we interpolate by the weight k that due to
   1354        // floating point rounding we've underestimated the value necessary to
   1355        // push it over the clipping boundary. Just in case, nudge the mantissa
   1356        // by a single increment so that we essentially round it up and move it
   1357        // further inside the clipping boundary. We use nextafter to do this in
   1358        // a portable fashion.
   1359        float k = prevDist / (prevDist - curDist);
   1360        Point3D clipped = prev + (cur - prev) * k;
   1361        if (prevSide * clipped.select(AXIS) > clipped.w) {
   1362          k = nextafterf(k, 1.0f);
   1363          clipped = prev + (cur - prev) * k;
   1364        }
   1365        outP[numClip] = clipped;
   1366        outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
   1367        // Don't output the current edge mask since start point was outside.
   1368        numClip++;
   1369      }
   1370      if (curMask) {
   1371        // Edge that was previously inside crosses outside.
   1372        // Evaluate plane equation for previous and current end-point
   1373        // based on current side and calculate relative offset.
   1374        if (numClip >= nump + 2) {
   1375          assert(false);
   1376          return 0;
   1377        }
   1378        // In the case the coordinate falls on both plane sides, the computation
   1379        // here is much the same as for prevSide, but since we are going from a
   1380        // previous W that is positive to current W that is negative, then the
   1381        // sign of cur.w - prev.w will flip in the equation. The resulting sign
   1382        // is negated to compensate for this.
   1383        float curSide =
   1384            (curMask & POSITIVE) && (!(curMask & NEGATIVE) ||
   1385                                     prevCoord * (cur.w - prev.w) <
   1386                                         prev.w * (curCoord - prevCoord))
   1387                ? 1
   1388                : -1;
   1389        float prevDist = prevCoord - curSide * prev.w;
   1390        float curDist = curCoord - curSide * cur.w;
   1391        // Calculate interpolation weight k and the nudge it inside clipping
   1392        // boundary with nextafter. Note that since we were previously inside
   1393        // and now crossing outside, we have to flip the nudge direction for
   1394        // the weight towards 0 instead of 1.
   1395        float k = prevDist / (prevDist - curDist);
   1396        Point3D clipped = prev + (cur - prev) * k;
   1397        if (curSide * clipped.select(AXIS) > clipped.w) {
   1398          k = nextafterf(k, 0.0f);
   1399          clipped = prev + (cur - prev) * k;
   1400        }
   1401        outP[numClip] = clipped;
   1402        outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
   1403        // Output the current edge mask since the end point is inside.
   1404        outEdgeMask |= (edgeMask & 1) << numClip;
   1405        numClip++;
   1406      }
   1407    }
   1408    if (!curMask) {
   1409      // The current end point is inside the plane, so output point unmodified.
   1410      if (numClip >= nump + 2) {
   1411        assert(false);
   1412        return 0;
   1413      }
   1414      outP[numClip] = cur;
   1415      outInterp[numClip] = curInterp;
   1416      // Output the current edge mask since the end point is inside.
   1417      outEdgeMask |= (edgeMask & 1) << numClip;
   1418      numClip++;
   1419    }
   1420    prev = cur;
   1421    prevInterp = curInterp;
   1422    prevCoord = curCoord;
   1423    prevMask = curMask;
   1424  }
   1425  return numClip;
   1426 }
   1427 
   1428 // Helper function to dispatch to perspective span drawing with points that
   1429 // have already been transformed and clipped.
   1430 static inline void draw_perspective_clipped(int nump, Point3D* p_clip,
   1431                                            Interpolants* interp_clip,
   1432                                            Texture& colortex,
   1433                                            Texture& depthtex) {
   1434  // If polygon is ouside clip rect, nothing to draw.
   1435  ClipRect clipRect(colortex);
   1436  if (!clipRect.overlaps(nump, p_clip)) {
   1437    return;
   1438  }
   1439 
   1440  // Finally draw perspective-correct spans for the polygon.
   1441  if (colortex.internal_format == GL_RGBA8) {
   1442    draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex,
   1443                                     depthtex, clipRect);
   1444  } else if (colortex.internal_format == GL_R8) {
   1445    draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex,
   1446                                    depthtex, clipRect);
   1447  } else {
   1448    assert(false);
   1449  }
   1450 }
   1451 
   1452 // Draws a perspective-correct 3D primitive with varying Z value, as opposed
   1453 // to a simple 2D planar primitive with a constant Z value that could be
   1454 // trivially Z rejected. This requires clipping the primitive against the near
   1455 // and far planes to ensure it stays within the valid Z-buffer range. The Z
   1456 // and W of each fragment of the primitives are interpolated across the
   1457 // generated spans and then depth-tested as appropriate.
   1458 // Additionally, vertex attributes must be interpolated with perspective-
   1459 // correction by dividing by W before interpolation, and then later multiplied
   1460 // by W again to produce the final correct attribute value for each fragment.
   1461 // This process is expensive and should be avoided if possible for primitive
   1462 // batches that are known ahead of time to not need perspective-correction.
   1463 static void draw_perspective(int nump, Interpolants interp_outs[4],
   1464                             Texture& colortex, Texture& depthtex) {
   1465  // Lines are not supported with perspective.
   1466  assert(nump >= 3);
   1467  // Convert output of vertex shader to screen space.
   1468  vec4 pos = vertex_shader->gl_Position;
   1469  vec3_scalar scale =
   1470      vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f;
   1471  vec3_scalar offset =
   1472      make_vec3(make_vec2(ctx->viewport.origin() - colortex.offset), 0.0f) +
   1473      scale;
   1474  // Verify if point is between near and far planes, rejecting NaN.
   1475  if (test_all(pos.z > -pos.w && pos.z < pos.w)) {
   1476    // No points cross the near or far planes, so no clipping required.
   1477    // Just divide coords by W and convert to viewport. We assume the W
   1478    // coordinate is non-zero and the reciprocal is finite since it would
   1479    // otherwise fail the test_none condition.
   1480    Float w = 1.0f / pos.w;
   1481    vec3 screen = pos.sel(X, Y, Z) * w * scale + offset;
   1482    Point3D p[4] = {{screen.x.x, screen.y.x, screen.z.x, w.x},
   1483                    {screen.x.y, screen.y.y, screen.z.y, w.y},
   1484                    {screen.x.z, screen.y.z, screen.z.z, w.z},
   1485                    {screen.x.w, screen.y.w, screen.z.w, w.w}};
   1486    draw_perspective_clipped(nump, p, interp_outs, colortex, depthtex);
   1487  } else {
   1488    // Points cross the near or far planes, so we need to clip.
   1489    // Start with the original 3 or 4 points...
   1490    Point3D p[4] = {{pos.x.x, pos.y.x, pos.z.x, pos.w.x},
   1491                    {pos.x.y, pos.y.y, pos.z.y, pos.w.y},
   1492                    {pos.x.z, pos.y.z, pos.z.z, pos.w.z},
   1493                    {pos.x.w, pos.y.w, pos.z.w, pos.w.w}};
   1494    // Clipping can expand the points by 1 for each of 6 view frustum planes.
   1495    Point3D p_clip[4 + 6];
   1496    Interpolants interp_clip[4 + 6];
   1497    // Clip against near and far Z planes.
   1498    nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip,
   1499                        swgl_AAEdgeMask);
   1500    // If no points are left inside the view frustum, there's nothing to draw.
   1501    if (nump < 3) {
   1502      return;
   1503    }
   1504    // After clipping against only the near and far planes, we might still
   1505    // produce points where W = 0, exactly at the camera plane. OpenGL specifies
   1506    // that for clip coordinates, points must satisfy:
   1507    //   -W <= X <= W
   1508    //   -W <= Y <= W
   1509    //   -W <= Z <= W
   1510    // When Z = W = 0, this is trivially satisfied, but when we transform and
   1511    // divide by W below it will produce a divide by 0. Usually we want to only
   1512    // clip Z to avoid the extra work of clipping X and Y. We can still project
   1513    // points that fall outside the view frustum X and Y so long as Z is valid.
   1514    // The span drawing code will then ensure X and Y are clamped to viewport
   1515    // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y,
   1516    // will push W further inside the view frustum so that it is no longer 0,
   1517    // allowing us to finally proceed to projecting the points to the screen.
   1518    for (int i = 0; i < nump; i++) {
   1519      // Found an invalid W, so need to clip against X and Y...
   1520      if (p_clip[i].w <= 0.0f) {
   1521        // Ping-pong p_clip -> p_tmp -> p_clip.
   1522        Point3D p_tmp[4 + 6];
   1523        Interpolants interp_tmp[4 + 6];
   1524        nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp,
   1525                            swgl_AAEdgeMask);
   1526        if (nump < 3) return;
   1527        nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip,
   1528                            swgl_AAEdgeMask);
   1529        if (nump < 3) return;
   1530        // After clipping against X and Y planes, there's still points left
   1531        // to draw, so proceed to trying projection now...
   1532        break;
   1533      }
   1534    }
   1535    // Divide coords by W and convert to viewport.
   1536    for (int i = 0; i < nump; i++) {
   1537      float w = 1.0f / p_clip[i].w;
   1538      // If the W coord is essentially zero, small enough that division would
   1539      // result in Inf/NaN, then just set the point to all zeroes, as the only
   1540      // point that satisfies -W <= X/Y/Z <= W is all zeroes.
   1541      p_clip[i] = isfinite(w)
   1542                      ? Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w)
   1543                      : Point3D(0.0f);
   1544    }
   1545    draw_perspective_clipped(nump, p_clip, interp_clip, colortex, depthtex);
   1546  }
   1547 }
   1548 
   1549 static void draw_quad(int nump, Texture& colortex, Texture& depthtex) {
   1550  // Run vertex shader once for the primitive's vertices.
   1551  // Reserve space for 6 sets of interpolants, in case we need to clip against
   1552  // near and far planes in the perspective case.
   1553  Interpolants interp_outs[4];
   1554  swgl_ClipFlags = 0;
   1555  vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants));
   1556  vec4 pos = vertex_shader->gl_Position;
   1557  // Check if any vertex W is different from another. If so, use perspective.
   1558  if (test_any(pos.w != pos.w.x)) {
   1559    draw_perspective(nump, interp_outs, colortex, depthtex);
   1560    return;
   1561  }
   1562 
   1563  // Convert output of vertex shader to screen space.
   1564  // Divide coords by W and convert to viewport.
   1565  float w = 1.0f / pos.w.x;
   1566  // If the W coord is essentially zero, small enough that division would
   1567  // result in Inf/NaN, then just set the reciprocal itself to zero so that
   1568  // the coordinates becomes zeroed out, as the only valid point that
   1569  // satisfies -W <= X/Y/Z <= W is all zeroes.
   1570  if (!isfinite(w)) w = 0.0f;
   1571  vec2 screen = (pos.sel(X, Y) * w + 1) * 0.5f *
   1572                    vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) +
   1573                make_vec2(ctx->viewport.origin() - colortex.offset);
   1574  Point2D p[4] = {{screen.x.x, screen.y.x},
   1575                  {screen.x.y, screen.y.y},
   1576                  {screen.x.z, screen.y.z},
   1577                  {screen.x.w, screen.y.w}};
   1578 
   1579  // If quad is ouside clip rect, nothing to draw.
   1580  ClipRect clipRect(colortex);
   1581  if (!clipRect.overlaps(nump, p)) {
   1582    return;
   1583  }
   1584 
   1585  // Since the quad is assumed 2D, Z is constant across the quad.
   1586  float screenZ = (pos.z.x * w + 1) * 0.5f;
   1587  if (screenZ < 0 || screenZ > 1) {
   1588    // Z values would cross the near or far plane, so just bail.
   1589    return;
   1590  }
   1591  // Since Z doesn't need to be interpolated, just set the fragment shader's
   1592  // Z and W values here, once and for all fragment shader invocations.
   1593  uint32_t z = uint32_t(MAX_DEPTH_VALUE * screenZ);
   1594  fragment_shader->gl_FragCoord.z = screenZ;
   1595  fragment_shader->gl_FragCoord.w = w;
   1596 
   1597  // If supplied a line, adjust it so that it is a quad at least 1 pixel thick.
   1598  // Assume that for a line that all 4 SIMD lanes were actually filled with
   1599  // vertexes 0, 1, 1, 0.
   1600  if (nump == 2) {
   1601    // Nudge Y height to span at least 1 pixel by advancing to next pixel
   1602    // boundary so that we step at least 1 row when drawing spans.
   1603    if (int(p[0].y + 0.5f) == int(p[1].y + 0.5f)) {
   1604      p[2].y = 1 + int(p[1].y + 0.5f);
   1605      p[3].y = p[2].y;
   1606      // Nudge X width to span at least 1 pixel so that rounded coords fall on
   1607      // separate pixels.
   1608      if (int(p[0].x + 0.5f) == int(p[1].x + 0.5f)) {
   1609        p[1].x += 1.0f;
   1610        p[2].x += 1.0f;
   1611      }
   1612    } else {
   1613      // If the line already spans at least 1 row, then assume line is vertical
   1614      // or diagonal and just needs to be dilated horizontally.
   1615      p[2].x += 1.0f;
   1616      p[3].x += 1.0f;
   1617    }
   1618    // Pretend that it's a quad now...
   1619    nump = 4;
   1620  }
   1621 
   1622  // Finally draw 2D spans for the quad. Currently only supports drawing to
   1623  // RGBA8 and R8 color buffers.
   1624  if (colortex.internal_format == GL_RGBA8) {
   1625    draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, depthtex,
   1626                              clipRect);
   1627  } else if (colortex.internal_format == GL_R8) {
   1628    draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, depthtex,
   1629                             clipRect);
   1630  } else {
   1631    assert(false);
   1632  }
   1633 }
   1634 
   1635 template <typename INDEX>
   1636 static inline void draw_elements(GLsizei count, GLsizei instancecount,
   1637                                 size_t offset, VertexArray& v,
   1638                                 Texture& colortex, Texture& depthtex) {
   1639  Buffer& indices_buf = ctx->buffers[v.element_array_buffer_binding];
   1640  if (!indices_buf.buf || offset >= indices_buf.size) {
   1641    return;
   1642  }
   1643  assert((offset & (sizeof(INDEX) - 1)) == 0);
   1644  INDEX* indices = (INDEX*)(indices_buf.buf + offset);
   1645  count = min(count, (GLsizei)((indices_buf.size - offset) / sizeof(INDEX)));
   1646  // Triangles must be indexed at offsets 0, 1, 2.
   1647  // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
   1648  if (count == 6 && indices[1] == indices[0] + 1 &&
   1649      indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) {
   1650    assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1);
   1651    // Fast path - since there is only a single quad, we only load per-vertex
   1652    // attribs once for all instances, as they won't change across instances
   1653    // or within an instance.
   1654    vertex_shader->load_attribs(v.attribs, indices[0], 0, 4);
   1655    draw_quad(4, colortex, depthtex);
   1656    for (GLsizei instance = 1; instance < instancecount; instance++) {
   1657      vertex_shader->load_attribs(v.attribs, indices[0], instance, 0);
   1658      draw_quad(4, colortex, depthtex);
   1659    }
   1660  } else {
   1661    for (GLsizei instance = 0; instance < instancecount; instance++) {
   1662      for (GLsizei i = 0; i + 3 <= count; i += 3) {
   1663        if (indices[i + 1] != indices[i] + 1 ||
   1664            indices[i + 2] != indices[i] + 2) {
   1665          continue;
   1666        }
   1667        if (i + 6 <= count && indices[i + 5] == indices[i] + 3) {
   1668          assert(indices[i + 3] == indices[i] + 2 &&
   1669                 indices[i + 4] == indices[i] + 1);
   1670          vertex_shader->load_attribs(v.attribs, indices[i], instance, 4);
   1671          draw_quad(4, colortex, depthtex);
   1672          i += 3;
   1673        } else {
   1674          vertex_shader->load_attribs(v.attribs, indices[i], instance, 3);
   1675          draw_quad(3, colortex, depthtex);
   1676        }
   1677      }
   1678    }
   1679  }
   1680 }