tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

composite.h (54346B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 // Converts a pixel from a source format to a destination format. By default,
      6 // just return the value unchanged as for a simple copy.
      7 template <typename P, typename U>
      8 static ALWAYS_INLINE P convert_pixel(U src) {
      9  return src;
     10 }
     11 
     12 // R8 format maps to BGRA value 0,0,R,1. The byte order is endian independent,
     13 // but the shifts unfortunately depend on endianness.
     14 template <>
     15 ALWAYS_INLINE uint32_t convert_pixel<uint32_t>(uint8_t src) {
     16 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     17  return (uint32_t(src) << 16) | 0xFF000000;
     18 #else
     19  return (uint32_t(src) << 8) | 0x000000FF;
     20 #endif
     21 }
     22 
     23 // RG8 format maps to BGRA value 0,G,R,1.
     24 template <>
     25 ALWAYS_INLINE uint32_t convert_pixel<uint32_t>(uint16_t src) {
     26  uint32_t rg = src;
     27 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     28  return ((rg & 0x00FF) << 16) | (rg & 0xFF00) | 0xFF000000;
     29 #else
     30  return (rg & 0xFF00) | ((rg & 0x00FF) << 16) | 0x000000FF;
     31 #endif
     32 }
     33 
     34 // RGBA8 format maps to R.
     35 template <>
     36 ALWAYS_INLINE uint8_t convert_pixel<uint8_t>(uint32_t src) {
     37 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     38  return (src >> 16) & 0xFF;
     39 #else
     40  return (src >> 8) & 0xFF;
     41 #endif
     42 }
     43 
     44 // RGBA8 formats maps to R,G.
     45 template <>
     46 ALWAYS_INLINE uint16_t convert_pixel<uint16_t>(uint32_t src) {
     47 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     48  return ((src >> 16) & 0x00FF) | (src & 0xFF00);
     49 #else
     50  return (src & 0xFF00) | ((src >> 16) & 0x00FF);
     51 #endif
     52 }
     53 
     54 // R8 format maps to R,0.
     55 template <>
     56 ALWAYS_INLINE uint16_t convert_pixel<uint16_t>(uint8_t src) {
     57 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     58  return src;
     59 #else
     60  return uint16_t(src) << 8;
     61 #endif
     62 }
     63 
     64 // RG8 format maps to R.
     65 template <>
     66 ALWAYS_INLINE uint8_t convert_pixel<uint8_t>(uint16_t src) {
     67 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     68  return src & 0xFF;
     69 #else
     70  return src >> 8;
     71 #endif
     72 }
     73 
     74 // Apply a u8 alpha mask to a u32 texture row
     75 static inline void mask_row(uint32_t* dst, const uint8_t* mask, int span) {
     76  auto* end = dst + span;
     77  while (dst + 4 <= end) {
     78    WideRGBA8 maskpx = expand_mask(dst, unpack(unaligned_load<PackedR8>(mask)));
     79    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst));
     80    PackedRGBA8 r = pack(muldiv255(dstpx, maskpx));
     81    unaligned_store(dst, r);
     82    mask += 4;
     83    dst += 4;
     84  }
     85  if (dst < end) {
     86    WideRGBA8 maskpx =
     87        expand_mask(dst, unpack(partial_load_span<PackedR8>(mask, end - dst)));
     88    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst));
     89    auto r = pack(maskpx + dstpx - muldiv255(dstpx, maskpx));
     90    partial_store_span(dst, r, end - dst);
     91  }
     92 }
     93 
     94 // Apply a R8 alpha mask to a RGBA8 texture
     95 static NO_INLINE void mask_blit(Texture& masktex, Texture& dsttex) {
     96  int maskStride = masktex.stride();
     97  int destStride = dsttex.stride();
     98  char* dest = dsttex.sample_ptr(0, 0);
     99  char* mask = masktex.sample_ptr(0, 0);
    100  int span = dsttex.width;
    101 
    102  for (int rows = dsttex.height; rows > 0; rows--) {
    103    mask_row((uint32_t*)dest, (uint8_t*)mask, span);
    104    dest += destStride;
    105    mask += maskStride;
    106  }
    107 }
    108 
    109 template <bool COMPOSITE, typename P>
    110 static inline void copy_row(P* dst, const P* src, int span) {
    111  // No scaling, so just do a fast copy.
    112  memcpy(dst, src, span * sizeof(P));
    113 }
    114 
    115 template <>
    116 void copy_row<true, uint32_t>(uint32_t* dst, const uint32_t* src, int span) {
    117  // No scaling, so just do a fast composite.
    118  auto* end = dst + span;
    119  while (dst + 4 <= end) {
    120    WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
    121    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst));
    122    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
    123    unaligned_store(dst, r);
    124    src += 4;
    125    dst += 4;
    126  }
    127  if (dst < end) {
    128    WideRGBA8 srcpx = unpack(partial_load_span<PackedRGBA8>(src, end - dst));
    129    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst));
    130    auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
    131    partial_store_span(dst, r, end - dst);
    132  }
    133 }
    134 
    135 template <bool COMPOSITE, typename P, typename U>
    136 static inline void scale_row(P* dst, int dstWidth, const U* src, int srcWidth,
    137                             int span, int frac) {
    138  // Do scaling with different source and dest widths.
    139  for (P* end = dst + span; dst < end; dst++) {
    140    *dst = convert_pixel<P>(*src);
    141    // Step source according to width ratio.
    142    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
    143      src++;
    144    }
    145  }
    146 }
    147 
    148 template <>
    149 void scale_row<true, uint32_t, uint32_t>(uint32_t* dst, int dstWidth,
    150                                         const uint32_t* src, int srcWidth,
    151                                         int span, int frac) {
    152  // Do scaling with different source and dest widths.
    153  // Gather source pixels four at a time for better packing.
    154  auto* end = dst + span;
    155  for (; dst + 4 <= end; dst += 4) {
    156    U32 srcn;
    157    srcn.x = *src;
    158    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
    159      src++;
    160    }
    161    srcn.y = *src;
    162    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
    163      src++;
    164    }
    165    srcn.z = *src;
    166    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
    167      src++;
    168    }
    169    srcn.w = *src;
    170    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
    171      src++;
    172    }
    173    WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn));
    174    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst));
    175    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
    176    unaligned_store(dst, r);
    177  }
    178  if (dst < end) {
    179    // Process any remaining pixels. Try to gather as many pixels as possible
    180    // into a single source chunk for compositing.
    181    U32 srcn = {*src, 0, 0, 0};
    182    if (end - dst > 1) {
    183      for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
    184        src++;
    185      }
    186      srcn.y = *src;
    187      if (end - dst > 2) {
    188        for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
    189          src++;
    190        }
    191        srcn.z = *src;
    192      }
    193    }
    194    WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn));
    195    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst));
    196    auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
    197    partial_store_span(dst, r, end - dst);
    198  }
    199 }
    200 
    201 template <bool COMPOSITE = false>
    202 static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq,
    203                                 Texture& dsttex, const IntRect& dstReq,
    204                                 bool invertY, const IntRect& clipRect) {
    205  assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 &&
    206                        dsttex.internal_format == GL_RGBA8));
    207  // Cache scaling ratios
    208  int srcWidth = srcReq.width();
    209  int srcHeight = srcReq.height();
    210  int dstWidth = dstReq.width();
    211  int dstHeight = dstReq.height();
    212  // Compute valid dest bounds
    213  IntRect dstBounds = dsttex.sample_bounds(dstReq).intersect(clipRect);
    214  // Compute valid source bounds
    215  IntRect srcBounds = srctex.sample_bounds(srcReq, invertY);
    216  // If srcReq is outside the source texture, we need to clip the sampling
    217  // bounds so that we never sample outside valid source bounds. Get texture
    218  // bounds relative to srcReq and scale to dest-space rounding inward, using
    219  // this rect to limit the dest bounds further.
    220  IntRect srcClip = srctex.bounds() - srcReq.origin();
    221  if (invertY) {
    222    srcClip.invert_y(srcReq.height());
    223  }
    224  srcClip.scale(srcWidth, srcHeight, dstWidth, dstHeight, true);
    225  dstBounds.intersect(srcClip);
    226  // Check if clipped sampling bounds are empty
    227  if (dstBounds.is_empty()) {
    228    return;
    229  }
    230 
    231  // Calculate source and dest pointers from clamped offsets
    232  int srcStride = srctex.stride();
    233  int destStride = dsttex.stride();
    234  char* dest = dsttex.sample_ptr(dstReq, dstBounds);
    235  // Clip the source bounds by the destination offset.
    236  int fracX = srcWidth * dstBounds.x0;
    237  int fracY = srcHeight * dstBounds.y0;
    238  srcBounds.x0 = max(fracX / dstWidth, srcBounds.x0);
    239  srcBounds.y0 = max(fracY / dstHeight, srcBounds.y0);
    240  fracX %= dstWidth;
    241  fracY %= dstHeight;
    242  char* src = srctex.sample_ptr(srcReq, srcBounds, invertY);
    243  // Inverted Y must step downward along source rows
    244  if (invertY) {
    245    srcStride = -srcStride;
    246  }
    247  int span = dstBounds.width();
    248  for (int rows = dstBounds.height(); rows > 0; rows--) {
    249    switch (srctex.bpp()) {
    250      case 1:
    251        switch (dsttex.bpp()) {
    252          case 2:
    253            scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint8_t*)src,
    254                                 srcWidth, span, fracX);
    255            break;
    256          case 4:
    257            scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint8_t*)src,
    258                                 srcWidth, span, fracX);
    259            break;
    260          default:
    261            if (srcWidth == dstWidth)
    262              copy_row<COMPOSITE>((uint8_t*)dest, (uint8_t*)src, span);
    263            else
    264              scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint8_t*)src,
    265                                   srcWidth, span, fracX);
    266            break;
    267        }
    268        break;
    269      case 2:
    270        switch (dsttex.bpp()) {
    271          case 1:
    272            scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint16_t*)src,
    273                                 srcWidth, span, fracX);
    274            break;
    275          case 4:
    276            scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint16_t*)src,
    277                                 srcWidth, span, fracX);
    278            break;
    279          default:
    280            if (srcWidth == dstWidth)
    281              copy_row<COMPOSITE>((uint16_t*)dest, (uint16_t*)src, span);
    282            else
    283              scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint16_t*)src,
    284                                   srcWidth, span, fracX);
    285            break;
    286        }
    287        break;
    288      case 4:
    289        switch (dsttex.bpp()) {
    290          case 1:
    291            scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint32_t*)src,
    292                                 srcWidth, span, fracX);
    293            break;
    294          case 2:
    295            scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint32_t*)src,
    296                                 srcWidth, span, fracX);
    297            break;
    298          default:
    299            if (srcWidth == dstWidth)
    300              copy_row<COMPOSITE>((uint32_t*)dest, (uint32_t*)src, span);
    301            else
    302              scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint32_t*)src,
    303                                   srcWidth, span, fracX);
    304            break;
    305        }
    306        break;
    307      default:
    308        assert(false);
    309        break;
    310    }
    311    dest += destStride;
    312    // Step source according to height ratio.
    313    for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) {
    314      src += srcStride;
    315    }
    316  }
    317 }
    318 
    319 template <bool COMPOSITE>
    320 static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV,
    321                            float srcDU, sampler2D sampler) {
    322  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
    323  for (; span >= 4; span -= 4) {
    324    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv));
    325    unaligned_store(dest, srcpx);
    326    dest += 4;
    327    uv.x += 4 * srcDU;
    328  }
    329  if (span > 0) {
    330    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv));
    331    partial_store_span(dest, srcpx, span);
    332  }
    333 }
    334 
    335 template <>
    336 void linear_row_blit<true>(uint32_t* dest, int span, const vec2_scalar& srcUV,
    337                           float srcDU, sampler2D sampler) {
    338  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
    339  for (; span >= 4; span -= 4) {
    340    WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv));
    341    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
    342    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
    343    unaligned_store(dest, r);
    344 
    345    dest += 4;
    346    uv.x += 4 * srcDU;
    347  }
    348  if (span > 0) {
    349    WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv));
    350    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span));
    351    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
    352    partial_store_span(dest, r, span);
    353  }
    354 }
    355 
    356 template <bool COMPOSITE>
    357 static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV,
    358                            float srcDU, sampler2D sampler) {
    359  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
    360  for (; span >= 4; span -= 4) {
    361    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv));
    362    unaligned_store(dest, srcpx);
    363    dest += 4;
    364    uv.x += 4 * srcDU;
    365  }
    366  if (span > 0) {
    367    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv));
    368    partial_store_span(dest, srcpx, span);
    369  }
    370 }
    371 
    372 template <bool COMPOSITE>
    373 static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV,
    374                            float srcDU, sampler2D sampler) {
    375  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
    376  for (; span >= 4; span -= 4) {
    377    auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv));
    378    unaligned_store(dest, srcpx);
    379    dest += 4;
    380    uv.x += 4 * srcDU;
    381  }
    382  if (span > 0) {
    383    auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv));
    384    partial_store_span(dest, srcpx, span);
    385  }
    386 }
    387 
    388 template <bool COMPOSITE = false>
    389 static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq,
    390                                  Texture& dsttex, const IntRect& dstReq,
    391                                  bool invertX, bool invertY,
    392                                  const IntRect& clipRect) {
    393  assert(srctex.internal_format == GL_RGBA8 ||
    394         srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8);
    395  assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 &&
    396                        dsttex.internal_format == GL_RGBA8));
    397  // Compute valid dest bounds
    398  IntRect dstBounds = dsttex.sample_bounds(dstReq);
    399  dstBounds.intersect(clipRect);
    400  // Check if sampling bounds are empty
    401  if (dstBounds.is_empty()) {
    402    return;
    403  }
    404  // Initialize sampler for source texture
    405  sampler2D_impl sampler;
    406  init_sampler(&sampler, srctex);
    407  sampler.filter = TextureFilter::LINEAR;
    408  // Compute source UVs
    409  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
    410  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
    411                     float(srcReq.height()) / dstReq.height());
    412  if (invertX) {
    413    // Advance to the end of the row and flip the step.
    414    srcUV.x += srcReq.width();
    415    srcDUV.x = -srcDUV.x;
    416  }
    417  // Inverted Y must step downward along source rows
    418  if (invertY) {
    419    srcUV.y += srcReq.height();
    420    srcDUV.y = -srcDUV.y;
    421  }
    422  // Skip to clamped source start
    423  srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
    424  // Scale UVs by lerp precision
    425  srcUV = linearQuantize(srcUV, 128);
    426  srcDUV *= 128.0f;
    427  // Calculate dest pointer from clamped offsets
    428  int bpp = dsttex.bpp();
    429  int destStride = dsttex.stride();
    430  char* dest = dsttex.sample_ptr(dstReq, dstBounds);
    431  int span = dstBounds.width();
    432  for (int rows = dstBounds.height(); rows > 0; rows--) {
    433    switch (bpp) {
    434      case 1:
    435        linear_row_blit<COMPOSITE>((uint8_t*)dest, span, srcUV, srcDUV.x,
    436                                   &sampler);
    437        break;
    438      case 2:
    439        linear_row_blit<COMPOSITE>((uint16_t*)dest, span, srcUV, srcDUV.x,
    440                                   &sampler);
    441        break;
    442      case 4:
    443        linear_row_blit<COMPOSITE>((uint32_t*)dest, span, srcUV, srcDUV.x,
    444                                   &sampler);
    445        break;
    446      default:
    447        assert(false);
    448        break;
    449    }
    450    dest += destStride;
    451    srcUV.y += srcDUV.y;
    452  }
    453 }
    454 
    455 // Whether the blit format is renderable.
    456 static inline bool is_renderable_format(GLenum format) {
    457  switch (format) {
    458    case GL_R8:
    459    case GL_RG8:
    460    case GL_RGBA8:
    461      return true;
    462    default:
    463      return false;
    464  }
    465 }
    466 
    467 extern "C" {
    468 
    469 void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    470                     GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
    471                     GLbitfield mask, GLenum filter) {
    472  assert(mask == GL_COLOR_BUFFER_BIT);
    473  Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER);
    474  if (!srcfb) return;
    475  Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER);
    476  if (!dstfb) return;
    477  Texture& srctex = ctx->textures[srcfb->color_attachment];
    478  if (!srctex.buf) return;
    479  Texture& dsttex = ctx->textures[dstfb->color_attachment];
    480  if (!dsttex.buf) return;
    481  assert(!dsttex.locked);
    482  if (srctex.internal_format != dsttex.internal_format &&
    483      (!is_renderable_format(srctex.internal_format) ||
    484       !is_renderable_format(dsttex.internal_format))) {
    485    // If the internal formats don't match, then we may have to convert. Require
    486    // that the format is a simple renderable format to limit combinatoric
    487    // explosion for now.
    488    assert(false);
    489    return;
    490  }
    491  // Force flipped Y onto dest coordinates
    492  if (srcY1 < srcY0) {
    493    swap(srcY0, srcY1);
    494    swap(dstY0, dstY1);
    495  }
    496  bool invertY = dstY1 < dstY0;
    497  if (invertY) {
    498    swap(dstY0, dstY1);
    499  }
    500  IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset;
    501  IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset;
    502  if (srcReq.is_empty() || dstReq.is_empty()) {
    503    return;
    504  }
    505  IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()};
    506  prepare_texture(srctex);
    507  prepare_texture(dsttex, &dstReq);
    508  if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR &&
    509      srctex.internal_format == dsttex.internal_format &&
    510      is_renderable_format(srctex.internal_format)) {
    511    linear_blit(srctex, srcReq, dsttex, dstReq, false, invertY, dstReq);
    512  } else {
    513    scale_blit(srctex, srcReq, dsttex, dstReq, invertY, clipRect);
    514  }
    515 }
    516 
    517 // Get the underlying buffer for a locked resource
    518 void* GetResourceBuffer(LockedTexture* resource, int32_t* width,
    519                        int32_t* height, int32_t* stride) {
    520  *width = resource->width;
    521  *height = resource->height;
    522  *stride = resource->stride();
    523  return resource->buf;
    524 }
    525 
    526 // Extension for optimized compositing of textures or framebuffers that may be
    527 // safely used across threads. The source and destination must be locked to
    528 // ensure that they can be safely accessed while the SWGL context might be used
    529 // by another thread. Band extents along the Y axis may be used to clip the
    530 // destination rectangle without effecting the integer scaling ratios.
    531 void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX,
    532               GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
    533               GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
    534               GLboolean opaque, GLboolean flipX, GLboolean flipY,
    535               GLenum filter, GLint clipX, GLint clipY, GLsizei clipWidth,
    536               GLsizei clipHeight) {
    537  if (!lockedDst || !lockedSrc) {
    538    return;
    539  }
    540  Texture& srctex = *lockedSrc;
    541  Texture& dsttex = *lockedDst;
    542  assert(srctex.bpp() == 4);
    543  assert(dsttex.bpp() == 4);
    544 
    545  IntRect srcReq =
    546      IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset;
    547  IntRect dstReq =
    548      IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
    549  if (srcReq.is_empty() || dstReq.is_empty()) {
    550    return;
    551  }
    552 
    553  // Compute clip rect as relative to the dstReq, as that's the same coords
    554  // as used for the sampling bounds.
    555  IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
    556                      clipY - dstY + clipHeight};
    557  // Ensure we have rows of at least 2 pixels when using the linear filter to
    558  // avoid overreading the row. Force X flips onto the linear filter for now
    559  // until scale_blit supports it.
    560  bool useLinear =
    561      srctex.width >= 2 &&
    562      (flipX || (!srcReq.same_size(dstReq) && filter == GL_LINEAR));
    563 
    564  if (opaque) {
    565    if (useLinear) {
    566      linear_blit<false>(srctex, srcReq, dsttex, dstReq, flipX, flipY,
    567                         clipRect);
    568    } else {
    569      scale_blit<false>(srctex, srcReq, dsttex, dstReq, flipY, clipRect);
    570    }
    571  } else {
    572    if (useLinear) {
    573      linear_blit<true>(srctex, srcReq, dsttex, dstReq, flipX, flipY, clipRect);
    574    } else {
    575      scale_blit<true>(srctex, srcReq, dsttex, dstReq, flipY, clipRect);
    576    }
    577  }
    578 }
    579 
    580 // Extension used by the SWGL compositor to apply an alpha mask
    581 // to a texture. The textures must be the same size. The mask
    582 // must be R8, the texture must be RGBA8.
    583 void ApplyMask(LockedTexture* lockedDst, LockedTexture* lockedMask) {
    584  assert(lockedDst);
    585  assert(lockedMask);
    586 
    587  Texture& masktex = *lockedMask;
    588  Texture& dsttex = *lockedDst;
    589 
    590  assert(masktex.bpp() == 1);
    591  assert(dsttex.bpp() == 4);
    592 
    593  assert(masktex.width == dsttex.width);
    594  assert(masktex.height == dsttex.height);
    595 
    596  mask_blit(masktex, dsttex);
    597 }
    598 
    599 }  // extern "C"
    600 
    601 // Saturated add helper for YUV conversion. Supported platforms have intrinsics
    602 // to do this natively, but support a slower generic fallback just in case.
    603 static inline V8<int16_t> addsat(V8<int16_t> x, V8<int16_t> y) {
    604 #if USE_SSE2
    605  return _mm_adds_epi16(x, y);
    606 #elif USE_NEON
    607  return vqaddq_s16(x, y);
    608 #else
    609  auto r = x + y;
    610  // An overflow occurred if the signs of both inputs x and y did not differ
    611  // but yet the sign of the result did differ.
    612  auto overflow = (~(x ^ y) & (r ^ x)) >> 15;
    613  // If there was an overflow, we need to choose the appropriate limit to clamp
    614  // to depending on whether or not the inputs are negative.
    615  auto limit = (x >> 15) ^ 0x7FFF;
    616  // If we didn't overflow, just use the result, and otherwise, use the limit.
    617  return (~overflow & r) | (overflow & limit);
    618 #endif
    619 }
    620 
    621 // Interleave and packing helper for YUV conversion. During transform by the
    622 // color matrix, the color components are de-interleaved as this format is
    623 // usually what comes out of the planar YUV textures. The components thus need
    624 // to be interleaved before finally getting packed to BGRA format. Alpha is
    625 // forced to be opaque.
    626 static inline PackedRGBA8 packYUV(V8<int16_t> gg, V8<int16_t> br) {
    627  return pack(bit_cast<WideRGBA8>(zip(br, gg))) |
    628         PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
    629 }
    630 
    631 // clang-format off
    632 // Supports YUV color matrixes of the form:
    633 // [R]   [1.1643835616438356,  0.0,  rv ]   [Y -  16]
    634 // [G] = [1.1643835616438358, -gu,  -gv ] x [U - 128]
    635 // [B]   [1.1643835616438356,  bu,  0.0 ]   [V - 128]
    636 // We must be able to multiply a YUV input by a matrix coefficient ranging as
    637 // high as ~2.2 in the U/V cases, where U/V can be signed values between -128
    638 // and 127. The largest fixed-point representation we can thus support without
    639 // overflowing 16 bit integers leaves us 6 bits of fractional precision while
    640 // also supporting a sign bit. The closest representation of the Y coefficient
    641 // ~1.164 in this precision is 74.5/2^6 which is common to all color spaces
    642 // we support. Conversions can still sometimes overflow the precision and
    643 // require clamping back into range, so we use saturated additions to do this
    644 // efficiently at no extra cost.
    645 // clang-format on
    646 struct YUVMatrix {
    647  // These constants are loaded off the "this" pointer via relative addressing
    648  // modes and should be about as quick to load as directly addressed SIMD
    649  // constant memory.
    650 
    651  V8<int16_t> br_uvCoeffs;  // biased by 6 bits [b_from_u, r_from_v, repeats]
    652  V8<int16_t> gg_uvCoeffs;  // biased by 6 bits [g_from_u, g_from_v, repeats]
    653  V8<uint16_t> yCoeffs;     // biased by 7 bits
    654  V8<int16_t> yBias;        // 0 or 16
    655  V8<int16_t> uvBias;       // 128
    656  V8<int16_t> br_yMask;
    657 
    658  // E.g. rec709-narrow:
    659  // [ 1.16,     0,  1.79, -0.97 ]
    660  // [ 1.16, -0.21, -0.53,  0.30 ]
    661  // [ 1.16,  2.11,     0, -1.13 ]
    662  // =
    663  // [ yScale,        0, r_from_v ]   ([Y ]              )
    664  // [ yScale, g_from_u, g_from_v ] x ([cb] - ycbcr_bias )
    665  // [ yScale, b_from_u,        0 ]   ([cr]              )
    666  static YUVMatrix From(const vec3_scalar& ycbcr_bias,
    667                        const mat3_scalar& rgb_from_debiased_ycbcr,
    668                        int rescale_factor = 0) {
    669    assert(ycbcr_bias.z == ycbcr_bias.y);
    670 
    671    const auto rgb_from_y = rgb_from_debiased_ycbcr[0].y;
    672    assert(rgb_from_debiased_ycbcr[0].x == rgb_from_debiased_ycbcr[0].z);
    673 
    674    int16_t br_from_y_mask = -1;
    675    if (rgb_from_debiased_ycbcr[0].x == 0.0) {
    676      // gbr-identity matrix?
    677      assert(rgb_from_debiased_ycbcr[0].x == 0);
    678      assert(rgb_from_debiased_ycbcr[0].y >= 1);
    679      assert(rgb_from_debiased_ycbcr[0].z == 0);
    680 
    681      assert(rgb_from_debiased_ycbcr[1].x == 0);
    682      assert(rgb_from_debiased_ycbcr[1].y == 0);
    683      assert(rgb_from_debiased_ycbcr[1].z >= 1);
    684 
    685      assert(rgb_from_debiased_ycbcr[2].x >= 1);
    686      assert(rgb_from_debiased_ycbcr[2].y == 0);
    687      assert(rgb_from_debiased_ycbcr[2].z == 0);
    688 
    689      assert(ycbcr_bias.x == 0);
    690      assert(ycbcr_bias.y == 0);
    691      assert(ycbcr_bias.z == 0);
    692 
    693      br_from_y_mask = 0;
    694    } else {
    695      assert(rgb_from_debiased_ycbcr[0].x == rgb_from_y);
    696    }
    697 
    698    assert(rgb_from_debiased_ycbcr[1].x == 0.0);
    699    const auto g_from_u = rgb_from_debiased_ycbcr[1].y;
    700    const auto b_from_u = rgb_from_debiased_ycbcr[1].z;
    701 
    702    const auto r_from_v = rgb_from_debiased_ycbcr[2].x;
    703    const auto g_from_v = rgb_from_debiased_ycbcr[2].y;
    704    assert(rgb_from_debiased_ycbcr[2].z == 0.0);
    705 
    706    return YUVMatrix({ycbcr_bias.x, ycbcr_bias.y}, rgb_from_y, br_from_y_mask,
    707                     r_from_v, g_from_u, g_from_v, b_from_u, rescale_factor);
    708  }
    709 
    710  // Convert matrix coefficients to fixed-point representation. If the matrix
    711  // has a rescaling applied to it, then we need to take care to undo the
    712  // scaling so that we can convert the coefficients to fixed-point range. The
    713  // bias still requires shifting to apply the rescaling. The rescaling will be
    714  // applied to the actual YCbCr sample data later by manually shifting it
    715  // before applying this matrix.
    716  YUVMatrix(vec2_scalar yuv_bias, double yCoeff, int16_t br_yMask_, double rv,
    717            double gu, double gv, double bu, int rescale_factor = 0)
    718      : br_uvCoeffs(zip(I16(int16_t(bu * (1 << (6 - rescale_factor)) + 0.5)),
    719                        I16(int16_t(rv * (1 << (6 - rescale_factor)) + 0.5)))),
    720        gg_uvCoeffs(
    721            zip(I16(-int16_t(-gu * (1 << (6 - rescale_factor)) +
    722                             0.5)),  // These are negative coeffs, so
    723                                     // round them away from zero
    724                I16(-int16_t(-gv * (1 << (6 - rescale_factor)) + 0.5)))),
    725        yCoeffs(uint16_t(yCoeff * (1 << (6 + 1 - rescale_factor)) + 0.5)),
    726        // We have a +0.5 fudge-factor for -ybias.
    727        // Without this, we get white=254 not 255.
    728        // This approximates rounding rather than truncation during `gg >>= 6`.
    729        yBias(int16_t(((yuv_bias.x * 255 * yCoeff) - 0.5) * (1 << 6))),
    730        uvBias(int16_t(yuv_bias.y * (255 << rescale_factor) + 0.5)),
    731        br_yMask(br_yMask_) {
    732    assert(yuv_bias.x >= 0);
    733    assert(yuv_bias.y >= 0);
    734    assert(yCoeff > 0);
    735    assert(br_yMask_ == 0 || br_yMask_ == -1);
    736    assert(bu > 0);
    737    assert(rv > 0);
    738    assert(gu <= 0);
    739    assert(gv <= 0);
    740    assert(rescale_factor <= 6);
    741  }
    742 
    743  ALWAYS_INLINE PackedRGBA8 convert(V8<int16_t> yy, V8<int16_t> uv) const {
    744    // We gave ourselves an extra bit (7 instead of 6) of bias to give us some
    745    // extra precision for the more-sensitive y scaling.
    746    // Note that we have to use an unsigned multiply with a 2x scale to
    747    // represent a fractional scale and to avoid shifting with the sign bit.
    748 
    749    // Note: if you subtract the bias before multiplication, we see more
    750    // underflows. This could be fixed by an unsigned subsat.
    751    yy = bit_cast<V8<int16_t>>((bit_cast<V8<uint16_t>>(yy) * yCoeffs) >> 1);
    752    yy -= yBias;
    753 
    754    // Compute [B] = [yCoeff*Y + bu*U +  0*V]
    755    //         [R]   [yCoeff*Y +  0*U + rv*V]
    756    uv -= uvBias;
    757    auto br = br_uvCoeffs * uv;
    758    br = addsat(yy & br_yMask, br);
    759    br >>= 6;
    760 
    761    // Compute G = yCoeff*Y + gu*U + gv*V
    762    // First calc [gu*U, gv*V, ...]:
    763    auto gg = gg_uvCoeffs * uv;
    764    // Then cross the streams to get `gu*U + gv*V`:
    765    gg = addsat(gg, bit_cast<V8<int16_t>>(bit_cast<V4<uint32_t>>(gg) >> 16));
    766    // Add the other parts:
    767    gg = addsat(yy, gg);  // This is the part that needs the most headroom
    768                          // usually. In particular, ycbcr(255,255,255) hugely
    769                          // saturates.
    770    gg >>= 6;
    771 
    772    // Interleave B/R and G values. Force alpha (high-gg half) to opaque.
    773    return packYUV(gg, br);
    774  }
    775 };
    776 
    777 // Helper function for textureLinearRowR8 that samples horizontal taps and
    778 // combines them based on Y fraction with next row.
    779 template <typename S>
    780 static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix,
    781                                                 int32_t offsety,
    782                                                 int32_t stridey,
    783                                                 int16_t fracy) {
    784  uint8_t* buf = (uint8_t*)sampler->buf + offsety;
    785  auto a0 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
    786  auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
    787  auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
    788  auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
    789  auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);
    790  buf += stridey;
    791  auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
    792  auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
    793  auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
    794  auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
    795  auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);
    796  abcd0 += ((abcd1 - abcd0) * fracy) >> 7;
    797  return abcd0;
    798 }
    799 
    800 // Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes
    801 // constant Y and returns a duplicate of the result interleaved with itself
    802 // to aid in later YUV transformation.
    803 template <typename S>
    804 static inline V8<int16_t> textureLinearRowR8(S sampler, I32 ix, int32_t offsety,
    805                                             int32_t stridey, int16_t fracy) {
    806  assert(sampler->format == TextureFormat::R8);
    807 
    808  // Calculate X fraction and clamp X offset into range.
    809  I32 fracx = ix;
    810  ix >>= 7;
    811  fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
    812  ix = clampCoord(ix, sampler->width - 1);
    813 
    814  // Load the sample taps and combine rows.
    815  auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
    816 
    817  // Unzip the result and do final horizontal multiply-add base on X fraction.
    818  auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6);
    819  auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7);
    820  abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
    821 
    822  // The final result is the packed values interleaved with a duplicate of
    823  // themselves.
    824  return abcdl;
    825 }
    826 
    827 // Optimized version of textureLinearPackedR8 for paired U/V R8 textures.
    828 // Since the two textures have the same dimensions and stride, the addressing
    829 // math can be shared between both samplers. This also allows a coalesced
    830 // multiply in the final stage by packing both U/V results into a single
    831 // operation.
    832 template <typename S>
    833 static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2,
    834                                                   I32 ix, int32_t offsety,
    835                                                   int32_t stridey,
    836                                                   int16_t fracy) {
    837  assert(sampler->format == TextureFormat::R8 &&
    838         sampler2->format == TextureFormat::R8);
    839  assert(sampler->width == sampler2->width &&
    840         sampler->height == sampler2->height);
    841  assert(sampler->stride == sampler2->stride);
    842 
    843  // Calculate X fraction and clamp X offset into range.
    844  I32 fracx = ix;
    845  ix >>= 7;
    846  fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
    847  ix = clampCoord(ix, sampler->width - 1);
    848 
    849  // Load the sample taps for the first sampler and combine rows.
    850  auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
    851 
    852  // Load the sample taps for the second sampler and combine rows.
    853  auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy);
    854 
    855  // We are left with a result vector for each sampler with values for adjacent
    856  // pixels interleaved together in each. We need to unzip these values so that
    857  // we can do the final horizontal multiply-add based on the X fraction.
    858  auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14);
    859  auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15);
    860  abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
    861 
    862  // The final result is the packed values for the first sampler interleaved
    863  // with the packed values for the second sampler.
    864  return abcdxyzwl;
    865 }
    866 
    867 // Casting to int loses some precision while stepping that can offset the
    868 // image, so shift the values by some extra bits of precision to minimize
    869 // this. We support up to 16 bits of image size, 7 bits of quantization,
    870 // and 1 bit for sign, which leaves 8 bits left for extra precision.
    871 const int STEP_BITS = 8;
    872 
    873 // Optimized version of textureLinearPackedR8 for Y R8 texture with
    874 // half-resolution paired U/V R8 textures. This allows us to more efficiently
    875 // pack YUV samples into vectors to substantially reduce math operations even
    876 // further.
    877 template <bool BLEND>
    878 static inline void upscaleYUV42R8(uint32_t* dest, int span, uint8_t* yRow,
    879                                  I32 yU, int32_t yDU, int32_t yStrideV,
    880                                  int16_t yFracV, uint8_t* cRow1,
    881                                  uint8_t* cRow2, I32 cU, int32_t cDU,
    882                                  int32_t cStrideV, int16_t cFracV,
    883                                  const YUVMatrix& colorSpace) {
    884  // As much as possible try to utilize the fact that we're only using half
    885  // the UV samples to combine Y and UV samples into single vectors. Here we
    886  // need to initialize several useful vector quantities for stepping fractional
    887  // offsets. For the UV samples, we take the average of the first+second and
    888  // third+fourth samples in a chunk which conceptually correspond to offsets
    889  // 0.5 and 1.5 (in 0..2 range). This allows us to reconstruct intermediate
    890  // samples 0.25, 0.75, 1.25, and 1.75 later. X fraction is shifted over into
    891  // the top 7 bits of an unsigned short so that we can mask off the exact
    892  // fractional bits we need to blend merely by right shifting them into
    893  // position.
    894  cU = (cU.xzxz + cU.ywyw) >> 1;
    895  auto ycFracX = CONVERT(combine(yU, cU), V8<uint16_t>)
    896                 << (16 - (STEP_BITS + 7));
    897  auto ycFracDX = combine(I16(yDU), I16(cDU)) << (16 - (STEP_BITS + 7));
    898  auto ycFracV = combine(I16(yFracV), I16(cFracV));
    899  I32 yI = yU >> (STEP_BITS + 7);
    900  I32 cI = cU >> (STEP_BITS + 7);
    901  // Load initial combined YUV samples for each row and blend them.
    902  auto ycSrc0 =
    903      CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x]),
    904                      combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x]),
    905                              unaligned_load<V2<uint8_t>>(&cRow2[cI.x]))),
    906              V8<int16_t>);
    907  auto ycSrc1 = CONVERT(
    908      combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x + yStrideV]),
    909              combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x + cStrideV]),
    910                      unaligned_load<V2<uint8_t>>(&cRow2[cI.x + cStrideV]))),
    911      V8<int16_t>);
    912  auto ycSrc = ycSrc0 + (((ycSrc1 - ycSrc0) * ycFracV) >> 7);
    913 
    914  // Here we shift in results from the next sample while caching results from
    915  // the previous sample. This allows us to reduce the multiplications in the
    916  // inner loop down to only two since we just need to blend the new samples
    917  // horizontally and then vertically once each.
    918  for (uint32_t* end = dest + span; dest < end; dest += 4) {
    919    yU += yDU;
    920    I32 yIn = yU >> (STEP_BITS + 7);
    921    cU += cDU;
    922    I32 cIn = cU >> (STEP_BITS + 7);
    923    // Load combined YUV samples for the next chunk on each row and blend them.
    924    auto ycSrc0n =
    925        CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x]),
    926                        combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x]),
    927                                unaligned_load<V2<uint8_t>>(&cRow2[cIn.x]))),
    928                V8<int16_t>);
    929    auto ycSrc1n = CONVERT(
    930        combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x + yStrideV]),
    931                combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x + cStrideV]),
    932                        unaligned_load<V2<uint8_t>>(&cRow2[cIn.x + cStrideV]))),
    933        V8<int16_t>);
    934    auto ycSrcn = ycSrc0n + (((ycSrc1n - ycSrc0n) * ycFracV) >> 7);
    935 
    936    // The source samples for the chunk may not match the actual tap offsets.
    937    // Since we're upscaling, we know the tap offsets fall within all the
    938    // samples in a 4-wide chunk. Since we can't rely on PSHUFB or similar,
    939    // instead we do laborious shuffling here for the Y samples and then the UV
    940    // samples.
    941    auto yshuf = lowHalf(ycSrc);
    942    auto yshufn =
    943        SHUFFLE(yshuf, yIn.x == yI.w ? lowHalf(ycSrcn).yyyy : lowHalf(ycSrcn),
    944                1, 2, 3, 4);
    945    if (yI.y == yI.x) {
    946      yshuf = yshuf.xxyz;
    947      yshufn = yshufn.xxyz;
    948    }
    949    if (yI.z == yI.y) {
    950      yshuf = yshuf.xyyz;
    951      yshufn = yshufn.xyyz;
    952    }
    953    if (yI.w == yI.z) {
    954      yshuf = yshuf.xyzz;
    955      yshufn = yshufn.xyzz;
    956    }
    957 
    958    auto cshuf = highHalf(ycSrc);
    959    auto cshufn =
    960        SHUFFLE(cshuf, cIn.x == cI.y ? highHalf(ycSrcn).yyww : highHalf(ycSrcn),
    961                1, 4, 3, 6);
    962    if (cI.y == cI.x) {
    963      cshuf = cshuf.xxzz;
    964      cshufn = cshufn.xxzz;
    965    }
    966 
    967    // After shuffling, combine the Y and UV samples back into a single vector
    968    // for blending. Shift X fraction into position as unsigned to mask off top
    969    // bits and get rid of low bits to avoid multiplication overflow.
    970    auto yuvPx = combine(yshuf, cshuf);
    971    yuvPx += ((combine(yshufn, cshufn) - yuvPx) *
    972              bit_cast<V8<int16_t>>(ycFracX >> (16 - 7))) >>
    973             7;
    974 
    975    // Cache the new samples as the current samples on the next iteration.
    976    ycSrc = ycSrcn;
    977    ycFracX += ycFracDX;
    978    yI = yIn;
    979    cI = cIn;
    980 
    981    // De-interleave the Y and UV results. We need to average the UV results
    982    // to produce values for intermediate samples. Taps for UV were collected at
    983    // offsets 0.5 and 1.5, such that if we take a quarter of the difference
    984    // (1.5-0.5)/4, subtract it from even samples, and add it to odd samples,
    985    // we can estimate samples 0.25, 0.75, 1.25, and 1.75.
    986    auto yPx = SHUFFLE(yuvPx, yuvPx, 0, 0, 1, 1, 2, 2, 3, 3);
    987    auto uvPx = SHUFFLE(yuvPx, yuvPx, 4, 6, 4, 6, 5, 7, 5, 7) +
    988                ((SHUFFLE(yuvPx, yuvPx, 4, 6, 5, 7, 4, 6, 5, 7) -
    989                  SHUFFLE(yuvPx, yuvPx, 5, 7, 4, 6, 5, 7, 4, 6)) >>
    990                 2);
    991 
    992    commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
    993  }
    994 }
    995 
    996 // This is the inner loop driver of CompositeYUV that processes an axis-aligned
    997 // YUV span, dispatching based on appropriate format and scaling. This is also
    998 // reused by blendYUV to accelerate some cases of texture sampling in the
    999 // shader.
   1000 template <bool BLEND = false>
   1001 static void linear_row_yuv(uint32_t* dest, int span, sampler2DRect samplerY,
   1002                           const vec2_scalar& srcUV, float srcDU,
   1003                           sampler2DRect samplerU, sampler2DRect samplerV,
   1004                           const vec2_scalar& chromaUV, float chromaDU,
   1005                           int colorDepth, const YUVMatrix& colorSpace) {
   1006  // Calculate varying and constant interp data for Y plane.
   1007  I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS));
   1008  int32_t yV = int32_t(srcUV.y);
   1009 
   1010  // Calculate varying and constant interp data for chroma planes.
   1011  I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS));
   1012  int32_t cV = int32_t(chromaUV.y);
   1013 
   1014  // We need to skip 4 pixels per chunk.
   1015  int32_t yDU = int32_t((4 << STEP_BITS) * srcDU);
   1016  int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU);
   1017 
   1018  if (samplerY->width < 2 || samplerU->width < 2) {
   1019    // If the source row has less than 2 pixels, it's not safe to use a linear
   1020    // filter because it may overread the row. Just convert the single pixel
   1021    // with nearest filtering and fill the row with it.
   1022    Float yuvF = {texelFetch(samplerY, ivec2(srcUV)).x.x,
   1023                  texelFetch(samplerU, ivec2(chromaUV)).x.x,
   1024                  texelFetch(samplerV, ivec2(chromaUV)).x.x, 1.0f};
   1025    // If this is an HDR LSB format, we need to renormalize the result.
   1026    if (colorDepth > 8) {
   1027      int rescaleFactor = 16 - colorDepth;
   1028      yuvF *= float(1 << rescaleFactor);
   1029    }
   1030    I16 yuv = CONVERT(round_pixel(yuvF), I16);
   1031    commit_solid_span<BLEND>(
   1032        dest,
   1033        unpack(colorSpace.convert(V8<int16_t>(yuv.x),
   1034                                  zip(I16(yuv.y), I16(yuv.z)))),
   1035        span);
   1036  } else if (samplerY->format == TextureFormat::R16) {
   1037    // Sample each YUV plane, rescale it to fit in low 8 bits of word, and
   1038    // then transform them by the appropriate color space.
   1039    assert(colorDepth > 8);
   1040    // Need to right shift the sample by the amount of bits over 8 it
   1041    // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
   1042    // of precision at the low end already, hence 1 is subtracted from the
   1043    // color depth.
   1044    int rescaleBits = (colorDepth - 1) - 8;
   1045    for (; span >= 4; span -= 4) {
   1046      auto yPx =
   1047          textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >>
   1048          rescaleBits;
   1049      auto uPx =
   1050          textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >>
   1051          rescaleBits;
   1052      auto vPx =
   1053          textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >>
   1054          rescaleBits;
   1055      commit_blend_span<BLEND>(
   1056          dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)));
   1057      dest += 4;
   1058      yU += yDU;
   1059      cU += cDU;
   1060    }
   1061    if (span > 0) {
   1062      // Handle any remaining pixels...
   1063      auto yPx =
   1064          textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >>
   1065          rescaleBits;
   1066      auto uPx =
   1067          textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >>
   1068          rescaleBits;
   1069      auto vPx =
   1070          textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >>
   1071          rescaleBits;
   1072      commit_blend_span<BLEND>(
   1073          dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)), span);
   1074    }
   1075  } else {
   1076    assert(samplerY->format == TextureFormat::R8);
   1077    assert(colorDepth == 8);
   1078 
   1079    // Calculate varying and constant interp data for Y plane.
   1080    int16_t yFracV = yV & 0x7F;
   1081    yV >>= 7;
   1082    int32_t yOffsetV = clampCoord(yV, samplerY->height) * samplerY->stride;
   1083    int32_t yStrideV =
   1084        yV >= 0 && yV < int32_t(samplerY->height) - 1 ? samplerY->stride : 0;
   1085 
   1086    // Calculate varying and constant interp data for chroma planes.
   1087    int16_t cFracV = cV & 0x7F;
   1088    cV >>= 7;
   1089    int32_t cOffsetV = clampCoord(cV, samplerU->height) * samplerU->stride;
   1090    int32_t cStrideV =
   1091        cV >= 0 && cV < int32_t(samplerU->height) - 1 ? samplerU->stride : 0;
   1092 
   1093    // If we're sampling the UV planes at half the resolution of the Y plane,
   1094    // then try to use half resolution fast-path.
   1095    if (yDU >= cDU && cDU > 0 && yDU <= (4 << (STEP_BITS + 7)) &&
   1096        cDU <= (2 << (STEP_BITS + 7))) {
   1097      // Ensure that samples don't fall outside of the valid bounds of each
   1098      // planar texture. Step until the initial X coordinates are positive.
   1099      for (; (yU.x < 0 || cU.x < 0) && span >= 4; span -= 4) {
   1100        auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
   1101                                      yStrideV, yFracV);
   1102        auto uvPx = textureLinearRowPairedR8(
   1103            samplerU, samplerV, cU >> STEP_BITS, cOffsetV, cStrideV, cFracV);
   1104        commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
   1105        dest += 4;
   1106        yU += yDU;
   1107        cU += cDU;
   1108      }
   1109      // Calculate the number of aligned chunks that we can step inside the
   1110      // bounds of each planar texture without overreading.
   1111      int inside = min(
   1112          min((((int(samplerY->width) - 4) << (STEP_BITS + 7)) - yU.x) / yDU,
   1113              (((int(samplerU->width) - 4) << (STEP_BITS + 7)) - cU.x) / cDU) *
   1114              4,
   1115          span & ~3);
   1116      if (inside > 0) {
   1117        uint8_t* yRow = (uint8_t*)samplerY->buf + yOffsetV;
   1118        uint8_t* cRow1 = (uint8_t*)samplerU->buf + cOffsetV;
   1119        uint8_t* cRow2 = (uint8_t*)samplerV->buf + cOffsetV;
   1120        upscaleYUV42R8<BLEND>(dest, inside, yRow, yU, yDU, yStrideV, yFracV,
   1121                              cRow1, cRow2, cU, cDU, cStrideV, cFracV,
   1122                              colorSpace);
   1123        span -= inside;
   1124        dest += inside;
   1125        yU += (inside / 4) * yDU;
   1126        cU += (inside / 4) * cDU;
   1127      }
   1128      // If there are any remaining chunks that weren't inside, handle them
   1129      // below.
   1130    }
   1131    for (; span >= 4; span -= 4) {
   1132      // Sample each YUV plane and then transform them by the appropriate
   1133      // color space.
   1134      auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
   1135                                    yStrideV, yFracV);
   1136      auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS,
   1137                                           cOffsetV, cStrideV, cFracV);
   1138      commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
   1139      dest += 4;
   1140      yU += yDU;
   1141      cU += cDU;
   1142    }
   1143    if (span > 0) {
   1144      // Handle any remaining pixels...
   1145      auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
   1146                                    yStrideV, yFracV);
   1147      auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS,
   1148                                           cOffsetV, cStrideV, cFracV);
   1149      commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx), span);
   1150    }
   1151  }
   1152 }
   1153 
   1154 static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex,
   1155                               const YUVMatrix& rgbFromYcbcr, int colorDepth,
   1156                               const IntRect& srcReq, Texture& dsttex,
   1157                               const IntRect& dstReq, bool invertX,
   1158                               bool invertY, const IntRect& clipRect) {
   1159  // Compute valid dest bounds
   1160  IntRect dstBounds = dsttex.sample_bounds(dstReq);
   1161  dstBounds.intersect(clipRect);
   1162  // Check if sampling bounds are empty
   1163  if (dstBounds.is_empty()) {
   1164    return;
   1165  }
   1166  // Initialize samplers for source textures
   1167  sampler2DRect_impl sampler[3];
   1168  init_sampler(&sampler[0], ytex);
   1169  init_sampler(&sampler[1], utex);
   1170  init_sampler(&sampler[2], vtex);
   1171 
   1172  // Compute source UVs
   1173  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
   1174  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
   1175                     float(srcReq.height()) / dstReq.height());
   1176  if (invertX) {
   1177    // Advance to the end of the row and flip the step.
   1178    srcUV.x += srcReq.width();
   1179    srcDUV.x = -srcDUV.x;
   1180  }
   1181  // Inverted Y must step downward along source rows
   1182  if (invertY) {
   1183    srcUV.y += srcReq.height();
   1184    srcDUV.y = -srcDUV.y;
   1185  }
   1186  // Skip to clamped source start
   1187  srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
   1188  // Calculate separate chroma UVs for chroma planes with different scale
   1189  vec2_scalar chromaScale(float(utex.width) / ytex.width,
   1190                          float(utex.height) / ytex.height);
   1191  vec2_scalar chromaUV = srcUV * chromaScale;
   1192  vec2_scalar chromaDUV = srcDUV * chromaScale;
   1193  // Scale UVs by lerp precision. If the row has only 1 pixel, then don't
   1194  // quantize so that we can use nearest filtering instead to avoid overreads.
   1195  if (ytex.width >= 2 && utex.width >= 2) {
   1196    srcUV = linearQuantize(srcUV, 128);
   1197    srcDUV *= 128.0f;
   1198    chromaUV = linearQuantize(chromaUV, 128);
   1199    chromaDUV *= 128.0f;
   1200  }
   1201  // Calculate dest pointer from clamped offsets
   1202  int destStride = dsttex.stride();
   1203  char* dest = dsttex.sample_ptr(dstReq, dstBounds);
   1204  int span = dstBounds.width();
   1205  for (int rows = dstBounds.height(); rows > 0; rows--) {
   1206    linear_row_yuv((uint32_t*)dest, span, &sampler[0], srcUV, srcDUV.x,
   1207                   &sampler[1], &sampler[2], chromaUV, chromaDUV.x, colorDepth,
   1208                   rgbFromYcbcr);
   1209    dest += destStride;
   1210    srcUV.y += srcDUV.y;
   1211    chromaUV.y += chromaDUV.y;
   1212  }
   1213 }
   1214 
   1215 // -
   1216 // This section must match gfx/2d/Types.h
   1217 
   1218 enum class YUVRangedColorSpace : uint8_t {
   1219  BT601_Narrow = 0,
   1220  BT601_Full,
   1221  BT709_Narrow,
   1222  BT709_Full,
   1223  BT2020_Narrow,
   1224  BT2020_Full,
   1225  GbrIdentity,
   1226 };
   1227 
   1228 // -
   1229 // This section must match yuv.glsl
   1230 
   1231 vec4_scalar get_ycbcr_zeros_ones(const YUVRangedColorSpace color_space,
   1232                                 const GLuint color_depth) {
   1233  // For SWGL's 8bpc-only pipeline, our extra care here probably doesn't matter.
   1234  // However, technically e.g. 10-bit achromatic zero for cb and cr is
   1235  // (128 << 2) / ((1 << 10) - 1) = 512 / 1023, which != 128 / 255, and affects
   1236  // our matrix values subtly. Maybe not enough to matter? But it's the most
   1237  // correct thing to do.
   1238  // Unlike the glsl version, our texture samples are u8([0,255]) not
   1239  // u16([0,1023]) though.
   1240  switch (color_space) {
   1241    case YUVRangedColorSpace::BT601_Narrow:
   1242    case YUVRangedColorSpace::BT709_Narrow:
   1243    case YUVRangedColorSpace::BT2020_Narrow: {
   1244      auto extra_bit_count = color_depth - 8;
   1245      vec4_scalar zo = {
   1246          float(16 << extra_bit_count),
   1247          float(128 << extra_bit_count),
   1248          float(235 << extra_bit_count),
   1249          float(240 << extra_bit_count),
   1250      };
   1251      float all_bits = (1 << color_depth) - 1;
   1252      zo /= all_bits;
   1253      return zo;
   1254    }
   1255 
   1256    case YUVRangedColorSpace::BT601_Full:
   1257    case YUVRangedColorSpace::BT709_Full:
   1258    case YUVRangedColorSpace::BT2020_Full: {
   1259      const auto narrow =
   1260          get_ycbcr_zeros_ones(YUVRangedColorSpace::BT601_Narrow, color_depth);
   1261      return {0.0, narrow.y, 1.0, 1.0};
   1262    }
   1263 
   1264    case YUVRangedColorSpace::GbrIdentity:
   1265      break;
   1266  }
   1267  return {0.0, 0.0, 1.0, 1.0};
   1268 }
   1269 
   1270 constexpr mat3_scalar RgbFromYuv_Rec601 = {
   1271    {1.00000, 1.00000, 1.00000},
   1272    {0.00000, -0.17207, 0.88600},
   1273    {0.70100, -0.35707, 0.00000},
   1274 };
   1275 constexpr mat3_scalar RgbFromYuv_Rec709 = {
   1276    {1.00000, 1.00000, 1.00000},
   1277    {0.00000, -0.09366, 0.92780},
   1278    {0.78740, -0.23406, 0.00000},
   1279 };
   1280 constexpr mat3_scalar RgbFromYuv_Rec2020 = {
   1281    {1.00000, 1.00000, 1.00000},
   1282    {0.00000, -0.08228, 0.94070},
   1283    {0.73730, -0.28568, 0.00000},
   1284 };
   1285 constexpr mat3_scalar RgbFromYuv_GbrIdentity = {
   1286    {0, 1, 0},
   1287    {0, 0, 1},
   1288    {1, 0, 0},
   1289 };
   1290 
   1291 inline mat3_scalar get_rgb_from_yuv(const YUVRangedColorSpace color_space) {
   1292  switch (color_space) {
   1293    case YUVRangedColorSpace::BT601_Narrow:
   1294    case YUVRangedColorSpace::BT601_Full:
   1295      return RgbFromYuv_Rec601;
   1296    case YUVRangedColorSpace::BT709_Narrow:
   1297    case YUVRangedColorSpace::BT709_Full:
   1298      return RgbFromYuv_Rec709;
   1299    case YUVRangedColorSpace::BT2020_Narrow:
   1300    case YUVRangedColorSpace::BT2020_Full:
   1301      return RgbFromYuv_Rec2020;
   1302    case YUVRangedColorSpace::GbrIdentity:
   1303      break;
   1304  }
   1305  return RgbFromYuv_GbrIdentity;
   1306 }
   1307 
   1308 struct YcbcrInfo final {
   1309  vec3_scalar ycbcr_bias;
   1310  mat3_scalar rgb_from_debiased_ycbcr;
   1311 };
   1312 
   1313 inline YcbcrInfo get_ycbcr_info(const YUVRangedColorSpace color_space,
   1314                                GLuint color_depth) {
   1315  // SWGL always does 8bpc math, so don't scale the matrix for 10bpc!
   1316  color_depth = 8;
   1317 
   1318  const auto zeros_ones = get_ycbcr_zeros_ones(color_space, color_depth);
   1319  const auto zeros = vec2_scalar{zeros_ones.x, zeros_ones.y};
   1320  const auto ones = vec2_scalar{zeros_ones.z, zeros_ones.w};
   1321  const auto scale = 1.0f / (ones - zeros);
   1322 
   1323  const auto rgb_from_yuv = get_rgb_from_yuv(color_space);
   1324  const mat3_scalar yuv_from_debiased_ycbcr = {
   1325      {scale.x, 0, 0},
   1326      {0, scale.y, 0},
   1327      {0, 0, scale.y},
   1328  };
   1329 
   1330  YcbcrInfo ret;
   1331  ret.ycbcr_bias = {zeros.x, zeros.y, zeros.y};
   1332  ret.rgb_from_debiased_ycbcr = rgb_from_yuv * yuv_from_debiased_ycbcr;
   1333  return ret;
   1334 }
   1335 
   1336 // -
   1337 
   1338 extern "C" {
   1339 
   1340 // Extension for compositing a YUV surface represented by separate YUV planes
   1341 // to a BGRA destination. The supplied color space is used to determine the
   1342 // transform from YUV to BGRA after sampling.
   1343 void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY,
   1344                  LockedTexture* lockedU, LockedTexture* lockedV,
   1345                  YUVRangedColorSpace colorSpace, GLuint colorDepth, GLint srcX,
   1346                  GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
   1347                  GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
   1348                  GLboolean flipX, GLboolean flipY, GLint clipX, GLint clipY,
   1349                  GLsizei clipWidth, GLsizei clipHeight) {
   1350  if (!lockedDst || !lockedY || !lockedU || !lockedV) {
   1351    return;
   1352  }
   1353  if (colorSpace > YUVRangedColorSpace::GbrIdentity) {
   1354    assert(false);
   1355    return;
   1356  }
   1357  const auto ycbcrInfo = get_ycbcr_info(colorSpace, colorDepth);
   1358  const auto rgbFromYcbcr =
   1359      YUVMatrix::From(ycbcrInfo.ycbcr_bias, ycbcrInfo.rgb_from_debiased_ycbcr);
   1360 
   1361  Texture& ytex = *lockedY;
   1362  Texture& utex = *lockedU;
   1363  Texture& vtex = *lockedV;
   1364  Texture& dsttex = *lockedDst;
   1365  // All YUV planes must currently be represented by R8 or R16 textures.
   1366  // The chroma (U/V) planes must have matching dimensions.
   1367  assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp());
   1368  assert((ytex.bpp() == 1 && colorDepth == 8) ||
   1369         (ytex.bpp() == 2 && colorDepth > 8));
   1370  // assert(ytex.width == utex.width && ytex.height == utex.height);
   1371  assert(utex.width == vtex.width && utex.height == vtex.height);
   1372  assert(ytex.offset == utex.offset && ytex.offset == vtex.offset);
   1373  assert(dsttex.bpp() == 4);
   1374 
   1375  IntRect srcReq =
   1376      IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset;
   1377  IntRect dstReq =
   1378      IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
   1379  if (srcReq.is_empty() || dstReq.is_empty()) {
   1380    return;
   1381  }
   1382 
   1383  // Compute clip rect as relative to the dstReq, as that's the same coords
   1384  // as used for the sampling bounds.
   1385  IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
   1386                      clipY - dstY + clipHeight};
   1387  // For now, always use a linear filter path that would be required for
   1388  // scaling. Further fast-paths for non-scaled video might be desirable in the
   1389  // future.
   1390  linear_convert_yuv(ytex, utex, vtex, rgbFromYcbcr, colorDepth, srcReq, dsttex,
   1391                     dstReq, flipX, flipY, clipRect);
   1392 }
   1393 
   1394 }  // extern "C"