composite.h (54346B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 // Converts a pixel from a source format to a destination format. By default, 6 // just return the value unchanged as for a simple copy. 7 template <typename P, typename U> 8 static ALWAYS_INLINE P convert_pixel(U src) { 9 return src; 10 } 11 12 // R8 format maps to BGRA value 0,0,R,1. The byte order is endian independent, 13 // but the shifts unfortunately depend on endianness. 14 template <> 15 ALWAYS_INLINE uint32_t convert_pixel<uint32_t>(uint8_t src) { 16 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 17 return (uint32_t(src) << 16) | 0xFF000000; 18 #else 19 return (uint32_t(src) << 8) | 0x000000FF; 20 #endif 21 } 22 23 // RG8 format maps to BGRA value 0,G,R,1. 24 template <> 25 ALWAYS_INLINE uint32_t convert_pixel<uint32_t>(uint16_t src) { 26 uint32_t rg = src; 27 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 28 return ((rg & 0x00FF) << 16) | (rg & 0xFF00) | 0xFF000000; 29 #else 30 return (rg & 0xFF00) | ((rg & 0x00FF) << 16) | 0x000000FF; 31 #endif 32 } 33 34 // RGBA8 format maps to R. 35 template <> 36 ALWAYS_INLINE uint8_t convert_pixel<uint8_t>(uint32_t src) { 37 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 38 return (src >> 16) & 0xFF; 39 #else 40 return (src >> 8) & 0xFF; 41 #endif 42 } 43 44 // RGBA8 formats maps to R,G. 45 template <> 46 ALWAYS_INLINE uint16_t convert_pixel<uint16_t>(uint32_t src) { 47 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 48 return ((src >> 16) & 0x00FF) | (src & 0xFF00); 49 #else 50 return (src & 0xFF00) | ((src >> 16) & 0x00FF); 51 #endif 52 } 53 54 // R8 format maps to R,0. 55 template <> 56 ALWAYS_INLINE uint16_t convert_pixel<uint16_t>(uint8_t src) { 57 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 58 return src; 59 #else 60 return uint16_t(src) << 8; 61 #endif 62 } 63 64 // RG8 format maps to R. 65 template <> 66 ALWAYS_INLINE uint8_t convert_pixel<uint8_t>(uint16_t src) { 67 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 68 return src & 0xFF; 69 #else 70 return src >> 8; 71 #endif 72 } 73 74 // Apply a u8 alpha mask to a u32 texture row 75 static inline void mask_row(uint32_t* dst, const uint8_t* mask, int span) { 76 auto* end = dst + span; 77 while (dst + 4 <= end) { 78 WideRGBA8 maskpx = expand_mask(dst, unpack(unaligned_load<PackedR8>(mask))); 79 WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst)); 80 PackedRGBA8 r = pack(muldiv255(dstpx, maskpx)); 81 unaligned_store(dst, r); 82 mask += 4; 83 dst += 4; 84 } 85 if (dst < end) { 86 WideRGBA8 maskpx = 87 expand_mask(dst, unpack(partial_load_span<PackedR8>(mask, end - dst))); 88 WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst)); 89 auto r = pack(maskpx + dstpx - muldiv255(dstpx, maskpx)); 90 partial_store_span(dst, r, end - dst); 91 } 92 } 93 94 // Apply a R8 alpha mask to a RGBA8 texture 95 static NO_INLINE void mask_blit(Texture& masktex, Texture& dsttex) { 96 int maskStride = masktex.stride(); 97 int destStride = dsttex.stride(); 98 char* dest = dsttex.sample_ptr(0, 0); 99 char* mask = masktex.sample_ptr(0, 0); 100 int span = dsttex.width; 101 102 for (int rows = dsttex.height; rows > 0; rows--) { 103 mask_row((uint32_t*)dest, (uint8_t*)mask, span); 104 dest += destStride; 105 mask += maskStride; 106 } 107 } 108 109 template <bool COMPOSITE, typename P> 110 static inline void copy_row(P* dst, const P* src, int span) { 111 // No scaling, so just do a fast copy. 112 memcpy(dst, src, span * sizeof(P)); 113 } 114 115 template <> 116 void copy_row<true, uint32_t>(uint32_t* dst, const uint32_t* src, int span) { 117 // No scaling, so just do a fast composite. 118 auto* end = dst + span; 119 while (dst + 4 <= end) { 120 WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src)); 121 WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst)); 122 PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); 123 unaligned_store(dst, r); 124 src += 4; 125 dst += 4; 126 } 127 if (dst < end) { 128 WideRGBA8 srcpx = unpack(partial_load_span<PackedRGBA8>(src, end - dst)); 129 WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst)); 130 auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); 131 partial_store_span(dst, r, end - dst); 132 } 133 } 134 135 template <bool COMPOSITE, typename P, typename U> 136 static inline void scale_row(P* dst, int dstWidth, const U* src, int srcWidth, 137 int span, int frac) { 138 // Do scaling with different source and dest widths. 139 for (P* end = dst + span; dst < end; dst++) { 140 *dst = convert_pixel<P>(*src); 141 // Step source according to width ratio. 142 for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { 143 src++; 144 } 145 } 146 } 147 148 template <> 149 void scale_row<true, uint32_t, uint32_t>(uint32_t* dst, int dstWidth, 150 const uint32_t* src, int srcWidth, 151 int span, int frac) { 152 // Do scaling with different source and dest widths. 153 // Gather source pixels four at a time for better packing. 154 auto* end = dst + span; 155 for (; dst + 4 <= end; dst += 4) { 156 U32 srcn; 157 srcn.x = *src; 158 for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { 159 src++; 160 } 161 srcn.y = *src; 162 for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { 163 src++; 164 } 165 srcn.z = *src; 166 for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { 167 src++; 168 } 169 srcn.w = *src; 170 for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { 171 src++; 172 } 173 WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn)); 174 WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst)); 175 PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); 176 unaligned_store(dst, r); 177 } 178 if (dst < end) { 179 // Process any remaining pixels. Try to gather as many pixels as possible 180 // into a single source chunk for compositing. 181 U32 srcn = {*src, 0, 0, 0}; 182 if (end - dst > 1) { 183 for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { 184 src++; 185 } 186 srcn.y = *src; 187 if (end - dst > 2) { 188 for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { 189 src++; 190 } 191 srcn.z = *src; 192 } 193 } 194 WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn)); 195 WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst)); 196 auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); 197 partial_store_span(dst, r, end - dst); 198 } 199 } 200 201 template <bool COMPOSITE = false> 202 static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq, 203 Texture& dsttex, const IntRect& dstReq, 204 bool invertY, const IntRect& clipRect) { 205 assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 && 206 dsttex.internal_format == GL_RGBA8)); 207 // Cache scaling ratios 208 int srcWidth = srcReq.width(); 209 int srcHeight = srcReq.height(); 210 int dstWidth = dstReq.width(); 211 int dstHeight = dstReq.height(); 212 // Compute valid dest bounds 213 IntRect dstBounds = dsttex.sample_bounds(dstReq).intersect(clipRect); 214 // Compute valid source bounds 215 IntRect srcBounds = srctex.sample_bounds(srcReq, invertY); 216 // If srcReq is outside the source texture, we need to clip the sampling 217 // bounds so that we never sample outside valid source bounds. Get texture 218 // bounds relative to srcReq and scale to dest-space rounding inward, using 219 // this rect to limit the dest bounds further. 220 IntRect srcClip = srctex.bounds() - srcReq.origin(); 221 if (invertY) { 222 srcClip.invert_y(srcReq.height()); 223 } 224 srcClip.scale(srcWidth, srcHeight, dstWidth, dstHeight, true); 225 dstBounds.intersect(srcClip); 226 // Check if clipped sampling bounds are empty 227 if (dstBounds.is_empty()) { 228 return; 229 } 230 231 // Calculate source and dest pointers from clamped offsets 232 int srcStride = srctex.stride(); 233 int destStride = dsttex.stride(); 234 char* dest = dsttex.sample_ptr(dstReq, dstBounds); 235 // Clip the source bounds by the destination offset. 236 int fracX = srcWidth * dstBounds.x0; 237 int fracY = srcHeight * dstBounds.y0; 238 srcBounds.x0 = max(fracX / dstWidth, srcBounds.x0); 239 srcBounds.y0 = max(fracY / dstHeight, srcBounds.y0); 240 fracX %= dstWidth; 241 fracY %= dstHeight; 242 char* src = srctex.sample_ptr(srcReq, srcBounds, invertY); 243 // Inverted Y must step downward along source rows 244 if (invertY) { 245 srcStride = -srcStride; 246 } 247 int span = dstBounds.width(); 248 for (int rows = dstBounds.height(); rows > 0; rows--) { 249 switch (srctex.bpp()) { 250 case 1: 251 switch (dsttex.bpp()) { 252 case 2: 253 scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint8_t*)src, 254 srcWidth, span, fracX); 255 break; 256 case 4: 257 scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint8_t*)src, 258 srcWidth, span, fracX); 259 break; 260 default: 261 if (srcWidth == dstWidth) 262 copy_row<COMPOSITE>((uint8_t*)dest, (uint8_t*)src, span); 263 else 264 scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint8_t*)src, 265 srcWidth, span, fracX); 266 break; 267 } 268 break; 269 case 2: 270 switch (dsttex.bpp()) { 271 case 1: 272 scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint16_t*)src, 273 srcWidth, span, fracX); 274 break; 275 case 4: 276 scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint16_t*)src, 277 srcWidth, span, fracX); 278 break; 279 default: 280 if (srcWidth == dstWidth) 281 copy_row<COMPOSITE>((uint16_t*)dest, (uint16_t*)src, span); 282 else 283 scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint16_t*)src, 284 srcWidth, span, fracX); 285 break; 286 } 287 break; 288 case 4: 289 switch (dsttex.bpp()) { 290 case 1: 291 scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint32_t*)src, 292 srcWidth, span, fracX); 293 break; 294 case 2: 295 scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint32_t*)src, 296 srcWidth, span, fracX); 297 break; 298 default: 299 if (srcWidth == dstWidth) 300 copy_row<COMPOSITE>((uint32_t*)dest, (uint32_t*)src, span); 301 else 302 scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint32_t*)src, 303 srcWidth, span, fracX); 304 break; 305 } 306 break; 307 default: 308 assert(false); 309 break; 310 } 311 dest += destStride; 312 // Step source according to height ratio. 313 for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) { 314 src += srcStride; 315 } 316 } 317 } 318 319 template <bool COMPOSITE> 320 static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV, 321 float srcDU, sampler2D sampler) { 322 vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); 323 for (; span >= 4; span -= 4) { 324 auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv)); 325 unaligned_store(dest, srcpx); 326 dest += 4; 327 uv.x += 4 * srcDU; 328 } 329 if (span > 0) { 330 auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv)); 331 partial_store_span(dest, srcpx, span); 332 } 333 } 334 335 template <> 336 void linear_row_blit<true>(uint32_t* dest, int span, const vec2_scalar& srcUV, 337 float srcDU, sampler2D sampler) { 338 vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); 339 for (; span >= 4; span -= 4) { 340 WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv)); 341 WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest)); 342 PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); 343 unaligned_store(dest, r); 344 345 dest += 4; 346 uv.x += 4 * srcDU; 347 } 348 if (span > 0) { 349 WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv)); 350 WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span)); 351 PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); 352 partial_store_span(dest, r, span); 353 } 354 } 355 356 template <bool COMPOSITE> 357 static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV, 358 float srcDU, sampler2D sampler) { 359 vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); 360 for (; span >= 4; span -= 4) { 361 auto srcpx = textureLinearPackedR8(sampler, ivec2(uv)); 362 unaligned_store(dest, srcpx); 363 dest += 4; 364 uv.x += 4 * srcDU; 365 } 366 if (span > 0) { 367 auto srcpx = textureLinearPackedR8(sampler, ivec2(uv)); 368 partial_store_span(dest, srcpx, span); 369 } 370 } 371 372 template <bool COMPOSITE> 373 static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV, 374 float srcDU, sampler2D sampler) { 375 vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); 376 for (; span >= 4; span -= 4) { 377 auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv)); 378 unaligned_store(dest, srcpx); 379 dest += 4; 380 uv.x += 4 * srcDU; 381 } 382 if (span > 0) { 383 auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv)); 384 partial_store_span(dest, srcpx, span); 385 } 386 } 387 388 template <bool COMPOSITE = false> 389 static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq, 390 Texture& dsttex, const IntRect& dstReq, 391 bool invertX, bool invertY, 392 const IntRect& clipRect) { 393 assert(srctex.internal_format == GL_RGBA8 || 394 srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8); 395 assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 && 396 dsttex.internal_format == GL_RGBA8)); 397 // Compute valid dest bounds 398 IntRect dstBounds = dsttex.sample_bounds(dstReq); 399 dstBounds.intersect(clipRect); 400 // Check if sampling bounds are empty 401 if (dstBounds.is_empty()) { 402 return; 403 } 404 // Initialize sampler for source texture 405 sampler2D_impl sampler; 406 init_sampler(&sampler, srctex); 407 sampler.filter = TextureFilter::LINEAR; 408 // Compute source UVs 409 vec2_scalar srcUV(srcReq.x0, srcReq.y0); 410 vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), 411 float(srcReq.height()) / dstReq.height()); 412 if (invertX) { 413 // Advance to the end of the row and flip the step. 414 srcUV.x += srcReq.width(); 415 srcDUV.x = -srcDUV.x; 416 } 417 // Inverted Y must step downward along source rows 418 if (invertY) { 419 srcUV.y += srcReq.height(); 420 srcDUV.y = -srcDUV.y; 421 } 422 // Skip to clamped source start 423 srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f); 424 // Scale UVs by lerp precision 425 srcUV = linearQuantize(srcUV, 128); 426 srcDUV *= 128.0f; 427 // Calculate dest pointer from clamped offsets 428 int bpp = dsttex.bpp(); 429 int destStride = dsttex.stride(); 430 char* dest = dsttex.sample_ptr(dstReq, dstBounds); 431 int span = dstBounds.width(); 432 for (int rows = dstBounds.height(); rows > 0; rows--) { 433 switch (bpp) { 434 case 1: 435 linear_row_blit<COMPOSITE>((uint8_t*)dest, span, srcUV, srcDUV.x, 436 &sampler); 437 break; 438 case 2: 439 linear_row_blit<COMPOSITE>((uint16_t*)dest, span, srcUV, srcDUV.x, 440 &sampler); 441 break; 442 case 4: 443 linear_row_blit<COMPOSITE>((uint32_t*)dest, span, srcUV, srcDUV.x, 444 &sampler); 445 break; 446 default: 447 assert(false); 448 break; 449 } 450 dest += destStride; 451 srcUV.y += srcDUV.y; 452 } 453 } 454 455 // Whether the blit format is renderable. 456 static inline bool is_renderable_format(GLenum format) { 457 switch (format) { 458 case GL_R8: 459 case GL_RG8: 460 case GL_RGBA8: 461 return true; 462 default: 463 return false; 464 } 465 } 466 467 extern "C" { 468 469 void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, 470 GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, 471 GLbitfield mask, GLenum filter) { 472 assert(mask == GL_COLOR_BUFFER_BIT); 473 Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER); 474 if (!srcfb) return; 475 Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER); 476 if (!dstfb) return; 477 Texture& srctex = ctx->textures[srcfb->color_attachment]; 478 if (!srctex.buf) return; 479 Texture& dsttex = ctx->textures[dstfb->color_attachment]; 480 if (!dsttex.buf) return; 481 assert(!dsttex.locked); 482 if (srctex.internal_format != dsttex.internal_format && 483 (!is_renderable_format(srctex.internal_format) || 484 !is_renderable_format(dsttex.internal_format))) { 485 // If the internal formats don't match, then we may have to convert. Require 486 // that the format is a simple renderable format to limit combinatoric 487 // explosion for now. 488 assert(false); 489 return; 490 } 491 // Force flipped Y onto dest coordinates 492 if (srcY1 < srcY0) { 493 swap(srcY0, srcY1); 494 swap(dstY0, dstY1); 495 } 496 bool invertY = dstY1 < dstY0; 497 if (invertY) { 498 swap(dstY0, dstY1); 499 } 500 IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset; 501 IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset; 502 if (srcReq.is_empty() || dstReq.is_empty()) { 503 return; 504 } 505 IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()}; 506 prepare_texture(srctex); 507 prepare_texture(dsttex, &dstReq); 508 if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR && 509 srctex.internal_format == dsttex.internal_format && 510 is_renderable_format(srctex.internal_format)) { 511 linear_blit(srctex, srcReq, dsttex, dstReq, false, invertY, dstReq); 512 } else { 513 scale_blit(srctex, srcReq, dsttex, dstReq, invertY, clipRect); 514 } 515 } 516 517 // Get the underlying buffer for a locked resource 518 void* GetResourceBuffer(LockedTexture* resource, int32_t* width, 519 int32_t* height, int32_t* stride) { 520 *width = resource->width; 521 *height = resource->height; 522 *stride = resource->stride(); 523 return resource->buf; 524 } 525 526 // Extension for optimized compositing of textures or framebuffers that may be 527 // safely used across threads. The source and destination must be locked to 528 // ensure that they can be safely accessed while the SWGL context might be used 529 // by another thread. Band extents along the Y axis may be used to clip the 530 // destination rectangle without effecting the integer scaling ratios. 531 void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX, 532 GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX, 533 GLint dstY, GLsizei dstWidth, GLsizei dstHeight, 534 GLboolean opaque, GLboolean flipX, GLboolean flipY, 535 GLenum filter, GLint clipX, GLint clipY, GLsizei clipWidth, 536 GLsizei clipHeight) { 537 if (!lockedDst || !lockedSrc) { 538 return; 539 } 540 Texture& srctex = *lockedSrc; 541 Texture& dsttex = *lockedDst; 542 assert(srctex.bpp() == 4); 543 assert(dsttex.bpp() == 4); 544 545 IntRect srcReq = 546 IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset; 547 IntRect dstReq = 548 IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset; 549 if (srcReq.is_empty() || dstReq.is_empty()) { 550 return; 551 } 552 553 // Compute clip rect as relative to the dstReq, as that's the same coords 554 // as used for the sampling bounds. 555 IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth, 556 clipY - dstY + clipHeight}; 557 // Ensure we have rows of at least 2 pixels when using the linear filter to 558 // avoid overreading the row. Force X flips onto the linear filter for now 559 // until scale_blit supports it. 560 bool useLinear = 561 srctex.width >= 2 && 562 (flipX || (!srcReq.same_size(dstReq) && filter == GL_LINEAR)); 563 564 if (opaque) { 565 if (useLinear) { 566 linear_blit<false>(srctex, srcReq, dsttex, dstReq, flipX, flipY, 567 clipRect); 568 } else { 569 scale_blit<false>(srctex, srcReq, dsttex, dstReq, flipY, clipRect); 570 } 571 } else { 572 if (useLinear) { 573 linear_blit<true>(srctex, srcReq, dsttex, dstReq, flipX, flipY, clipRect); 574 } else { 575 scale_blit<true>(srctex, srcReq, dsttex, dstReq, flipY, clipRect); 576 } 577 } 578 } 579 580 // Extension used by the SWGL compositor to apply an alpha mask 581 // to a texture. The textures must be the same size. The mask 582 // must be R8, the texture must be RGBA8. 583 void ApplyMask(LockedTexture* lockedDst, LockedTexture* lockedMask) { 584 assert(lockedDst); 585 assert(lockedMask); 586 587 Texture& masktex = *lockedMask; 588 Texture& dsttex = *lockedDst; 589 590 assert(masktex.bpp() == 1); 591 assert(dsttex.bpp() == 4); 592 593 assert(masktex.width == dsttex.width); 594 assert(masktex.height == dsttex.height); 595 596 mask_blit(masktex, dsttex); 597 } 598 599 } // extern "C" 600 601 // Saturated add helper for YUV conversion. Supported platforms have intrinsics 602 // to do this natively, but support a slower generic fallback just in case. 603 static inline V8<int16_t> addsat(V8<int16_t> x, V8<int16_t> y) { 604 #if USE_SSE2 605 return _mm_adds_epi16(x, y); 606 #elif USE_NEON 607 return vqaddq_s16(x, y); 608 #else 609 auto r = x + y; 610 // An overflow occurred if the signs of both inputs x and y did not differ 611 // but yet the sign of the result did differ. 612 auto overflow = (~(x ^ y) & (r ^ x)) >> 15; 613 // If there was an overflow, we need to choose the appropriate limit to clamp 614 // to depending on whether or not the inputs are negative. 615 auto limit = (x >> 15) ^ 0x7FFF; 616 // If we didn't overflow, just use the result, and otherwise, use the limit. 617 return (~overflow & r) | (overflow & limit); 618 #endif 619 } 620 621 // Interleave and packing helper for YUV conversion. During transform by the 622 // color matrix, the color components are de-interleaved as this format is 623 // usually what comes out of the planar YUV textures. The components thus need 624 // to be interleaved before finally getting packed to BGRA format. Alpha is 625 // forced to be opaque. 626 static inline PackedRGBA8 packYUV(V8<int16_t> gg, V8<int16_t> br) { 627 return pack(bit_cast<WideRGBA8>(zip(br, gg))) | 628 PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; 629 } 630 631 // clang-format off 632 // Supports YUV color matrixes of the form: 633 // [R] [1.1643835616438356, 0.0, rv ] [Y - 16] 634 // [G] = [1.1643835616438358, -gu, -gv ] x [U - 128] 635 // [B] [1.1643835616438356, bu, 0.0 ] [V - 128] 636 // We must be able to multiply a YUV input by a matrix coefficient ranging as 637 // high as ~2.2 in the U/V cases, where U/V can be signed values between -128 638 // and 127. The largest fixed-point representation we can thus support without 639 // overflowing 16 bit integers leaves us 6 bits of fractional precision while 640 // also supporting a sign bit. The closest representation of the Y coefficient 641 // ~1.164 in this precision is 74.5/2^6 which is common to all color spaces 642 // we support. Conversions can still sometimes overflow the precision and 643 // require clamping back into range, so we use saturated additions to do this 644 // efficiently at no extra cost. 645 // clang-format on 646 struct YUVMatrix { 647 // These constants are loaded off the "this" pointer via relative addressing 648 // modes and should be about as quick to load as directly addressed SIMD 649 // constant memory. 650 651 V8<int16_t> br_uvCoeffs; // biased by 6 bits [b_from_u, r_from_v, repeats] 652 V8<int16_t> gg_uvCoeffs; // biased by 6 bits [g_from_u, g_from_v, repeats] 653 V8<uint16_t> yCoeffs; // biased by 7 bits 654 V8<int16_t> yBias; // 0 or 16 655 V8<int16_t> uvBias; // 128 656 V8<int16_t> br_yMask; 657 658 // E.g. rec709-narrow: 659 // [ 1.16, 0, 1.79, -0.97 ] 660 // [ 1.16, -0.21, -0.53, 0.30 ] 661 // [ 1.16, 2.11, 0, -1.13 ] 662 // = 663 // [ yScale, 0, r_from_v ] ([Y ] ) 664 // [ yScale, g_from_u, g_from_v ] x ([cb] - ycbcr_bias ) 665 // [ yScale, b_from_u, 0 ] ([cr] ) 666 static YUVMatrix From(const vec3_scalar& ycbcr_bias, 667 const mat3_scalar& rgb_from_debiased_ycbcr, 668 int rescale_factor = 0) { 669 assert(ycbcr_bias.z == ycbcr_bias.y); 670 671 const auto rgb_from_y = rgb_from_debiased_ycbcr[0].y; 672 assert(rgb_from_debiased_ycbcr[0].x == rgb_from_debiased_ycbcr[0].z); 673 674 int16_t br_from_y_mask = -1; 675 if (rgb_from_debiased_ycbcr[0].x == 0.0) { 676 // gbr-identity matrix? 677 assert(rgb_from_debiased_ycbcr[0].x == 0); 678 assert(rgb_from_debiased_ycbcr[0].y >= 1); 679 assert(rgb_from_debiased_ycbcr[0].z == 0); 680 681 assert(rgb_from_debiased_ycbcr[1].x == 0); 682 assert(rgb_from_debiased_ycbcr[1].y == 0); 683 assert(rgb_from_debiased_ycbcr[1].z >= 1); 684 685 assert(rgb_from_debiased_ycbcr[2].x >= 1); 686 assert(rgb_from_debiased_ycbcr[2].y == 0); 687 assert(rgb_from_debiased_ycbcr[2].z == 0); 688 689 assert(ycbcr_bias.x == 0); 690 assert(ycbcr_bias.y == 0); 691 assert(ycbcr_bias.z == 0); 692 693 br_from_y_mask = 0; 694 } else { 695 assert(rgb_from_debiased_ycbcr[0].x == rgb_from_y); 696 } 697 698 assert(rgb_from_debiased_ycbcr[1].x == 0.0); 699 const auto g_from_u = rgb_from_debiased_ycbcr[1].y; 700 const auto b_from_u = rgb_from_debiased_ycbcr[1].z; 701 702 const auto r_from_v = rgb_from_debiased_ycbcr[2].x; 703 const auto g_from_v = rgb_from_debiased_ycbcr[2].y; 704 assert(rgb_from_debiased_ycbcr[2].z == 0.0); 705 706 return YUVMatrix({ycbcr_bias.x, ycbcr_bias.y}, rgb_from_y, br_from_y_mask, 707 r_from_v, g_from_u, g_from_v, b_from_u, rescale_factor); 708 } 709 710 // Convert matrix coefficients to fixed-point representation. If the matrix 711 // has a rescaling applied to it, then we need to take care to undo the 712 // scaling so that we can convert the coefficients to fixed-point range. The 713 // bias still requires shifting to apply the rescaling. The rescaling will be 714 // applied to the actual YCbCr sample data later by manually shifting it 715 // before applying this matrix. 716 YUVMatrix(vec2_scalar yuv_bias, double yCoeff, int16_t br_yMask_, double rv, 717 double gu, double gv, double bu, int rescale_factor = 0) 718 : br_uvCoeffs(zip(I16(int16_t(bu * (1 << (6 - rescale_factor)) + 0.5)), 719 I16(int16_t(rv * (1 << (6 - rescale_factor)) + 0.5)))), 720 gg_uvCoeffs( 721 zip(I16(-int16_t(-gu * (1 << (6 - rescale_factor)) + 722 0.5)), // These are negative coeffs, so 723 // round them away from zero 724 I16(-int16_t(-gv * (1 << (6 - rescale_factor)) + 0.5)))), 725 yCoeffs(uint16_t(yCoeff * (1 << (6 + 1 - rescale_factor)) + 0.5)), 726 // We have a +0.5 fudge-factor for -ybias. 727 // Without this, we get white=254 not 255. 728 // This approximates rounding rather than truncation during `gg >>= 6`. 729 yBias(int16_t(((yuv_bias.x * 255 * yCoeff) - 0.5) * (1 << 6))), 730 uvBias(int16_t(yuv_bias.y * (255 << rescale_factor) + 0.5)), 731 br_yMask(br_yMask_) { 732 assert(yuv_bias.x >= 0); 733 assert(yuv_bias.y >= 0); 734 assert(yCoeff > 0); 735 assert(br_yMask_ == 0 || br_yMask_ == -1); 736 assert(bu > 0); 737 assert(rv > 0); 738 assert(gu <= 0); 739 assert(gv <= 0); 740 assert(rescale_factor <= 6); 741 } 742 743 ALWAYS_INLINE PackedRGBA8 convert(V8<int16_t> yy, V8<int16_t> uv) const { 744 // We gave ourselves an extra bit (7 instead of 6) of bias to give us some 745 // extra precision for the more-sensitive y scaling. 746 // Note that we have to use an unsigned multiply with a 2x scale to 747 // represent a fractional scale and to avoid shifting with the sign bit. 748 749 // Note: if you subtract the bias before multiplication, we see more 750 // underflows. This could be fixed by an unsigned subsat. 751 yy = bit_cast<V8<int16_t>>((bit_cast<V8<uint16_t>>(yy) * yCoeffs) >> 1); 752 yy -= yBias; 753 754 // Compute [B] = [yCoeff*Y + bu*U + 0*V] 755 // [R] [yCoeff*Y + 0*U + rv*V] 756 uv -= uvBias; 757 auto br = br_uvCoeffs * uv; 758 br = addsat(yy & br_yMask, br); 759 br >>= 6; 760 761 // Compute G = yCoeff*Y + gu*U + gv*V 762 // First calc [gu*U, gv*V, ...]: 763 auto gg = gg_uvCoeffs * uv; 764 // Then cross the streams to get `gu*U + gv*V`: 765 gg = addsat(gg, bit_cast<V8<int16_t>>(bit_cast<V4<uint32_t>>(gg) >> 16)); 766 // Add the other parts: 767 gg = addsat(yy, gg); // This is the part that needs the most headroom 768 // usually. In particular, ycbcr(255,255,255) hugely 769 // saturates. 770 gg >>= 6; 771 772 // Interleave B/R and G values. Force alpha (high-gg half) to opaque. 773 return packYUV(gg, br); 774 } 775 }; 776 777 // Helper function for textureLinearRowR8 that samples horizontal taps and 778 // combines them based on Y fraction with next row. 779 template <typename S> 780 static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix, 781 int32_t offsety, 782 int32_t stridey, 783 int16_t fracy) { 784 uint8_t* buf = (uint8_t*)sampler->buf + offsety; 785 auto a0 = unaligned_load<V2<uint8_t>>(&buf[ix.x]); 786 auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]); 787 auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]); 788 auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]); 789 auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>); 790 buf += stridey; 791 auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]); 792 auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]); 793 auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]); 794 auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]); 795 auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>); 796 abcd0 += ((abcd1 - abcd0) * fracy) >> 7; 797 return abcd0; 798 } 799 800 // Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes 801 // constant Y and returns a duplicate of the result interleaved with itself 802 // to aid in later YUV transformation. 803 template <typename S> 804 static inline V8<int16_t> textureLinearRowR8(S sampler, I32 ix, int32_t offsety, 805 int32_t stridey, int16_t fracy) { 806 assert(sampler->format == TextureFormat::R8); 807 808 // Calculate X fraction and clamp X offset into range. 809 I32 fracx = ix; 810 ix >>= 7; 811 fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F; 812 ix = clampCoord(ix, sampler->width - 1); 813 814 // Load the sample taps and combine rows. 815 auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy); 816 817 // Unzip the result and do final horizontal multiply-add base on X fraction. 818 auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6); 819 auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7); 820 abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7; 821 822 // The final result is the packed values interleaved with a duplicate of 823 // themselves. 824 return abcdl; 825 } 826 827 // Optimized version of textureLinearPackedR8 for paired U/V R8 textures. 828 // Since the two textures have the same dimensions and stride, the addressing 829 // math can be shared between both samplers. This also allows a coalesced 830 // multiply in the final stage by packing both U/V results into a single 831 // operation. 832 template <typename S> 833 static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2, 834 I32 ix, int32_t offsety, 835 int32_t stridey, 836 int16_t fracy) { 837 assert(sampler->format == TextureFormat::R8 && 838 sampler2->format == TextureFormat::R8); 839 assert(sampler->width == sampler2->width && 840 sampler->height == sampler2->height); 841 assert(sampler->stride == sampler2->stride); 842 843 // Calculate X fraction and clamp X offset into range. 844 I32 fracx = ix; 845 ix >>= 7; 846 fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F; 847 ix = clampCoord(ix, sampler->width - 1); 848 849 // Load the sample taps for the first sampler and combine rows. 850 auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy); 851 852 // Load the sample taps for the second sampler and combine rows. 853 auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy); 854 855 // We are left with a result vector for each sampler with values for adjacent 856 // pixels interleaved together in each. We need to unzip these values so that 857 // we can do the final horizontal multiply-add based on the X fraction. 858 auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14); 859 auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15); 860 abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7; 861 862 // The final result is the packed values for the first sampler interleaved 863 // with the packed values for the second sampler. 864 return abcdxyzwl; 865 } 866 867 // Casting to int loses some precision while stepping that can offset the 868 // image, so shift the values by some extra bits of precision to minimize 869 // this. We support up to 16 bits of image size, 7 bits of quantization, 870 // and 1 bit for sign, which leaves 8 bits left for extra precision. 871 const int STEP_BITS = 8; 872 873 // Optimized version of textureLinearPackedR8 for Y R8 texture with 874 // half-resolution paired U/V R8 textures. This allows us to more efficiently 875 // pack YUV samples into vectors to substantially reduce math operations even 876 // further. 877 template <bool BLEND> 878 static inline void upscaleYUV42R8(uint32_t* dest, int span, uint8_t* yRow, 879 I32 yU, int32_t yDU, int32_t yStrideV, 880 int16_t yFracV, uint8_t* cRow1, 881 uint8_t* cRow2, I32 cU, int32_t cDU, 882 int32_t cStrideV, int16_t cFracV, 883 const YUVMatrix& colorSpace) { 884 // As much as possible try to utilize the fact that we're only using half 885 // the UV samples to combine Y and UV samples into single vectors. Here we 886 // need to initialize several useful vector quantities for stepping fractional 887 // offsets. For the UV samples, we take the average of the first+second and 888 // third+fourth samples in a chunk which conceptually correspond to offsets 889 // 0.5 and 1.5 (in 0..2 range). This allows us to reconstruct intermediate 890 // samples 0.25, 0.75, 1.25, and 1.75 later. X fraction is shifted over into 891 // the top 7 bits of an unsigned short so that we can mask off the exact 892 // fractional bits we need to blend merely by right shifting them into 893 // position. 894 cU = (cU.xzxz + cU.ywyw) >> 1; 895 auto ycFracX = CONVERT(combine(yU, cU), V8<uint16_t>) 896 << (16 - (STEP_BITS + 7)); 897 auto ycFracDX = combine(I16(yDU), I16(cDU)) << (16 - (STEP_BITS + 7)); 898 auto ycFracV = combine(I16(yFracV), I16(cFracV)); 899 I32 yI = yU >> (STEP_BITS + 7); 900 I32 cI = cU >> (STEP_BITS + 7); 901 // Load initial combined YUV samples for each row and blend them. 902 auto ycSrc0 = 903 CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x]), 904 combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x]), 905 unaligned_load<V2<uint8_t>>(&cRow2[cI.x]))), 906 V8<int16_t>); 907 auto ycSrc1 = CONVERT( 908 combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x + yStrideV]), 909 combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x + cStrideV]), 910 unaligned_load<V2<uint8_t>>(&cRow2[cI.x + cStrideV]))), 911 V8<int16_t>); 912 auto ycSrc = ycSrc0 + (((ycSrc1 - ycSrc0) * ycFracV) >> 7); 913 914 // Here we shift in results from the next sample while caching results from 915 // the previous sample. This allows us to reduce the multiplications in the 916 // inner loop down to only two since we just need to blend the new samples 917 // horizontally and then vertically once each. 918 for (uint32_t* end = dest + span; dest < end; dest += 4) { 919 yU += yDU; 920 I32 yIn = yU >> (STEP_BITS + 7); 921 cU += cDU; 922 I32 cIn = cU >> (STEP_BITS + 7); 923 // Load combined YUV samples for the next chunk on each row and blend them. 924 auto ycSrc0n = 925 CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x]), 926 combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x]), 927 unaligned_load<V2<uint8_t>>(&cRow2[cIn.x]))), 928 V8<int16_t>); 929 auto ycSrc1n = CONVERT( 930 combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x + yStrideV]), 931 combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x + cStrideV]), 932 unaligned_load<V2<uint8_t>>(&cRow2[cIn.x + cStrideV]))), 933 V8<int16_t>); 934 auto ycSrcn = ycSrc0n + (((ycSrc1n - ycSrc0n) * ycFracV) >> 7); 935 936 // The source samples for the chunk may not match the actual tap offsets. 937 // Since we're upscaling, we know the tap offsets fall within all the 938 // samples in a 4-wide chunk. Since we can't rely on PSHUFB or similar, 939 // instead we do laborious shuffling here for the Y samples and then the UV 940 // samples. 941 auto yshuf = lowHalf(ycSrc); 942 auto yshufn = 943 SHUFFLE(yshuf, yIn.x == yI.w ? lowHalf(ycSrcn).yyyy : lowHalf(ycSrcn), 944 1, 2, 3, 4); 945 if (yI.y == yI.x) { 946 yshuf = yshuf.xxyz; 947 yshufn = yshufn.xxyz; 948 } 949 if (yI.z == yI.y) { 950 yshuf = yshuf.xyyz; 951 yshufn = yshufn.xyyz; 952 } 953 if (yI.w == yI.z) { 954 yshuf = yshuf.xyzz; 955 yshufn = yshufn.xyzz; 956 } 957 958 auto cshuf = highHalf(ycSrc); 959 auto cshufn = 960 SHUFFLE(cshuf, cIn.x == cI.y ? highHalf(ycSrcn).yyww : highHalf(ycSrcn), 961 1, 4, 3, 6); 962 if (cI.y == cI.x) { 963 cshuf = cshuf.xxzz; 964 cshufn = cshufn.xxzz; 965 } 966 967 // After shuffling, combine the Y and UV samples back into a single vector 968 // for blending. Shift X fraction into position as unsigned to mask off top 969 // bits and get rid of low bits to avoid multiplication overflow. 970 auto yuvPx = combine(yshuf, cshuf); 971 yuvPx += ((combine(yshufn, cshufn) - yuvPx) * 972 bit_cast<V8<int16_t>>(ycFracX >> (16 - 7))) >> 973 7; 974 975 // Cache the new samples as the current samples on the next iteration. 976 ycSrc = ycSrcn; 977 ycFracX += ycFracDX; 978 yI = yIn; 979 cI = cIn; 980 981 // De-interleave the Y and UV results. We need to average the UV results 982 // to produce values for intermediate samples. Taps for UV were collected at 983 // offsets 0.5 and 1.5, such that if we take a quarter of the difference 984 // (1.5-0.5)/4, subtract it from even samples, and add it to odd samples, 985 // we can estimate samples 0.25, 0.75, 1.25, and 1.75. 986 auto yPx = SHUFFLE(yuvPx, yuvPx, 0, 0, 1, 1, 2, 2, 3, 3); 987 auto uvPx = SHUFFLE(yuvPx, yuvPx, 4, 6, 4, 6, 5, 7, 5, 7) + 988 ((SHUFFLE(yuvPx, yuvPx, 4, 6, 5, 7, 4, 6, 5, 7) - 989 SHUFFLE(yuvPx, yuvPx, 5, 7, 4, 6, 5, 7, 4, 6)) >> 990 2); 991 992 commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); 993 } 994 } 995 996 // This is the inner loop driver of CompositeYUV that processes an axis-aligned 997 // YUV span, dispatching based on appropriate format and scaling. This is also 998 // reused by blendYUV to accelerate some cases of texture sampling in the 999 // shader. 1000 template <bool BLEND = false> 1001 static void linear_row_yuv(uint32_t* dest, int span, sampler2DRect samplerY, 1002 const vec2_scalar& srcUV, float srcDU, 1003 sampler2DRect samplerU, sampler2DRect samplerV, 1004 const vec2_scalar& chromaUV, float chromaDU, 1005 int colorDepth, const YUVMatrix& colorSpace) { 1006 // Calculate varying and constant interp data for Y plane. 1007 I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS)); 1008 int32_t yV = int32_t(srcUV.y); 1009 1010 // Calculate varying and constant interp data for chroma planes. 1011 I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS)); 1012 int32_t cV = int32_t(chromaUV.y); 1013 1014 // We need to skip 4 pixels per chunk. 1015 int32_t yDU = int32_t((4 << STEP_BITS) * srcDU); 1016 int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU); 1017 1018 if (samplerY->width < 2 || samplerU->width < 2) { 1019 // If the source row has less than 2 pixels, it's not safe to use a linear 1020 // filter because it may overread the row. Just convert the single pixel 1021 // with nearest filtering and fill the row with it. 1022 Float yuvF = {texelFetch(samplerY, ivec2(srcUV)).x.x, 1023 texelFetch(samplerU, ivec2(chromaUV)).x.x, 1024 texelFetch(samplerV, ivec2(chromaUV)).x.x, 1.0f}; 1025 // If this is an HDR LSB format, we need to renormalize the result. 1026 if (colorDepth > 8) { 1027 int rescaleFactor = 16 - colorDepth; 1028 yuvF *= float(1 << rescaleFactor); 1029 } 1030 I16 yuv = CONVERT(round_pixel(yuvF), I16); 1031 commit_solid_span<BLEND>( 1032 dest, 1033 unpack(colorSpace.convert(V8<int16_t>(yuv.x), 1034 zip(I16(yuv.y), I16(yuv.z)))), 1035 span); 1036 } else if (samplerY->format == TextureFormat::R16) { 1037 // Sample each YUV plane, rescale it to fit in low 8 bits of word, and 1038 // then transform them by the appropriate color space. 1039 assert(colorDepth > 8); 1040 // Need to right shift the sample by the amount of bits over 8 it 1041 // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit 1042 // of precision at the low end already, hence 1 is subtracted from the 1043 // color depth. 1044 int rescaleBits = (colorDepth - 1) - 8; 1045 for (; span >= 4; span -= 4) { 1046 auto yPx = 1047 textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >> 1048 rescaleBits; 1049 auto uPx = 1050 textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >> 1051 rescaleBits; 1052 auto vPx = 1053 textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >> 1054 rescaleBits; 1055 commit_blend_span<BLEND>( 1056 dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx))); 1057 dest += 4; 1058 yU += yDU; 1059 cU += cDU; 1060 } 1061 if (span > 0) { 1062 // Handle any remaining pixels... 1063 auto yPx = 1064 textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >> 1065 rescaleBits; 1066 auto uPx = 1067 textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >> 1068 rescaleBits; 1069 auto vPx = 1070 textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >> 1071 rescaleBits; 1072 commit_blend_span<BLEND>( 1073 dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)), span); 1074 } 1075 } else { 1076 assert(samplerY->format == TextureFormat::R8); 1077 assert(colorDepth == 8); 1078 1079 // Calculate varying and constant interp data for Y plane. 1080 int16_t yFracV = yV & 0x7F; 1081 yV >>= 7; 1082 int32_t yOffsetV = clampCoord(yV, samplerY->height) * samplerY->stride; 1083 int32_t yStrideV = 1084 yV >= 0 && yV < int32_t(samplerY->height) - 1 ? samplerY->stride : 0; 1085 1086 // Calculate varying and constant interp data for chroma planes. 1087 int16_t cFracV = cV & 0x7F; 1088 cV >>= 7; 1089 int32_t cOffsetV = clampCoord(cV, samplerU->height) * samplerU->stride; 1090 int32_t cStrideV = 1091 cV >= 0 && cV < int32_t(samplerU->height) - 1 ? samplerU->stride : 0; 1092 1093 // If we're sampling the UV planes at half the resolution of the Y plane, 1094 // then try to use half resolution fast-path. 1095 if (yDU >= cDU && cDU > 0 && yDU <= (4 << (STEP_BITS + 7)) && 1096 cDU <= (2 << (STEP_BITS + 7))) { 1097 // Ensure that samples don't fall outside of the valid bounds of each 1098 // planar texture. Step until the initial X coordinates are positive. 1099 for (; (yU.x < 0 || cU.x < 0) && span >= 4; span -= 4) { 1100 auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, 1101 yStrideV, yFracV); 1102 auto uvPx = textureLinearRowPairedR8( 1103 samplerU, samplerV, cU >> STEP_BITS, cOffsetV, cStrideV, cFracV); 1104 commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); 1105 dest += 4; 1106 yU += yDU; 1107 cU += cDU; 1108 } 1109 // Calculate the number of aligned chunks that we can step inside the 1110 // bounds of each planar texture without overreading. 1111 int inside = min( 1112 min((((int(samplerY->width) - 4) << (STEP_BITS + 7)) - yU.x) / yDU, 1113 (((int(samplerU->width) - 4) << (STEP_BITS + 7)) - cU.x) / cDU) * 1114 4, 1115 span & ~3); 1116 if (inside > 0) { 1117 uint8_t* yRow = (uint8_t*)samplerY->buf + yOffsetV; 1118 uint8_t* cRow1 = (uint8_t*)samplerU->buf + cOffsetV; 1119 uint8_t* cRow2 = (uint8_t*)samplerV->buf + cOffsetV; 1120 upscaleYUV42R8<BLEND>(dest, inside, yRow, yU, yDU, yStrideV, yFracV, 1121 cRow1, cRow2, cU, cDU, cStrideV, cFracV, 1122 colorSpace); 1123 span -= inside; 1124 dest += inside; 1125 yU += (inside / 4) * yDU; 1126 cU += (inside / 4) * cDU; 1127 } 1128 // If there are any remaining chunks that weren't inside, handle them 1129 // below. 1130 } 1131 for (; span >= 4; span -= 4) { 1132 // Sample each YUV plane and then transform them by the appropriate 1133 // color space. 1134 auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, 1135 yStrideV, yFracV); 1136 auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS, 1137 cOffsetV, cStrideV, cFracV); 1138 commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); 1139 dest += 4; 1140 yU += yDU; 1141 cU += cDU; 1142 } 1143 if (span > 0) { 1144 // Handle any remaining pixels... 1145 auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, 1146 yStrideV, yFracV); 1147 auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS, 1148 cOffsetV, cStrideV, cFracV); 1149 commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx), span); 1150 } 1151 } 1152 } 1153 1154 static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex, 1155 const YUVMatrix& rgbFromYcbcr, int colorDepth, 1156 const IntRect& srcReq, Texture& dsttex, 1157 const IntRect& dstReq, bool invertX, 1158 bool invertY, const IntRect& clipRect) { 1159 // Compute valid dest bounds 1160 IntRect dstBounds = dsttex.sample_bounds(dstReq); 1161 dstBounds.intersect(clipRect); 1162 // Check if sampling bounds are empty 1163 if (dstBounds.is_empty()) { 1164 return; 1165 } 1166 // Initialize samplers for source textures 1167 sampler2DRect_impl sampler[3]; 1168 init_sampler(&sampler[0], ytex); 1169 init_sampler(&sampler[1], utex); 1170 init_sampler(&sampler[2], vtex); 1171 1172 // Compute source UVs 1173 vec2_scalar srcUV(srcReq.x0, srcReq.y0); 1174 vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), 1175 float(srcReq.height()) / dstReq.height()); 1176 if (invertX) { 1177 // Advance to the end of the row and flip the step. 1178 srcUV.x += srcReq.width(); 1179 srcDUV.x = -srcDUV.x; 1180 } 1181 // Inverted Y must step downward along source rows 1182 if (invertY) { 1183 srcUV.y += srcReq.height(); 1184 srcDUV.y = -srcDUV.y; 1185 } 1186 // Skip to clamped source start 1187 srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f); 1188 // Calculate separate chroma UVs for chroma planes with different scale 1189 vec2_scalar chromaScale(float(utex.width) / ytex.width, 1190 float(utex.height) / ytex.height); 1191 vec2_scalar chromaUV = srcUV * chromaScale; 1192 vec2_scalar chromaDUV = srcDUV * chromaScale; 1193 // Scale UVs by lerp precision. If the row has only 1 pixel, then don't 1194 // quantize so that we can use nearest filtering instead to avoid overreads. 1195 if (ytex.width >= 2 && utex.width >= 2) { 1196 srcUV = linearQuantize(srcUV, 128); 1197 srcDUV *= 128.0f; 1198 chromaUV = linearQuantize(chromaUV, 128); 1199 chromaDUV *= 128.0f; 1200 } 1201 // Calculate dest pointer from clamped offsets 1202 int destStride = dsttex.stride(); 1203 char* dest = dsttex.sample_ptr(dstReq, dstBounds); 1204 int span = dstBounds.width(); 1205 for (int rows = dstBounds.height(); rows > 0; rows--) { 1206 linear_row_yuv((uint32_t*)dest, span, &sampler[0], srcUV, srcDUV.x, 1207 &sampler[1], &sampler[2], chromaUV, chromaDUV.x, colorDepth, 1208 rgbFromYcbcr); 1209 dest += destStride; 1210 srcUV.y += srcDUV.y; 1211 chromaUV.y += chromaDUV.y; 1212 } 1213 } 1214 1215 // - 1216 // This section must match gfx/2d/Types.h 1217 1218 enum class YUVRangedColorSpace : uint8_t { 1219 BT601_Narrow = 0, 1220 BT601_Full, 1221 BT709_Narrow, 1222 BT709_Full, 1223 BT2020_Narrow, 1224 BT2020_Full, 1225 GbrIdentity, 1226 }; 1227 1228 // - 1229 // This section must match yuv.glsl 1230 1231 vec4_scalar get_ycbcr_zeros_ones(const YUVRangedColorSpace color_space, 1232 const GLuint color_depth) { 1233 // For SWGL's 8bpc-only pipeline, our extra care here probably doesn't matter. 1234 // However, technically e.g. 10-bit achromatic zero for cb and cr is 1235 // (128 << 2) / ((1 << 10) - 1) = 512 / 1023, which != 128 / 255, and affects 1236 // our matrix values subtly. Maybe not enough to matter? But it's the most 1237 // correct thing to do. 1238 // Unlike the glsl version, our texture samples are u8([0,255]) not 1239 // u16([0,1023]) though. 1240 switch (color_space) { 1241 case YUVRangedColorSpace::BT601_Narrow: 1242 case YUVRangedColorSpace::BT709_Narrow: 1243 case YUVRangedColorSpace::BT2020_Narrow: { 1244 auto extra_bit_count = color_depth - 8; 1245 vec4_scalar zo = { 1246 float(16 << extra_bit_count), 1247 float(128 << extra_bit_count), 1248 float(235 << extra_bit_count), 1249 float(240 << extra_bit_count), 1250 }; 1251 float all_bits = (1 << color_depth) - 1; 1252 zo /= all_bits; 1253 return zo; 1254 } 1255 1256 case YUVRangedColorSpace::BT601_Full: 1257 case YUVRangedColorSpace::BT709_Full: 1258 case YUVRangedColorSpace::BT2020_Full: { 1259 const auto narrow = 1260 get_ycbcr_zeros_ones(YUVRangedColorSpace::BT601_Narrow, color_depth); 1261 return {0.0, narrow.y, 1.0, 1.0}; 1262 } 1263 1264 case YUVRangedColorSpace::GbrIdentity: 1265 break; 1266 } 1267 return {0.0, 0.0, 1.0, 1.0}; 1268 } 1269 1270 constexpr mat3_scalar RgbFromYuv_Rec601 = { 1271 {1.00000, 1.00000, 1.00000}, 1272 {0.00000, -0.17207, 0.88600}, 1273 {0.70100, -0.35707, 0.00000}, 1274 }; 1275 constexpr mat3_scalar RgbFromYuv_Rec709 = { 1276 {1.00000, 1.00000, 1.00000}, 1277 {0.00000, -0.09366, 0.92780}, 1278 {0.78740, -0.23406, 0.00000}, 1279 }; 1280 constexpr mat3_scalar RgbFromYuv_Rec2020 = { 1281 {1.00000, 1.00000, 1.00000}, 1282 {0.00000, -0.08228, 0.94070}, 1283 {0.73730, -0.28568, 0.00000}, 1284 }; 1285 constexpr mat3_scalar RgbFromYuv_GbrIdentity = { 1286 {0, 1, 0}, 1287 {0, 0, 1}, 1288 {1, 0, 0}, 1289 }; 1290 1291 inline mat3_scalar get_rgb_from_yuv(const YUVRangedColorSpace color_space) { 1292 switch (color_space) { 1293 case YUVRangedColorSpace::BT601_Narrow: 1294 case YUVRangedColorSpace::BT601_Full: 1295 return RgbFromYuv_Rec601; 1296 case YUVRangedColorSpace::BT709_Narrow: 1297 case YUVRangedColorSpace::BT709_Full: 1298 return RgbFromYuv_Rec709; 1299 case YUVRangedColorSpace::BT2020_Narrow: 1300 case YUVRangedColorSpace::BT2020_Full: 1301 return RgbFromYuv_Rec2020; 1302 case YUVRangedColorSpace::GbrIdentity: 1303 break; 1304 } 1305 return RgbFromYuv_GbrIdentity; 1306 } 1307 1308 struct YcbcrInfo final { 1309 vec3_scalar ycbcr_bias; 1310 mat3_scalar rgb_from_debiased_ycbcr; 1311 }; 1312 1313 inline YcbcrInfo get_ycbcr_info(const YUVRangedColorSpace color_space, 1314 GLuint color_depth) { 1315 // SWGL always does 8bpc math, so don't scale the matrix for 10bpc! 1316 color_depth = 8; 1317 1318 const auto zeros_ones = get_ycbcr_zeros_ones(color_space, color_depth); 1319 const auto zeros = vec2_scalar{zeros_ones.x, zeros_ones.y}; 1320 const auto ones = vec2_scalar{zeros_ones.z, zeros_ones.w}; 1321 const auto scale = 1.0f / (ones - zeros); 1322 1323 const auto rgb_from_yuv = get_rgb_from_yuv(color_space); 1324 const mat3_scalar yuv_from_debiased_ycbcr = { 1325 {scale.x, 0, 0}, 1326 {0, scale.y, 0}, 1327 {0, 0, scale.y}, 1328 }; 1329 1330 YcbcrInfo ret; 1331 ret.ycbcr_bias = {zeros.x, zeros.y, zeros.y}; 1332 ret.rgb_from_debiased_ycbcr = rgb_from_yuv * yuv_from_debiased_ycbcr; 1333 return ret; 1334 } 1335 1336 // - 1337 1338 extern "C" { 1339 1340 // Extension for compositing a YUV surface represented by separate YUV planes 1341 // to a BGRA destination. The supplied color space is used to determine the 1342 // transform from YUV to BGRA after sampling. 1343 void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY, 1344 LockedTexture* lockedU, LockedTexture* lockedV, 1345 YUVRangedColorSpace colorSpace, GLuint colorDepth, GLint srcX, 1346 GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX, 1347 GLint dstY, GLsizei dstWidth, GLsizei dstHeight, 1348 GLboolean flipX, GLboolean flipY, GLint clipX, GLint clipY, 1349 GLsizei clipWidth, GLsizei clipHeight) { 1350 if (!lockedDst || !lockedY || !lockedU || !lockedV) { 1351 return; 1352 } 1353 if (colorSpace > YUVRangedColorSpace::GbrIdentity) { 1354 assert(false); 1355 return; 1356 } 1357 const auto ycbcrInfo = get_ycbcr_info(colorSpace, colorDepth); 1358 const auto rgbFromYcbcr = 1359 YUVMatrix::From(ycbcrInfo.ycbcr_bias, ycbcrInfo.rgb_from_debiased_ycbcr); 1360 1361 Texture& ytex = *lockedY; 1362 Texture& utex = *lockedU; 1363 Texture& vtex = *lockedV; 1364 Texture& dsttex = *lockedDst; 1365 // All YUV planes must currently be represented by R8 or R16 textures. 1366 // The chroma (U/V) planes must have matching dimensions. 1367 assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp()); 1368 assert((ytex.bpp() == 1 && colorDepth == 8) || 1369 (ytex.bpp() == 2 && colorDepth > 8)); 1370 // assert(ytex.width == utex.width && ytex.height == utex.height); 1371 assert(utex.width == vtex.width && utex.height == vtex.height); 1372 assert(ytex.offset == utex.offset && ytex.offset == vtex.offset); 1373 assert(dsttex.bpp() == 4); 1374 1375 IntRect srcReq = 1376 IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset; 1377 IntRect dstReq = 1378 IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset; 1379 if (srcReq.is_empty() || dstReq.is_empty()) { 1380 return; 1381 } 1382 1383 // Compute clip rect as relative to the dstReq, as that's the same coords 1384 // as used for the sampling bounds. 1385 IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth, 1386 clipY - dstY + clipHeight}; 1387 // For now, always use a linear filter path that would be required for 1388 // scaling. Further fast-paths for non-scaled video might be desirable in the 1389 // future. 1390 linear_convert_yuv(ytex, utex, vtex, rgbFromYcbcr, colorDepth, srcReq, dsttex, 1391 dstReq, flipX, flipY, clipRect); 1392 } 1393 1394 } // extern "C"