commit 88ab8c9600f2ad4f1fa40fadffd83aa675f360d7 parent fda063630b13dfe14a41394a2eee3c22d66bfde8 Author: Serban Stanca <sstanca@mozilla.com> Date: Wed, 12 Nov 2025 19:10:49 +0200 Revert "Bug 1996818 - Store gpu addresses as a u32. r=gw" for causing mochitests-plain failures in test_ext_web_accessible_resources.html This reverts commit f04bdbe632dd6c596bce383ddc76e2b26ef0c3f9. This reverts commit c4c1d2a01faeae04eabd837aa393dc621c46304a. This reverts commit de3703a4ecf74306332c88cea762c265dda69b0e. This reverts commit 5e29586453931121d577a3c3fe7f80e8dbc5a304. This reverts commit 0f3a303c01170f22166784f30d2e1d135942296d. Diffstat:
72 files changed, 3116 insertions(+), 1058 deletions(-)
diff --git a/gfx/layers/ipc/CompositorBridgeParent.cpp b/gfx/layers/ipc/CompositorBridgeParent.cpp @@ -1860,6 +1860,11 @@ int32_t RecordContentFrameTime( .AccumulateSingleSample( static_cast<unsigned long long>(fracLatencyNorm)); + if (aStats) { + latencyMs -= (double(aStats->gpu_cache_upload_time) / 1000000.0); + latencyNorm = latencyMs / aVsyncRate.ToMilliseconds(); + fracLatencyNorm = lround(latencyNorm * 100.0); + } mozilla::glean::gfx_content_frame_time::without_resource_upload .AccumulateSingleSample( static_cast<unsigned long long>(fracLatencyNorm)); diff --git a/gfx/layers/ipc/PCompositorBridge.ipdl b/gfx/layers/ipc/PCompositorBridge.ipdl @@ -52,6 +52,7 @@ struct FrameStats { TimeStamp compositeEnd; int32_t contentFrameTime; double resourceUploadTime; + double gpuCacheUploadTime; TimeStamp transactionStart; TimeStamp refreshStart; TimeStamp fwdTime; diff --git a/gfx/layers/wr/WebRenderBridgeParent.cpp b/gfx/layers/wr/WebRenderBridgeParent.cpp @@ -2671,6 +2671,7 @@ void WebRenderBridgeParent::FlushTransactionIdsForEpoch( transactionId.mId, aCompositeStartTime, aRenderStartTime, aEndTime, contentFrameTime, aStats ? (double(aStats->resource_upload_time) / 1000000.0) : 0.0, + aStats ? (double(aStats->gpu_cache_upload_time) / 1000000.0) : 0.0, transactionId.mTxnStartTime, transactionId.mRefreshStartTime, transactionId.mFwdTime, transactionId.mSceneBuiltTime, transactionId.mSkippedComposites, transactionId.mTxnURL)); diff --git a/gfx/layers/wr/WebRenderMessageUtils.h b/gfx/layers/wr/WebRenderMessageUtils.h @@ -296,6 +296,8 @@ inline auto TiedFields<mozilla::wr::MemoryReport>( // clang-format off return std::tie( a.clip_stores, + a.gpu_cache_metadata, + a.gpu_cache_cpu_mirror, a.hit_testers, a.fonts, a.weak_fonts, @@ -308,6 +310,7 @@ inline auto TiedFields<mozilla::wr::MemoryReport>( a.swgl, a.frame_allocator, a.render_tasks, + a.gpu_cache_textures, a.vertex_data_textures, a.render_target_textures, a.picture_tile_textures, diff --git a/gfx/thebes/gfxPlatform.cpp b/gfx/thebes/gfxPlatform.cpp @@ -537,6 +537,7 @@ static void WebRenderDebugPrefChangeCallback(const char* aPrefName, void*) { GFX_WEBRENDER_DEBUG(".echo-driver-messages", wr::DebugFlags::ECHO_DRIVER_MESSAGES) GFX_WEBRENDER_DEBUG(".show-overdraw", wr::DebugFlags::SHOW_OVERDRAW) + GFX_WEBRENDER_DEBUG(".gpu-cache", wr::DebugFlags::GPU_CACHE_DBG) GFX_WEBRENDER_DEBUG(".texture-cache.clear-evicted", wr::DebugFlags::TEXTURE_CACHE_DBG_CLEAR_EVICTED) GFX_WEBRENDER_DEBUG(".picture-caching", wr::DebugFlags::PICTURE_CACHING_DBG) @@ -729,6 +730,8 @@ WebRenderMemoryReporter::CollectReports(nsIHandleReportCallback* aHandleReport, [=](wr::MemoryReport aReport) { // CPU Memory. helper.Report(aReport.clip_stores, "clip-stores"); + helper.Report(aReport.gpu_cache_metadata, "gpu-cache/metadata"); + helper.Report(aReport.gpu_cache_cpu_mirror, "gpu-cache/cpu-mirror"); helper.Report(aReport.hit_testers, "hit-testers"); helper.Report(aReport.fonts, "resource-cache/fonts"); helper.Report(aReport.weak_fonts, "resource-cache/weak-fonts"); @@ -748,6 +751,7 @@ WebRenderMemoryReporter::CollectReports(nsIHandleReportCallback* aHandleReport, WEBRENDER_FOR_EACH_INTERNER(REPORT_DATA_STORE, ); // GPU Memory. + helper.ReportTexture(aReport.gpu_cache_textures, "gpu-cache"); helper.ReportTexture(aReport.vertex_data_textures, "vertex-data"); helper.ReportTexture(aReport.render_target_textures, "render-targets"); helper.ReportTexture(aReport.depth_target_textures, "depth-targets"); @@ -3644,7 +3648,8 @@ void gfxPlatform::GetFrameStats(mozilla::widget::InfoObject& aObj) { "Frame %" PRIu64 "(%s) CONTENT_FRAME_TIME %d - Transaction start %f, main-thread time " "%f, full paint time %f, Skipped composites %u, Composite start %f, " - "Resource upload time %f, Render time %f, Composite time %f", + "Resource upload time %f, GPU cache upload time %f, Render time %f, " + "Composite time %f", f.id().mId, f.url().get(), f.contentFrameTime(), (f.transactionStart() - f.refreshStart()).ToMilliseconds(), (f.fwdTime() - f.transactionStart()).ToMilliseconds(), @@ -3653,7 +3658,7 @@ void gfxPlatform::GetFrameStats(mozilla::widget::InfoObject& aObj) { : 0.0, f.skippedComposites(), (f.compositeStart() - f.refreshStart()).ToMilliseconds(), - f.resourceUploadTime(), + f.resourceUploadTime(), f.gpuCacheUploadTime(), (f.compositeEnd() - f.renderStart()).ToMilliseconds(), (f.compositeEnd() - f.compositeStart()).ToMilliseconds()); aObj.DefineProperty(name.get(), value.get()); diff --git a/gfx/wr/webrender/res/blend.glsl b/gfx/wr/webrender/res/blend.glsl @@ -2,8 +2,6 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include gpu_buffer - #define COMPONENT_TRANSFER_IDENTITY 0 #define COMPONENT_TRANSFER_TABLE 1 #define COMPONENT_TRANSFER_DISCRETE 2 @@ -77,14 +75,14 @@ void SetupFilterParams( ); color_offset = vec4(0.0); } else if (op == FILTER_COLOR_MATRIX) { - vec4 mat_data[4] = fetch_from_gpu_buffer_4f(gpu_data_address); - vec4 offset_data = fetch_from_gpu_buffer_1f(gpu_data_address + 4); + vec4 mat_data[4] = fetch_from_gpu_cache_4(gpu_data_address); + vec4 offset_data = fetch_from_gpu_cache_1(gpu_data_address + 4); color_mat = mat4(mat_data[0], mat_data[1], mat_data[2], mat_data[3]); color_offset = offset_data; } else if (op == FILTER_COMPONENT_TRANSFER) { table_address = gpu_data_address; } else if (op == FILTER_FLOOD) { - color_offset = fetch_from_gpu_buffer_1f(gpu_data_address); + color_offset = fetch_from_gpu_cache_1(gpu_data_address); } } #endif @@ -156,7 +154,7 @@ vec4 ComponentTransfer(vec4 colora, vec4 vfuncs, highp int table_address) { case COMPONENT_TRANSFER_DISCRETE: { // fetch value from lookup table k = int(floor(colora[i]*255.0 + 0.5)); - texel = fetch_from_gpu_buffer_1f(table_address + offset + k/4); + texel = fetch_from_gpu_cache_1(table_address + offset + k/4); colora[i] = clamp(texel[k % 4], 0.0, 1.0); // offset plus 256/4 blocks offset = offset + 64; @@ -164,7 +162,7 @@ vec4 ComponentTransfer(vec4 colora, vec4 vfuncs, highp int table_address) { } case COMPONENT_TRANSFER_LINEAR: { // fetch the two values for use in the linear equation - texel = fetch_from_gpu_buffer_1f(table_address + offset); + texel = fetch_from_gpu_cache_1(table_address + offset); colora[i] = clamp(texel[0] * colora[i] + texel[1], 0.0, 1.0); // offset plus 1 block offset = offset + 1; @@ -172,7 +170,7 @@ vec4 ComponentTransfer(vec4 colora, vec4 vfuncs, highp int table_address) { } case COMPONENT_TRANSFER_GAMMA: { // fetch the three values for use in the gamma equation - texel = fetch_from_gpu_buffer_1f(table_address + offset); + texel = fetch_from_gpu_cache_1(table_address + offset); colora[i] = clamp(texel[0] * pow(colora[i], texel[1]) + texel[2], 0.0, 1.0); // offset plus 1 block offset = offset + 1; diff --git a/gfx/wr/webrender/res/brush.glsl b/gfx/wr/webrender/res/brush.glsl @@ -24,15 +24,15 @@ /// | z: flags | | | | local_clip_rect | +-----------------------+ | | /// | segment_index | | | +---------------------+ | | /// | w: resource_address +--+ | | | | -/// +----------------------------+ | | | (float gpu buffer) | | -/// | | | (float gpu buffer) +------------+ | | +/// +----------------------------+ | | | (sGpuCache) | | +/// | | | (sGpuCache) +------------+ | | /// | | | +---------------+ | Transform | <--------+ | -/// (float gpu buffer) | | +-> | Picture task | +------------+ | +/// (sGpuCache) | | +-> | Picture task | +------------+ | /// +-------------+ | | | | | /// | Resource | <---+ | | ... | | /// | | | +---------------+ +--------------------------------+ /// | | | | -/// +-------------+ | (float gpu buffer) v (float gpu buffer) +/// +-------------+ | (sGpuCache) v (sGpuCache) /// | +---------------+ +--------------+---------------+-+-+ /// +-----> | Clip area | | Brush data | Segment data | | | /// | | | | | | | @@ -113,7 +113,7 @@ void brush_shader_main_vs( VECS_PER_SPECIFIC_BRUSH + instance.segment_index * VECS_PER_SEGMENT; - vec4[2] segment_info = fetch_from_gpu_buffer_2f(segment_address); + vec4[2] segment_info = fetch_from_gpu_cache_2(segment_address); segment_rect = RectWithEndpoint(segment_info[0].xy, segment_info[0].zw); segment_rect.p0 += ph.local_rect.p0; segment_rect.p1 += ph.local_rect.p0; diff --git a/gfx/wr/webrender/res/brush_blend.glsl b/gfx/wr/webrender/res/brush_blend.glsl @@ -5,7 +5,7 @@ #define VECS_PER_SPECIFIC_BRUSH 3 #define WR_FEATURE_TEXTURE_2D -#include shared,prim_shared,brush,blend,image_source +#include shared,prim_shared,brush,blend // Interpolated UV coordinates to sample. varying highp vec2 v_uv; diff --git a/gfx/wr/webrender/res/brush_image.glsl b/gfx/wr/webrender/res/brush_image.glsl @@ -4,7 +4,7 @@ #define VECS_PER_SPECIFIC_BRUSH 3 -#include shared,prim_shared,brush,image_source +#include shared,prim_shared,brush // Interpolated UV coordinates to sample. varying highp vec2 v_uv; @@ -38,7 +38,7 @@ struct ImageBrushData { }; ImageBrushData fetch_image_data(int address) { - vec4[3] raw_data = fetch_from_gpu_buffer_3f(address); + vec4[3] raw_data = fetch_from_gpu_cache_3(address); ImageBrushData data = ImageBrushData( raw_data[0], raw_data[1], diff --git a/gfx/wr/webrender/res/brush_linear_gradient.glsl b/gfx/wr/webrender/res/brush_linear_gradient.glsl @@ -20,7 +20,7 @@ struct Gradient { }; Gradient fetch_gradient(int address) { - vec4 data[2] = fetch_from_gpu_buffer_2f(address); + vec4 data[2] = fetch_from_gpu_cache_2(address); return Gradient( data[0], int(data[1].x), diff --git a/gfx/wr/webrender/res/brush_mix_blend.glsl b/gfx/wr/webrender/res/brush_mix_blend.glsl @@ -5,7 +5,7 @@ #define VECS_PER_SPECIFIC_BRUSH 3 #define WR_FEATURE_TEXTURE_2D -#include shared,prim_shared,brush,image_source +#include shared,prim_shared,brush // UV and bounds for the source image varying highp vec2 v_src_uv; diff --git a/gfx/wr/webrender/res/brush_solid.glsl b/gfx/wr/webrender/res/brush_solid.glsl @@ -15,7 +15,7 @@ struct SolidBrush { }; SolidBrush fetch_solid_primitive(int address) { - vec4 data = fetch_from_gpu_buffer_1f(address); + vec4 data = fetch_from_gpu_cache_1(address); return SolidBrush(data); } diff --git a/gfx/wr/webrender/res/brush_yuv_image.glsl b/gfx/wr/webrender/res/brush_yuv_image.glsl @@ -4,7 +4,7 @@ #define VECS_PER_SPECIFIC_BRUSH 1 -#include shared,prim_shared,brush,yuv,image_source +#include shared,prim_shared,brush,yuv varying highp vec2 vUv_Y; flat varying highp vec4 vUvBounds_Y; @@ -28,7 +28,7 @@ flat varying mediump int vRescaleFactor; #ifdef WR_VERTEX_SHADER YuvPrimitive fetch_yuv_primitive(int address) { - vec4 data = fetch_from_gpu_buffer_1f(address); + vec4 data = fetch_from_gpu_cache_1(address); // From YuvImageData.write_prim_gpu_blocks: int channel_bit_depth = int(data.x); int color_space = int(data.y); diff --git a/gfx/wr/webrender/res/clip_shared.glsl b/gfx/wr/webrender/res/clip_shared.glsl @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include rect,render_task,transform +#include rect,render_task,gpu_cache,transform #ifdef WR_VERTEX_SHADER diff --git a/gfx/wr/webrender/res/cs_clip_box_shadow.glsl b/gfx/wr/webrender/res/cs_clip_box_shadow.glsl @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include shared,clip_shared,image_source +#include shared,clip_shared varying highp vec4 vLocalPos; varying highp vec2 vUv; @@ -17,7 +17,7 @@ flat varying mediump vec2 vClipMode; #ifdef WR_VERTEX_SHADER -PER_INSTANCE in int aClipDataResourceAddress; +PER_INSTANCE in ivec2 aClipDataResourceAddress; PER_INSTANCE in vec2 aClipSrcRectSize; PER_INSTANCE in int aClipMode; PER_INSTANCE in ivec2 aStretchMode; @@ -25,7 +25,7 @@ PER_INSTANCE in vec4 aClipDestRect; struct ClipMaskInstanceBoxShadow { ClipMaskInstanceCommon base; - int resource_address; + ivec2 resource_address; }; ClipMaskInstanceBoxShadow fetch_clip_item() { @@ -61,7 +61,7 @@ void main(void) { Transform clip_transform = fetch_transform(cmi.base.clip_transform_id); Transform prim_transform = fetch_transform(cmi.base.prim_transform_id); BoxShadowData bs_data = fetch_data(); - ImageSource res = fetch_image_source(cmi.resource_address); + ImageSource res = fetch_image_source_direct(cmi.resource_address); RectWithEndpoint dest_rect = bs_data.dest_rect; diff --git a/gfx/wr/webrender/res/cs_conic_gradient.glsl b/gfx/wr/webrender/res/cs_conic_gradient.glsl @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include shared,rect,render_task,gpu_buffer,gradient +#include shared,rect,render_task,gpu_cache,gpu_buffer,gradient #define PI 3.141592653589793 diff --git a/gfx/wr/webrender/res/cs_linear_gradient.glsl b/gfx/wr/webrender/res/cs_linear_gradient.glsl @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include shared,rect,render_task,gpu_buffer,gradient +#include shared,rect,render_task,gpu_cache,gpu_buffer,gradient varying highp vec2 v_pos; diff --git a/gfx/wr/webrender/res/cs_radial_gradient.glsl b/gfx/wr/webrender/res/cs_radial_gradient.glsl @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include shared,rect,render_task,gpu_buffer,gradient +#include shared,rect,render_task,gpu_cache,gpu_buffer,gradient varying highp vec2 v_pos; diff --git a/gfx/wr/webrender/res/cs_svg_filter.glsl b/gfx/wr/webrender/res/cs_svg_filter.glsl @@ -4,7 +4,7 @@ #define WR_FEATURE_TEXTURE_2D -#include shared,prim_shared,gpu_buffer +#include shared,prim_shared varying highp vec2 vInput1Uv; varying highp vec2 vInput2Uv; @@ -53,7 +53,7 @@ PER_INSTANCE in int aFilterInput2TaskAddress; PER_INSTANCE in int aFilterKind; PER_INSTANCE in int aFilterInputCount; PER_INSTANCE in int aFilterGenericInt; -PER_INSTANCE in int aFilterExtraDataAddress; +PER_INSTANCE in ivec2 aFilterExtraDataAddress; struct FilterTask { RectWithEndpoint task_rect; @@ -126,20 +126,18 @@ void main(void) { vData = ivec4(aFilterGenericInt, 0, 0, 0); break; case FILTER_FLOOD: - vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress); + vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress); break; case FILTER_OPACITY: vFloat0.x = filter_task.user_data.x; break; - case FILTER_COLOR_MATRIX: { - ivec2 buffer_uv = get_gpu_buffer_uv(aFilterExtraDataAddress); - vec4 mat_data[4] = fetch_from_gpu_buffer_4f_direct(buffer_uv); + case FILTER_COLOR_MATRIX: + vec4 mat_data[4] = fetch_from_gpu_cache_4_direct(aFilterExtraDataAddress); vColorMat = mat4(mat_data[0], mat_data[1], mat_data[2], mat_data[3]); - vFilterData0 = fetch_from_gpu_buffer_1f_direct(buffer_uv + ivec2(4, 0)); + vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress + ivec2(4, 0)); break; - } case FILTER_DROP_SHADOW: - vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress); + vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress); break; case FILTER_OFFSET: vec2 texture_size = vec2(TEX_SIZE(sColor0).xy); @@ -150,15 +148,13 @@ void main(void) { clipRect /= texture_size.xyxy; vFilterData1 = clipRect; break; - case FILTER_COMPONENT_TRANSFER: { - ivec2 buffer_uv = get_gpu_buffer_uv(aFilterExtraDataAddress); - vData = ivec4(buffer_uv, 0, 0); + case FILTER_COMPONENT_TRANSFER: + vData = ivec4(aFilterExtraDataAddress, 0, 0); break; - } case FILTER_COMPOSITE: vData = ivec4(aFilterGenericInt, 0, 0, 0); if (aFilterGenericInt == COMPOSITE_ARITHMETIC) { - vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress); + vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress); } break; default: @@ -447,21 +443,21 @@ vec4 ComponentTransfer(vec4 colora) { case COMPONENT_TRANSFER_DISCRETE: // fetch value from lookup table k = int(floor(colora[i]*255.0 + 0.5)); - texel = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(offset + k/4, 0)); + texel = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(offset + k/4, 0)); colora[i] = clamp(texel[k % 4], 0.0, 1.0); // offset plus 256/4 blocks offset = offset + 64; break; case COMPONENT_TRANSFER_LINEAR: // fetch the two values for use in the linear equation - texel = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(offset, 0)); + texel = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(offset, 0)); colora[i] = clamp(texel[0] * colora[i] + texel[1], 0.0, 1.0); // offset plus 1 block offset = offset + 1; break; case COMPONENT_TRANSFER_GAMMA: // fetch the three values for use in the gamma equation - texel = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(offset, 0)); + texel = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(offset, 0)); colora[i] = clamp(texel[0] * pow(colora[i], texel[1]) + texel[2], 0.0, 1.0); // offset plus 1 block offset = offset + 1; diff --git a/gfx/wr/webrender/res/cs_svg_filter_node.glsl b/gfx/wr/webrender/res/cs_svg_filter_node.glsl @@ -38,7 +38,7 @@ Notes about specific filter kinds: #define WR_FEATURE_TEXTURE_2D -#include shared,prim_shared,gpu_buffer +#include shared,prim_shared varying highp vec2 vInput1Uv; varying highp vec2 vInput2Uv; @@ -172,7 +172,7 @@ PER_INSTANCE in int aFilterInput1TaskAddress; PER_INSTANCE in int aFilterInput2TaskAddress; PER_INSTANCE in int aFilterKind; PER_INSTANCE in int aFilterInputCount; -PER_INSTANCE in int aFilterExtraDataAddress; +PER_INSTANCE in ivec2 aFilterExtraDataAddress; // used for feFlood and feDropShadow colors // this is based on SrgbToLinear below, but that version hits SWGL compile @@ -270,23 +270,19 @@ void main(void) { case FILTER_BLEND_SOFT_LIGHT_CONVERTSRGB: break; case FILTER_COLOR_MATRIX: - case FILTER_COLOR_MATRIX_CONVERTSRGB: { - ivec2 gpu_buffer_uv = get_gpu_buffer_uv(aFilterExtraDataAddress); - vec4 mat_data[4] = fetch_from_gpu_buffer_4f_direct(gpu_buffer_uv); + case FILTER_COLOR_MATRIX_CONVERTSRGB: + vec4 mat_data[4] = fetch_from_gpu_cache_4_direct(aFilterExtraDataAddress); vColorMat = mat4(mat_data[0], mat_data[1], mat_data[2], mat_data[3]); - vFilterData0 = fetch_from_gpu_buffer_1f_direct(gpu_buffer_uv + ivec2(4, 0)); + vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress + ivec2(4, 0)); break; - } case FILTER_COMPONENT_TRANSFER: - case FILTER_COMPONENT_TRANSFER_CONVERTSRGB: { - ivec2 gpu_buffer_uv = get_gpu_buffer_uv(aFilterExtraDataAddress); - vData = ivec4(gpu_buffer_uv, 0, 0); + case FILTER_COMPONENT_TRANSFER_CONVERTSRGB: + vData = ivec4(aFilterExtraDataAddress, 0, 0); break; - } case FILTER_COMPOSITE_ARITHMETIC: case FILTER_COMPOSITE_ARITHMETIC_CONVERTSRGB: // arithmetic parameters - vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress); + vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress); break; case FILTER_COMPOSITE_ATOP: case FILTER_COMPOSITE_ATOP_CONVERTSRGB: @@ -330,12 +326,12 @@ void main(void) { // TODO break; case FILTER_DROP_SHADOW: - vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress); + vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress); // premultiply the color vFilterData0.rgb = vFilterData0.rgb * vFilterData0.a; break; case FILTER_DROP_SHADOW_CONVERTSRGB: - vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress); + vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress); // convert from sRGB to linearRGB and premultiply by alpha vFilterData0.rgb = vertexSrgbToLinear(vFilterData0.rgb); vFilterData0.rgb = vFilterData0.rgb * vFilterData0.a; @@ -605,7 +601,7 @@ void main(void) { vec4 result = vec4(1.0, 0.0, 0.0, 1.0); // This would produce more efficient code for swgl if we used a switch statement. - // However, the glsl-optimizer pass produces awful code for switch statements, + // However, the glsl-optimizer pass produces awful code for switch statements, // resulting in the optimized fragment shader taking half a minute to compile on // some Adreno devices. See bug 1929209. // We should fix the optimizer to produce more sensible output for switch @@ -686,10 +682,10 @@ void main(void) { result = floor(clamp(Ns * 255.0, vec4(0.0), vec4(255.0))); // SWGL doesn't have an intrinsic for ivec4(vec4) k = ivec4(int(result.r), int(result.g), int(result.b), int(result.a)); - result.r = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(k.r, 0)).r; - result.g = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(k.g, 0)).g; - result.b = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(k.b, 0)).b; - result.a = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(k.a, 0)).a; + result.r = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(k.r, 0)).r; + result.g = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(k.g, 0)).g; + result.b = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(k.b, 0)).b; + result.a = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(k.a, 0)).a; result.rgb = result.rgb * result.a; } else if (vFilterKind == FILTER_COMPOSITE_ARITHMETIC || vFilterKind == FILTER_COMPOSITE_ARITHMETIC_CONVERTSRGB) { result = Rs * Rb * vFilterData0.x + Rs * vFilterData0.y + Rb * vFilterData0.z + vec4(vFilterData0.w); diff --git a/gfx/wr/webrender/res/gpu_buffer.glsl b/gfx/wr/webrender/res/gpu_buffer.glsl @@ -10,14 +10,21 @@ ivec2 get_gpu_buffer_uv(HIGHP_FS_ADDRESS int address) { uint(address) / WR_MAX_VERTEX_TEXTURE_WIDTH); } -vec4[2] fetch_from_gpu_buffer_2f_direct(ivec2 uv) { +vec4 fetch_from_gpu_buffer_1f(HIGHP_FS_ADDRESS int address) { + ivec2 uv = get_gpu_buffer_uv(address); + return texelFetch(sGpuBufferF, uv, 0); +} + +vec4[2] fetch_from_gpu_buffer_2f(HIGHP_FS_ADDRESS int address) { + ivec2 uv = get_gpu_buffer_uv(address); return vec4[2]( TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(0, 0)), TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(1, 0)) ); } -vec4[3] fetch_from_gpu_buffer_3f_direct(ivec2 uv) { +vec4[3] fetch_from_gpu_buffer_3f(HIGHP_FS_ADDRESS int address) { + ivec2 uv = get_gpu_buffer_uv(address); return vec4[3]( TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(0, 0)), TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(1, 0)), @@ -25,7 +32,8 @@ vec4[3] fetch_from_gpu_buffer_3f_direct(ivec2 uv) { ); } -vec4[4] fetch_from_gpu_buffer_4f_direct(ivec2 uv) { +vec4[4] fetch_from_gpu_buffer_4f(HIGHP_FS_ADDRESS int address) { + ivec2 uv = get_gpu_buffer_uv(address); return vec4[4]( TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(0, 0)), TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(1, 0)), @@ -34,7 +42,8 @@ vec4[4] fetch_from_gpu_buffer_4f_direct(ivec2 uv) { ); } -vec4[5] fetch_from_gpu_buffer_5f_direct(ivec2 uv) { +vec4[5] fetch_from_gpu_buffer_5f(HIGHP_FS_ADDRESS int address) { + ivec2 uv = get_gpu_buffer_uv(address); return vec4[5]( TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(0, 0)), TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(1, 0)), @@ -44,35 +53,6 @@ vec4[5] fetch_from_gpu_buffer_5f_direct(ivec2 uv) { ); } -vec4 fetch_from_gpu_buffer_1f(HIGHP_FS_ADDRESS int address) { - ivec2 uv = get_gpu_buffer_uv(address); - return texelFetch(sGpuBufferF, uv, 0); -} - -vec4[2] fetch_from_gpu_buffer_2f(HIGHP_FS_ADDRESS int address) { - ivec2 uv = get_gpu_buffer_uv(address); - return fetch_from_gpu_buffer_2f_direct(uv); -} - -vec4[3] fetch_from_gpu_buffer_3f(HIGHP_FS_ADDRESS int address) { - ivec2 uv = get_gpu_buffer_uv(address); - return fetch_from_gpu_buffer_3f_direct(uv); -} - -vec4[4] fetch_from_gpu_buffer_4f(HIGHP_FS_ADDRESS int address) { - ivec2 uv = get_gpu_buffer_uv(address); - return fetch_from_gpu_buffer_4f_direct(uv); -} - -vec4[5] fetch_from_gpu_buffer_5f(HIGHP_FS_ADDRESS int address) { - ivec2 uv = get_gpu_buffer_uv(address); - return fetch_from_gpu_buffer_5f_direct(uv); -} - -vec4 fetch_from_gpu_buffer_1f_direct(ivec2 uv) { - return texelFetch(sGpuBufferF, uv, 0); -} - ivec4 fetch_from_gpu_buffer_1i(HIGHP_FS_ADDRESS int address) { ivec2 uv = get_gpu_buffer_uv(address); return texelFetch(sGpuBufferI, uv, 0); diff --git a/gfx/wr/webrender/res/gpu_cache.glsl b/gfx/wr/webrender/res/gpu_cache.glsl @@ -0,0 +1,137 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +uniform HIGHP_SAMPLER_FLOAT sampler2D sGpuCache; + +#define VECS_PER_IMAGE_RESOURCE 2 + +// TODO(gw): This is here temporarily while we have +// both GPU store and cache. When the GPU +// store code is removed, we can change the +// PrimitiveInstance instance structure to +// use 2x unsigned shorts as vertex attributes +// instead of an int, and encode the UV directly +// in the vertices. +ivec2 get_gpu_cache_uv(HIGHP_FS_ADDRESS int address) { + return ivec2(uint(address) % WR_MAX_VERTEX_TEXTURE_WIDTH, + uint(address) / WR_MAX_VERTEX_TEXTURE_WIDTH); +} + +vec4[2] fetch_from_gpu_cache_2_direct(ivec2 address) { + return vec4[2]( + TEXEL_FETCH(sGpuCache, address, 0, ivec2(0, 0)), + TEXEL_FETCH(sGpuCache, address, 0, ivec2(1, 0)) + ); +} + +vec4[2] fetch_from_gpu_cache_2(HIGHP_FS_ADDRESS int address) { + ivec2 uv = get_gpu_cache_uv(address); + return vec4[2]( + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0)) + ); +} + +vec4 fetch_from_gpu_cache_1_direct(ivec2 address) { + return texelFetch(sGpuCache, address, 0); +} + +vec4 fetch_from_gpu_cache_1(HIGHP_FS_ADDRESS int address) { + ivec2 uv = get_gpu_cache_uv(address); + return texelFetch(sGpuCache, uv, 0); +} + +#ifdef WR_VERTEX_SHADER + +vec4[8] fetch_from_gpu_cache_8(int address) { + ivec2 uv = get_gpu_cache_uv(address); + return vec4[8]( + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(2, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(3, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(4, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(5, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(6, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(7, 0)) + ); +} + +vec4[3] fetch_from_gpu_cache_3(int address) { + ivec2 uv = get_gpu_cache_uv(address); + return vec4[3]( + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(2, 0)) + ); +} + +vec4[3] fetch_from_gpu_cache_3_direct(ivec2 address) { + return vec4[3]( + TEXEL_FETCH(sGpuCache, address, 0, ivec2(0, 0)), + TEXEL_FETCH(sGpuCache, address, 0, ivec2(1, 0)), + TEXEL_FETCH(sGpuCache, address, 0, ivec2(2, 0)) + ); +} + +vec4[4] fetch_from_gpu_cache_4_direct(ivec2 address) { + return vec4[4]( + TEXEL_FETCH(sGpuCache, address, 0, ivec2(0, 0)), + TEXEL_FETCH(sGpuCache, address, 0, ivec2(1, 0)), + TEXEL_FETCH(sGpuCache, address, 0, ivec2(2, 0)), + TEXEL_FETCH(sGpuCache, address, 0, ivec2(3, 0)) + ); +} + +vec4[4] fetch_from_gpu_cache_4(int address) { + ivec2 uv = get_gpu_cache_uv(address); + return vec4[4]( + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(2, 0)), + TEXEL_FETCH(sGpuCache, uv, 0, ivec2(3, 0)) + ); +} + +//TODO: image resource is too specific for this module + +struct ImageSource { + RectWithEndpoint uv_rect; + vec4 user_data; +}; + +ImageSource fetch_image_source(int address) { + //Note: number of blocks has to match `renderer::BLOCKS_PER_UV_RECT` + vec4 data[2] = fetch_from_gpu_cache_2(address); + RectWithEndpoint uv_rect = RectWithEndpoint(data[0].xy, data[0].zw); + return ImageSource(uv_rect, data[1]); +} + +ImageSource fetch_image_source_direct(ivec2 address) { + vec4 data[2] = fetch_from_gpu_cache_2_direct(address); + RectWithEndpoint uv_rect = RectWithEndpoint(data[0].xy, data[0].zw); + return ImageSource(uv_rect, data[1]); +} + +// Fetch optional extra data for a texture cache resource. This can contain +// a polygon defining a UV rect within the texture cache resource. +// Note: the polygon coordinates are in homogeneous space. +struct ImageSourceExtra { + vec4 st_tl; + vec4 st_tr; + vec4 st_bl; + vec4 st_br; +}; + +ImageSourceExtra fetch_image_source_extra(int address) { + vec4 data[4] = fetch_from_gpu_cache_4(address + VECS_PER_IMAGE_RESOURCE); + return ImageSourceExtra( + data[0], + data[1], + data[2], + data[3] + ); +} + +#endif //WR_VERTEX_SHADER diff --git a/gfx/wr/webrender/res/gradient.glsl b/gfx/wr/webrender/res/gradient.glsl @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include dithering,gpu_buffer +#include dithering // Gradient GPU cache address. // Packed in to a vector to work around bug 1630356. diff --git a/gfx/wr/webrender/res/image_source.glsl b/gfx/wr/webrender/res/image_source.glsl @@ -1,51 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -#include gpu_buffer - -#define VECS_PER_IMAGE_RESOURCE 2 - -#ifdef WR_VERTEX_SHADER - -#include rect - -struct ImageSource { - RectWithEndpoint uv_rect; - vec4 user_data; -}; - -ImageSource fetch_image_source(int address) { - //Note: number of blocks has to match `renderer::BLOCKS_PER_UV_RECT` - vec4 data[2] = fetch_from_gpu_buffer_2f(address); - RectWithEndpoint uv_rect = RectWithEndpoint(data[0].xy, data[0].zw); - return ImageSource(uv_rect, data[1]); -} - -ImageSource fetch_image_source_direct(ivec2 address) { - vec4 data[2] = fetch_from_gpu_buffer_2f_direct(address); - RectWithEndpoint uv_rect = RectWithEndpoint(data[0].xy, data[0].zw); - return ImageSource(uv_rect, data[1]); -} - -// Fetch optional extra data for a texture cache resource. This can contain -// a polygon defining a UV rect within the texture cache resource. -// Note: the polygon coordinates are in homogeneous space. -struct ImageSourceExtra { - vec4 st_tl; - vec4 st_tr; - vec4 st_bl; - vec4 st_br; -}; - -ImageSourceExtra fetch_image_source_extra(int address) { - vec4 data[4] = fetch_from_gpu_buffer_4f(address + VECS_PER_IMAGE_RESOURCE); - return ImageSourceExtra( - data[0], - data[1], - data[2], - data[3] - ); -} - -#endif // WR_VERTEX_SHADER diff --git a/gfx/wr/webrender/res/prim_shared.glsl b/gfx/wr/webrender/res/prim_shared.glsl @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include rect,render_task,transform,image_source +#include rect,render_task,gpu_cache,transform #define EXTEND_MODE_CLAMP 0 #define EXTEND_MODE_REPEAT 1 diff --git a/gfx/wr/webrender/res/ps_quad.glsl b/gfx/wr/webrender/res/ps_quad.glsl @@ -10,14 +10,14 @@ /// ///```ascii /// (int gpu buffer) -/// +---------------+ (float gpu buffer) +/// +---------------+ (sGpuCache) /// (instance-step vertex attr) | Int header | +-----------+ /// +-----------------------------+ | | | Transform | /// | Quad instance (uvec4) | +--> | transform id +--> +-----------+ /// | | | | z id | /// | x: int prim address +---+ +---------------+ (float gpu buffer) /// | y: float prim address +--------------------------> +-----------+--------------+-+-+ -/// | z: quad flags | (float gpu buffer) | Quad Prim | Quad Segment | | | +/// | z: quad flags | (sGpuCache) | Quad Prim | Quad Segment | | | /// | edge flags | +--------------------+ | | | | | /// | part index | | Picture task | | bounds | rect | | | /// | segment index | | | | clip | uv rect | | | diff --git a/gfx/wr/webrender/res/ps_split_composite.glsl b/gfx/wr/webrender/res/ps_split_composite.glsl @@ -4,7 +4,7 @@ #define WR_FEATURE_TEXTURE_2D -#include shared,prim_shared,image_source +#include shared,prim_shared // interpolated UV coordinates to sample. varying highp vec2 vUv; @@ -21,14 +21,17 @@ struct SplitGeometry { }; SplitGeometry fetch_split_geometry(int address) { - vec4[2] data = fetch_from_gpu_buffer_2f(address); + ivec2 uv = get_gpu_cache_uv(address); + + vec4 data0 = TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)); + vec4 data1 = TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0)); SplitGeometry geo; geo.local = vec2[4]( - data[0].xy, - data[0].zw, - data[1].xy, - data[1].zw + data0.xy, + data0.zw, + data1.xy, + data1.zw ); return geo; diff --git a/gfx/wr/webrender/res/ps_text_run.glsl b/gfx/wr/webrender/res/ps_text_run.glsl @@ -2,7 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -#include shared,prim_shared,gpu_buffer +#include shared,prim_shared flat varying mediump vec4 v_color; flat varying mediump vec3 v_mask_swizzle; @@ -45,7 +45,7 @@ Glyph fetch_glyph(int specific_prim_address, int glyph_address = specific_prim_address + VECS_PER_TEXT_RUN + int(uint(glyph_index) / GLYPHS_PER_GPU_BLOCK); - vec4 data = fetch_from_gpu_buffer_1f(glyph_address); + vec4 data = fetch_from_gpu_cache_1(glyph_address); // Select XY or ZW based on glyph index. vec2 glyph = mix(data.xy, data.zw, bvec2(uint(glyph_index) % GLYPHS_PER_GPU_BLOCK == 1U)); @@ -60,7 +60,7 @@ struct GlyphResource { }; GlyphResource fetch_glyph_resource(int address) { - vec4 data[2] = fetch_from_gpu_buffer_2f(address); + vec4 data[2] = fetch_from_gpu_cache_2(address); return GlyphResource(data[0], data[1].xy, data[1].z); } @@ -69,7 +69,7 @@ struct TextRun { }; TextRun fetch_text_run(int address) { - vec4 data = fetch_from_gpu_buffer_1f(address); + vec4 data = fetch_from_gpu_cache_1(address); return TextRun(data); } diff --git a/gfx/wr/webrender/src/batch.rs b/gfx/wr/webrender/src/batch.rs @@ -11,7 +11,8 @@ use crate::composite::CompositorSurfaceKind; use crate::pattern::PatternKind; use crate::spatial_tree::{SpatialTree, SpatialNodeIndex, CoordinateSystemId}; use glyph_rasterizer::{GlyphFormat, SubpixelDirection}; -use crate::gpu_types::{BrushFlags, BrushInstance, ImageSource, PrimitiveHeaders, UvRectKind, ZBufferId, ZBufferIdGenerator}; +use crate::gpu_cache::{GpuBlockData, GpuCache, GpuCacheAddress}; +use crate::gpu_types::{BrushFlags, BrushInstance, PrimitiveHeaders, ZBufferId, ZBufferIdGenerator}; use crate::gpu_types::SplitCompositeInstance; use crate::gpu_types::{PrimitiveInstanceData, RasterizationSpace, GlyphInstance}; use crate::gpu_types::{PrimitiveHeader, PrimitiveHeaderIndex, TransformPaletteId, TransformPalette}; @@ -27,7 +28,7 @@ use crate::quad; use crate::render_target::RenderTargetContext; use crate::render_task_graph::{RenderTaskId, RenderTaskGraph}; use crate::render_task::{RenderTaskAddress, RenderTaskKind, SubPass}; -use crate::renderer::{BlendMode, GpuBufferAddress, GpuBufferBlockF, GpuBufferBuilder, ShaderColorMode}; +use crate::renderer::{BlendMode, GpuBufferBuilder, ShaderColorMode}; use crate::renderer::MAX_VERTEX_TEXTURE_WIDTH; use crate::resource_cache::{GlyphFetchResult, ImageProperties}; use crate::space::SpaceMapper; @@ -37,7 +38,6 @@ use std::{f32, i32, usize}; use crate::util::{project_rect, MaxRect, TransformedRectKind, ScaleOffset}; use crate::segment::EdgeAaSegmentMask; - // Special sentinel value recognized by the shader. It is considered to be // a dummy task that doesn't mask out anything. const OPAQUE_TASK_ADDRESS: RenderTaskAddress = RenderTaskAddress(0x7fffffff); @@ -820,6 +820,7 @@ impl BatchBuilder { cmd: &PrimitiveCommand, prim_spatial_node_index: SpatialNodeIndex, ctx: &RenderTargetContext, + gpu_cache: &mut GpuCache, render_tasks: &RenderTaskGraph, prim_headers: &mut PrimitiveHeaders, transforms: &mut TransformPalette, @@ -991,7 +992,7 @@ impl BatchBuilder { } let blend_mode = BlendMode::PremultipliedAlpha; - let prim_cache_address = ctx.globals.default_image_data; + let prim_cache_address = gpu_cache.get_address(&ctx.globals.default_image_handle); match picture.raster_config { Some(ref raster_config) => { @@ -1039,7 +1040,7 @@ impl BatchBuilder { let picture_prim_header = PrimitiveHeader { local_rect: prim_rect, local_clip_rect, - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address, transform_id, z: z_id, render_task_address: self.batcher.render_task_address, @@ -1084,7 +1085,7 @@ impl BatchBuilder { let (uv_rect_address, texture) = render_tasks.resolve_location( pic_task_id, - + gpu_cache, ).unwrap(); // The set of input textures that most composite modes use, @@ -1146,7 +1147,7 @@ impl BatchBuilder { let shadow_textures = textures; let content_uv_rect_address = render_tasks[secondary_id] - .get_texture_address() + .get_texture_address(gpu_cache) .as_int(); // Build BatchTextures for shadow/content @@ -1159,12 +1160,15 @@ impl BatchBuilder { let shadow_key = BatchKey::new(kind, blend_mode, shadow_textures); let content_key = BatchKey::new(kind, blend_mode, content_textures); - for (shadow, shadow_prim_address) in shadows.iter().zip(picture.extra_gpu_data.iter()) { + for (shadow, shadow_gpu_data) in shadows.iter().zip(picture.extra_gpu_data_handles.iter()) { + // Get the GPU cache address of the extra data handle. + let shadow_prim_address = gpu_cache.get_address(shadow_gpu_data); + let shadow_rect = picture_prim_header.local_rect.translate(shadow.offset); let shadow_prim_header = PrimitiveHeader { local_rect: shadow_rect, - specific_prim_address: shadow_prim_address.as_int(), + specific_prim_address: shadow_prim_address, z: z_id, user_data: ImageBrushData { color_mode: ShaderColorMode::Alpha, @@ -1239,10 +1243,10 @@ impl BatchBuilder { (0.01745329251 * angle * 65536.0) as i32 } Filter::ColorMatrix(_) => { - picture.extra_gpu_data[0].as_int() + picture.extra_gpu_data_handles[0].as_int(gpu_cache) } Filter::Flood(_) => { - picture.extra_gpu_data[0].as_int() + picture.extra_gpu_data_handles[0].as_int(gpu_cache) } // These filters are handled via different paths. @@ -1293,7 +1297,7 @@ impl BatchBuilder { filter_data.data.b_func.to_int() << 20 | filter_data.data.a_func.to_int() << 16) as i32); - let user_data = filter_data.gpu_buffer_address.as_int(); + let user_data = filter_data.gpu_cache_handle.as_int(gpu_cache); let key = BatchKey::new( BatchKind::Brush(BrushBatchKind::Blend), @@ -1380,8 +1384,8 @@ impl BatchBuilder { clip_mask: clip_mask_texture_id, }, ); - let src_uv_address = render_tasks[pic_task_id].get_texture_address(); - let readback_uv_address = render_tasks[backdrop_id].get_texture_address(); + let src_uv_address = render_tasks[pic_task_id].get_texture_address(gpu_cache); + let readback_uv_address = render_tasks[backdrop_id].get_texture_address(gpu_cache); let prim_header = PrimitiveHeader { user_data: [ mode as u32 as i32, @@ -1489,7 +1493,7 @@ impl BatchBuilder { ); let prim_header = PrimitiveHeader { - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address, user_data: batch_params.prim_user_data, ..picture_prim_header }; @@ -1596,7 +1600,7 @@ impl BatchBuilder { transform_id, z: z_id, render_task_address: self.batcher.render_task_address, - specific_prim_address: GpuBufferAddress::INVALID.as_int(), // Will be overridden by most uses + specific_prim_address: GpuCacheAddress::INVALID, // Will be overridden by most uses user_data: [0; 4], // Will be overridden by most uses }; @@ -1620,11 +1624,11 @@ impl BatchBuilder { }; let (prim_cache_address, segments) = if segment_instance_index == SegmentInstanceIndex::UNUSED { - (common_data.gpu_buffer_address, None) + (gpu_cache.try_get_address(&common_data.gpu_cache_handle), None) } else { let segment_instance = &ctx.scratch.segment_instances[segment_instance_index]; let segments = Some(&ctx.scratch.segments[segment_instance.segments_range]); - (segment_instance.gpu_data, segments) + (Some(gpu_cache.get_address(&segment_instance.gpu_cache_handle)), segments) }; // The following primitives lower to the image brush shader in the same way. @@ -1649,7 +1653,7 @@ impl BatchBuilder { }; if let Some((src_color, visible_tiles_range, brush_segments)) = img_brush_data { - let src_color = render_tasks.resolve_location(src_color); + let src_color = render_tasks.resolve_location(src_color, gpu_cache); let (uv_rect_address, texture_source) = match src_color { Some(src) => src, @@ -1668,7 +1672,7 @@ impl BatchBuilder { }.encode(); let prim_header = PrimitiveHeader { - specific_prim_address: common_data.gpu_buffer_address.as_int(), + specific_prim_address: gpu_cache.get_address(&common_data.gpu_cache_handle), user_data: prim_user_data, ..base_prim_header }; @@ -1771,7 +1775,7 @@ impl BatchBuilder { // use of interning. let prim_header = PrimitiveHeader { - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address.unwrap(), user_data: [get_shader_opacity(1.0), 0, 0, 0], ..base_prim_header }; @@ -1805,7 +1809,7 @@ impl BatchBuilder { // task for each valid edge / corner of the border. for task_id in task_ids { - if let Some((uv_rect_address, texture)) = render_tasks.resolve_location(*task_id) { + if let Some((uv_rect_address, texture)) = render_tasks.resolve_location(*task_id, gpu_cache) { segment_data.push( SegmentInstanceData { textures: TextureSet::prim_textured(texture), @@ -1830,7 +1834,7 @@ impl BatchBuilder { ); let prim_header = PrimitiveHeader { - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address.unwrap(), user_data: batch_params.prim_user_data, ..base_prim_header }; @@ -1878,7 +1882,7 @@ impl BatchBuilder { min: prim_rect.min - run.reference_frame_relative_offset, max: run.snapped_reference_frame_relative_offset.to_point(), }, - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address.unwrap(), user_data: [ (run.raster_scale * 65535.0).round() as i32, 0, @@ -1906,6 +1910,7 @@ impl BatchBuilder { font, &glyph_keys, &mut self.glyph_fetch_buffer, + gpu_cache, |texture_id, glyph_format, glyphs| { debug_assert_ne!(texture_id, TextureSource::Invalid); @@ -2083,7 +2088,7 @@ impl BatchBuilder { let (batch_kind, textures, prim_user_data, specific_resource_address) = match render_task { Some(task_id) => { - let (uv_rect_address, texture) = render_tasks.resolve_location(*task_id).unwrap(); + let (uv_rect_address, texture) = render_tasks.resolve_location(*task_id, gpu_cache).unwrap(); let textures = BatchTextures::prim_textured( texture, clip_mask_texture_id, @@ -2111,7 +2116,7 @@ impl BatchBuilder { }; let prim_header = PrimitiveHeader { - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address.unwrap(), user_data: prim_user_data, ..base_prim_header }; @@ -2146,7 +2151,7 @@ impl BatchBuilder { ); let prim_header = PrimitiveHeader { - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address.unwrap(), user_data: batch_params.prim_user_data, ..base_prim_header }; @@ -2179,6 +2184,7 @@ impl BatchBuilder { z_id, bounding_rect, ctx, + gpu_cache, render_tasks, prim_headers, ); @@ -2195,7 +2201,7 @@ impl BatchBuilder { debug_assert!(channel_count <= 3); for channel in 0 .. channel_count { - let src_channel = render_tasks.resolve_location(yuv_image_data.src_yuv[channel]); + let src_channel = render_tasks.resolve_location(yuv_image_data.src_yuv[channel], gpu_cache); let (uv_rect_address, texture_source) = match src_channel { Some(src) => src, @@ -2240,7 +2246,7 @@ impl BatchBuilder { debug_assert_ne!(segment_instance_index, SegmentInstanceIndex::INVALID); let prim_header = PrimitiveHeader { - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address.unwrap(), user_data: batch_params.prim_user_data, ..base_prim_header }; @@ -2273,6 +2279,7 @@ impl BatchBuilder { z_id, bounding_rect, ctx, + gpu_cache, render_tasks, prim_headers, ); @@ -2306,7 +2313,7 @@ impl BatchBuilder { } } - let src_color = render_tasks.resolve_location(image_instance.src_color); + let src_color = render_tasks.resolve_location(image_instance.src_color, gpu_cache); let (uv_rect_address, texture_source) = match src_color { Some(src) => src, @@ -2324,11 +2331,11 @@ impl BatchBuilder { debug_assert_ne!(image_instance.segment_instance_index, SegmentInstanceIndex::INVALID); let (prim_cache_address, segments) = if image_instance.segment_instance_index == SegmentInstanceIndex::UNUSED { - (prim_cache_address, None) + (prim_cache_address.unwrap(), None) } else { let segment_instance = &ctx.scratch.segment_instances[image_instance.segment_instance_index]; let segments = Some(&ctx.scratch.segments[segment_instance.segments_range]); - (segment_instance.gpu_data, segments) + (gpu_cache.get_address(&segment_instance.gpu_cache_handle), segments) }; let local_rect = image_instance.adjustment.map_local_rect(&prim_rect); @@ -2338,7 +2345,7 @@ impl BatchBuilder { let prim_header = PrimitiveHeader { local_rect, local_clip_rect, - specific_prim_address: prim_cache_address.as_int(), + specific_prim_address: prim_cache_address, user_data: batch_params.prim_user_data, ..base_prim_header }; @@ -2376,7 +2383,7 @@ impl BatchBuilder { ).unwrap(); // use temporary block storage since we don't know the number of visible tiles beforehand - let mut gpu_blocks = Vec::<GpuBufferBlockF>::with_capacity(3 + max_tiles_per_header * 2); + let mut gpu_blocks = Vec::<GpuBlockData>::with_capacity(3 + max_tiles_per_header * 2); for chunk in image_instance.visible_tiles.chunks(max_tiles_per_header) { gpu_blocks.clear(); gpu_blocks.push(image_data.color.premultiplied().into()); //color @@ -2386,25 +2393,20 @@ impl BatchBuilder { for tile in chunk { let tile_rect = tile.local_rect.translate(-prim_rect.min.to_vector()); gpu_blocks.push(tile_rect.into()); - gpu_blocks.push([0.0; 4].into()); + gpu_blocks.push(GpuBlockData::EMPTY); } - let mut writer = gpu_buffer_builder.f32.write_blocks(gpu_blocks.len()); - for block in &gpu_blocks { - writer.push_one(*block); - } - let specific_prim_address = writer.finish(); - + let gpu_handle = gpu_cache.push_per_frame_blocks(&gpu_blocks); let prim_header = PrimitiveHeader { local_clip_rect: image_instance.tight_local_clip_rect, - specific_prim_address: specific_prim_address.as_int(), + specific_prim_address: gpu_cache.get_address(&gpu_handle), user_data: prim_user_data, ..base_prim_header }; let prim_header_index = prim_headers.push(&prim_header); for (i, tile) in chunk.iter().enumerate() { - let (uv_rect_address, texture) = match render_tasks.resolve_location(tile.src_color) { + let (uv_rect_address, texture) = match render_tasks.resolve_location(tile.src_color, gpu_cache) { Some(result) => result, None => { return; @@ -2453,7 +2455,7 @@ impl BatchBuilder { let prim_header = PrimitiveHeader { user_data: user_data, - specific_prim_address: prim_data.gpu_buffer_address.as_int(), + specific_prim_address: gpu_cache.get_address(&prim_data.gpu_cache_handle), ..base_prim_header }; let prim_header_index = prim_headers.push(&prim_header); @@ -2495,7 +2497,7 @@ impl BatchBuilder { for tile in visible_tiles { let tile_prim_header = PrimitiveHeader { - specific_prim_address: tile.address.as_int(), + specific_prim_address: gpu_cache.get_address(&tile.handle), local_rect: tile.local_rect, local_clip_rect: tile.local_clip_rect, user_data: user_data, @@ -2531,7 +2533,10 @@ impl BatchBuilder { let kind = BatchKind::Brush( BrushBatchKind::Image(ImageBufferKind::Texture2D) ); - let (_, texture) = render_tasks.resolve_location(pic_task_id).unwrap(); + let (_, texture) = render_tasks.resolve_location( + pic_task_id, + gpu_cache, + ).unwrap(); let textures = BatchTextures::prim_textured( texture, clip_mask_texture_id, @@ -2543,7 +2548,7 @@ impl BatchBuilder { ); let prim_header = PrimitiveHeader { - specific_prim_address: ctx.globals.default_image_data.as_int(), + specific_prim_address: gpu_cache.get_address(&ctx.globals.default_image_handle), user_data: ImageBrushData { color_mode: ShaderColorMode::Image, alpha_type: AlphaType::PremultipliedAlpha, @@ -2591,19 +2596,23 @@ impl BatchBuilder { calculate_screen_uv(points[3].unwrap() * pic_info.device_pixel_scale, backdrop_rect), ]; - let source = ImageSource { - p0: target_rect.min.to_f32(), - p1: target_rect.max.to_f32(), - user_data: [0.0; 4], - uv_rect_kind: UvRectKind::Quad { - top_left: uvs[0], - top_right: uvs[1], - bottom_left: uvs[2], - bottom_right: uvs[3], - }, - }; - - let uv_rect_address = source.write_gpu_blocks(&mut gpu_buffer_builder.f32); + // TODO (gw): This is a hack that provides the GPU cache blocks for an + // ImageSource. We should update the GPU cache interfaces to + // allow pushing per-frame blocks via a request interface. + let gpu_blocks = &[ + GpuBlockData::from([ + target_rect.min.x as f32, + target_rect.min.y as f32, + target_rect.max.x as f32, + target_rect.max.y as f32, + ]), + GpuBlockData::from([0.0; 4]), + GpuBlockData::from(uvs[0]), + GpuBlockData::from(uvs[1]), + GpuBlockData::from(uvs[2]), + GpuBlockData::from(uvs[3]), + ]; + let uv_rect_handle = gpu_cache.push_per_frame_blocks(gpu_blocks); self.add_brush_instance_to_batches( key, @@ -2615,7 +2624,7 @@ impl BatchBuilder { clip_task_address, brush_flags, prim_header_index, - uv_rect_address.as_int(), + uv_rect_handle.as_int(gpu_cache), ); } } @@ -2632,9 +2641,12 @@ impl BatchBuilder { z_id: ZBufferId, bounding_rect: &PictureRect, ctx: &RenderTargetContext, + gpu_cache: &mut GpuCache, render_tasks: &RenderTaskGraph, prim_headers: &mut PrimitiveHeaders, ) { + let prim_cache_address = gpu_cache.get_address(&ctx.globals.default_black_rect_handle); + let (clip_task_address, clip_mask_texture_id) = ctx.get_prim_clip_task_and_texture( clip_task_index, render_tasks, @@ -2643,7 +2655,7 @@ impl BatchBuilder { let prim_header = PrimitiveHeader { local_rect: prim_rect, local_clip_rect, - specific_prim_address: ctx.globals.default_black_rect_address.as_int(), + specific_prim_address: prim_cache_address, transform_id, z: z_id, render_task_address: self.batcher.render_task_address, @@ -3145,6 +3157,7 @@ impl ClipBatcher { clip_node_range: ClipNodeRange, root_spatial_node_index: SpatialNodeIndex, render_tasks: &RenderTaskGraph, + gpu_cache: &GpuCache, clip_store: &ClipStore, transforms: &mut TransformPalette, actual_rect: DeviceRect, @@ -3189,7 +3202,7 @@ impl ClipBatcher { let task_id = source .render_task .expect("bug: render task handle not allocated"); - let (uv_rect_address, texture) = render_tasks.resolve_location(task_id).unwrap(); + let (uv_rect_address, texture) = render_tasks.resolve_location(task_id, gpu_cache).unwrap(); self.get_batch_list(is_first_clip) .box_shadows @@ -3197,7 +3210,7 @@ impl ClipBatcher { .or_insert_with(|| ctx.frame_memory.new_vec()) .push(ClipMaskInstanceBoxShadow { common, - resource_address: uv_rect_address.as_int(), + resource_address: uv_rect_address, shadow_data: BoxShadowData { src_rect_size: source.original_alloc_size, clip_mode: source.clip_mode as i32, diff --git a/gfx/wr/webrender/src/clip.rs b/gfx/wr/webrender/src/clip.rs @@ -98,9 +98,9 @@ use api::units::*; use crate::image_tiling::{self, Repetition}; use crate::border::{ensure_no_corner_overlap, BorderRadiusAu}; use crate::box_shadow::{BLUR_SAMPLE_SCALE, BoxShadowClipSource, BoxShadowCacheKey}; -use crate::renderer::GpuBufferBuilderF; use crate::spatial_tree::{SpatialTree, SpatialNodeIndex}; use crate::ellipse::Ellipse; +use crate::gpu_cache::GpuCache; use crate::gpu_types::{BoxShadowStretchMode}; use crate::intern; use crate::internal_types::{FastHashMap, FastHashSet, LayoutPrimitiveInfo}; @@ -1092,7 +1092,7 @@ impl ClipNodeInfo { &self, node: &ClipNode, clipped_rect: &LayoutRect, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, resource_cache: &mut ResourceCache, mask_tiles: &mut Vec<VisibleMaskImageTile>, spatial_tree: &SpatialTree, @@ -1158,7 +1158,7 @@ impl ClipNodeInfo { if request_resources { resource_cache.request_image( req, - gpu_buffer, + gpu_cache, ); } @@ -1176,7 +1176,7 @@ impl ClipNodeInfo { visible_tiles = Some(tile_range_start..mask_tiles.len()); } else { if request_resources { - resource_cache.request_image(request, gpu_buffer); + resource_cache.request_image(request, gpu_cache); } let tile_range_start = mask_tiles.len(); @@ -1499,7 +1499,7 @@ impl ClipStore { prim_to_pic_mapper: &SpaceMapper<LayoutPixel, PicturePixel>, pic_to_vis_mapper: &SpaceMapper<PicturePixel, VisPixel>, spatial_tree: &SpatialTree, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, resource_cache: &mut ResourceCache, device_pixel_scale: DevicePixelScale, culling_rect: &VisRect, @@ -1567,7 +1567,7 @@ impl ClipStore { if let Some(instance) = node_info.create_instance( node, &local_bounding_rect, - gpu_buffer, + gpu_cache, resource_cache, &mut self.mask_tiles, spatial_tree, diff --git a/gfx/wr/webrender/src/command_buffer.rs b/gfx/wr/webrender/src/command_buffer.rs @@ -4,7 +4,7 @@ use api::units::PictureRect; use crate::pattern::{PatternKind, PatternShaderInput}; -use crate::{spatial_tree::SpatialNodeIndex, render_task_graph::RenderTaskId, surface::SurfaceTileDescriptor, picture::TileKey, renderer::GpuBufferAddress, FastHashMap, prim_store::PrimitiveInstanceIndex}; +use crate::{spatial_tree::SpatialNodeIndex, render_task_graph::RenderTaskId, surface::SurfaceTileDescriptor, picture::TileKey, renderer::GpuBufferAddress, FastHashMap, prim_store::PrimitiveInstanceIndex, gpu_cache::GpuCacheAddress}; use crate::gpu_types::{QuadSegment, TransformPaletteId}; use crate::segment::EdgeAaSegmentMask; @@ -112,7 +112,7 @@ pub enum PrimitiveCommand { }, Complex { prim_instance_index: PrimitiveInstanceIndex, - gpu_address: GpuBufferAddress, + gpu_address: GpuCacheAddress, }, Instance { prim_instance_index: PrimitiveInstanceIndex, @@ -142,7 +142,7 @@ impl PrimitiveCommand { pub fn complex( prim_instance_index: PrimitiveInstanceIndex, - gpu_address: GpuBufferAddress, + gpu_address: GpuCacheAddress, ) -> Self { PrimitiveCommand::Complex { prim_instance_index, @@ -239,11 +239,11 @@ impl CommandBuffer { } PrimitiveCommand::Complex { prim_instance_index, gpu_address } => { self.commands.push(Command::draw_complex_prim(prim_instance_index)); - self.commands.push(Command::data(gpu_address.as_u32())); + self.commands.push(Command::data((gpu_address.u as u32) << 16 | gpu_address.v as u32)); } PrimitiveCommand::Instance { prim_instance_index, gpu_buffer_address } => { self.commands.push(Command::draw_instance(prim_instance_index)); - self.commands.push(Command::data(gpu_buffer_address.as_u32())); + self.commands.push(Command::data((gpu_buffer_address.u as u32) << 16 | gpu_buffer_address.v as u32)); } PrimitiveCommand::Quad { pattern, pattern_input, prim_instance_index, gpu_buffer_address, transform_id, quad_flags, edge_flags, src_color_task_id } => { self.commands.push(Command::draw_quad(prim_instance_index)); @@ -251,7 +251,7 @@ impl CommandBuffer { self.commands.push(Command::data(pattern_input.0 as u32)); self.commands.push(Command::data(pattern_input.1 as u32)); self.commands.push(Command::data(src_color_task_id.index)); - self.commands.push(Command::data(gpu_buffer_address.as_u32())); + self.commands.push(Command::data((gpu_buffer_address.u as u32) << 16 | gpu_buffer_address.v as u32)); self.commands.push(Command::data(transform_id.0)); self.commands.push(Command::data((quad_flags.bits() as u32) << 16 | edge_flags.bits() as u32)); } @@ -284,7 +284,10 @@ impl CommandBuffer { Command::CMD_DRAW_COMPLEX_PRIM => { let prim_instance_index = PrimitiveInstanceIndex(param); let data = cmd_iter.next().unwrap(); - let gpu_address = GpuBufferAddress::from_u32(data.0); + let gpu_address = GpuCacheAddress { + u: (data.0 >> 16) as u16, + v: (data.0 & 0xffff) as u16, + }; let cmd = PrimitiveCommand::complex( prim_instance_index, gpu_address, @@ -304,7 +307,10 @@ impl CommandBuffer { let bits = cmd_iter.next().unwrap().0; let quad_flags = QuadFlags::from_bits((bits >> 16) as u8).unwrap(); let edge_flags = EdgeAaSegmentMask::from_bits((bits & 0xff) as u8).unwrap(); - let gpu_buffer_address = GpuBufferAddress::from_u32(data.0); + let gpu_buffer_address = GpuBufferAddress { + u: (data.0 >> 16) as u16, + v: (data.0 & 0xffff) as u16, + }; let cmd = PrimitiveCommand::quad( pattern, pattern_input, @@ -321,7 +327,10 @@ impl CommandBuffer { Command::CMD_DRAW_INSTANCE => { let prim_instance_index = PrimitiveInstanceIndex(param); let data = cmd_iter.next().unwrap(); - let gpu_buffer_address = GpuBufferAddress::from_u32(data.0); + let gpu_buffer_address = GpuBufferAddress { + u: (data.0 >> 16) as u16, + v: (data.0 & 0xffff) as u16, + }; let cmd = PrimitiveCommand::instance( prim_instance_index, gpu_buffer_address, diff --git a/gfx/wr/webrender/src/composite.rs b/gfx/wr/webrender/src/composite.rs @@ -6,8 +6,8 @@ use api::{BorderRadius, ColorF, ExternalImageId, ImageBufferKind, ImageKey, Imag use api::units::*; use api::ColorDepth; use crate::image_source::resolve_image; -use crate::renderer::GpuBufferBuilderF; use euclid::Box2D; +use crate::gpu_cache::GpuCache; use crate::gpu_types::{ZBufferId, ZBufferIdGenerator}; use crate::internal_types::{FrameAllocator, FrameMemory, FrameVec, TextureSource}; use crate::picture::{ImageDependency, ResolvedSurfaceTexture, TileCacheInstance, TileId, TileSurface}; @@ -854,7 +854,7 @@ impl CompositeState { is_opaque: bool, device_clip_rect: DeviceRect, resource_cache: &ResourceCache, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, deferred_resolves: &mut FrameVec<DeferredResolve>, clip_index: Option<CompositorClipIndex>, ) { @@ -905,7 +905,7 @@ impl CompositeState { &image_dependencies, required_plane_count, resource_cache, - gpu_buffer, + gpu_cache, deferred_resolves, ); if external_surface_index == ResolvedExternalSurfaceIndex::INVALID { @@ -967,7 +967,7 @@ impl CompositeState { tile_cache: &TileCacheInstance, device_clip_rect: DeviceRect, resource_cache: &ResourceCache, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, deferred_resolves: &mut FrameVec<DeferredResolve>, ) { let slice_transform = self.get_compositor_transform(tile_cache.transform_index); @@ -983,7 +983,7 @@ impl CompositeState { tile_cache.compositor_clip, backdrop_surface.device_rect, ); - + // Use the backdrop native surface we created and add that to the composite state. self.descriptor.surfaces.push( CompositeSurfaceDescriptor { @@ -1006,7 +1006,7 @@ impl CompositeState { true, device_clip_rect, resource_cache, - gpu_buffer, + gpu_cache, deferred_resolves, tile_cache.compositor_clip, ); @@ -1092,7 +1092,7 @@ impl CompositeState { compositor_surface.is_opaque, device_clip_rect, resource_cache, - gpu_buffer, + gpu_cache, deferred_resolves, tile_cache.compositor_clip, ); @@ -1136,7 +1136,7 @@ impl CompositeState { image_dependencies: &[ImageDependency; 3], required_plane_count: usize, resource_cache: &ResourceCache, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, deferred_resolves: &mut FrameVec<DeferredResolve>, ) -> ResolvedExternalSurfaceIndex { let mut planes = [ @@ -1156,7 +1156,7 @@ impl CompositeState { let cache_item = resolve_image( request, resource_cache, - gpu_buffer, + gpu_cache, deferred_resolves, true, ); @@ -1801,7 +1801,7 @@ impl Occluders { occluders: memory.new_vec(), scratch: OccludersScratchBuffers { events: memory.new_vec(), - active: memory.new_vec(), + active: memory.new_vec(), } } } diff --git a/gfx/wr/webrender/src/filterdata.rs b/gfx/wr/webrender/src/filterdata.rs @@ -3,8 +3,9 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ use std::hash; +use crate::gpu_cache::{GpuCache, GpuCacheHandle}; +use crate::gpu_cache::GpuDataRequest; use crate::intern; -use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF, GpuBufferWriterF}; use api::ComponentTransferFuncType; @@ -108,14 +109,14 @@ impl intern::InternDebug for SFilterDataKey {} #[derive(MallocSizeOf)] pub struct SFilterDataTemplate { pub data: SFilterData, - pub gpu_buffer_address: GpuBufferAddress, + pub gpu_cache_handle: GpuCacheHandle, } impl From<SFilterDataKey> for SFilterDataTemplate { fn from(item: SFilterDataKey) -> Self { SFilterDataTemplate { data: item.data, - gpu_buffer_address: GpuBufferAddress::INVALID, + gpu_cache_handle: GpuCacheHandle::new(), } } } @@ -128,14 +129,12 @@ impl SFilterData { && self.a_func == SFilterDataComponent::Identity } - pub fn write_gpu_blocks(&self, gpu_buffer: &mut GpuBufferBuilderF) -> GpuBufferAddress { + pub fn update(&self, mut request: GpuDataRequest) { + push_component_transfer_data(&self.r_func, &mut request); + push_component_transfer_data(&self.g_func, &mut request); + push_component_transfer_data(&self.b_func, &mut request); + push_component_transfer_data(&self.a_func, &mut request); assert!(!self.is_identity()); - let mut writer = gpu_buffer.write_blocks(1024); - push_component_transfer_data(&self.r_func, &mut writer); - push_component_transfer_data(&self.g_func, &mut writer); - push_component_transfer_data(&self.b_func, &mut writer); - push_component_transfer_data(&self.a_func, &mut writer); - writer.finish() } } @@ -144,11 +143,13 @@ impl SFilterDataTemplate { /// times per frame, by each primitive reference that refers to this interned /// template. The initial request call to the GPU cache ensures that work is only /// done if the cache entry is invalid (due to first use or eviction). - pub fn write_gpu_blocks( + pub fn update( &mut self, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, ) { - self.gpu_buffer_address = self.data.write_gpu_blocks(gpu_buffer); + if let Some(request) = gpu_cache.request(&mut self.gpu_cache_handle) { + self.data.update(request); + } } } @@ -165,7 +166,7 @@ impl intern::Internable for FilterDataIntern { fn push_component_transfer_data( func_comp: &SFilterDataComponent, - writer: &mut GpuBufferWriterF, + request: &mut GpuDataRequest, ) { match func_comp { SFilterDataComponent::Identity => {} @@ -204,14 +205,14 @@ fn push_component_transfer_data( } } - writer.push_one(arr); + request.push(arr); } } SFilterDataComponent::Linear(a, b) => { - writer.push_one([*a, *b, 0.0, 0.0]); + request.push([*a, *b, 0.0, 0.0]); } SFilterDataComponent::Gamma(a, b, c) => { - writer.push_one([*a, *b, *c, 0.0]); + request.push([*a, *b, *c, 0.0]); } } } diff --git a/gfx/wr/webrender/src/frame_builder.rs b/gfx/wr/webrender/src/frame_builder.rs @@ -13,9 +13,10 @@ use crate::spatial_node::SpatialNodeType; use crate::spatial_tree::{SpatialTree, SpatialNodeIndex}; use crate::composite::{CompositorKind, CompositeState, CompositeStatePreallocator}; use crate::debug_item::DebugItem; +use crate::gpu_cache::{GpuCache, GpuCacheHandle}; use crate::gpu_types::{PrimitiveHeaders, TransformPalette, ZBufferIdGenerator}; use crate::gpu_types::{QuadSegment, TransformData}; -use crate::internal_types::{FastHashMap, PlaneSplitter, FrameStamp}; +use crate::internal_types::{FastHashMap, PlaneSplitter, FrameId, FrameStamp}; use crate::picture::{DirtyRegion, SliceId, TileCacheInstance}; use crate::picture::{SurfaceInfo, SurfaceIndex, ResolvedSurfaceTexture}; use crate::picture::{SubpixelMode, RasterConfig, PictureCompositeMode}; @@ -24,7 +25,7 @@ use crate::prim_store::{PictureIndex, PrimitiveScratchBuffer}; use crate::prim_store::{DeferredResolve, PrimitiveInstance}; use crate::profiler::{self, TransactionProfile}; use crate::render_backend::{DataStores, ScratchBuffer}; -use crate::renderer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferBuilderF, GpuBufferBuilderI, GpuBufferF, GpuBufferI}; +use crate::renderer::{GpuBufferF, GpuBufferBuilderF, GpuBufferI, GpuBufferBuilderI, GpuBufferBuilder}; use crate::render_target::{PictureCacheTarget, PictureCacheTargetKind}; use crate::render_target::{RenderTargetContext, RenderTargetKind, RenderTarget}; use crate::render_task_graph::{Pass, RenderTaskGraph, RenderTaskId, SubPassSurface}; @@ -80,40 +81,40 @@ pub struct FrameBuilderConfig { pub struct FrameGlobalResources { /// The image shader block for the most common / default /// set of image parameters (color white, stretch == rect.size). - pub default_image_data: GpuBufferAddress, + pub default_image_handle: GpuCacheHandle, /// A GPU cache config for drawing cut-out rectangle primitives. /// This is used to 'cut out' overlay tiles where a compositor /// surface exists. - pub default_black_rect_address: GpuBufferAddress, + pub default_black_rect_handle: GpuCacheHandle, } impl FrameGlobalResources { pub fn empty() -> Self { FrameGlobalResources { - default_image_data: GpuBufferAddress::INVALID, - default_black_rect_address: GpuBufferAddress::INVALID, + default_image_handle: GpuCacheHandle::new(), + default_black_rect_handle: GpuCacheHandle::new(), } } pub fn update( &mut self, - gpu_buffers: &mut GpuBufferBuilder, + gpu_cache: &mut GpuCache, ) { - let mut writer = gpu_buffers.f32.write_blocks(3); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ - -1.0, // -ve means use prim rect for stretch size - 0.0, - 0.0, - 0.0, - ]); - self.default_image_data = writer.finish(); - - let mut writer = gpu_buffers.f32.write_blocks(1); - writer.push_one(PremultipliedColorF::BLACK); - self.default_black_rect_address = writer.finish(); + if let Some(mut request) = gpu_cache.request(&mut self.default_image_handle) { + request.push(PremultipliedColorF::WHITE); + request.push(PremultipliedColorF::WHITE); + request.push([ + -1.0, // -ve means use prim rect for stretch size + 0.0, + 0.0, + 0.0, + ]); + } + + if let Some(mut request) = gpu_cache.request(&mut self.default_black_rect_handle) { + request.push(PremultipliedColorF::BLACK); + } } } @@ -165,6 +166,7 @@ pub struct FrameBuildingState<'a> { pub rg_builder: &'a mut RenderTaskGraphBuilder, pub clip_store: &'a mut ClipStore, pub resource_cache: &'a mut ResourceCache, + pub gpu_cache: &'a mut GpuCache, pub transforms: &'a mut TransformPalette, pub segment_builder: SegmentBuilder, pub surfaces: &'a mut Vec<SurfaceInfo>, @@ -281,6 +283,7 @@ impl FrameBuilder { present: bool, global_screen_world_rect: WorldRect, resource_cache: &mut ResourceCache, + gpu_cache: &mut GpuCache, rg_builder: &mut RenderTaskGraphBuilder, global_device_pixel_scale: DevicePixelScale, scene_properties: &SceneProperties, @@ -392,7 +395,7 @@ impl FrameBuilder { let mut visibility_state = FrameVisibilityState { clip_store: &mut scene.clip_store, resource_cache, - frame_gpu_data, + gpu_cache, data_stores, clip_tree: &mut scene.clip_tree, composite_state, @@ -452,7 +455,7 @@ impl FrameBuilder { let mut visibility_state = FrameVisibilityState { clip_store: &mut scene.clip_store, resource_cache, - frame_gpu_data, + gpu_cache, data_stores, clip_tree: &mut scene.clip_tree, composite_state, @@ -524,6 +527,7 @@ impl FrameBuilder { rg_builder, clip_store: &mut scene.clip_store, resource_cache, + gpu_cache, transforms: transform_palette, segment_builder: SegmentBuilder::new(), surfaces: &mut scene.surfaces, @@ -627,7 +631,7 @@ impl FrameBuilder { profile_marker!("BlockOnResources"); resource_cache.block_until_all_resources_added( - frame_gpu_data, + gpu_cache, profile, ); } @@ -638,6 +642,7 @@ impl FrameBuilder { scene: &mut BuiltScene, present: bool, resource_cache: &mut ResourceCache, + gpu_cache: &mut GpuCache, rg_builder: &mut RenderTaskGraphBuilder, stamp: FrameStamp, device_origin: DeviceIntPoint, @@ -656,22 +661,18 @@ impl FrameBuilder { profile_marker!("BuildFrame"); let mut frame_memory = FrameMemory::new(chunk_pool, stamp.frame_id()); - // TODO(gw): Recycle backing vec buffers for gpu buffer builder between frames - let mut gpu_buffer_builder = GpuBufferBuilder { - f32: GpuBufferBuilderF::new(&frame_memory), - i32: GpuBufferBuilderI::new(&frame_memory), - }; profile.set(profiler::PRIMITIVES, scene.prim_instances.len()); profile.set(profiler::PICTURE_CACHE_SLICES, scene.tile_cache_config.picture_cache_slice_count); scratch.begin_frame(); - resource_cache.begin_frame(stamp, profile); + gpu_cache.begin_frame(stamp); + resource_cache.begin_frame(stamp, gpu_cache, profile); // TODO(gw): Follow up patches won't clear this, as they'll be assigned // statically during scene building. scene.surfaces.clear(); - self.globals.update(&mut gpu_buffer_builder); + self.globals.update(gpu_cache); spatial_tree.update_tree(scene_properties); let mut transform_palette = spatial_tree.build_transform_palette(&frame_memory); @@ -697,11 +698,18 @@ impl FrameBuilder { let mut cmd_buffers = CommandBufferList::new(); + // TODO(gw): Recycle backing vec buffers for gpu buffer builder between frames + let mut gpu_buffer_builder = GpuBufferBuilder { + f32: GpuBufferBuilderF::new(&frame_memory), + i32: GpuBufferBuilderI::new(&frame_memory), + }; + self.build_layer_screen_rects_and_cull_layers( scene, present, screen_world_rect, resource_cache, + gpu_cache, rg_builder, global_device_pixel_scale, scene_properties, @@ -727,7 +735,7 @@ impl FrameBuilder { // Finish creating the frame graph and build it. let render_tasks = rg_builder.end_frame( resource_cache, - &mut gpu_buffer_builder, + gpu_cache, &mut deferred_resolves, scene.config.max_shared_surface_size, &frame_memory, @@ -771,6 +779,7 @@ impl FrameBuilder { pass, output_size, &mut ctx, + gpu_cache, &mut gpu_buffer_builder, &render_tasks, &scene.clip_store, @@ -812,7 +821,7 @@ impl FrameBuilder { self.build_composite_pass( scene, &mut ctx, - &mut gpu_buffer_builder, + gpu_cache, &mut deferred_resolves, &mut composite_state, ); @@ -821,6 +830,8 @@ impl FrameBuilder { profile.end_time(profiler::FRAME_BATCHING_TIME); + let gpu_cache_frame_id = gpu_cache.end_frame(profile).frame_id(); + resource_cache.end_frame(profile); self.prim_headers_prealloc.record_vec(&prim_headers.headers_int); @@ -843,6 +854,7 @@ impl FrameBuilder { transform_palette: transform_palette.finish(), render_tasks, deferred_resolves, + gpu_cache_frame_id, has_been_rendered: false, has_texture_cache_tasks, prim_headers, @@ -994,7 +1006,7 @@ impl FrameBuilder { &self, scene: &BuiltScene, ctx: &RenderTargetContext, - gpu_buffers: &mut GpuBufferBuilder, + gpu_cache: &mut GpuCache, deferred_resolves: &mut FrameVec<DeferredResolve>, composite_state: &mut CompositeState, ) { @@ -1022,7 +1034,7 @@ impl FrameBuilder { tile_cache, device_clip_rect, ctx.resource_cache, - &mut gpu_buffers.f32, + gpu_cache, deferred_resolves, ); } @@ -1043,6 +1055,7 @@ pub fn build_render_pass( src_pass: &Pass, screen_size: DeviceIntSize, ctx: &mut RenderTargetContext, + gpu_cache: &mut GpuCache, gpu_buffer_builder: &mut GpuBufferBuilder, render_tasks: &RenderTaskGraph, clip_store: &ClipStore, @@ -1080,6 +1093,7 @@ pub fn build_render_pass( target.add_task( *task_id, ctx, + gpu_cache, gpu_buffer_builder, render_tasks, clip_store, @@ -1104,6 +1118,7 @@ pub fn build_render_pass( target.add_task( *task_id, ctx, + gpu_cache, gpu_buffer_builder, render_tasks, clip_store, @@ -1152,6 +1167,7 @@ pub fn build_render_pass( cmd, spatial_node_index, ctx, + gpu_cache, render_tasks, prim_headers, transforms, @@ -1240,6 +1256,7 @@ pub fn build_render_pass( texture.add_task( *task_id, ctx, + gpu_cache, gpu_buffer_builder, render_tasks, clip_store, @@ -1255,6 +1272,7 @@ pub fn build_render_pass( pass.color.build( ctx, + gpu_cache, render_tasks, prim_headers, transforms, @@ -1265,6 +1283,7 @@ pub fn build_render_pass( ); pass.alpha.build( ctx, + gpu_cache, render_tasks, prim_headers, transforms, @@ -1277,6 +1296,7 @@ pub fn build_render_pass( for target in &mut pass.texture_cache.values_mut() { target.build( ctx, + gpu_cache, render_tasks, prim_headers, transforms, @@ -1309,6 +1329,9 @@ pub struct Frame { pub render_tasks: RenderTaskGraph, pub prim_headers: PrimitiveHeaders, + /// The GPU cache frame that the contents of Self depend on + pub gpu_cache_frame_id: FrameId, + /// List of textures that we don't know about yet /// from the backend thread. The render thread /// will use a callback to resolve these and diff --git a/gfx/wr/webrender/src/gpu_cache.rs b/gfx/wr/webrender/src/gpu_cache.rs @@ -0,0 +1,945 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +//! Overview of the GPU cache. +//! +//! The main goal of the GPU cache is to allow on-demand +//! allocation and construction of GPU resources for the +//! vertex shaders to consume. +//! +//! Every item that wants to be stored in the GPU cache +//! should create a GpuCacheHandle that is used to refer +//! to a cached GPU resource. Creating a handle is a +//! cheap operation, that does *not* allocate room in the +//! cache. +//! +//! On any frame when that data is required, the caller +//! must request that handle, via ```request```. If the +//! data is not in the cache, the user provided closure +//! will be invoked to build the data. +//! +//! After ```end_frame``` has occurred, callers can +//! use the ```get_address``` API to get the allocated +//! address in the GPU cache of a given resource slot +//! for this frame. + +use api::{DebugFlags, DocumentId, PremultipliedColorF}; +#[cfg(test)] +use api::IdNamespace; +use api::units::*; +use euclid::{HomogeneousVector, Box2D}; +use crate::internal_types::{FastHashMap, FastHashSet, FrameStamp, FrameId}; +use crate::profiler::{self, TransactionProfile}; +use crate::prim_store::VECS_PER_SEGMENT; +use crate::renderer::MAX_VERTEX_TEXTURE_WIDTH; +use crate::util::VecHelper; +use std::{u16, u32}; +use std::num::NonZeroU32; +use std::ops::Add; +use std::time::{Duration, Instant}; + + +/// At the time of this writing, Firefox uses about 15 GPU cache rows on +/// startup, and then gradually works its way up to the mid-30s with normal +/// browsing. +pub const GPU_CACHE_INITIAL_HEIGHT: i32 = 20; +const NEW_ROWS_PER_RESIZE: i32 = 10; + +/// The number of frames an entry can go unused before being evicted. +const FRAMES_BEFORE_EVICTION: u64 = 10; + +/// The ratio of utilized blocks to total blocks for which we start the clock +/// on reclaiming memory. +const RECLAIM_THRESHOLD: f32 = 0.2; + +/// The amount of time utilization must be below the above threshold before we +/// blow away the cache and rebuild it. +const RECLAIM_DELAY_S: u64 = 5; + +#[derive(Debug, Copy, Clone, Eq, MallocSizeOf, PartialEq)] +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +struct Epoch(u32); + +impl Epoch { + fn next(&mut self) { + *self = Epoch(self.0.wrapping_add(1)); + } +} + +#[derive(Debug, Copy, Clone, MallocSizeOf)] +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +struct CacheLocation { + block_index: BlockIndex, + epoch: Epoch, +} + +/// A single texel in RGBAF32 texture - 16 bytes. +#[derive(Copy, Clone, Debug, MallocSizeOf)] +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +pub struct GpuBlockData { + data: [f32; 4], +} + +impl GpuBlockData { + pub const EMPTY: Self = GpuBlockData { data: [0.0; 4] }; +} + +/// Conversion helpers for GpuBlockData +impl From<PremultipliedColorF> for GpuBlockData { + fn from(c: PremultipliedColorF) -> Self { + GpuBlockData { + data: [c.r, c.g, c.b, c.a], + } + } +} + +impl From<[f32; 4]> for GpuBlockData { + fn from(data: [f32; 4]) -> Self { + GpuBlockData { data } + } +} + +impl<P> From<Box2D<f32, P>> for GpuBlockData { + fn from(r: Box2D<f32, P>) -> Self { + GpuBlockData { + data: [ + r.min.x, + r.min.y, + r.max.x, + r.max.y, + ], + } + } +} + +impl<P> From<HomogeneousVector<f32, P>> for GpuBlockData { + fn from(v: HomogeneousVector<f32, P>) -> Self { + GpuBlockData { + data: [ + v.x, + v.y, + v.z, + v.w, + ], + } + } +} + +impl From<TexelRect> for GpuBlockData { + fn from(tr: TexelRect) -> Self { + GpuBlockData { + data: [tr.uv0.x, tr.uv0.y, tr.uv1.x, tr.uv1.y], + } + } +} + + +// A handle to a GPU resource. +#[derive(Debug, Copy, Clone, MallocSizeOf)] +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +pub struct GpuCacheHandle { + location: Option<CacheLocation>, +} + +impl GpuCacheHandle { + pub fn new() -> Self { + GpuCacheHandle { location: None } + } + + pub fn as_int(self, gpu_cache: &GpuCache) -> i32 { + gpu_cache.get_address(&self).as_int() + } +} + +// A unique address in the GPU cache. These are uploaded +// as part of the primitive instances, to allow the vertex +// shader to fetch the specific data. +#[repr(C)] +#[derive(Copy, Debug, Clone, MallocSizeOf, Eq, PartialEq)] +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +pub struct GpuCacheAddress { + pub u: u16, + pub v: u16, +} + +impl GpuCacheAddress { + fn new(u: usize, v: usize) -> Self { + GpuCacheAddress { + u: u as u16, + v: v as u16, + } + } + + pub const INVALID: GpuCacheAddress = GpuCacheAddress { + u: u16::MAX, + v: u16::MAX, + }; + + pub fn as_int(self) -> i32 { + // TODO(gw): Temporarily encode GPU Cache addresses as a single int. + // In the future, we can change the PrimitiveInstanceData struct + // to use 2x u16 for the vertex attribute instead of an i32. + self.v as i32 * MAX_VERTEX_TEXTURE_WIDTH as i32 + self.u as i32 + } +} + +impl Add<usize> for GpuCacheAddress { + type Output = GpuCacheAddress; + + fn add(self, other: usize) -> GpuCacheAddress { + GpuCacheAddress { + u: self.u + other as u16, + v: self.v, + } + } +} + +// An entry in a free-list of blocks in the GPU cache. +#[derive(Debug, MallocSizeOf)] +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +struct Block { + // The location in the cache of this block. + address: GpuCacheAddress, + // The current epoch (generation) of this block. + epoch: Epoch, + // Index of the next free block in the list it + // belongs to (either a free-list or the + // occupied list). + next: Option<BlockIndex>, + // The last frame this block was referenced. + last_access_time: FrameId, +} + +impl Block { + fn new( + address: GpuCacheAddress, + next: Option<BlockIndex>, + frame_id: FrameId, + epoch: Epoch, + ) -> Self { + Block { + address, + next, + last_access_time: frame_id, + epoch, + } + } + + fn advance_epoch(&mut self, max_epoch: &mut Epoch) { + self.epoch.next(); + if max_epoch.0 < self.epoch.0 { + max_epoch.0 = self.epoch.0; + } + } + + /// Creates an invalid dummy block ID. + pub const INVALID: Block = Block { + address: GpuCacheAddress { u: 0, v: 0 }, + epoch: Epoch(0), + next: None, + last_access_time: FrameId::INVALID, + }; +} + +/// Represents the index of a Block in the block array. We only create such +/// structs for blocks that represent the start of a chunk. +/// +/// Because we use Option<BlockIndex> in a lot of places, we use a NonZeroU32 +/// here and avoid ever using the index zero. +#[derive(Debug, Copy, Clone, MallocSizeOf)] +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +struct BlockIndex(NonZeroU32); + +impl BlockIndex { + fn new(idx: usize) -> Self { + debug_assert!(idx <= u32::MAX as usize); + BlockIndex(NonZeroU32::new(idx as u32).expect("Index zero forbidden")) + } + + fn get(&self) -> usize { + self.0.get() as usize + } +} + +// A row in the cache texture. +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +#[derive(MallocSizeOf)] +struct Row { + // The fixed size of blocks that this row supports. + // Each row becomes a slab allocator for a fixed block size. + // This means no dealing with fragmentation within a cache + // row as items are allocated and freed. + block_count_per_item: usize, +} + +impl Row { + fn new(block_count_per_item: usize) -> Self { + Row { + block_count_per_item, + } + } +} + +// A list of update operations that can be applied on the cache +// this frame. The list of updates is created by the render backend +// during frame construction. It's passed to the render thread +// where GL commands can be applied. +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +#[derive(MallocSizeOf)] +pub enum GpuCacheUpdate { + Copy { + block_index: usize, + block_count: usize, + address: GpuCacheAddress, + }, +} + +/// Command to inform the debug display in the renderer when chunks are allocated +/// or freed. +#[derive(MallocSizeOf)] +pub enum GpuCacheDebugCmd { + /// Describes an allocated chunk. + Alloc(GpuCacheDebugChunk), + /// Describes a freed chunk. + Free(GpuCacheAddress), +} + +#[derive(Clone, MallocSizeOf)] +pub struct GpuCacheDebugChunk { + pub address: GpuCacheAddress, + pub size: usize, +} + +#[must_use] +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +#[derive(MallocSizeOf)] +pub struct GpuCacheUpdateList { + /// The frame current update list was generated from. + pub frame_id: FrameId, + /// Whether the texture should be cleared before updates + /// are applied. + pub clear: bool, + /// The current height of the texture. The render thread + /// should resize the texture if required. + pub height: i32, + /// List of updates to apply. + pub updates: Vec<GpuCacheUpdate>, + /// A flat list of GPU blocks that are pending upload + /// to GPU memory. + pub blocks: Vec<GpuBlockData>, + /// Whole state GPU block metadata for debugging. + #[cfg_attr(feature = "serde", serde(skip))] + pub debug_commands: Vec<GpuCacheDebugCmd>, +} + +// Holds the free lists of fixed size blocks. Mostly +// just serves to work around the borrow checker. +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +#[derive(MallocSizeOf)] +struct FreeBlockLists { + free_list_1: Option<BlockIndex>, + free_list_2: Option<BlockIndex>, + free_list_4: Option<BlockIndex>, + free_list_8: Option<BlockIndex>, + free_list_16: Option<BlockIndex>, + free_list_32: Option<BlockIndex>, + free_list_64: Option<BlockIndex>, + free_list_128: Option<BlockIndex>, + free_list_256: Option<BlockIndex>, + free_list_341: Option<BlockIndex>, + free_list_512: Option<BlockIndex>, + free_list_1024: Option<BlockIndex>, +} + +impl FreeBlockLists { + fn new() -> Self { + FreeBlockLists { + free_list_1: None, + free_list_2: None, + free_list_4: None, + free_list_8: None, + free_list_16: None, + free_list_32: None, + free_list_64: None, + free_list_128: None, + free_list_256: None, + free_list_341: None, + free_list_512: None, + free_list_1024: None, + } + } + + fn get_actual_block_count_and_free_list( + &mut self, + block_count: usize, + ) -> (usize, &mut Option<BlockIndex>) { + // Find the appropriate free list to use based on the block size. + // + // Note that we cheat a bit with the 341 bucket, since it's not quite + // a divisor of 1024, because purecss-francine allocates many 260-block + // chunks, and there's no reason we shouldn't pack these three to a row. + // This means the allocation statistics will under-report by one block + // for each row using 341-block buckets, which is fine. + debug_assert_eq!(MAX_VERTEX_TEXTURE_WIDTH, 1024, "Need to update bucketing"); + match block_count { + 0 => panic!("Can't allocate zero sized blocks!"), + 1 => (1, &mut self.free_list_1), + 2 => (2, &mut self.free_list_2), + 3..=4 => (4, &mut self.free_list_4), + 5..=8 => (8, &mut self.free_list_8), + 9..=16 => (16, &mut self.free_list_16), + 17..=32 => (32, &mut self.free_list_32), + 33..=64 => (64, &mut self.free_list_64), + 65..=128 => (128, &mut self.free_list_128), + 129..=256 => (256, &mut self.free_list_256), + 257..=341 => (341, &mut self.free_list_341), + 342..=512 => (512, &mut self.free_list_512), + 513..=1024 => (1024, &mut self.free_list_1024), + _ => panic!("Can't allocate > MAX_VERTEX_TEXTURE_WIDTH per resource!"), + } + } +} + +// CPU-side representation of the GPU resource cache texture. +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +#[derive(MallocSizeOf)] +struct Texture { + // Current texture height + height: i32, + // All blocks that have been created for this texture + blocks: Vec<Block>, + // Metadata about each allocated row. + rows: Vec<Row>, + // The base Epoch for this texture. + base_epoch: Epoch, + // The maximum epoch reached. We track this along with the above so + // that we can rebuild the Texture and avoid collisions with handles + // allocated for the old texture. + max_epoch: Epoch, + // Free lists of available blocks for each supported + // block size in the texture. These are intrusive + // linked lists. + free_lists: FreeBlockLists, + // Linked list of currently occupied blocks. This + // makes it faster to iterate blocks looking for + // candidates to be evicted from the cache. + occupied_list_heads: FastHashMap<DocumentId, BlockIndex>, + // Pending blocks that have been written this frame + // and will need to be sent to the GPU. + pending_blocks: Vec<GpuBlockData>, + // Pending update commands. + updates: Vec<GpuCacheUpdate>, + // Profile stats + allocated_block_count: usize, + // The stamp at which we first reached our threshold for reclaiming `GpuCache` + // memory, or `None` if the threshold hasn't been reached. + #[cfg_attr(feature = "serde", serde(skip))] + reached_reclaim_threshold: Option<Instant>, + // List of debug commands to be sent to the renderer when the GPU cache + // debug display is enabled. + #[cfg_attr(feature = "serde", serde(skip))] + debug_commands: Vec<GpuCacheDebugCmd>, + // The current debug flags for the system. + debug_flags: DebugFlags, +} + +impl Texture { + fn new(base_epoch: Epoch, debug_flags: DebugFlags) -> Self { + // Pre-fill the block array with one invalid block so that we never use + // 0 for a BlockIndex. This lets us use NonZeroU32 for BlockIndex, which + // saves memory. + let blocks = vec![Block::INVALID]; + + Texture { + height: GPU_CACHE_INITIAL_HEIGHT, + blocks, + rows: Vec::new(), + base_epoch, + max_epoch: base_epoch, + free_lists: FreeBlockLists::new(), + pending_blocks: Vec::new(), + updates: Vec::new(), + occupied_list_heads: FastHashMap::default(), + allocated_block_count: 0, + reached_reclaim_threshold: None, + debug_commands: Vec::new(), + debug_flags, + } + } + + // Push new data into the cache. The ```pending_block_index``` field represents + // where the data was pushed into the texture ```pending_blocks``` array. + // Return the allocated address for this data. + fn push_data( + &mut self, + pending_block_index: Option<usize>, + block_count: usize, + frame_stamp: FrameStamp + ) -> CacheLocation { + debug_assert!(frame_stamp.is_valid()); + // Find the appropriate free list to use based on the block size. + let (alloc_size, free_list) = self.free_lists + .get_actual_block_count_and_free_list(block_count); + + // See if we need a new row (if free-list has nothing available) + if free_list.is_none() { + if self.rows.len() as i32 == self.height { + self.height += NEW_ROWS_PER_RESIZE; + } + + // Create a new row. + let items_per_row = MAX_VERTEX_TEXTURE_WIDTH / alloc_size; + let row_index = self.rows.len(); + self.rows.push(Row::new(alloc_size)); + + // Create a ```Block``` for each possible allocation address + // in this row, and link it in to the free-list for this + // block size. + let mut prev_block_index = None; + for i in 0 .. items_per_row { + let address = GpuCacheAddress::new(i * alloc_size, row_index); + let block_index = BlockIndex::new(self.blocks.len()); + let block = Block::new(address, prev_block_index, frame_stamp.frame_id(), self.base_epoch); + self.blocks.push(block); + prev_block_index = Some(block_index); + } + + *free_list = prev_block_index; + } + + // Given the code above, it's now guaranteed that there is a block + // available in the appropriate free-list. Pull a block from the + // head of the list. + let free_block_index = free_list.take().unwrap(); + let block = &mut self.blocks[free_block_index.get()]; + *free_list = block.next; + + // Add the block to the occupied linked list. + block.next = self.occupied_list_heads.get(&frame_stamp.document_id()).cloned(); + block.last_access_time = frame_stamp.frame_id(); + self.occupied_list_heads.insert(frame_stamp.document_id(), free_block_index); + self.allocated_block_count += alloc_size; + + if let Some(pending_block_index) = pending_block_index { + // Add this update to the pending list of blocks that need + // to be updated on the GPU. + self.updates.push(GpuCacheUpdate::Copy { + block_index: pending_block_index, + block_count, + address: block.address, + }); + } + + // If we're using the debug display, communicate the allocation to the + // renderer thread. Note that we do this regardless of whether or not + // pending_block_index is None (if it is, the renderer thread will fill + // in the data via a deferred resolve, but the block is still considered + // allocated). + if self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) { + self.debug_commands.push(GpuCacheDebugCmd::Alloc(GpuCacheDebugChunk { + address: block.address, + size: block_count, + })); + } + + CacheLocation { + block_index: free_block_index, + epoch: block.epoch, + } + } + + // Run through the list of occupied cache blocks and evict + // any old blocks that haven't been referenced for a while. + fn evict_old_blocks(&mut self, frame_stamp: FrameStamp) { + debug_assert!(frame_stamp.is_valid()); + // Prune any old items from the list to make room. + // Traverse the occupied linked list and see + // which items have not been used for a long time. + let mut current_block = self.occupied_list_heads.get(&frame_stamp.document_id()).map(|x| *x); + let mut prev_block: Option<BlockIndex> = None; + + while let Some(index) = current_block { + let (next_block, should_unlink) = { + let block = &mut self.blocks[index.get()]; + + let next_block = block.next; + let mut should_unlink = false; + + // If this resource has not been used in the last + // few frames, free it from the texture and mark + // as empty. + if block.last_access_time + FRAMES_BEFORE_EVICTION < frame_stamp.frame_id() { + should_unlink = true; + + // Get the row metadata from the address. + let row = &mut self.rows[block.address.v as usize]; + + // Use the row metadata to determine which free-list + // this block belongs to. + let (_, free_list) = self.free_lists + .get_actual_block_count_and_free_list(row.block_count_per_item); + + block.advance_epoch(&mut self.max_epoch); + block.next = *free_list; + *free_list = Some(index); + + self.allocated_block_count -= row.block_count_per_item; + + if self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) { + let cmd = GpuCacheDebugCmd::Free(block.address); + self.debug_commands.push(cmd); + } + }; + + (next_block, should_unlink) + }; + + // If the block was released, we will need to remove it + // from the occupied linked list. + if should_unlink { + match prev_block { + Some(prev_block) => { + self.blocks[prev_block.get()].next = next_block; + } + None => { + match next_block { + Some(next_block) => { + self.occupied_list_heads.insert(frame_stamp.document_id(), next_block); + } + None => { + self.occupied_list_heads.remove(&frame_stamp.document_id()); + } + } + } + } + } else { + prev_block = current_block; + } + + current_block = next_block; + } + } + + /// Returns the ratio of utilized blocks. + fn utilization(&self) -> f32 { + let total_blocks = self.rows.len() * MAX_VERTEX_TEXTURE_WIDTH; + debug_assert!(total_blocks > 0); + let ratio = self.allocated_block_count as f32 / total_blocks as f32; + debug_assert!(0.0 <= ratio && ratio <= 1.0, "Bad ratio: {}", ratio); + ratio + } +} + + +/// A wrapper object for GPU data requests, +/// works as a container that can only grow. +#[must_use] +pub struct GpuDataRequest<'a> { + //TODO: remove this, see + // https://bugzilla.mozilla.org/show_bug.cgi?id=1690546 + #[allow(dead_code)] + handle: &'a mut GpuCacheHandle, + frame_stamp: FrameStamp, + start_index: usize, + max_block_count: usize, + texture: &'a mut Texture, +} + +impl<'a> GpuDataRequest<'a> { + pub fn push<B>(&mut self, block: B) + where + B: Into<GpuBlockData>, + { + self.texture.pending_blocks.push(block.into()); + } + + // Write the GPU cache data for an individual segment. + pub fn write_segment( + &mut self, + local_rect: LayoutRect, + extra_data: [f32; 4], + ) { + let _ = VECS_PER_SEGMENT; + self.push(local_rect); + self.push(extra_data); + } + + pub fn current_used_block_num(&self) -> usize { + self.texture.pending_blocks.len() - self.start_index + } +} + +impl<'a> Drop for GpuDataRequest<'a> { + fn drop(&mut self) { + // Push the data to the texture pending updates list. + let block_count = self.current_used_block_num(); + debug_assert!(block_count <= self.max_block_count); + + let location = self.texture + .push_data(Some(self.start_index), block_count, self.frame_stamp); + self.handle.location = Some(location); + } +} + + +/// The main LRU cache interface. +#[cfg_attr(feature = "capture", derive(Serialize))] +#[cfg_attr(feature = "replay", derive(Deserialize))] +#[derive(MallocSizeOf)] +pub struct GpuCache { + /// Current FrameId. + now: FrameStamp, + /// CPU-side texture allocator. + texture: Texture, + /// Number of blocks requested this frame that don't + /// need to be re-uploaded. + saved_block_count: usize, + /// The current debug flags for the system. + debug_flags: DebugFlags, + /// Whether there is a pending clear to send with the + /// next update. + pending_clear: bool, + /// Indicates that prepare_for_frames has been called for this group of frames. + /// Used for sanity checks. + prepared_for_frames: bool, + /// This indicates that we performed a cleanup operation which requires all + /// documents to build a frame. + requires_frame_build: bool, + /// The set of documents which have had frames built in this update. Used for + /// sanity checks. + document_frames_to_build: FastHashSet<DocumentId>, +} + +impl GpuCache { + pub fn new() -> Self { + let debug_flags = DebugFlags::empty(); + GpuCache { + now: FrameStamp::INVALID, + texture: Texture::new(Epoch(0), debug_flags), + saved_block_count: 0, + debug_flags, + pending_clear: false, + prepared_for_frames: false, + requires_frame_build: false, + document_frames_to_build: FastHashSet::default(), + } + } + + /// Creates a GpuCache and sets it up with a valid `FrameStamp`, which + /// is useful for avoiding panics when instantiating the `GpuCache` + /// directly from unit test code. + #[cfg(test)] + pub fn new_for_testing() -> Self { + let mut cache = Self::new(); + let mut now = FrameStamp::first(DocumentId::new(IdNamespace(1), 1)); + now.advance(); + cache.prepared_for_frames = true; + cache.begin_frame(now); + cache + } + + /// Drops everything in the GPU cache. Must not be called once gpu cache entries + /// for the next frame have already been requested. + pub fn clear(&mut self) { + assert!(self.texture.updates.is_empty(), "Clearing with pending updates"); + let mut next_base_epoch = self.texture.max_epoch; + next_base_epoch.next(); + self.texture = Texture::new(next_base_epoch, self.debug_flags); + self.saved_block_count = 0; + self.pending_clear = true; + self.requires_frame_build = true; + } + + pub fn requires_frame_build(&self) -> bool { + self.requires_frame_build + } + + pub fn prepare_for_frames(&mut self) { + self.prepared_for_frames = true; + if self.should_reclaim_memory() { + self.clear(); + debug_assert!(self.document_frames_to_build.is_empty()); + for &document_id in self.texture.occupied_list_heads.keys() { + self.document_frames_to_build.insert(document_id); + } + } + } + + pub fn bookkeep_after_frames(&mut self) { + assert!(self.document_frames_to_build.is_empty()); + assert!(self.prepared_for_frames); + self.requires_frame_build = false; + self.prepared_for_frames = false; + } + + /// Begin a new frame. + pub fn begin_frame(&mut self, stamp: FrameStamp) { + debug_assert!(self.texture.pending_blocks.is_empty()); + assert!(self.prepared_for_frames); + profile_scope!("begin_frame"); + self.now = stamp; + self.texture.evict_old_blocks(self.now); + self.saved_block_count = 0; + } + + // Invalidate a (possibly) existing block in the cache. + // This means the next call to request() for this location + // will rebuild the data and upload it to the GPU. + pub fn invalidate(&mut self, handle: &GpuCacheHandle) { + if let Some(ref location) = handle.location { + // don't invalidate blocks that are already re-assigned + if let Some(block) = self.texture.blocks.get_mut(location.block_index.get()) { + if block.epoch == location.epoch { + block.advance_epoch(&mut self.texture.max_epoch); + } + } + } + } + + /// Request a resource be added to the cache. If the resource + /// is already in the cache, `None` will be returned. + pub fn request<'a>(&'a mut self, handle: &'a mut GpuCacheHandle) -> Option<GpuDataRequest<'a>> { + let mut max_block_count = MAX_VERTEX_TEXTURE_WIDTH; + // Check if the allocation for this handle is still valid. + if let Some(ref location) = handle.location { + if let Some(block) = self.texture.blocks.get_mut(location.block_index.get()) { + if block.epoch == location.epoch { + max_block_count = self.texture.rows[block.address.v as usize].block_count_per_item; + if block.last_access_time != self.now.frame_id() { + // Mark last access time to avoid evicting this block. + block.last_access_time = self.now.frame_id(); + self.saved_block_count += max_block_count; + } + return None; + } + } + } + + debug_assert!(self.now.is_valid()); + Some(GpuDataRequest { + handle, + frame_stamp: self.now, + start_index: self.texture.pending_blocks.len(), + texture: &mut self.texture, + max_block_count, + }) + } + + // Push an array of data blocks to be uploaded to the GPU + // unconditionally for this frame. The cache handle will + // assert if the caller tries to retrieve the address + // of this handle on a subsequent frame. This is typically + // used for uploading data that changes every frame, and + // therefore makes no sense to try and cache. + pub fn push_per_frame_blocks(&mut self, blocks: &[GpuBlockData]) -> GpuCacheHandle { + let start_index = self.texture.pending_blocks.len(); + self.texture.pending_blocks.extend_from_slice(blocks); + let location = self.texture + .push_data(Some(start_index), blocks.len(), self.now); + GpuCacheHandle { + location: Some(location), + } + } + + // Reserve space in the cache for per-frame blocks that + // will be resolved by the render thread via the + // external image callback. + pub fn push_deferred_per_frame_blocks(&mut self, block_count: usize) -> GpuCacheHandle { + let location = self.texture.push_data(None, block_count, self.now); + GpuCacheHandle { + location: Some(location), + } + } + + /// End the frame. Return the list of updates to apply to the + /// device specific cache texture. + pub fn end_frame( + &mut self, + profile: &mut TransactionProfile, + ) -> FrameStamp { + profile_scope!("end_frame"); + profile.set(profiler::GPU_CACHE_ROWS_TOTAL, self.texture.rows.len()); + profile.set(profiler::GPU_CACHE_BLOCKS_TOTAL, self.texture.allocated_block_count); + profile.set(profiler::GPU_CACHE_BLOCKS_SAVED, self.saved_block_count); + + let reached_threshold = + self.texture.rows.len() > (GPU_CACHE_INITIAL_HEIGHT as usize) && + self.texture.utilization() < RECLAIM_THRESHOLD; + if reached_threshold { + self.texture.reached_reclaim_threshold.get_or_insert_with(Instant::now); + } else { + self.texture.reached_reclaim_threshold = None; + } + + self.document_frames_to_build.remove(&self.now.document_id()); + self.now + } + + /// Returns true if utilization has been low enough for long enough that we + /// should blow the cache away and rebuild it. + pub fn should_reclaim_memory(&self) -> bool { + self.texture.reached_reclaim_threshold + .map_or(false, |t| t.elapsed() > Duration::from_secs(RECLAIM_DELAY_S)) + } + + /// Extract the pending updates from the cache. + pub fn extract_updates(&mut self) -> GpuCacheUpdateList { + let clear = self.pending_clear; + self.pending_clear = false; + GpuCacheUpdateList { + frame_id: self.now.frame_id(), + clear, + height: self.texture.height, + debug_commands: self.texture.debug_commands.take_and_preallocate(), + updates: self.texture.updates.take_and_preallocate(), + blocks: self.texture.pending_blocks.take_and_preallocate(), + } + } + + /// Sets the current debug flags for the system. + pub fn set_debug_flags(&mut self, flags: DebugFlags) { + self.debug_flags = flags; + self.texture.debug_flags = flags; + } + + /// Get the actual GPU address in the texture for a given slot ID. + /// It's assumed at this point that the given slot has been requested + /// and built for this frame. Attempting to get the address for a + /// freed or pending slot will panic! + pub fn get_address(&self, id: &GpuCacheHandle) -> GpuCacheAddress { + self.try_get_address(id).expect("handle not requested or allocated!") + } + + /// Get the actual GPU address in the texture for a given slot ID. + /// + /// Returns None if the slot has not been requested. + pub fn try_get_address(&self, id: &GpuCacheHandle) -> Option<GpuCacheAddress> { + let Some(location) = id.location else { return None; }; + let block = &self.texture.blocks[location.block_index.get()]; + debug_assert_eq!(block.epoch, location.epoch); + debug_assert_eq!(block.last_access_time, self.now.frame_id()); + Some(block.address) + } +} + +#[test] +#[cfg(target_pointer_width = "64")] +fn test_struct_sizes() { + use std::mem; + // We can end up with a lot of blocks stored in the global vec, and keeping + // them small helps reduce memory overhead. + assert_eq!(mem::size_of::<Block>(), 24, "Block size changed"); +} diff --git a/gfx/wr/webrender/src/gpu_types.rs b/gfx/wr/webrender/src/gpu_types.rs @@ -4,15 +4,15 @@ use api::{AlphaType, PremultipliedColorF, YuvFormat, YuvRangedColorSpace}; use api::units::*; -use euclid::HomogeneousVector; use crate::composite::{CompositeFeatures, CompositorClip}; use crate::segment::EdgeAaSegmentMask; use crate::spatial_tree::{SpatialTree, SpatialNodeIndex}; +use crate::gpu_cache::{GpuCacheAddress, GpuDataRequest}; use crate::internal_types::{FastHashMap, FrameVec, FrameMemory}; use crate::prim_store::ClipData; use crate::render_task::RenderTaskAddress; use crate::render_task_graph::RenderTaskId; -use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF, GpuBufferWriterF, ShaderColorMode}; +use crate::renderer::{ShaderColorMode, GpuBufferAddress}; use std::i32; use crate::util::{MatrixHelpers, TransformedRectKind}; use glyph_rasterizer::SubpixelDirection; @@ -172,7 +172,7 @@ pub struct SvgFilterInstance { pub input_count: u16, pub generic_int: u16, pub padding: u16, - pub extra_data_address: i32, + pub extra_data_address: GpuCacheAddress, } #[derive(Clone, Debug)] @@ -187,7 +187,7 @@ pub struct SVGFEFilterInstance { pub input_2_task_address: RenderTaskAddress, pub kind: u16, pub input_count: u16, - pub extra_data_address: i32, + pub extra_data_address: GpuCacheAddress, } #[derive(Copy, Clone, Debug, Hash, MallocSizeOf, PartialEq, Eq)] @@ -261,7 +261,7 @@ pub struct BoxShadowData { #[repr(C)] pub struct ClipMaskInstanceBoxShadow { pub common: ClipMaskInstanceCommon, - pub resource_address: i32, + pub resource_address: GpuCacheAddress, pub shadow_data: BoxShadowData, } @@ -505,7 +505,7 @@ impl PrimitiveHeaders { self.headers_int.push(PrimitiveHeaderI { z: prim_header.z, render_task_address: prim_header.render_task_address, - specific_prim_address: prim_header.specific_prim_address, + specific_prim_address: prim_header.specific_prim_address.as_int(), transform_id: prim_header.transform_id, user_data: prim_header.user_data, }); @@ -520,7 +520,7 @@ impl PrimitiveHeaders { pub struct PrimitiveHeader { pub local_rect: LayoutRect, pub local_clip_rect: LayoutRect, - pub specific_prim_address: i32, + pub specific_prim_address: GpuCacheAddress, pub transform_id: TransformPaletteId, pub z: ZBufferId, pub render_task_address: RenderTaskAddress, @@ -571,7 +571,7 @@ impl GlyphInstance { clip_task: RenderTaskAddress, subpx_dir: SubpixelDirection, glyph_index_in_text_run: i32, - glyph_uv_rect: GpuBufferAddress, + glyph_uv_rect: GpuCacheAddress, color_mode: ShaderColorMode, ) -> PrimitiveInstanceData { PrimitiveInstanceData { @@ -612,8 +612,8 @@ impl From<SplitCompositeInstance> for PrimitiveInstanceData { #[cfg_attr(feature = "replay", derive(Deserialize))] pub struct QuadInstance { pub dst_task_address: RenderTaskAddress, - pub prim_address_i: i32, - pub prim_address_f: i32, + pub prim_address_i: GpuBufferAddress, + pub prim_address_f: GpuBufferAddress, pub quad_flags: u8, pub edge_flags: u8, pub part_index: u8, @@ -631,8 +631,8 @@ impl From<QuadInstance> for PrimitiveInstanceData { PrimitiveInstanceData { data: [ - instance.prim_address_i, - instance.prim_address_f, + instance.prim_address_i.as_int(), + instance.prim_address_f.as_int(), ((instance.quad_flags as i32) << 24) | ((instance.edge_flags as i32) << 16) | @@ -1006,34 +1006,25 @@ pub struct ImageSource { } impl ImageSource { - pub fn write_gpu_blocks(&self, gpu_buffer: &mut GpuBufferBuilderF) -> GpuBufferAddress { - let mut writer = gpu_buffer.write_blocks(6); - self.push_gpu_blocks(&mut writer); - writer.finish() - } - - pub fn push_gpu_blocks(&self, writer: &mut GpuBufferWriterF) { + pub fn write_gpu_blocks(&self, request: &mut GpuDataRequest) { // see fetch_image_resource in GLSL // has to be VECS_PER_IMAGE_RESOURCE vectors - writer.push_one([ + request.push([ self.p0.x, self.p0.y, self.p1.x, self.p1.y, ]); - writer.push_one(self.user_data); + request.push(self.user_data); // If this is a polygon uv kind, then upload the four vertices. if let UvRectKind::Quad { top_left, top_right, bottom_left, bottom_right } = self.uv_rect_kind { // see fetch_image_resource_extra in GLSL //Note: we really need only 3 components per point here: X, Y, and W - fn to_array(v: HomogeneousVector<f32, DevicePixel>) -> [f32; 4] { - [v.x, v.y, v.z, v.w] - } - writer.push_one(to_array(top_left)); - writer.push_one(to_array(top_right)); - writer.push_one(to_array(bottom_left)); - writer.push_one(to_array(bottom_right)); + request.push(top_left); + request.push(top_right); + request.push(bottom_left); + request.push(bottom_right); } } } diff --git a/gfx/wr/webrender/src/image_source.rs b/gfx/wr/webrender/src/image_source.rs @@ -11,10 +11,10 @@ use crate::api::ExternalImageType; use crate::api::units::*; +use crate::gpu_cache::GpuCache; use crate::prim_store::DeferredResolve; use crate::renderer::BLOCKS_PER_UV_RECT; use crate::render_task_cache::RenderTaskCacheEntryHandle; -use crate::renderer::GpuBufferBuilderF; use crate::resource_cache::{ResourceCache, ImageRequest, CacheItem}; use crate::internal_types::{TextureSource, TextureSourceExternal, DeferredResolveIndex, FrameVec}; @@ -22,7 +22,7 @@ use crate::internal_types::{TextureSource, TextureSourceExternal, DeferredResolv pub fn resolve_image( request: ImageRequest, resource_cache: &ResourceCache, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, deferred_resolves: &mut FrameVec<DeferredResolve>, is_composited: bool, ) -> CacheItem { @@ -35,7 +35,7 @@ pub fn resolve_image( // This is an external texture - we will add it to // the deferred resolves list to be patched by // the render thread... - let uv_rect_address = gpu_buffer.reserve_renderer_deferred_blocks(BLOCKS_PER_UV_RECT); + let cache_handle = gpu_cache.push_deferred_per_frame_blocks(BLOCKS_PER_UV_RECT); let deferred_resolve_index = DeferredResolveIndex(deferred_resolves.len() as u32); @@ -56,7 +56,7 @@ pub fn resolve_image( kind: image_buffer_kind, normalized_uvs: external_image.normalized_uvs, }), - uv_rect_handle: uv_rect_address, + uv_rect_handle: cache_handle, uv_rect: DeviceIntRect::from_size( image_properties.descriptor.size, ), @@ -65,7 +65,7 @@ pub fn resolve_image( deferred_resolves.push(DeferredResolve { image_properties, - address: uv_rect_address, + address: gpu_cache.get_address(&cache_handle), rendering: request.rendering, is_composited, }); diff --git a/gfx/wr/webrender/src/internal_types.rs b/gfx/wr/webrender/src/internal_types.rs @@ -10,6 +10,7 @@ use crate::render_api::DebugCommand; use crate::composite::NativeSurfaceOperation; use crate::device::TextureFilter; use crate::renderer::{FullFrameStats, PipelineInfo}; +use crate::gpu_cache::GpuCacheUpdateList; use crate::gpu_types::BlurEdgeMode; use crate::frame_builder::Frame; use crate::profiler::TransactionProfile; @@ -1349,6 +1350,7 @@ pub enum ResultMsg { DebugCommand(DebugCommand), DebugOutput(DebugOutput), RefreshShader(PathBuf), + UpdateGpuCache(GpuCacheUpdateList), UpdateResources { resource_updates: ResourceUpdateList, memory_pressure: bool, diff --git a/gfx/wr/webrender/src/lib.rs b/gfx/wr/webrender/src/lib.rs @@ -102,6 +102,7 @@ mod filterdata; mod frame_builder; mod freelist; mod glyph_cache; +mod gpu_cache; mod gpu_types; mod hit_test; mod internal_types; diff --git a/gfx/wr/webrender/src/picture.rs b/gfx/wr/webrender/src/picture.rs @@ -116,6 +116,7 @@ use crate::intern::ItemUid; use crate::internal_types::{FastHashMap, FastHashSet, PlaneSplitter, FilterGraphOp, FilterGraphNode, Filter, FrameId}; use crate::internal_types::{PlaneSplitterIndex, PlaneSplitAnchor, TextureSource}; use crate::frame_builder::{FrameBuildingContext, FrameBuildingState, PictureState, PictureContext}; +use crate::gpu_cache::{GpuCache, GpuCacheAddress, GpuCacheHandle}; use crate::gpu_types::{UvRectKind, ZBufferId, BlurEdgeMode}; use peek_poke::{PeekPoke, poke_into_vec, peek_from_slice, ensure_red_zone}; use plane_split::{Clipper, Polygon}; @@ -127,7 +128,7 @@ use crate::render_task_graph::RenderTaskId; use crate::render_target::RenderTargetKind; use crate::render_task::{BlurTask, RenderTask, RenderTaskLocation, BlurTaskCache}; use crate::render_task::{StaticRenderTaskSurface, RenderTaskKind}; -use crate::renderer::{BlendMode, GpuBufferAddress}; +use crate::renderer::BlendMode; use crate::resource_cache::{ResourceCache, ImageGeneration, ImageRequest}; use crate::space::SpaceMapper; use crate::scene::SceneProperties; @@ -2193,7 +2194,7 @@ impl TileCacheInstance { &map_local_to_picture, &pic_to_vis_mapper, frame_context.spatial_tree, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, frame_state.resource_cache, frame_context.global_device_pixel_scale, &surface.culling_rect, @@ -2725,7 +2726,7 @@ impl TileCacheInstance { api_keys: &[ImageKey; 3], resource_cache: &mut ResourceCache, composite_state: &mut CompositeState, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, image_rendering: ImageRendering, color_depth: ColorDepth, color_space: YuvRangedColorSpace, @@ -2740,7 +2741,7 @@ impl TileCacheInstance { rendering: image_rendering, tile: None, }, - gpu_buffer, + gpu_cache, ); } } @@ -2781,7 +2782,7 @@ impl TileCacheInstance { api_key: ImageKey, resource_cache: &mut ResourceCache, composite_state: &mut CompositeState, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, image_rendering: ImageRendering, is_opaque: bool, surface_kind: CompositorSurfaceKind, @@ -2800,7 +2801,7 @@ impl TileCacheInstance { rendering: image_rendering, tile: None, }, - gpu_buffer, + gpu_cache, ); self.setup_compositor_surfaces_impl( @@ -3146,7 +3147,7 @@ impl TileCacheInstance { color_bindings: &ColorBindingStorage, surface_stack: &[(PictureIndex, SurfaceIndex)], composite_state: &mut CompositeState, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, scratch: &mut PrimitiveScratchBuffer, is_root_tile_cache: bool, surfaces: &mut [SurfaceInfo], @@ -3390,7 +3391,7 @@ impl TileCacheInstance { image_data.key, resource_cache, composite_state, - gpu_buffer, + gpu_cache, image_data.image_rendering, is_opaque, kind, @@ -3508,7 +3509,7 @@ impl TileCacheInstance { &prim_data.kind.yuv_key, resource_cache, composite_state, - gpu_buffer, + gpu_cache, prim_data.kind.image_rendering, prim_data.kind.color_depth, prim_data.kind.color_space.with_range(prim_data.kind.color_range), @@ -4956,7 +4957,7 @@ pub enum Picture3DContext<C> { #[cfg_attr(feature = "capture", derive(Serialize))] pub struct OrderedPictureChild { pub anchor: PlaneSplitAnchor, - pub gpu_address: GpuBufferAddress, + pub gpu_address: GpuCacheAddress, } bitflags! { @@ -5217,7 +5218,7 @@ pub struct PicturePrimitive { // Optional cache handles for storing extra data // in the GPU cache, depending on the type of // picture. - pub extra_gpu_data: SmallVec<[GpuBufferAddress; 1]>, + pub extra_gpu_data_handles: SmallVec<[GpuCacheHandle; 1]>, /// The spatial node index of this picture when it is /// composited into the parent picture. @@ -5331,7 +5332,7 @@ impl PicturePrimitive { composite_mode, raster_config: None, context_3d, - extra_gpu_data: SmallVec::new(), + extra_gpu_data_handles: SmallVec::new(), is_backface_visible: prim_flags.contains(PrimitiveFlags::IS_BACKFACE_VISIBLE), spatial_node_index, prev_local_rect: LayoutRect::zero(), @@ -5509,7 +5510,7 @@ impl PicturePrimitive { if let Some(TileSurface::Texture { descriptor, .. }) = tile.surface.as_ref() { if let SurfaceTextureDescriptor::TextureCache { handle: Some(handle), .. } = descriptor { frame_state.resource_cache - .picture_textures.request(handle, &mut frame_state.frame_gpu_data.f32); + .picture_textures.request(handle, frame_state.gpu_cache); } } @@ -5545,7 +5546,7 @@ impl PicturePrimitive { // TODO(gw): Consider switching to manual eviction policy? frame_state.resource_cache .picture_textures - .request(handle.as_ref().unwrap(), &mut frame_state.frame_gpu_data.f32); + .request(handle.as_ref().unwrap(), frame_state.gpu_cache); } else { // If the texture was evicted on a previous frame, we need to assume // that the entire tile rect is dirty. @@ -5602,7 +5603,7 @@ impl PicturePrimitive { frame_state.resource_cache.picture_textures.update( tile_cache.current_tile_size, handle, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, &mut frame_state.resource_cache.texture_cache.next_id, &mut frame_state.resource_cache.texture_cache.pending_updates, ); @@ -6027,6 +6028,14 @@ impl PicturePrimitive { // use of the conservative picture rect for segmenting (which should // be done during scene building). if local_rect != self.prev_local_rect { + match raster_config.composite_mode { + PictureCompositeMode::Filter(Filter::DropShadows(..)) => { + for handle in &self.extra_gpu_data_handles { + frame_state.gpu_cache.invalidate(handle); + } + } + _ => {} + } // Invalidate any segments built for this picture, since the local // rect has changed. self.segments_are_valid = false; @@ -6130,7 +6139,7 @@ impl PicturePrimitive { &self.snapshot, &surface_rects, false, - &mut|rg_builder, _| { + &mut|rg_builder, _, _| { RenderTask::new_blur( blur_std_deviation, picture_task_id, @@ -6179,7 +6188,7 @@ impl PicturePrimitive { let mut blur_tasks = BlurTaskCache::default(); - self.extra_gpu_data.resize(shadows.len(), GpuBufferAddress::INVALID); + self.extra_gpu_data_handles.resize(shadows.len(), GpuCacheHandle::new()); let mut blur_render_task_id = picture_task_id; for shadow in shadows { @@ -6307,7 +6316,7 @@ impl PicturePrimitive { &self.snapshot, &surface_rects, is_opaque, - &mut|rg_builder, _| { + &mut|rg_builder, _, _| { rg_builder.add().init( RenderTask::new_dynamic( task_size, @@ -6346,7 +6355,7 @@ impl PicturePrimitive { &self.snapshot, &surface_rects, is_opaque, - &mut|rg_builder, _| { + &mut|rg_builder, _, _| { rg_builder.add().init( RenderTask::new_dynamic( surface_rects.task_size, @@ -6385,7 +6394,7 @@ impl PicturePrimitive { &self.snapshot, &surface_rects, is_opaque, - &mut|rg_builder, _| { + &mut|rg_builder, _, _| { rg_builder.add().init( RenderTask::new_dynamic( surface_rects.task_size, @@ -6425,7 +6434,7 @@ impl PicturePrimitive { &self.snapshot, &surface_rects, is_opaque, - &mut|rg_builder, _| { + &mut|rg_builder, _, _| { rg_builder.add().init( RenderTask::new_dynamic( surface_rects.task_size, @@ -6470,7 +6479,7 @@ impl PicturePrimitive { &self.snapshot, &surface_rects, is_opaque, - &mut|rg_builder, _| { + &mut|rg_builder, _, _| { rg_builder.add().init( RenderTask::new_dynamic( surface_rects.task_size, @@ -6529,7 +6538,7 @@ impl PicturePrimitive { &self.snapshot, &surface_rects, is_opaque, - &mut|rg_builder, _| { + &mut|rg_builder, _, _| { RenderTask::new_svg_filter( primitives, filter_datas, @@ -6604,11 +6613,11 @@ impl PicturePrimitive { &self.snapshot, &surface_rects, false, - &mut|rg_builder, gpu_buffer| { + &mut|rg_builder, _, gpu_cache| { RenderTask::new_svg_filter_graph( filters, rg_builder, - gpu_buffer, + gpu_cache, data_stores, surface_rects.uv_rect_kind, picture_task_id, @@ -6769,7 +6778,7 @@ impl PicturePrimitive { PicturePrimitive::resolve_split_planes( splitter, list, - &mut frame_state.frame_gpu_data.f32, + &mut frame_state.gpu_cache, &frame_context.spatial_tree, ); @@ -6878,7 +6887,7 @@ impl PicturePrimitive { fn resolve_split_planes( splitter: &mut PlaneSplitter, ordered: &mut Vec<OrderedPictureChild>, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, spatial_tree: &SpatialTree, ) { ordered.clear(); @@ -6914,11 +6923,12 @@ impl PicturePrimitive { let p1 = local_points[1].unwrap(); let p2 = local_points[2].unwrap(); let p3 = local_points[3].unwrap(); - - let mut writer = gpu_buffer.write_blocks(2); - writer.push_one([p0.x, p0.y, p1.x, p1.y]); - writer.push_one([p2.x, p2.y, p3.x, p3.y]); - let gpu_address = writer.finish(); + let gpu_blocks = [ + [p0.x, p0.y, p1.x, p1.y].into(), + [p2.x, p2.y, p3.x, p3.y].into(), + ]; + let gpu_handle = gpu_cache.push_per_frame_blocks(&gpu_blocks); + let gpu_address = gpu_cache.get_address(&gpu_handle); ordered.push(OrderedPictureChild { anchor: poly.anchor, @@ -7242,7 +7252,7 @@ impl PicturePrimitive { } }; - // TODO(gw): Almost all of the Picture types below use extra_gpu_data + // TODO(gw): Almost all of the Picture types below use extra_gpu_cache_data // to store the same type of data. The exception is the filter // with a ColorMatrix, which stores the color matrix here. It's // probably worth tidying this code up to be a bit more consistent. @@ -7253,68 +7263,67 @@ impl PicturePrimitive { PictureCompositeMode::TileCache { .. } => {} PictureCompositeMode::Filter(Filter::Blur { .. }) => {} PictureCompositeMode::Filter(Filter::DropShadows(ref shadows)) => { - self.extra_gpu_data.resize(shadows.len(), GpuBufferAddress::INVALID); - for (shadow, extra_handle) in shadows.iter().zip(self.extra_gpu_data.iter_mut()) { - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(5); - let surface = &frame_state.surfaces[raster_config.surface_index.0]; - let prim_rect = surface.clipped_local_rect.cast_unit(); - - // Basic brush primitive header is (see end of prepare_prim_for_render_inner in prim_store.rs) - // [brush specific data] - // [segment_rect, segment data] - let (blur_inflation_x, blur_inflation_y) = surface.clamp_blur_radius( - shadow.blur_radius, - shadow.blur_radius, - ); + self.extra_gpu_data_handles.resize(shadows.len(), GpuCacheHandle::new()); + for (shadow, extra_handle) in shadows.iter().zip(self.extra_gpu_data_handles.iter_mut()) { + if let Some(mut request) = frame_state.gpu_cache.request(extra_handle) { + let surface = &frame_state.surfaces[raster_config.surface_index.0]; + let prim_rect = surface.clipped_local_rect.cast_unit(); + + // Basic brush primitive header is (see end of prepare_prim_for_render_inner in prim_store.rs) + // [brush specific data] + // [segment_rect, segment data] + let (blur_inflation_x, blur_inflation_y) = surface.clamp_blur_radius( + shadow.blur_radius, + shadow.blur_radius, + ); - let shadow_rect = prim_rect.inflate( - blur_inflation_x * BLUR_SAMPLE_SCALE, - blur_inflation_y * BLUR_SAMPLE_SCALE, - ).translate(shadow.offset); - - // ImageBrush colors - writer.push_one(shadow.color.premultiplied()); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ - shadow_rect.width(), - shadow_rect.height(), - 0.0, - 0.0, - ]); - - // segment rect / extra data - writer.push_one(shadow_rect); - writer.push_one([0.0, 0.0, 0.0, 0.0]); - - *extra_handle = writer.finish(); + let shadow_rect = prim_rect.inflate( + blur_inflation_x * BLUR_SAMPLE_SCALE, + blur_inflation_y * BLUR_SAMPLE_SCALE, + ).translate(shadow.offset); + + // ImageBrush colors + request.push(shadow.color.premultiplied()); + request.push(PremultipliedColorF::WHITE); + request.push([ + shadow_rect.width(), + shadow_rect.height(), + 0.0, + 0.0, + ]); + + // segment rect / extra data + request.push(shadow_rect); + request.push([0.0, 0.0, 0.0, 0.0]); + } } } PictureCompositeMode::Filter(ref filter) => { match *filter { Filter::ColorMatrix(ref m) => { - if self.extra_gpu_data.is_empty() { - self.extra_gpu_data.push(GpuBufferAddress::INVALID); + if self.extra_gpu_data_handles.is_empty() { + self.extra_gpu_data_handles.push(GpuCacheHandle::new()); } - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(5); - for i in 0..5 { - writer.push_one([m[i*4], m[i*4+1], m[i*4+2], m[i*4+3]]); + if let Some(mut request) = frame_state.gpu_cache.request(&mut self.extra_gpu_data_handles[0]) { + for i in 0..5 { + request.push([m[i*4], m[i*4+1], m[i*4+2], m[i*4+3]]); + } } - self.extra_gpu_data[0] = writer.finish(); } Filter::Flood(ref color) => { - if self.extra_gpu_data.is_empty() { - self.extra_gpu_data.push(GpuBufferAddress::INVALID); + if self.extra_gpu_data_handles.is_empty() { + self.extra_gpu_data_handles.push(GpuCacheHandle::new()); + } + if let Some(mut request) = frame_state.gpu_cache.request(&mut self.extra_gpu_data_handles[0]) { + request.push(color.to_array()); } - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(1); - writer.push_one(color.to_array()); - self.extra_gpu_data[0] = writer.finish(); } _ => {} } } PictureCompositeMode::ComponentTransferFilter(handle) => { let filter_data = &mut data_stores.filter_data[handle]; - filter_data.write_gpu_blocks(&mut frame_state.frame_gpu_data.f32); + filter_data.update(&mut frame_state.gpu_cache); } PictureCompositeMode::MixBlend(..) | PictureCompositeMode::Blit(_) | @@ -7326,7 +7335,7 @@ impl PicturePrimitive { match op { FilterGraphOp::SVGFEComponentTransferInterned { handle, creates_pixels: _ } => { let filter_data = &mut data_stores.filter_data[*handle]; - filter_data.write_gpu_blocks(&mut frame_state.frame_gpu_data.f32); + filter_data.update(&mut frame_state.gpu_cache); } _ => {} } @@ -8613,7 +8622,7 @@ fn request_render_task( snapshot: &Option<SnapshotInfo>, surface_rects: &SurfaceAllocInfo, is_opaque: bool, - f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId, + f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId, ) -> RenderTaskId { let task_id = match snapshot { @@ -8627,6 +8636,7 @@ fn request_render_task( surface_rects.task_size, frame_state.rg_builder, &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, is_opaque, &adjustment, f @@ -8649,6 +8659,7 @@ fn request_render_task( f( frame_state.rg_builder, &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache ) } }; diff --git a/gfx/wr/webrender/src/picture_textures.rs b/gfx/wr/webrender/src/picture_textures.rs @@ -13,8 +13,8 @@ use crate::internal_types::{ }; use crate::profiler::{self, TransactionProfile}; use crate::gpu_types::{ImageSource, UvRectKind}; +use crate::gpu_cache::{GpuCache, GpuCacheHandle}; use crate::freelist::{FreeList, FreeListHandle, WeakFreeListHandle}; -use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF}; #[derive(Debug, PartialEq)] @@ -42,23 +42,24 @@ pub struct PictureCacheEntry { // in the glyph cache eviction code. We could probably remove it // entirely in future (or move to EntryDetails::Picture). pub last_access: FrameStamp, - /// Handle to the resource rect in the float GPU buffer. - pub uv_rect_handle: GpuBufferAddress, + /// Handle to the resource rect in the GPU cache. + pub uv_rect_handle: GpuCacheHandle, /// The actual device texture ID this is part of. pub texture_id: CacheTextureId, } impl PictureCacheEntry { - fn write_gpu_blocks(&mut self, gpu_buffer: &mut GpuBufferBuilderF) { - let origin = DeviceIntPoint::zero(); - let image_source = ImageSource { - p0: origin.to_f32(), - p1: (origin + self.size).to_f32(), - uv_rect_kind: UvRectKind::Rect, - user_data: [0.0; 4], - }; - - self.uv_rect_handle = image_source.write_gpu_blocks(gpu_buffer); + fn update_gpu_cache(&mut self, gpu_cache: &mut GpuCache) { + if let Some(mut request) = gpu_cache.request(&mut self.uv_rect_handle) { + let origin = DeviceIntPoint::zero(); + let image_source = ImageSource { + p0: origin.to_f32(), + p1: (origin + self.size).to_f32(), + uv_rect_kind: UvRectKind::Rect, + user_data: [0.0; 4], + }; + image_source.write_gpu_blocks(&mut request); + } } } @@ -129,7 +130,7 @@ impl PictureTextures { &mut self, tile_size: DeviceIntSize, handle: &mut Option<PictureCacheTextureHandle>, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, next_texture_id: &mut CacheTextureId, pending_updates: &mut TextureUpdateList, ) { @@ -159,7 +160,7 @@ impl PictureTextures { self.cache_entries .get_opt_mut(handle) .expect("BUG: handle must be valid now") - .write_gpu_blocks(gpu_buffer); + .update_gpu_cache(gpu_cache); } else { panic!("The handle should be valid picture cache handle now") } @@ -218,7 +219,7 @@ impl PictureTextures { let cache_entry = PictureCacheEntry { size: tile_size, last_access: self.now, - uv_rect_handle: GpuBufferAddress::INVALID, + uv_rect_handle: GpuCacheHandle::new(), texture_id, }; @@ -263,14 +264,14 @@ impl PictureTextures { } } - pub fn request(&mut self, handle: &PictureCacheTextureHandle, gpu_buffer: &mut GpuBufferBuilderF) -> bool { + pub fn request(&mut self, handle: &PictureCacheTextureHandle, gpu_cache: &mut GpuCache) -> bool { let entry = self.cache_entries.get_opt_mut(handle); let now = self.now; entry.map_or(true, |entry| { // If an image is requested that is already in the cache, // refresh the GPU cache data associated with this item. entry.last_access = now; - entry.write_gpu_blocks(gpu_buffer); + entry.update_gpu_cache(gpu_cache); false }) } diff --git a/gfx/wr/webrender/src/prepare.rs b/gfx/wr/webrender/src/prepare.rs @@ -6,7 +6,7 @@ //! //! TODO: document this! -use api::{ColorF, DebugFlags}; +use api::{ColorF, DebugFlags, PropertyBinding}; use api::{BoxShadowClipMode, BorderStyle, ClipMode}; use api::units::*; use euclid::Scale; @@ -17,10 +17,10 @@ use crate::image_tiling::{self, Repetition}; use crate::border::{get_max_scale_for_border, build_border_instances}; use crate::clip::{ClipStore, ClipNodeRange}; use crate::pattern::Pattern; -use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF, GpuBufferWriterF}; use crate::spatial_tree::{SpatialNodeIndex, SpatialTree}; use crate::clip::{ClipDataStore, ClipNodeFlags, ClipChainInstance, ClipItemKind}; use crate::frame_builder::{FrameBuildingContext, FrameBuildingState, PictureContext, PictureState}; +use crate::gpu_cache::{GpuCacheHandle, GpuDataRequest}; use crate::gpu_types::BrushFlags; use crate::internal_types::{FastHashMap, PlaneSplitAnchor, Filter}; use crate::picture::{ClusterFlags, PictureCompositeMode, PicturePrimitive, SliceId}; @@ -438,10 +438,11 @@ fn prepare_interned_prim_for_render( }), false, RenderTaskParent::Surface, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.rg_builder, &mut frame_state.surface_builder, - &mut |rg_builder, _| { + &mut |rg_builder, _, _| { rg_builder.add().init(RenderTask::new_dynamic( task_size, RenderTaskKind::new_line_decoration( @@ -513,11 +514,13 @@ fn prepare_interned_prim_for_render( allow_subpixel, frame_context.fb_config.low_quality_pinch_zoom, frame_state.resource_cache, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, frame_context.spatial_tree, scratch, ); + // Update the template this instane references, which may refresh the GPU + // cache with any shared template data. prim_data.update(frame_state); } PrimitiveInstanceKind::Clear { data_handle, .. } => { @@ -589,10 +592,11 @@ fn prepare_interned_prim_for_render( Some(cache_key), false, // TODO(gw): We don't calculate opacity for borders yet! RenderTaskParent::Surface, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.rg_builder, &mut frame_state.surface_builder, - &mut |rg_builder, _| { + &mut |rg_builder, _, _| { rg_builder.add().init(RenderTask::new_dynamic( cache_size, RenderTaskKind::new_border_segment( @@ -626,13 +630,35 @@ fn prepare_interned_prim_for_render( frame_state ); } - PrimitiveInstanceKind::Rectangle { data_handle, segment_instance_index, use_legacy_path, .. } => { + PrimitiveInstanceKind::Rectangle { data_handle, segment_instance_index, color_binding_index, use_legacy_path, .. } => { profile_scope!("Rectangle"); if *use_legacy_path { let prim_data = &mut data_stores.prim[*data_handle]; prim_data.common.may_need_repetition = false; + // TODO(gw): Legacy rect rendering path - remove once we support masks on quad prims + if *color_binding_index != ColorBindingIndex::INVALID { + match store.color_bindings[*color_binding_index] { + PropertyBinding::Binding(..) => { + // We explicitly invalidate the gpu cache + // if the color is animating. + let gpu_cache_handle = + if *segment_instance_index == SegmentInstanceIndex::INVALID { + None + } else if *segment_instance_index == SegmentInstanceIndex::UNUSED { + Some(&prim_data.common.gpu_cache_handle) + } else { + Some(&scratch.segment_instances[*segment_instance_index].gpu_cache_handle) + }; + if let Some(gpu_cache_handle) = gpu_cache_handle { + frame_state.gpu_cache.invalidate(gpu_cache_handle); + } + } + PropertyBinding::Value(..) => {}, + } + } + // Update the template this instane references, which may refresh the GPU // cache with any shared template data. prim_data.update( @@ -695,8 +721,8 @@ fn prepare_interned_prim_for_render( frame_state, &mut scratch.segments, &mut scratch.segment_instances, - |writer| { - yuv_image_data.write_prim_gpu_blocks(writer); + |request| { + yuv_image_data.write_prim_gpu_blocks(request); } ); } @@ -778,22 +804,19 @@ fn prepare_interned_prim_for_render( frame_state, &mut scratch.gradient_tiles, &frame_context.spatial_tree, - Some(&mut |_, gpu_buffer| { - let mut writer = gpu_buffer.write_blocks(2); - writer.push_one([ + Some(&mut |_, mut request| { + request.push([ prim_data.start_point.x, prim_data.start_point.y, prim_data.end_point.x, prim_data.end_point.y, ]); - writer.push_one([ + request.push([ pack_as_float(prim_data.extend_mode as u32), prim_data.stretch_size.width, prim_data.stretch_size.height, 0.0, ]); - - writer.finish() }), ); @@ -1210,22 +1233,23 @@ fn write_segment<F>( segments: &mut SegmentStorage, segment_instances: &mut SegmentInstanceStorage, f: F, -) where F: Fn(&mut GpuBufferWriterF) { +) where F: Fn(&mut GpuDataRequest) { debug_assert_ne!(segment_instance_index, SegmentInstanceIndex::INVALID); if segment_instance_index != SegmentInstanceIndex::UNUSED { let segment_instance = &mut segment_instances[segment_instance_index]; - let segments = &segments[segment_instance.segments_range]; - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + segments.len() * VECS_PER_SEGMENT); + if let Some(mut request) = frame_state.gpu_cache.request(&mut segment_instance.gpu_cache_handle) { + let segments = &segments[segment_instance.segments_range]; - f(&mut writer); + f(&mut request); - for segment in segments { - writer.push_one(segment.local_rect); - writer.push_one([0.0; 4]); + for segment in segments { + request.write_segment( + segment.local_rect, + [0.0; 4], + ); + } } - - segment_instance.gpu_data = writer.finish(); } } @@ -1238,7 +1262,7 @@ fn decompose_repeated_gradient( frame_state: &mut FrameBuildingState, gradient_tiles: &mut GradientTileStorage, spatial_tree: &SpatialTree, - mut callback: Option<&mut dyn FnMut(&LayoutRect, &mut GpuBufferBuilderF) -> GpuBufferAddress>, + mut callback: Option<&mut dyn FnMut(&LayoutRect, GpuDataRequest)>, ) -> GradientTileRange { let tile_range = gradient_tiles.open_range(); @@ -1262,21 +1286,22 @@ fn decompose_repeated_gradient( let repetitions = image_tiling::repetitions(prim_local_rect, &visible_rect, stride); gradient_tiles.reserve(repetitions.num_repetitions()); for Repetition { origin, .. } in repetitions { + let mut handle = GpuCacheHandle::new(); let rect = LayoutRect::from_origin_and_size( origin, *stretch_size, ); - let mut address = GpuBufferAddress::INVALID; - if let Some(callback) = &mut callback { - address = callback(&rect, &mut frame_state.frame_gpu_data.f32); + if let Some(request) = frame_state.gpu_cache.request(&mut handle) { + callback(&rect, request); + } } gradient_tiles.push(VisibleGradientTile { local_rect: rect, local_clip_rect: tight_clip_rect, - address, + handle }); } } @@ -1451,7 +1476,7 @@ fn update_clip_task_for_brush( &pic_state.map_local_to_pic, &pic_state.map_pic_to_vis, &frame_context.spatial_tree, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, frame_state.resource_cache, device_pixel_scale, &dirty_rect, @@ -1548,6 +1573,7 @@ pub fn update_clip_task( instance.vis.clip_chain.clips_range, root_spatial_node_index, frame_state.clip_store, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.resource_cache, frame_state.rg_builder, @@ -1613,6 +1639,7 @@ pub fn update_brush_segment_clip_task( clip_chain.clips_range, root_spatial_node_index, frame_state.clip_store, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.resource_cache, frame_state.rg_builder, @@ -1827,7 +1854,7 @@ fn build_segments_if_needed( let instance = SegmentedInstance { segments_range, - gpu_data: GpuBufferAddress::INVALID, + gpu_cache_handle: GpuCacheHandle::new(), }; *segment_instance_index = segment_instances_store.push(instance); diff --git a/gfx/wr/webrender/src/prim_store/borders.rs b/gfx/wr/webrender/src/prim_store/borders.rs @@ -6,13 +6,16 @@ use api::{NormalBorder, PremultipliedColorF, Shadow, RasterSpace}; use api::units::*; use crate::border::create_border_segments; use crate::border::NormalBorderAu; -use crate::renderer::GpuBufferWriterF; use crate::scene_building::{CreateShadow, IsVisible}; use crate::frame_builder::FrameBuildingState; +use crate::gpu_cache::GpuDataRequest; use crate::intern; use crate::internal_types::{LayoutPrimitiveInfo, FrameId}; use crate::prim_store::{ - BorderSegmentInfo, BrushSegment, InternablePrimitive, NinePatchDescriptor, PrimKey, PrimTemplate, PrimTemplateCommonData, PrimitiveInstanceKind, PrimitiveOpacity, PrimitiveStore, VECS_PER_SEGMENT + BorderSegmentInfo, BrushSegment, NinePatchDescriptor, PrimKey, + PrimTemplate, PrimTemplateCommonData, + PrimitiveInstanceKind, PrimitiveOpacity, + PrimitiveStore, InternablePrimitive, }; use crate::resource_cache::ImageRequest; use crate::render_task::RenderTask; @@ -64,24 +67,25 @@ impl NormalBorderData { common: &mut PrimTemplateCommonData, frame_state: &mut FrameBuildingState, ) { - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT); - self.write_prim_gpu_blocks(&mut writer, common.prim_rect.size()); - self.write_segment_gpu_blocks(&mut writer); - common.gpu_buffer_address = writer.finish(); + if let Some(ref mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) { + self.write_prim_gpu_blocks(request, common.prim_rect.size()); + self.write_segment_gpu_blocks(request); + } + common.opacity = PrimitiveOpacity::translucent(); } fn write_prim_gpu_blocks( &self, - writer: &mut GpuBufferWriterF, + request: &mut GpuDataRequest, prim_size: LayoutSize ) { // Border primitives currently used for // image borders, and run through the // normal brush_image shader. - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ + request.push(PremultipliedColorF::WHITE); + request.push(PremultipliedColorF::WHITE); + request.push([ prim_size.width, prim_size.height, 0.0, @@ -91,12 +95,14 @@ impl NormalBorderData { fn write_segment_gpu_blocks( &self, - writer: &mut GpuBufferWriterF, + request: &mut GpuDataRequest, ) { for segment in &self.brush_segments { // has to match VECS_PER_SEGMENT - writer.push_one(segment.local_rect); - writer.push_one(segment.extra_data); + request.write_segment( + segment.local_rect, + segment.extra_data, + ); } } } @@ -239,10 +245,10 @@ impl ImageBorderData { common: &mut PrimTemplateCommonData, frame_state: &mut FrameBuildingState, ) { - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT); - self.write_prim_gpu_blocks(&mut writer, &common.prim_rect.size()); - self.write_segment_gpu_blocks(&mut writer); - common.gpu_buffer_address = writer.finish(); + if let Some(ref mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) { + self.write_prim_gpu_blocks(request, &common.prim_rect.size()); + self.write_segment_gpu_blocks(request); + } let frame_id = frame_state.rg_builder.frame_id(); if self.frame_id != frame_id { @@ -250,7 +256,7 @@ impl ImageBorderData { let size = frame_state.resource_cache.request_image( self.request, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, ); let task_id = frame_state.rg_builder.add().init( @@ -273,15 +279,15 @@ impl ImageBorderData { fn write_prim_gpu_blocks( &self, - writer: &mut GpuBufferWriterF, + request: &mut GpuDataRequest, prim_size: &LayoutSize, ) { // Border primitives currently used for // image borders, and run through the // normal brush_image shader. - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ + request.push(PremultipliedColorF::WHITE); + request.push(PremultipliedColorF::WHITE); + request.push([ prim_size.width, prim_size.height, 0.0, @@ -291,12 +297,14 @@ impl ImageBorderData { fn write_segment_gpu_blocks( &self, - writer: &mut GpuBufferWriterF, + request: &mut GpuDataRequest, ) { for segment in &self.brush_segments { // has to match VECS_PER_SEGMENT - writer.push_one(segment.local_rect); - writer.push_one(segment.extra_data); + request.write_segment( + segment.local_rect, + segment.extra_data, + ); } } } @@ -369,9 +377,9 @@ fn test_struct_sizes() { // (b) You made a structure larger. This is not necessarily a problem, but should only // be done with care, and after checking if talos performance regresses badly. assert_eq!(mem::size_of::<NormalBorderPrim>(), 84, "NormalBorderPrim size changed"); - assert_eq!(mem::size_of::<NormalBorderTemplate>(), 208, "NormalBorderTemplate size changed"); + assert_eq!(mem::size_of::<NormalBorderTemplate>(), 216, "NormalBorderTemplate size changed"); assert_eq!(mem::size_of::<NormalBorderKey>(), 104, "NormalBorderKey size changed"); assert_eq!(mem::size_of::<ImageBorder>(), 68, "ImageBorder size changed"); - assert_eq!(mem::size_of::<ImageBorderTemplate>(), 96, "ImageBorderTemplate size changed"); + assert_eq!(mem::size_of::<ImageBorderTemplate>(), 104, "ImageBorderTemplate size changed"); assert_eq!(mem::size_of::<ImageBorderKey>(), 88, "ImageBorderKey size changed"); } diff --git a/gfx/wr/webrender/src/prim_store/gradient/conic.rs b/gfx/wr/webrender/src/prim_store/gradient/conic.rs @@ -17,7 +17,7 @@ use crate::scene_building::IsVisible; use crate::frame_builder::FrameBuildingState; use crate::intern::{Internable, InternDebug, Handle as InternHandle}; use crate::internal_types::LayoutPrimitiveInfo; -use crate::prim_store::{BrushSegment, GradientTileRange, VECS_PER_SEGMENT}; +use crate::prim_store::{BrushSegment, GradientTileRange}; use crate::prim_store::{PrimitiveInstanceKind, PrimitiveOpacity, FloatKey}; use crate::prim_store::{PrimKeyCommonData, PrimTemplateCommonData, PrimitiveStore}; use crate::prim_store::{NinePatchDescriptor, PointKey, SizeKey, InternablePrimitive}; @@ -261,23 +261,27 @@ impl ConicGradientTemplate { &mut self, frame_state: &mut FrameBuildingState, ) { - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT); - // write_prim_gpu_blocks - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ - self.stretch_size.width, - self.stretch_size.height, - 0.0, - 0.0, - ]); - // write_segment_gpu_blocks - for segment in &self.brush_segments { - // has to match VECS_PER_SEGMENT - writer.push_one(segment.local_rect); - writer.push_one(segment.extra_data); + if let Some(mut request) = + frame_state.gpu_cache.request(&mut self.common.gpu_cache_handle) { + // write_prim_gpu_blocks + request.push(PremultipliedColorF::WHITE); + request.push(PremultipliedColorF::WHITE); + request.push([ + self.stretch_size.width, + self.stretch_size.height, + 0.0, + 0.0, + ]); + + // write_segment_gpu_blocks + for segment in &self.brush_segments { + // has to match VECS_PER_SEGMENT + request.write_segment( + segment.local_rect, + segment.extra_data, + ); + } } - self.common.gpu_buffer_address = writer.finish(); let cache_key = ConicGradientCacheKey { size: self.task_size, @@ -297,10 +301,11 @@ impl ConicGradientTemplate { }), false, RenderTaskParent::Surface, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.rg_builder, &mut frame_state.surface_builder, - &mut |rg_builder, gpu_buffer_builder| { + &mut |rg_builder, gpu_buffer_builder, _| { let stops = GradientGpuBlockBuilder::build( false, gpu_buffer_builder, diff --git a/gfx/wr/webrender/src/prim_store/gradient/linear.rs b/gfx/wr/webrender/src/prim_store/gradient/linear.rs @@ -19,7 +19,7 @@ use crate::frame_builder::FrameBuildingState; use crate::intern::{Internable, InternDebug, Handle as InternHandle}; use crate::internal_types::LayoutPrimitiveInfo; use crate::image_tiling::simplify_repeated_primitive; -use crate::prim_store::{BrushSegment, GradientTileRange, VECS_PER_SEGMENT}; +use crate::prim_store::{BrushSegment, GradientTileRange}; use crate::prim_store::{PrimitiveInstanceKind, PrimitiveOpacity}; use crate::prim_store::{PrimKeyCommonData, PrimTemplateCommonData, PrimitiveStore}; use crate::prim_store::{NinePatchDescriptor, PointKey, SizeKey, InternablePrimitive}; @@ -494,44 +494,47 @@ impl LinearGradientTemplate { &mut self, frame_state: &mut FrameBuildingState, ) { - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT); - - // Write_prim_gpu_blocks - if self.cached { - // We are using the image brush. - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ - self.stretch_size.width, - self.stretch_size.height, - 0.0, - 0.0, - ]); - } else { - // We are using the gradient brush. - writer.push_one([ - self.start_point.x, - self.start_point.y, - self.end_point.x, - self.end_point.y, - ]); - writer.push_one([ - pack_as_float(self.extend_mode as u32), - self.stretch_size.width, - self.stretch_size.height, - 0.0, - ]); - } + if let Some(mut request) = frame_state.gpu_cache.request( + &mut self.common.gpu_cache_handle + ) { + + // Write_prim_gpu_blocks + if self.cached { + // We are using the image brush. + request.push(PremultipliedColorF::WHITE); + request.push(PremultipliedColorF::WHITE); + request.push([ + self.stretch_size.width, + self.stretch_size.height, + 0.0, + 0.0, + ]); + } else { + // We are using the gradient brush. + request.push([ + self.start_point.x, + self.start_point.y, + self.end_point.x, + self.end_point.y, + ]); + request.push([ + pack_as_float(self.extend_mode as u32), + self.stretch_size.width, + self.stretch_size.height, + 0.0, + ]); + } - // write_segment_gpu_blocks - for segment in &self.brush_segments { - // has to match VECS_PER_SEGMENT - writer.push_one(segment.local_rect); - writer.push_one(segment.extra_data); + // write_segment_gpu_blocks + for segment in &self.brush_segments { + // has to match VECS_PER_SEGMENT + request.write_segment( + segment.local_rect, + segment.extra_data, + ); + } } - self.common.gpu_buffer_address = writer.finish(); - // Tile spacing is always handled by decomposing into separate draw calls so the // primitive opacity is equivalent to stops opacity. This might change to being // set to non-opaque in the presence of tile spacing if/when tile spacing is handled @@ -562,10 +565,11 @@ impl LinearGradientTemplate { }), false, RenderTaskParent::Surface, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.rg_builder, &mut frame_state.surface_builder, - &mut |rg_builder, _| { + &mut |rg_builder, _, _| { rg_builder.add().init(RenderTask::new_dynamic( self.task_size, RenderTaskKind::FastLinearGradient(gradient), @@ -590,10 +594,11 @@ impl LinearGradientTemplate { }), false, RenderTaskParent::Surface, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.rg_builder, &mut frame_state.surface_builder, - &mut |rg_builder, gpu_buffer_builder| { + &mut |rg_builder, gpu_buffer_builder, _| { let stops = Some(GradientGpuBlockBuilder::build( self.reverse_stops, gpu_buffer_builder, diff --git a/gfx/wr/webrender/src/prim_store/gradient/mod.rs b/gfx/wr/webrender/src/prim_store/gradient/mod.rs @@ -590,14 +590,14 @@ fn test_struct_sizes() { // (b) You made a structure larger. This is not necessarily a problem, but should only // be done with care, and after checking if talos performance regresses badly. assert_eq!(mem::size_of::<LinearGradient>(), 72, "LinearGradient size changed"); - assert_eq!(mem::size_of::<LinearGradientTemplate>(), 136, "LinearGradientTemplate size changed"); + assert_eq!(mem::size_of::<LinearGradientTemplate>(), 144, "LinearGradientTemplate size changed"); assert_eq!(mem::size_of::<LinearGradientKey>(), 96, "LinearGradientKey size changed"); assert_eq!(mem::size_of::<RadialGradient>(), 72, "RadialGradient size changed"); - assert_eq!(mem::size_of::<RadialGradientTemplate>(), 136, "RadialGradientTemplate size changed"); + assert_eq!(mem::size_of::<RadialGradientTemplate>(), 144, "RadialGradientTemplate size changed"); assert_eq!(mem::size_of::<RadialGradientKey>(), 96, "RadialGradientKey size changed"); assert_eq!(mem::size_of::<ConicGradient>(), 72, "ConicGradient size changed"); - assert_eq!(mem::size_of::<ConicGradientTemplate>(), 136, "ConicGradientTemplate size changed"); + assert_eq!(mem::size_of::<ConicGradientTemplate>(), 144, "ConicGradientTemplate size changed"); assert_eq!(mem::size_of::<ConicGradientKey>(), 96, "ConicGradientKey size changed"); } diff --git a/gfx/wr/webrender/src/prim_store/gradient/radial.rs b/gfx/wr/webrender/src/prim_store/gradient/radial.rs @@ -17,7 +17,7 @@ use crate::scene_building::IsVisible; use crate::frame_builder::FrameBuildingState; use crate::intern::{Internable, InternDebug, Handle as InternHandle}; use crate::internal_types::LayoutPrimitiveInfo; -use crate::prim_store::{BrushSegment, GradientTileRange, InternablePrimitive, VECS_PER_SEGMENT}; +use crate::prim_store::{BrushSegment, GradientTileRange, InternablePrimitive}; use crate::prim_store::{PrimitiveInstanceKind, PrimitiveOpacity}; use crate::prim_store::{PrimKeyCommonData, PrimTemplateCommonData, PrimitiveStore}; use crate::prim_store::{NinePatchDescriptor, PointKey, SizeKey, FloatKey}; @@ -228,24 +228,27 @@ impl RadialGradientTemplate { &mut self, frame_state: &mut FrameBuildingState, ) { - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT); - - // write_prim_gpu_blocks - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ - self.stretch_size.width, - self.stretch_size.height, - 0.0, - 0.0, - ]); - // write_segment_gpu_blocks - for segment in &self.brush_segments { - // has to match VECS_PER_SEGMENT - writer.push_one(segment.local_rect); - writer.push_one(segment.extra_data); + if let Some(mut request) = + frame_state.gpu_cache.request(&mut self.common.gpu_cache_handle) { + // write_prim_gpu_blocks + request.push(PremultipliedColorF::WHITE); + request.push(PremultipliedColorF::WHITE); + request.push([ + self.stretch_size.width, + self.stretch_size.height, + 0.0, + 0.0, + ]); + + // write_segment_gpu_blocks + for segment in &self.brush_segments { + // has to match VECS_PER_SEGMENT + request.write_segment( + segment.local_rect, + segment.extra_data, + ); + } } - self.common.gpu_buffer_address = writer.finish(); let task_size = self.task_size; let cache_key = RadialGradientCacheKey { @@ -266,10 +269,11 @@ impl RadialGradientTemplate { }), false, RenderTaskParent::Surface, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.rg_builder, &mut frame_state.surface_builder, - &mut |rg_builder, gpu_buffer_builder| { + &mut |rg_builder, gpu_buffer_builder, _| { let stops = GradientGpuBlockBuilder::build( false, gpu_buffer_builder, diff --git a/gfx/wr/webrender/src/prim_store/image.rs b/gfx/wr/webrender/src/prim_store/image.rs @@ -10,9 +10,9 @@ use api::{ use api::units::*; use euclid::point2; use crate::composite::CompositorSurfaceKind; -use crate::renderer::{GpuBufferBuilderF, GpuBufferWriterF}; use crate::scene_building::{CreateShadow, IsVisible}; use crate::frame_builder::{FrameBuildingContext, FrameBuildingState}; +use crate::gpu_cache::{GpuCache, GpuDataRequest}; use crate::intern::{Internable, InternDebug, Handle as InternHandle}; use crate::internal_types::LayoutPrimitiveInfo; use crate::prim_store::{ @@ -192,7 +192,7 @@ impl ImageData { let mut size = frame_state.resource_cache.request_image( request, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, ); let mut task_id = frame_state.rg_builder.add().init( @@ -274,10 +274,11 @@ impl ImageData { }), descriptor.is_opaque(), RenderTaskParent::Surface, + frame_state.gpu_cache, &mut frame_state.frame_gpu_data.f32, frame_state.rg_builder, &mut frame_state.surface_builder, - &mut |rg_builder, _| { + &mut |rg_builder, _, _| { // Create a task to blit from the texture cache to // a normal transient render task surface. // TODO: figure out if/when we can do a blit instead. @@ -356,7 +357,7 @@ impl ImageData { let request = request.with_tile(tile.offset); let size = frame_state.resource_cache.request_image( request, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, ); let task_id = frame_state.rg_builder.add().init( @@ -389,19 +390,19 @@ impl ImageData { ); } - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3); - self.write_prim_gpu_blocks(&image_instance.adjustment, &mut writer); - common.gpu_buffer_address = writer.finish(); + if let Some(mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) { + self.write_prim_gpu_blocks(&image_instance.adjustment, &mut request); + } } - pub fn write_prim_gpu_blocks(&self, adjustment: &AdjustedImageSource, writer: &mut GpuBufferWriterF) { + pub fn write_prim_gpu_blocks(&self, adjustment: &AdjustedImageSource, request: &mut GpuDataRequest) { let stretch_size = adjustment.map_stretch_size(self.stretch_size); // Images are drawn as a white color, modulated by the total // opacity coming from any collapsed property bindings. // Size has to match `VECS_PER_SPECIFIC_BRUSH` from `brush_image.glsl` exactly. - writer.push_one(self.color.premultiplied()); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ + request.push(self.color.premultiplied()); + request.push(PremultipliedColorF::WHITE); + request.push([ stretch_size.width + self.tile_spacing.width, stretch_size.height + self.tile_spacing.height, 0.0, @@ -672,7 +673,7 @@ impl YuvImageData { let size = frame_state.resource_cache.request_image( request, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, ); let task_id = frame_state.rg_builder.add().init( @@ -686,18 +687,18 @@ impl YuvImageData { self.src_yuv[channel] = Some(task_id); } - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(1); - self.write_prim_gpu_blocks(&mut writer); - common.gpu_buffer_address = writer.finish(); + if let Some(mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) { + self.write_prim_gpu_blocks(&mut request); + }; - // YUV images never have transparency + // YUV images never have transparency common.opacity = PrimitiveOpacity::opaque(); } pub fn request_resources( &mut self, resource_cache: &mut ResourceCache, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, ) { let channel_num = self.format.get_plane_num(); debug_assert!(channel_num <= 3); @@ -708,14 +709,14 @@ impl YuvImageData { rendering: self.image_rendering, tile: None, }, - gpu_buffer, + gpu_cache, ); } } - pub fn write_prim_gpu_blocks(&self, writer: &mut GpuBufferWriterF) { + pub fn write_prim_gpu_blocks(&self, request: &mut GpuDataRequest) { let ranged_color_space = self.color_space.with_range(self.color_range); - writer.push_one([ + request.push([ pack_as_float(self.color_depth.bit_depth()), pack_as_float(ranged_color_space as u32), pack_as_float(self.format as u32), @@ -784,9 +785,9 @@ fn test_struct_sizes() { // (b) You made a structure larger. This is not necessarily a problem, but should only // be done with care, and after checking if talos performance regresses badly. assert_eq!(mem::size_of::<Image>(), 32, "Image size changed"); - assert_eq!(mem::size_of::<ImageTemplate>(), 68, "ImageTemplate size changed"); + assert_eq!(mem::size_of::<ImageTemplate>(), 72, "ImageTemplate size changed"); assert_eq!(mem::size_of::<ImageKey>(), 52, "ImageKey size changed"); assert_eq!(mem::size_of::<YuvImage>(), 32, "YuvImage size changed"); - assert_eq!(mem::size_of::<YuvImageTemplate>(), 80, "YuvImageTemplate size changed"); + assert_eq!(mem::size_of::<YuvImageTemplate>(), 84, "YuvImageTemplate size changed"); assert_eq!(mem::size_of::<YuvImageKey>(), 52, "YuvImageKey size changed"); } diff --git a/gfx/wr/webrender/src/prim_store/line_dec.rs b/gfx/wr/webrender/src/prim_store/line_dec.rs @@ -7,9 +7,9 @@ use api::{ LineOrientation, LineStyle, PremultipliedColorF, Shadow, }; use api::units::*; -use crate::renderer::GpuBufferWriterF; use crate::scene_building::{CreateShadow, IsVisible}; use crate::frame_builder::FrameBuildingState; +use crate::gpu_cache::GpuDataRequest; use crate::intern; use crate::internal_types::LayoutPrimitiveInfo; use crate::prim_store::{ @@ -78,20 +78,20 @@ impl LineDecorationData { common: &mut PrimTemplateCommonData, frame_state: &mut FrameBuildingState, ) { - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3); - self.write_prim_gpu_blocks(&mut writer); - common.gpu_buffer_address = writer.finish(); + if let Some(ref mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) { + self.write_prim_gpu_blocks(request); + } } fn write_prim_gpu_blocks( &self, - writer: &mut GpuBufferWriterF + request: &mut GpuDataRequest ) { match self.cache_key.as_ref() { Some(cache_key) => { - writer.push_one(self.color.premultiplied()); - writer.push_one(PremultipliedColorF::WHITE); - writer.push_one([ + request.push(self.color.premultiplied()); + request.push(PremultipliedColorF::WHITE); + request.push([ cache_key.size.width.to_f32_px(), cache_key.size.height.to_f32_px(), 0.0, @@ -99,7 +99,7 @@ impl LineDecorationData { ]); } None => { - writer.push_one(self.color.premultiplied()); + request.push(self.color.premultiplied()); } } } @@ -251,6 +251,6 @@ fn test_struct_sizes() { // (b) You made a structure larger. This is not necessarily a problem, but should only // be done with care, and after checking if talos performance regresses badly. assert_eq!(mem::size_of::<LineDecoration>(), 20, "LineDecoration size changed"); - assert_eq!(mem::size_of::<LineDecorationTemplate>(), 56, "LineDecorationTemplate size changed"); + assert_eq!(mem::size_of::<LineDecorationTemplate>(), 60, "LineDecorationTemplate size changed"); assert_eq!(mem::size_of::<LineDecorationKey>(), 40, "LineDecorationKey size changed"); } diff --git a/gfx/wr/webrender/src/prim_store/mod.rs b/gfx/wr/webrender/src/prim_store/mod.rs @@ -13,7 +13,6 @@ use crate::composite::CompositorSurfaceKind; use crate::clip::ClipLeafId; use crate::pattern::{Pattern, PatternBuilder, PatternBuilderContext, PatternBuilderState}; use crate::quad::QuadTileClassifier; -use crate::renderer::{GpuBufferAddress, GpuBufferWriterF}; use crate::segment::EdgeAaSegmentMask; use crate::border::BorderSegmentCacheKey; use crate::debug_item::{DebugItem, DebugMessage}; @@ -21,6 +20,7 @@ use crate::debug_colors; use crate::scene_building::{CreateShadow, IsVisible}; use crate::frame_builder::FrameBuildingState; use glyph_rasterizer::GlyphKey; +use crate::gpu_cache::{GpuCacheAddress, GpuCacheHandle, GpuDataRequest}; use crate::gpu_types::{BrushFlags, QuadSegment}; use crate::intern; use crate::picture::PicturePrimitive; @@ -90,7 +90,7 @@ impl PrimitiveOpacity { #[cfg_attr(feature = "capture", derive(Serialize))] #[cfg_attr(feature = "replay", derive(Deserialize))] pub struct DeferredResolve { - pub address: GpuBufferAddress, + pub address: GpuCacheAddress, pub image_properties: ImageProperties, pub rendering: ImageRendering, pub is_composited: bool, @@ -488,16 +488,16 @@ impl PrimitiveTemplateKind { /// Write any GPU blocks for the primitive template to the given request object. pub fn write_prim_gpu_blocks( &self, - writer: &mut GpuBufferWriterF, + request: &mut GpuDataRequest, scene_properties: &SceneProperties, ) { match *self { PrimitiveTemplateKind::Clear => { // Opaque black with operator dest out - writer.push_one(PremultipliedColorF::BLACK); + request.push(PremultipliedColorF::BLACK); } PrimitiveTemplateKind::Rectangle { ref color, .. } => { - writer.push_one(scene_properties.resolve_color(color).premultiplied()) + request.push(scene_properties.resolve_color(color).premultiplied()) } } } @@ -530,12 +530,11 @@ pub struct PrimTemplateCommonData { pub may_need_repetition: bool, pub prim_rect: LayoutRect, pub opacity: PrimitiveOpacity, - /// Address of the per-primitive data in the GPU cache. - /// - /// TODO: This is only valid during the current frame and must - /// be overwritten each frame. We should move this out of the - /// common data to avoid accidental reuse. - pub gpu_buffer_address: GpuBufferAddress, + /// The GPU cache handle for a primitive template. Since this structure + /// is retained across display lists by interning, this GPU cache handle + /// also remains valid, which reduces the number of updates to the GPU + /// cache when a new display list is processed. + pub gpu_cache_handle: GpuCacheHandle, /// Specifies the edges that are *allowed* to have anti-aliasing. /// In other words EdgeAaSegmentFlags::all() does not necessarily mean all edges will /// be anti-aliased, only that they could be. @@ -550,7 +549,7 @@ impl PrimTemplateCommonData { flags: common.flags, may_need_repetition: true, prim_rect: common.prim_rect.into(), - gpu_buffer_address: GpuBufferAddress::INVALID, + gpu_cache_handle: GpuCacheHandle::new(), opacity: PrimitiveOpacity::translucent(), edge_aa_mask: EdgeAaSegmentMask::all(), } @@ -640,9 +639,9 @@ impl PrimitiveTemplate { frame_state: &mut FrameBuildingState, scene_properties: &SceneProperties, ) { - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(1); - self.kind.write_prim_gpu_blocks(&mut writer, scene_properties); - self.common.gpu_buffer_address = writer.finish(); + if let Some(mut request) = frame_state.gpu_cache.request(&mut self.common.gpu_cache_handle) { + self.kind.write_prim_gpu_blocks(&mut request, scene_properties); + } self.opacity = match self.kind { PrimitiveTemplateKind::Clear => { @@ -713,7 +712,7 @@ pub struct VisibleMaskImageTile { #[derive(Debug)] #[cfg_attr(feature = "capture", derive(Serialize))] pub struct VisibleGradientTile { - pub address: GpuBufferAddress, + pub handle: GpuCacheHandle, pub local_rect: LayoutRect, pub local_clip_rect: LayoutRect, } @@ -1203,7 +1202,7 @@ impl PrimitiveInstance { #[cfg_attr(feature = "capture", derive(Serialize))] #[derive(Debug)] pub struct SegmentedInstance { - pub gpu_data: GpuBufferAddress, + pub gpu_cache_handle: GpuCacheHandle, pub segments_range: SegmentsRange, } @@ -1556,7 +1555,7 @@ fn test_struct_sizes() { // be done with care, and after checking if talos performance regresses badly. assert_eq!(mem::size_of::<PrimitiveInstance>(), 88, "PrimitiveInstance size changed"); assert_eq!(mem::size_of::<PrimitiveInstanceKind>(), 24, "PrimitiveInstanceKind size changed"); - assert_eq!(mem::size_of::<PrimitiveTemplate>(), 52, "PrimitiveTemplate size changed"); + assert_eq!(mem::size_of::<PrimitiveTemplate>(), 56, "PrimitiveTemplate size changed"); assert_eq!(mem::size_of::<PrimitiveTemplateKind>(), 28, "PrimitiveTemplateKind size changed"); assert_eq!(mem::size_of::<PrimitiveKey>(), 36, "PrimitiveKey size changed"); assert_eq!(mem::size_of::<PrimitiveKeyKind>(), 16, "PrimitiveKeyKind size changed"); diff --git a/gfx/wr/webrender/src/prim_store/text_run.rs b/gfx/wr/webrender/src/prim_store/text_run.rs @@ -8,12 +8,13 @@ use api::units::*; use crate::scene_building::{CreateShadow, IsVisible}; use crate::frame_builder::FrameBuildingState; use glyph_rasterizer::{FontInstance, FontTransform, GlyphKey, FONT_SIZE_LIMIT}; +use crate::gpu_cache::GpuCache; use crate::intern; use crate::internal_types::LayoutPrimitiveInfo; use crate::picture::SurfaceInfo; use crate::prim_store::{PrimitiveOpacity, PrimitiveScratchBuffer}; use crate::prim_store::{PrimitiveStore, PrimKeyCommonData, PrimTemplateCommonData}; -use crate::renderer::{GpuBufferBuilderF, MAX_VERTEX_TEXTURE_WIDTH}; +use crate::renderer::MAX_VERTEX_TEXTURE_WIDTH; use crate::resource_cache::ResourceCache; use crate::util::MatrixHelpers; use crate::prim_store::{InternablePrimitive, PrimitiveInstanceKind, LayoutPointAu}; @@ -135,32 +136,32 @@ impl TextRunTemplate { &mut self, frame_state: &mut FrameBuildingState, ) { - // Corresponds to `fetch_glyph` in the shaders. - let num_blocks = (self.glyphs.len() + 1) / 2 + 1; - assert!(num_blocks <= MAX_VERTEX_TEXTURE_WIDTH); - let mut writer = frame_state.frame_gpu_data.f32.write_blocks(num_blocks); - writer.push_one(ColorF::from(self.font.color).premultiplied()); - - let mut gpu_block = [0.0; 4]; - for (i, src) in self.glyphs.iter().enumerate() { - // Two glyphs are packed per GPU block. - if (i & 1) == 0 { - gpu_block[0] = src.point.x; - gpu_block[1] = src.point.y; - } else { - gpu_block[2] = src.point.x; - gpu_block[3] = src.point.y; - writer.push_one(gpu_block); + // corresponds to `fetch_glyph` in the shaders + if let Some(mut request) = frame_state.gpu_cache.request(&mut self.common.gpu_cache_handle) { + request.push(ColorF::from(self.font.color).premultiplied()); + + let mut gpu_block = [0.0; 4]; + for (i, src) in self.glyphs.iter().enumerate() { + // Two glyphs are packed per GPU block. + + if (i & 1) == 0 { + gpu_block[0] = src.point.x; + gpu_block[1] = src.point.y; + } else { + gpu_block[2] = src.point.x; + gpu_block[3] = src.point.y; + request.push(gpu_block); + } } - } - // Ensure the last block is added in the case - // of an odd number of glyphs. - if (self.glyphs.len() & 1) != 0 { - writer.push_one(gpu_block); - } + // Ensure the last block is added in the case + // of an odd number of glyphs. + if (self.glyphs.len() & 1) != 0 { + request.push(gpu_block); + } - self.common.gpu_buffer_address = writer.finish(); + assert!(request.current_used_block_num() <= MAX_VERTEX_TEXTURE_WIDTH); + } } } @@ -465,7 +466,7 @@ impl TextRunPrimitive { allow_subpixel: bool, low_quality_pinch_zoom: bool, resource_cache: &mut ResourceCache, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, spatial_tree: &SpatialTree, scratch: &mut PrimitiveScratchBuffer, ) { @@ -506,7 +507,7 @@ impl TextRunPrimitive { resource_cache.request_glyphs( self.used_font.clone(), &scratch.glyph_keys[self.glyph_keys_range], - gpu_buffer, + gpu_cache, ); } } @@ -523,7 +524,7 @@ fn test_struct_sizes() { // (b) You made a structure larger. This is not necessarily a problem, but should only // be done with care, and after checking if talos performance regresses badly. assert_eq!(mem::size_of::<TextRun>(), 88, "TextRun size changed"); - assert_eq!(mem::size_of::<TextRunTemplate>(), 88, "TextRunTemplate size changed"); + assert_eq!(mem::size_of::<TextRunTemplate>(), 96, "TextRunTemplate size changed"); assert_eq!(mem::size_of::<TextRunKey>(), 104, "TextRunKey size changed"); assert_eq!(mem::size_of::<TextRunPrimitive>(), 80, "TextRunPrimitive size changed"); } diff --git a/gfx/wr/webrender/src/profiler.rs b/gfx/wr/webrender/src/profiler.rs @@ -27,7 +27,7 @@ use crate::renderer::DebugRenderer; use crate::device::query::GpuTimer; use euclid::{Point2D, Rect, Size2D, vec2, default}; use crate::internal_types::FastHashMap; -use crate::renderer::{FullFrameStats, init::wr_has_been_initialized}; +use crate::renderer::{FullFrameStats, MAX_VERTEX_TEXTURE_WIDTH, init::wr_has_been_initialized}; use api::units::DeviceIntSize; use std::collections::vec_deque::VecDeque; use std::fmt::{Write, Debug}; @@ -150,134 +150,144 @@ pub const UPLOAD_NUM_COPY_BATCHES: usize = 23; pub const TOTAL_UPLOAD_TIME: usize = 24; pub const CREATE_CACHE_TEXTURE_TIME: usize = 25; pub const DELETE_CACHE_TEXTURE_TIME: usize = 26; +pub const GPU_CACHE_UPLOAD_TIME: usize = 27; -pub const RASTERIZED_BLOBS: usize = 27; -pub const RASTERIZED_BLOB_TILES: usize = 28; -pub const RASTERIZED_BLOBS_PX: usize = 29; -pub const BLOB_RASTERIZATION_TIME: usize = 30; +pub const RASTERIZED_BLOBS: usize = 28; +pub const RASTERIZED_BLOB_TILES: usize = 29; +pub const RASTERIZED_BLOBS_PX: usize = 30; +pub const BLOB_RASTERIZATION_TIME: usize = 31; -pub const RASTERIZED_GLYPHS: usize = 31; -pub const GLYPH_RESOLVE_TIME: usize = 32; +pub const RASTERIZED_GLYPHS: usize = 32; +pub const GLYPH_RESOLVE_TIME: usize = 33; -pub const DRAW_CALLS: usize = 33; -pub const VERTICES: usize = 34; -pub const PRIMITIVES: usize = 35; -pub const VISIBLE_PRIMITIVES: usize = 36; +pub const DRAW_CALLS: usize = 34; +pub const VERTICES: usize = 35; +pub const PRIMITIVES: usize = 36; +pub const VISIBLE_PRIMITIVES: usize = 37; -pub const USED_TARGETS: usize = 37; -pub const CREATED_TARGETS: usize = 38; -pub const PICTURE_CACHE_SLICES: usize = 39; +pub const USED_TARGETS: usize = 38; +pub const CREATED_TARGETS: usize = 39; +pub const PICTURE_CACHE_SLICES: usize = 40; -pub const COLOR_PASSES: usize = 40; -pub const ALPHA_PASSES: usize = 41; -pub const PICTURE_TILES: usize = 42; -pub const RENDERED_PICTURE_TILES: usize = 43; +pub const COLOR_PASSES: usize = 41; +pub const ALPHA_PASSES: usize = 42; +pub const PICTURE_TILES: usize = 43; +pub const RENDERED_PICTURE_TILES: usize = 44; -pub const FONT_TEMPLATES: usize = 44; -pub const FONT_TEMPLATES_MEM: usize = 45; -pub const IMAGE_TEMPLATES: usize = 46; -pub const IMAGE_TEMPLATES_MEM: usize = 47; +pub const FONT_TEMPLATES: usize = 45; +pub const FONT_TEMPLATES_MEM: usize = 46; +pub const IMAGE_TEMPLATES: usize = 47; +pub const IMAGE_TEMPLATES_MEM: usize = 48; + +pub const GPU_CACHE_ROWS_TOTAL: usize = 49; +pub const GPU_CACHE_ROWS_UPDATED: usize = 50; +pub const GPU_CACHE_BLOCKS_TOTAL: usize = 51; +pub const GPU_CACHE_BLOCKS_UPDATED: usize = 52; +pub const GPU_CACHE_BLOCKS_SAVED: usize = 53; // Atlas items represents the area occupied by items in the cache textures. // The actual texture memory allocated is ATLAS_TEXTURES_MEM. -pub const ATLAS_ITEMS_MEM: usize = 48; -pub const ATLAS_A8_PIXELS: usize = 49; -pub const ATLAS_A8_TEXTURES: usize = 50; -pub const ATLAS_A16_PIXELS: usize = 51; -pub const ATLAS_A16_TEXTURES: usize = 52; -pub const ATLAS_RGBA8_LINEAR_PIXELS: usize = 53; -pub const ATLAS_RGBA8_LINEAR_TEXTURES: usize = 54; -pub const ATLAS_RGBA8_NEAREST_PIXELS: usize = 55; -pub const ATLAS_RGBA8_NEAREST_TEXTURES: usize = 56; -pub const ATLAS_RGBA8_GLYPHS_PIXELS: usize = 57; -pub const ATLAS_RGBA8_GLYPHS_TEXTURES: usize = 58; -pub const ATLAS_A8_GLYPHS_PIXELS: usize = 59; -pub const ATLAS_A8_GLYPHS_TEXTURES: usize = 60; -pub const ATLAS_COLOR8_LINEAR_PRESSURE: usize = 61; -pub const ATLAS_COLOR8_NEAREST_PRESSURE: usize = 62; -pub const ATLAS_COLOR8_GLYPHS_PRESSURE: usize = 63; -pub const ATLAS_ALPHA8_PRESSURE: usize = 64; -pub const ATLAS_ALPHA8_GLYPHS_PRESSURE: usize = 65; -pub const ATLAS_ALPHA16_PRESSURE: usize = 66; -pub const ATLAS_STANDALONE_PRESSURE: usize = 67; - -pub const TEXTURE_CACHE_EVICTION_COUNT: usize = 68; -pub const TEXTURE_CACHE_YOUNGEST_EVICTION: usize = 69; -pub const EXTERNAL_IMAGE_BYTES: usize = 70; -pub const ATLAS_TEXTURES_MEM: usize = 71; -pub const STANDALONE_TEXTURES_MEM: usize = 72; -pub const PICTURE_TILES_MEM: usize = 73; -pub const RENDER_TARGET_MEM: usize = 74; - -pub const ALPHA_TARGETS_SAMPLERS: usize = 75; -pub const TRANSPARENT_PASS_SAMPLERS: usize = 76; -pub const OPAQUE_PASS_SAMPLERS: usize = 77; -pub const TOTAL_SAMPLERS: usize = 78; - -pub const INTERNED_PRIMITIVES: usize = 79; -pub const INTERNED_CLIPS: usize = 80; -pub const INTERNED_TEXT_RUNS: usize = 81; -pub const INTERNED_NORMAL_BORDERS: usize = 82; -pub const INTERNED_IMAGE_BORDERS: usize = 83; -pub const INTERNED_IMAGES: usize = 84; -pub const INTERNED_YUV_IMAGES: usize = 85; -pub const INTERNED_LINE_DECORATIONS: usize = 86; -pub const INTERNED_LINEAR_GRADIENTS: usize = 87; -pub const INTERNED_RADIAL_GRADIENTS: usize = 88; -pub const INTERNED_CONIC_GRADIENTS: usize = 89; -pub const INTERNED_PICTURES: usize = 90; -pub const INTERNED_FILTER_DATA: usize = 91; -pub const INTERNED_BACKDROP_CAPTURES: usize = 92; -pub const INTERNED_BACKDROP_RENDERS: usize = 93; -pub const INTERNED_POLYGONS: usize = 94; -pub const INTERNED_BOX_SHADOWS: usize = 95; -pub const DEPTH_TARGETS_MEM: usize = 96; - -pub const SHADER_BUILD_TIME: usize = 97; - -pub const RENDER_REASON_FIRST: usize = 98; -pub const RENDER_REASON_SCENE: usize = 99; -pub const RENDER_REASON_ANIMATED_PROPERTY: usize = 100; -pub const RENDER_REASON_RESOURCE_UPDATE: usize = 101; -pub const RENDER_REASON_ASYNC_IMAGE: usize = 102; -pub const RENDER_REASON_CLEAR_RESOURCES: usize = 103; -pub const RENDER_REASON_APZ: usize = 104; -pub const RENDER_REASON_RESIZE: usize = 105; -pub const RENDER_REASON_WIDGET: usize = 106; -pub const RENDER_REASON_TEXTURE_CACHE_FLUSH: usize = 107; -pub const RENDER_REASON_SNAPSHOT: usize = 108; -pub const RENDER_REASON_POST_RESOURCE_UPDATE_HOOKS: usize = 109; -pub const RENDER_REASON_CONFIG_CHANGE: usize = 110; -pub const RENDER_REASON_CONTENT_SYNC: usize = 111; -pub const RENDER_REASON_FLUSH: usize = 112; -pub const RENDER_REASON_TESTING: usize = 113; -pub const RENDER_REASON_OTHER: usize = 114; -pub const RENDER_REASON_VSYNC: usize = 115; - -pub const TEXTURES_CREATED: usize = 116; -pub const TEXTURES_DELETED: usize = 117; - -pub const SLOW_FRAME_CPU_COUNT: usize = 118; -pub const SLOW_FRAME_GPU_COUNT: usize = 119; -pub const SLOW_FRAME_BUILD_COUNT: usize = 120; -pub const SLOW_UPLOAD_COUNT: usize = 121; -pub const SLOW_RENDER_COUNT: usize = 122; -pub const SLOW_DRAW_CALLS_COUNT: usize = 123; -pub const SLOW_TARGETS_COUNT: usize = 124; -pub const SLOW_BLOB_COUNT: usize = 125; -pub const SLOW_SCROLL_AFTER_SCENE_COUNT: usize = 126; - -pub const GPU_BUFFER_MEM: usize = 127; -pub const GPU_TOTAL_MEM: usize = 128; - -pub const FRAME_SEND_TIME: usize = 129; -pub const UPDATE_DOCUMENT_TIME: usize = 130; - -pub const COMPOSITOR_SURFACE_UNDERLAYS: usize = 131; -pub const COMPOSITOR_SURFACE_OVERLAYS: usize = 132; -pub const COMPOSITOR_SURFACE_BLITS: usize = 133; - -pub const NUM_PROFILER_EVENTS: usize = 134; +pub const ATLAS_ITEMS_MEM: usize = 54; +pub const ATLAS_A8_PIXELS: usize = 55; +pub const ATLAS_A8_TEXTURES: usize = 56; +pub const ATLAS_A16_PIXELS: usize = 57; +pub const ATLAS_A16_TEXTURES: usize = 58; +pub const ATLAS_RGBA8_LINEAR_PIXELS: usize = 59; +pub const ATLAS_RGBA8_LINEAR_TEXTURES: usize = 60; +pub const ATLAS_RGBA8_NEAREST_PIXELS: usize = 61; +pub const ATLAS_RGBA8_NEAREST_TEXTURES: usize = 62; +pub const ATLAS_RGBA8_GLYPHS_PIXELS: usize = 63; +pub const ATLAS_RGBA8_GLYPHS_TEXTURES: usize = 64; +pub const ATLAS_A8_GLYPHS_PIXELS: usize = 65; +pub const ATLAS_A8_GLYPHS_TEXTURES: usize = 66; +pub const ATLAS_COLOR8_LINEAR_PRESSURE: usize = 67; +pub const ATLAS_COLOR8_NEAREST_PRESSURE: usize = 68; +pub const ATLAS_COLOR8_GLYPHS_PRESSURE: usize = 69; +pub const ATLAS_ALPHA8_PRESSURE: usize = 70; +pub const ATLAS_ALPHA8_GLYPHS_PRESSURE: usize = 71; +pub const ATLAS_ALPHA16_PRESSURE: usize = 72; +pub const ATLAS_STANDALONE_PRESSURE: usize = 73; + +pub const TEXTURE_CACHE_EVICTION_COUNT: usize = 74; +pub const TEXTURE_CACHE_YOUNGEST_EVICTION: usize = 75; +pub const EXTERNAL_IMAGE_BYTES: usize = 76; +pub const ATLAS_TEXTURES_MEM: usize = 77; +pub const STANDALONE_TEXTURES_MEM: usize = 78; +pub const PICTURE_TILES_MEM: usize = 79; +pub const RENDER_TARGET_MEM: usize = 80; + +pub const ALPHA_TARGETS_SAMPLERS: usize = 81; +pub const TRANSPARENT_PASS_SAMPLERS: usize = 82; +pub const OPAQUE_PASS_SAMPLERS: usize = 83; +pub const TOTAL_SAMPLERS: usize = 84; + +pub const INTERNED_PRIMITIVES: usize = 85; +pub const INTERNED_CLIPS: usize = 86; +pub const INTERNED_TEXT_RUNS: usize = 87; +pub const INTERNED_NORMAL_BORDERS: usize = 88; +pub const INTERNED_IMAGE_BORDERS: usize = 89; +pub const INTERNED_IMAGES: usize = 90; +pub const INTERNED_YUV_IMAGES: usize = 91; +pub const INTERNED_LINE_DECORATIONS: usize = 92; +pub const INTERNED_LINEAR_GRADIENTS: usize = 93; +pub const INTERNED_RADIAL_GRADIENTS: usize = 94; +pub const INTERNED_CONIC_GRADIENTS: usize = 95; +pub const INTERNED_PICTURES: usize = 96; +pub const INTERNED_FILTER_DATA: usize = 97; +pub const INTERNED_BACKDROP_CAPTURES: usize = 98; +pub const INTERNED_BACKDROP_RENDERS: usize = 99; +pub const INTERNED_POLYGONS: usize = 100; +pub const INTERNED_BOX_SHADOWS: usize = 101; +pub const DEPTH_TARGETS_MEM: usize = 102; + +pub const SHADER_BUILD_TIME: usize = 103; + +pub const RENDER_REASON_FIRST: usize = 104; +pub const RENDER_REASON_SCENE: usize = 104; +pub const RENDER_REASON_ANIMATED_PROPERTY: usize = 105; +pub const RENDER_REASON_RESOURCE_UPDATE: usize = 106; +pub const RENDER_REASON_ASYNC_IMAGE: usize = 107; +pub const RENDER_REASON_CLEAR_RESOURCES: usize = 108; +pub const RENDER_REASON_APZ: usize = 109; +pub const RENDER_REASON_RESIZE: usize = 110; +pub const RENDER_REASON_WIDGET: usize = 111; +pub const RENDER_REASON_TEXTURE_CACHE_FLUSH: usize = 112; +pub const RENDER_REASON_SNAPSHOT: usize = 113; +pub const RENDER_REASON_POST_RESOURCE_UPDATE_HOOKS: usize = 114; +pub const RENDER_REASON_CONFIG_CHANGE: usize = 115; +pub const RENDER_REASON_CONTENT_SYNC: usize = 116; +pub const RENDER_REASON_FLUSH: usize = 117; +pub const RENDER_REASON_TESTING: usize = 118; +pub const RENDER_REASON_OTHER: usize = 119; +pub const RENDER_REASON_VSYNC: usize = 120; + +pub const TEXTURES_CREATED: usize = 121; +pub const TEXTURES_DELETED: usize = 122; + +pub const SLOW_FRAME_CPU_COUNT: usize = 123; +pub const SLOW_FRAME_GPU_COUNT: usize = 124; +pub const SLOW_FRAME_BUILD_COUNT: usize = 125; +pub const SLOW_UPLOAD_COUNT: usize = 126; +pub const SLOW_RENDER_COUNT: usize = 127; +pub const SLOW_DRAW_CALLS_COUNT: usize = 128; +pub const SLOW_TARGETS_COUNT: usize = 129; +pub const SLOW_BLOB_COUNT: usize = 130; +pub const SLOW_SCROLL_AFTER_SCENE_COUNT: usize = 131; + +pub const GPU_CACHE_MEM: usize = 132; +pub const GPU_BUFFER_MEM: usize = 133; +pub const GPU_TOTAL_MEM: usize = 134; + +pub const GPU_CACHE_PREPARE_TIME: usize = 135; + +pub const FRAME_SEND_TIME: usize = 136; +pub const UPDATE_DOCUMENT_TIME: usize = 137; + +pub const COMPOSITOR_SURFACE_UNDERLAYS: usize = 138; +pub const COMPOSITOR_SURFACE_OVERLAYS: usize = 139; +pub const COMPOSITOR_SURFACE_BLITS: usize = 140; + +pub const NUM_PROFILER_EVENTS: usize = 141; pub struct Profiler { counters: Vec<Counter>, @@ -366,6 +376,7 @@ impl Profiler { float("Texture cache upload", "ms", TOTAL_UPLOAD_TIME, expected(0.0..5.0)), float("Cache texture creation", "ms", CREATE_CACHE_TEXTURE_TIME, expected(0.0..2.0)), float("Cache texture deletion", "ms", DELETE_CACHE_TEXTURE_TIME, expected(0.0..1.0)), + float("GPU cache upload", "ms", GPU_CACHE_UPLOAD_TIME, expected(0.0..2.0)), int("Rasterized blobs", "", RASTERIZED_BLOBS, expected(0..15)), int("Rasterized blob tiles", "", RASTERIZED_BLOB_TILES, expected(0..15)), @@ -394,6 +405,12 @@ impl Profiler { int("Image templates", "", IMAGE_TEMPLATES, expected(0..100)), float("Image templates mem", "MB", IMAGE_TEMPLATES_MEM, expected(0.0..50.0)), + int("GPU cache rows total", "", GPU_CACHE_ROWS_TOTAL, expected(1..50)), + int("GPU cache rows updated", "", GPU_CACHE_ROWS_UPDATED, expected(0..25)), + int("GPU blocks total", "", GPU_CACHE_BLOCKS_TOTAL, expected(1..65_000)), + int("GPU blocks updated", "", GPU_CACHE_BLOCKS_UPDATED, expected(0..1000)), + int("GPU blocks saved", "", GPU_CACHE_BLOCKS_SAVED, expected(0..50_000)), + float("Atlas items mem", "MB", ATLAS_ITEMS_MEM, expected(0.0..100.0)), int("Atlas A8 pixels", "px", ATLAS_A8_PIXELS, expected(0..1_000_000)), int("Atlas A8 textures", "", ATLAS_A8_TEXTURES, expected(0..2)), @@ -449,7 +466,6 @@ impl Profiler { float("Depth targets mem", "MB", DEPTH_TARGETS_MEM, Expected::none()), float("Shader build time", "ms", SHADER_BUILD_TIME, Expected::none()), // We use the expected range to highlight render reasons that are happening. - float("Reason First", "", RENDER_REASON_FIRST, expected(0.0..0.01)), float("Reason scene", "", RENDER_REASON_SCENE, expected(0.0..0.01)), float("Reason animated property", "", RENDER_REASON_ANIMATED_PROPERTY, expected(0.0..0.01)), float("Reason resource update", "", RENDER_REASON_RESOURCE_UPDATE, expected(0.0..0.01)), @@ -481,9 +497,11 @@ impl Profiler { int("Slow: blobs", "%", SLOW_BLOB_COUNT, Expected::none()), int("Slow: after scene", "%", SLOW_SCROLL_AFTER_SCENE_COUNT, Expected::none()), + float("GPU cache mem", "MB", GPU_CACHE_MEM, Expected::none()), float("GPU buffer mem", "MB", GPU_BUFFER_MEM, Expected::none()), float("GPU total mem", "MB", GPU_TOTAL_MEM, Expected::none()), + float("GPU cache preapre", "ms", GPU_CACHE_PREPARE_TIME, Expected::none()), float("Frame send", "ms", FRAME_SEND_TIME, Expected::none()), float("Update document", "ms", UPDATE_DOCUMENT_TIME, Expected::none()), @@ -689,6 +707,7 @@ impl Profiler { RENDER_TARGET_MEM, DEPTH_TARGETS_MEM, ATLAS_ITEMS_MEM, + GPU_CACHE_MEM, GPU_BUFFER_MEM, ] { if let Some(val) = self.counters[counter].get() { @@ -786,6 +805,10 @@ impl Profiler { flush_counters(&mut counters, selection); selection.push(Item::GpuTimeQueries); } + "GPU cache bars" => { + flush_counters(&mut counters, selection); + selection.push(Item::GpuCacheBars); + } "Paint phase graph" => { flush_counters(&mut counters, selection); selection.push(Item::PaintPhaseGraph); @@ -834,6 +857,10 @@ impl Profiler { &self.counters } + pub fn get(&self, id: usize) -> Option<f64> { + self.counters[id].get() + } + fn draw_counters( counters: &[Counter], selected: &[usize], @@ -1072,6 +1099,102 @@ impl Profiler { } } + fn draw_bar( + label: &str, + label_color: ColorU, + counters: &[(ColorU, usize)], + x: f32, y: f32, + debug_renderer: &mut DebugRenderer, + ) -> default::Rect<f32> { + let x = x + 8.0; + let y = y + 24.0; + let text_rect = debug_renderer.add_text( + x, y, + label, + label_color, + None, + ); + + let x_base = text_rect.max_x() + 10.0; + let width = 300.0; + let total_value = counters.last().unwrap().1; + let scale = width / total_value as f32; + let mut x_current = x_base; + + for &(color, counter) in counters { + let x_stop = x_base + counter as f32 * scale; + debug_renderer.add_quad( + x_current, + text_rect.origin.y, + x_stop, + text_rect.max_y(), + color, + color, + ); + x_current = x_stop; + + } + + let mut total_rect = text_rect; + total_rect.size.width += width + 10.0; + + total_rect + } + + fn draw_gpu_cache_bars(&self, x: f32, mut y: f32, text_buffer: &mut String, debug_renderer: &mut DebugRenderer) -> default::Rect<f32> { + let color_updated = ColorU::new(0xFF, 0, 0, 0xFF); + let color_free = ColorU::new(0, 0, 0xFF, 0xFF); + let color_saved = ColorU::new(0, 0xFF, 0, 0xFF); + + let updated_blocks = self.get(GPU_CACHE_BLOCKS_UPDATED).unwrap_or(0.0) as usize; + let saved_blocks = self.get(GPU_CACHE_BLOCKS_SAVED).unwrap_or(0.0) as usize; + let allocated_blocks = self.get(GPU_CACHE_BLOCKS_TOTAL).unwrap_or(0.0) as usize; + let allocated_rows = self.get(GPU_CACHE_ROWS_TOTAL).unwrap_or(0.0) as usize; + let updated_rows = self.get(GPU_CACHE_ROWS_UPDATED).unwrap_or(0.0) as usize; + let requested_blocks = updated_blocks + saved_blocks; + let total_blocks = allocated_rows * MAX_VERTEX_TEXTURE_WIDTH; + + set_text!(text_buffer, "GPU cache rows ({}):", allocated_rows); + + let rect0 = Profiler::draw_bar( + text_buffer, + ColorU::new(0xFF, 0xFF, 0xFF, 0xFF), + &[ + (color_updated, updated_rows), + (color_free, allocated_rows), + ], + x, y, + debug_renderer, + ); + + y = rect0.max_y(); + + let rect1 = Profiler::draw_bar( + "GPU cache blocks", + ColorU::new(0xFF, 0xFF, 0, 0xFF), + &[ + (color_updated, updated_blocks), + (color_saved, requested_blocks), + (color_free, allocated_blocks), + (ColorU::new(0, 0, 0, 0xFF), total_blocks), + ], + x, y, + debug_renderer, + ); + + let total_rect = rect0.union(&rect1).inflate(10.0, 10.0); + debug_renderer.add_quad( + total_rect.origin.x, + total_rect.origin.y, + total_rect.origin.x + total_rect.size.width, + total_rect.origin.y + total_rect.size.height, + ColorF::new(0.1, 0.1, 0.1, 0.8).into(), + ColorF::new(0.2, 0.2, 0.2, 0.8).into(), + ); + + total_rect + } + // Draws a frame graph for a given frame collection. fn draw_frame_graph( frame_collection: &ProfilerFrameCollection, @@ -1237,6 +1360,9 @@ impl Profiler { Item::GpuTimeQueries => { Profiler::draw_frame_graph(&self.gpu_frames, x, y, debug_renderer) } + Item::GpuCacheBars => { + self.draw_gpu_cache_bars(x, y, &mut text_buffer, debug_renderer) + } Item::PaintPhaseGraph => { Profiler::draw_frame_graph(&self.frame_stats, x, y, debug_renderer) } @@ -1947,6 +2073,7 @@ pub struct CpuFrameTimings { pub frame_building_other: f64, pub frame_send: f64, pub uploads: f64, + pub gpu_cache: f64, pub draw_calls: f64, pub unknown: f64, } @@ -1962,9 +2089,10 @@ impl CpuFrameTimings { let frame_send = counters[FRAME_SEND_TIME].get().unwrap_or(0.0); let renderer = counters[RENDERER_TIME].get().unwrap_or(0.0); let uploads = counters[TEXTURE_CACHE_UPDATE_TIME].get().unwrap_or(0.0); + let gpu_cache = counters[GPU_CACHE_PREPARE_TIME].get().unwrap_or(0.0); let frame_build = visibility + prepare + glyph_resolve + batching; let update_document = counters[UPDATE_DOCUMENT_TIME].get().unwrap_or(0.0) - frame_build; - let draw_calls = renderer - uploads; + let draw_calls = renderer - uploads - gpu_cache; let unknown = (total - (api_send + update_document + frame_build + frame_send + renderer)).max(0.0); let frame_building_other = (counters[FRAME_BUILDING_TIME].get().unwrap_or(0.0) - frame_build).max(0.0); @@ -1979,6 +2107,7 @@ impl CpuFrameTimings { frame_building_other, frame_send, uploads, + gpu_cache, draw_calls, unknown, } @@ -2010,9 +2139,10 @@ impl CpuFrameTimings { sample(self.frame_send, "08. frame send", ColorF { r: 1.0, g: 0.8, b: 0.8, a: 1.0 }), // Renderer sample(self.uploads, "09. texture uploads", ColorF { r: 0.8, g: 0.0, b: 0.3, a: 1.0 }), - sample(self.draw_calls, "10. draw calls", ColorF { r: 1.0, g: 0.5, b: 0.0, a: 1.0 }), + sample(self.gpu_cache, "10. gpu cache update", ColorF { r: 0.5, g: 0.0, b: 0.4, a: 1.0 }), + sample(self.draw_calls, "11. draw calls", ColorF { r: 1.0, g: 0.5, b: 0.0, a: 1.0 }), // Unaccounted time - sample(self.unknown, "11. unknown", ColorF { r: 0.3, g: 0.3, b: 0.3, a: 1.0 }), + sample(self.unknown, "12. unknown", ColorF { r: 0.3, g: 0.3, b: 0.3, a: 1.0 }), ], } } @@ -2037,6 +2167,7 @@ enum Item { ChangeIndicator(usize), Fps, GpuTimeQueries, + GpuCacheBars, PaintPhaseGraph, SlowScrollFrames, Text(String), diff --git a/gfx/wr/webrender/src/quad.rs b/gfx/wr/webrender/src/quad.rs @@ -1219,8 +1219,8 @@ pub fn add_to_batch<F>( let mut instance = QuadInstance { dst_task_address, - prim_address_i: prim_address_i.as_int(), - prim_address_f: prim_address_f.as_int(), + prim_address_i, + prim_address_f, edge_flags: edge_flags_bits, quad_flags: quad_flags.bits(), part_index: PartIndex::All as u8, diff --git a/gfx/wr/webrender/src/render_api.rs b/gfx/wr/webrender/src/render_api.rs @@ -973,6 +973,8 @@ pub enum DebugCommand { EnableNativeCompositor(bool), /// Sets the maximum amount of existing batches to visit before creating a new one. SetBatchingLookback(u32), + /// Invalidate GPU cache, forcing the update from the CPU mirror. + InvalidateGpuCache, /// Causes the scene builder to pause for a given amount of milliseconds each time it /// processes a transaction. SimulateLongSceneBuild(u32), @@ -1488,6 +1490,8 @@ pub struct MemoryReport { // CPU Memory. // pub clip_stores: usize, + pub gpu_cache_metadata: usize, + pub gpu_cache_cpu_mirror: usize, pub hit_testers: usize, pub fonts: usize, pub weak_fonts: usize, @@ -1504,6 +1508,7 @@ pub struct MemoryReport { // // GPU memory. // + pub gpu_cache_textures: usize, pub vertex_data_textures: usize, pub render_target_textures: usize, pub picture_tile_textures: usize, diff --git a/gfx/wr/webrender/src/render_backend.rs b/gfx/wr/webrender/src/render_backend.rs @@ -30,6 +30,7 @@ use crate::capture::CaptureConfig; use crate::composite::{CompositorKind, CompositeDescriptor}; use crate::frame_builder::{FrameBuilder, FrameBuilderConfig, FrameScratchBuffer}; use glyph_rasterizer::FontInstance; +use crate::gpu_cache::GpuCache; use crate::hit_test::{HitTest, HitTester, SharedHitTester}; use crate::intern::DataStore; #[cfg(any(feature = "capture", feature = "replay"))] @@ -511,6 +512,7 @@ impl Document { fn build_frame( &mut self, resource_cache: &mut ResourceCache, + gpu_cache: &mut GpuCache, debug_flags: DebugFlags, tile_caches: &mut FastHashMap<SliceId, Box<TileCacheInstance>>, frame_stats: Option<FullFrameStats>, @@ -531,6 +533,7 @@ impl Document { &mut self.scene, present, resource_cache, + gpu_cache, &mut self.rg_builder, self.stamp, self.view.scene.device_rect.min, @@ -584,6 +587,7 @@ impl Document { &mut self, mut txn: OffscreenBuiltScene, resource_cache: &mut ResourceCache, + gpu_cache: &mut GpuCache, chunk_pool: Arc<ChunkPool>, debug_flags: DebugFlags, ) -> RenderedDocument { @@ -609,6 +613,7 @@ impl Document { &mut txn.scene, present, resource_cache, + gpu_cache, &mut self.rg_builder, self.stamp, // TODO(nical) self.view.scene.device_rect.min, @@ -773,6 +778,7 @@ pub struct RenderBackend { result_tx: Sender<ResultMsg>, scene_tx: Sender<SceneBuilderRequest>, + gpu_cache: GpuCache, resource_cache: ResourceCache, chunk_pool: Arc<ChunkPool>, @@ -824,6 +830,7 @@ impl RenderBackend { result_tx, scene_tx, resource_cache, + gpu_cache: GpuCache::new(), chunk_pool, frame_config, default_compositor_kind : frame_config.compositor_kind, @@ -922,6 +929,7 @@ impl RenderBackend { result_tx: Option<Sender<SceneSwapResult>>, frame_counter: &mut u32, ) -> bool { + self.prepare_for_frames(); self.maybe_force_nop_documents( frame_counter, |document_id| txns.iter().any(|txn| txn.document_id == document_id)); @@ -1010,10 +1018,14 @@ impl RenderBackend { let rendered_document = doc.process_offscreen_scene( offscreen_scene, &mut self.resource_cache, + &mut self.gpu_cache, self.chunk_pool.clone(), self.debug_flags, ); + let msg = ResultMsg::UpdateGpuCache(self.gpu_cache.extract_updates()); + self.result_tx.send(msg).unwrap(); + let pending_update = self.resource_cache.pending_updates(); let msg = ResultMsg::PublishDocument( @@ -1100,6 +1112,8 @@ impl RenderBackend { // recently used resources. self.resource_cache.clear(ClearCache::all()); + self.gpu_cache.clear(); + for (_, doc) in &mut self.documents { doc.scratch.memory_pressure(); for tile_cache in self.tile_caches.values_mut() { @@ -1135,6 +1149,8 @@ impl RenderBackend { return RenderBackendStatus::Continue; } DebugCommand::GenerateFrame => { + self.prepare_for_frames(); + let documents: Vec<DocumentId> = self.documents.keys() .cloned() .collect(); @@ -1166,6 +1182,7 @@ impl RenderBackend { doc.scene.config.force_invalidation = invalidation_config; } } + self.bookkeep_after_frames(); return RenderBackendStatus::Continue; } @@ -1265,6 +1282,7 @@ impl RenderBackend { } DebugCommand::SetFlags(flags) => { self.resource_cache.set_debug_flags(flags); + self.gpu_cache.set_debug_flags(flags); let force_invalidation = flags.contains(DebugFlags::FORCE_PICTURE_INVALIDATION); if self.frame_config.force_invalidation != force_invalidation { @@ -1275,6 +1293,19 @@ impl RenderBackend { self.update_frame_builder_config(); } + // If we're toggling on the GPU cache debug display, we + // need to blow away the cache. This is because we only + // send allocation/free notifications to the renderer + // thread when the debug display is enabled, and thus + // enabling it when the cache is partially populated will + // give the renderer an incomplete view of the world. + // And since we might as well drop all the debugging state + // from the renderer when we disable the debug display, + // we just clear the cache on toggle. + let changed = self.debug_flags ^ flags; + if changed.contains(DebugFlags::GPU_CACHE_DBG) { + self.gpu_cache.clear(); + } self.debug_flags = flags; ResultMsg::DebugCommand(option) @@ -1318,6 +1349,7 @@ impl RenderBackend { result_tx, frame_counter, ); + self.bookkeep_after_frames(); }, #[cfg(feature = "capture")] SceneBuilderResult::CapturedTransactions(txns, capture_config, result_tx) => { @@ -1340,6 +1372,8 @@ impl RenderBackend { if built_frame { self.save_capture_sequence(); } + + self.bookkeep_after_frames(); }, #[cfg(feature = "capture")] SceneBuilderResult::StopCaptureSequence => { @@ -1405,8 +1439,16 @@ impl RenderBackend { ); } + fn prepare_for_frames(&mut self) { + self.gpu_cache.prepare_for_frames(); + } + + fn bookkeep_after_frames(&mut self) { + self.gpu_cache.bookkeep_after_frames(); + } + fn requires_frame_build(&mut self) -> bool { - false // TODO(nical) + self.gpu_cache.requires_frame_build() } fn prepare_transactions( @@ -1414,6 +1456,7 @@ impl RenderBackend { txns: Vec<Box<TransactionMsg>>, frame_counter: &mut u32, ) { + self.prepare_for_frames(); self.maybe_force_nop_documents( frame_counter, |document_id| txns.iter().any(|txn| txn.document_id == document_id)); @@ -1446,6 +1489,7 @@ impl RenderBackend { #[cfg(feature = "capture")] self.save_capture_sequence(); } + self.bookkeep_after_frames(); } /// In certain cases, resources shared by multiple documents have to run @@ -1599,6 +1643,7 @@ impl RenderBackend { let rendered_document = doc.build_frame( &mut self.resource_cache, + &mut self.gpu_cache, self.debug_flags, &mut self.tile_caches, frame_stats, @@ -1610,6 +1655,9 @@ impl RenderBackend { debug!("generated frame for document {:?} with {} passes", document_id, rendered_document.frame.passes.len()); + let msg = ResultMsg::UpdateGpuCache(self.gpu_cache.extract_updates()); + self.result_tx.send(msg).unwrap(); + Telemetry::stop_and_accumulate_framebuild_time(timer_id); let pending_update = self.resource_cache.pending_updates(); @@ -1730,6 +1778,7 @@ impl RenderBackend { let mut report = Box::new(MemoryReport::default()); let ops = self.size_of_ops.as_mut().unwrap(); let op = ops.size_of_op; + report.gpu_cache_metadata = self.gpu_cache.size_of(ops); for doc in self.documents.values() { report.clip_stores += doc.scene.clip_store.size_of(ops); report.hit_testers += match &doc.hit_tester { @@ -1795,6 +1844,10 @@ impl RenderBackend { } let config = CaptureConfig::new(root, bits); + if config.bits.contains(CaptureBits::FRAME) { + self.prepare_for_frames(); + } + for (&id, doc) in &mut self.documents { debug!("\tdocument {:?}", id); if config.bits.contains(CaptureBits::FRAME) { @@ -1802,6 +1855,7 @@ impl RenderBackend { let force_invalidation = std::mem::replace(&mut doc.scene.config.force_invalidation, true); let rendered_document = doc.build_frame( &mut self.resource_cache, + &mut self.gpu_cache, self.debug_flags, &mut self.tile_caches, None, @@ -1812,6 +1866,11 @@ impl RenderBackend { doc.scene.config.force_invalidation = force_invalidation; + // After we rendered the frames, there are pending updates to both + // GPU cache and resources. Instead of serializing them, we are going to make sure + // they are applied on the `Renderer` side. + let msg_update_gpu_cache = ResultMsg::UpdateGpuCache(self.gpu_cache.extract_updates()); + self.result_tx.send(msg_update_gpu_cache).unwrap(); //TODO: write down doc's pipeline info? // it has `pipeline_epoch_map`, // which may capture necessary details for some cases. @@ -1869,6 +1928,7 @@ impl RenderBackend { // report it here if we do. If we don't, it will simply crash in // Renderer::render_impl and give us less information about the source. assert!(!self.requires_frame_build(), "Caches were cleared during a capture."); + self.bookkeep_after_frames(); } debug!("\tscene builder"); @@ -1902,6 +1962,8 @@ impl RenderBackend { info!("\tresource cache"); let caches = self.resource_cache.save_caches(&config.root); config.serialize_for_resource(&caches, "resource_cache"); + info!("\tgpu cache"); + config.serialize_for_resource(&self.gpu_cache, "gpu_cache"); } DebugOutput::SaveCapture(config, deferred) @@ -1975,6 +2037,11 @@ impl RenderBackend { DebugOutput::LoadCapture(config.clone(), plain_externals) ); self.result_tx.send(msg_load).unwrap(); + + self.gpu_cache = match config.deserialize_for_resource::<GpuCache, _>("gpu_cache") { + Some(gpu_cache) => gpu_cache, + None => GpuCache::new(), + }; } self.frame_config = backend.frame_config; @@ -2059,6 +2126,9 @@ impl RenderBackend { Some(frame) => { info!("\tloaded a built frame with {} passes", frame.passes.len()); + let msg_update = ResultMsg::UpdateGpuCache(self.gpu_cache.extract_updates()); + self.result_tx.send(msg_update).unwrap(); + self.frame_publish_id.advance(); let msg_publish = ResultMsg::PublishDocument( self.frame_publish_id, diff --git a/gfx/wr/webrender/src/render_target.rs b/gfx/wr/webrender/src/render_target.rs @@ -13,6 +13,7 @@ use crate::segment::EdgeAaSegmentMask; use crate::spatial_tree::SpatialTree; use crate::clip::{ClipStore, ClipItemKind}; use crate::frame_builder::FrameGlobalResources; +use crate::gpu_cache::{GpuCache, GpuCacheAddress}; use crate::gpu_types::{BorderInstance, SvgFilterInstance, SVGFEFilterInstance, BlurDirection, BlurInstance, PrimitiveHeaders, ScalingInstance}; use crate::gpu_types::{TransformPalette, ZBufferIdGenerator, MaskInstance, ClipSpace, BlurEdgeMode}; use crate::gpu_types::{ZBufferId, QuadSegment, PrimitiveInstanceData, TransformPaletteId}; @@ -107,6 +108,7 @@ impl RenderTargetList { pub fn build( &mut self, ctx: &mut RenderTargetContext, + gpu_cache: &mut GpuCache, render_tasks: &RenderTaskGraph, prim_headers: &mut PrimitiveHeaders, transforms: &mut TransformPalette, @@ -122,6 +124,7 @@ impl RenderTargetList { for target in &mut self.targets { target.build( ctx, + gpu_cache, render_tasks, prim_headers, transforms, @@ -253,6 +256,7 @@ impl RenderTarget { pub fn build( &mut self, ctx: &mut RenderTargetContext, + gpu_cache: &mut GpuCache, render_tasks: &RenderTaskGraph, prim_headers: &mut PrimitiveHeaders, transforms: &mut TransformPalette, @@ -309,6 +313,7 @@ impl RenderTarget { cmd, spatial_node_index, ctx, + gpu_cache, render_tasks, prim_headers, transforms, @@ -349,6 +354,7 @@ impl RenderTarget { &mut self, task_id: RenderTaskId, ctx: &RenderTargetContext, + gpu_cache: &mut GpuCache, gpu_buffer_builder: &mut GpuBufferBuilder, render_tasks: &RenderTaskGraph, clip_store: &ClipStore, @@ -438,7 +444,7 @@ impl RenderTarget { task_id, task.children.get(0).cloned(), task.children.get(1).cloned(), - task_info.extra_gpu_data, + task_info.extra_gpu_cache_handle.map(|handle| gpu_cache.get_address(&handle)), &ctx.frame_memory, ) } @@ -450,7 +456,7 @@ impl RenderTarget { task, task.children.get(0).cloned(), task.children.get(1).cloned(), - task_info.extra_gpu_data, + task_info.extra_gpu_cache_handle.map(|handle| gpu_cache.get_address(&handle)), &ctx.frame_memory, ) } @@ -465,6 +471,7 @@ impl RenderTarget { task_info.clip_node_range, task_info.root_spatial_node_index, render_tasks, + gpu_cache, clip_store, transforms, task_info.actual_rect, @@ -675,7 +682,7 @@ fn add_svg_filter_instances( task_id: RenderTaskId, input_1_task: Option<RenderTaskId>, input_2_task: Option<RenderTaskId>, - extra_data_address: Option<GpuBufferAddress>, + extra_data_address: Option<GpuCacheAddress>, memory: &FrameMemory, ) { let mut textures = BatchTextures::empty(); @@ -746,7 +753,7 @@ fn add_svg_filter_instances( input_count, generic_int, padding: 0, - extra_data_address: extra_data_address.unwrap_or(GpuBufferAddress::INVALID).as_int(), + extra_data_address: extra_data_address.unwrap_or(GpuCacheAddress::INVALID), }; for (ref mut batch_textures, ref mut batch) in instances.iter_mut() { @@ -779,7 +786,7 @@ fn add_svg_filter_node_instances( target_task: &RenderTask, input_1_task: Option<RenderTaskId>, input_2_task: Option<RenderTaskId>, - extra_data_address: Option<GpuBufferAddress>, + extra_data_address: Option<GpuCacheAddress>, memory: &FrameMemory, ) { let node = &task_info.node; @@ -801,7 +808,7 @@ fn add_svg_filter_node_instances( input_2_task_address: RenderTaskId::INVALID.into(), kind: 0, input_count: node.inputs.len() as u16, - extra_data_address: extra_data_address.unwrap_or(GpuBufferAddress::INVALID).as_int(), + extra_data_address: extra_data_address.unwrap_or(GpuCacheAddress::INVALID), }; // Must match FILTER_* in cs_svg_filter_node.glsl diff --git a/gfx/wr/webrender/src/render_task.rs b/gfx/wr/webrender/src/render_task.rs @@ -15,6 +15,7 @@ use crate::profiler::{add_text_marker}; use crate::spatial_tree::SpatialNodeIndex; use crate::filterdata::SFilterData; use crate::frame_builder::FrameBuilderConfig; +use crate::gpu_cache::{GpuCache, GpuCacheAddress, GpuCacheHandle}; use crate::gpu_types::{BorderInstance, ImageSource, UvRectKind, TransformPaletteId, BlurEdgeMode}; use crate::internal_types::{CacheTextureId, FastHashMap, FilterGraphNode, FilterGraphOp, FilterGraphPictureReference, SVGFE_CONVOLVE_VALUES_LIMIT, TextureSource, Swizzle}; use crate::picture::{ResolvedSurfaceTexture, MAX_SURFACE_SIZE}; @@ -25,7 +26,7 @@ use crate::prim_store::gradient::{ }; use crate::resource_cache::{ResourceCache, ImageRequest}; use std::{usize, f32, i32, u32}; -use crate::renderer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferBuilderF}; +use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF}; use crate::render_backend::DataStores; use crate::render_target::{ResolveOp, RenderTargetKind}; use crate::render_task_graph::{PassId, RenderTaskId, RenderTaskGraphBuilder}; @@ -345,7 +346,7 @@ pub enum SvgFilterInfo { #[cfg_attr(feature = "replay", derive(Deserialize))] pub struct SvgFilterTask { pub info: SvgFilterInfo, - pub extra_gpu_data: Option<GpuBufferAddress>, + pub extra_gpu_cache_handle: Option<GpuCacheHandle>, } #[derive(Debug)] @@ -355,7 +356,7 @@ pub struct SVGFEFilterTask { pub node: FilterGraphNode, pub op: FilterGraphOp, pub content_origin: DevicePoint, - pub extra_gpu_data: Option<GpuBufferAddress>, + pub extra_gpu_cache_handle: Option<GpuCacheHandle>, } #[cfg_attr(feature = "capture", derive(Serialize))] @@ -627,6 +628,7 @@ impl RenderTaskKind { clip_node_range: ClipNodeRange, root_spatial_node_index: SpatialNodeIndex, clip_store: &mut ClipStore, + gpu_cache: &mut GpuCache, gpu_buffer_builder: &mut GpuBufferBuilderF, resource_cache: &mut ResourceCache, rg_builder: &mut RenderTaskGraphBuilder, @@ -684,10 +686,11 @@ impl RenderTaskKind { }), false, RenderTaskParent::RenderTask(clip_task_id), + gpu_cache, gpu_buffer_builder, rg_builder, surface_builder, - &mut |rg_builder, _| { + &mut |rg_builder, _, _| { let clip_data = ClipData::rounded_rect( source.minimal_shadow_rect.size(), &source.shadow_radius, @@ -847,32 +850,38 @@ impl RenderTaskKind { pub fn write_gpu_blocks( &mut self, - gpu_buffer: &mut GpuBufferBuilder, + gpu_cache: &mut GpuCache, ) { match self { RenderTaskKind::SvgFilter(ref mut filter_task) => { match filter_task.info { SvgFilterInfo::ColorMatrix(ref matrix) => { - let mut writer = gpu_buffer.f32.write_blocks(5); - for i in 0..5 { - writer.push_one([matrix[i*4], matrix[i*4+1], matrix[i*4+2], matrix[i*4+3]]); + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + for i in 0..5 { + request.push([matrix[i*4], matrix[i*4+1], matrix[i*4+2], matrix[i*4+3]]); + } } - filter_task.extra_gpu_data = Some(writer.finish()); } SvgFilterInfo::DropShadow(color) | SvgFilterInfo::Flood(color) => { - let mut writer = gpu_buffer.f32.write_blocks(1); - writer.push_one(color.to_array()); - filter_task.extra_gpu_data = Some(writer.finish()); + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + request.push(color.to_array()); + } } SvgFilterInfo::ComponentTransfer(ref data) => { - filter_task.extra_gpu_data = Some(data.write_gpu_blocks(&mut gpu_buffer.f32)); + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(request) = gpu_cache.request(handle) { + data.update(request); + } } SvgFilterInfo::Composite(ref operator) => { if let CompositeOperator::Arithmetic(k_vals) = operator { - let mut writer = gpu_buffer.f32.write_blocks(1); - writer.push_one(*k_vals); - filter_task.extra_gpu_data = Some(writer.finish()); + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + request.push(*k_vals); + } } } _ => {}, @@ -896,19 +905,21 @@ impl RenderTaskKind { FilterGraphOp::SVGFEBlendSaturation => {} FilterGraphOp::SVGFEBlendColor => {} FilterGraphOp::SVGFEBlendLuminosity => {} - FilterGraphOp::SVGFEColorMatrix { values: matrix } => { - let mut writer = gpu_buffer.f32.write_blocks(5); - for i in 0..5 { - writer.push_one([matrix[i*4], matrix[i*4+1], matrix[i*4+2], matrix[i*4+3]]); + FilterGraphOp::SVGFEColorMatrix{values: matrix} => { + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + for i in 0..5 { + request.push([matrix[i*4], matrix[i*4+1], matrix[i*4+2], matrix[i*4+3]]); + } } - filter_task.extra_gpu_data = Some(writer.finish()); } FilterGraphOp::SVGFEComponentTransfer => unreachable!(), FilterGraphOp::SVGFEComponentTransferInterned{..} => {} FilterGraphOp::SVGFECompositeArithmetic{k1, k2, k3, k4} => { - let mut writer = gpu_buffer.f32.write_blocks(1); - writer.push_one([k1, k2, k3, k4]); - filter_task.extra_gpu_data = Some(writer.finish()); + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + request.push([k1, k2, k3, k4]); + } } FilterGraphOp::SVGFECompositeATop => {} FilterGraphOp::SVGFECompositeIn => {} @@ -919,40 +930,44 @@ impl RenderTaskKind { FilterGraphOp::SVGFEConvolveMatrixEdgeModeDuplicate{order_x, order_y, kernel, divisor, bias, target_x, target_y, kernel_unit_length_x, kernel_unit_length_y, preserve_alpha} | FilterGraphOp::SVGFEConvolveMatrixEdgeModeNone{order_x, order_y, kernel, divisor, bias, target_x, target_y, kernel_unit_length_x, kernel_unit_length_y, preserve_alpha} | FilterGraphOp::SVGFEConvolveMatrixEdgeModeWrap{order_x, order_y, kernel, divisor, bias, target_x, target_y, kernel_unit_length_x, kernel_unit_length_y, preserve_alpha} => { - let mut writer = gpu_buffer.f32.write_blocks(8); - assert!(SVGFE_CONVOLVE_VALUES_LIMIT == 25); - writer.push_one([-target_x as f32, -target_y as f32, order_x as f32, order_y as f32]); - writer.push_one([kernel_unit_length_x as f32, kernel_unit_length_y as f32, 1.0 / divisor, bias]); - writer.push_one([kernel[0], kernel[1], kernel[2], kernel[3]]); - writer.push_one([kernel[4], kernel[5], kernel[6], kernel[7]]); - writer.push_one([kernel[8], kernel[9], kernel[10], kernel[11]]); - writer.push_one([kernel[12], kernel[13], kernel[14], kernel[15]]); - writer.push_one([kernel[16], kernel[17], kernel[18], kernel[19]]); - writer.push_one([kernel[20], 0.0, 0.0, preserve_alpha as f32]); - filter_task.extra_gpu_data = Some(writer.finish()); + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + request.push([-target_x as f32, -target_y as f32, order_x as f32, order_y as f32]); + request.push([kernel_unit_length_x as f32, kernel_unit_length_y as f32, 1.0 / divisor, bias]); + assert!(SVGFE_CONVOLVE_VALUES_LIMIT == 25); + request.push([kernel[0], kernel[1], kernel[2], kernel[3]]); + request.push([kernel[4], kernel[5], kernel[6], kernel[7]]); + request.push([kernel[8], kernel[9], kernel[10], kernel[11]]); + request.push([kernel[12], kernel[13], kernel[14], kernel[15]]); + request.push([kernel[16], kernel[17], kernel[18], kernel[19]]); + request.push([kernel[20], 0.0, 0.0, preserve_alpha as f32]); + } } FilterGraphOp::SVGFEDiffuseLightingDistant{..} => {} FilterGraphOp::SVGFEDiffuseLightingPoint{..} => {} FilterGraphOp::SVGFEDiffuseLightingSpot{..} => {} FilterGraphOp::SVGFEDisplacementMap{scale, x_channel_selector, y_channel_selector} => { - let mut writer = gpu_buffer.f32.write_blocks(1); - writer.push_one([x_channel_selector as f32, y_channel_selector as f32, scale, 0.0]); - filter_task.extra_gpu_data = Some(writer.finish()); + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + request.push([x_channel_selector as f32, y_channel_selector as f32, scale, 0.0]); + } + } + FilterGraphOp::SVGFEDropShadow{color, ..} | + FilterGraphOp::SVGFEFlood{color} => { + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + request.push(color.to_array()); + } } - FilterGraphOp::SVGFEDropShadow { color, .. } | - FilterGraphOp::SVGFEFlood { color } => { - let mut writer = gpu_buffer.f32.write_blocks(1); - writer.push_one(color.to_array()); - filter_task.extra_gpu_data = Some(writer.finish()); - } FilterGraphOp::SVGFEGaussianBlur{..} => {} FilterGraphOp::SVGFEIdentity => {} - FilterGraphOp::SVGFEImage {..} => {} - FilterGraphOp::SVGFEMorphologyDilate { radius_x, radius_y } | - FilterGraphOp::SVGFEMorphologyErode { radius_x, radius_y } => { - let mut writer = gpu_buffer.f32.write_blocks(1); - writer.push_one([radius_x, radius_y, 0.0, 0.0]); - filter_task.extra_gpu_data = Some(writer.finish()); + FilterGraphOp::SVGFEImage{..} => {} + FilterGraphOp::SVGFEMorphologyDilate{radius_x, radius_y} | + FilterGraphOp::SVGFEMorphologyErode{radius_x, radius_y} => { + let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new); + if let Some(mut request) = gpu_cache.request(handle) { + request.push([radius_x, radius_y, 0.0, 0.0]); + } } FilterGraphOp::SVGFEOpacity{..} => {} FilterGraphOp::SVGFESourceAlpha => {} @@ -1038,7 +1053,7 @@ pub struct RenderTask { /// /// Will be set to None if the render task is cached, in which case the texture cache /// manages the handle. - pub uv_rect_handle: GpuBufferAddress, + pub uv_rect_handle: GpuCacheHandle, pub cache_handle: Option<RenderTaskCacheEntryHandle>, uv_rect_kind: UvRectKind, } @@ -1056,7 +1071,7 @@ impl RenderTask { kind, free_after: PassId::MAX, render_on: PassId::MIN, - uv_rect_handle: GpuBufferAddress::INVALID, + uv_rect_handle: GpuCacheHandle::new(), uv_rect_kind: UvRectKind::Rect, cache_handle: None, sub_pass: None, @@ -1101,7 +1116,7 @@ impl RenderTask { }), free_after: PassId::MAX, render_on: PassId::MIN, - uv_rect_handle: GpuBufferAddress::INVALID, + uv_rect_handle: GpuCacheHandle::new(), uv_rect_kind: UvRectKind::Rect, cache_handle: None, sub_pass: None, @@ -1120,7 +1135,7 @@ impl RenderTask { kind: RenderTaskKind::Test(target), free_after: PassId::MAX, render_on: PassId::MIN, - uv_rect_handle: GpuBufferAddress::INVALID, + uv_rect_handle: GpuCacheHandle::new(), uv_rect_kind: UvRectKind::Rect, cache_handle: None, sub_pass: None, @@ -1620,7 +1635,7 @@ impl RenderTask { let task_id = rg_builder.add().init(RenderTask::new_dynamic( target_size, RenderTaskKind::SvgFilter(SvgFilterTask { - extra_gpu_data: None, + extra_gpu_cache_handle: None, info, }), ).with_uv_rect_kind(uv_rect_kind)); @@ -1650,7 +1665,7 @@ impl RenderTask { pub fn new_svg_filter_graph( filter_nodes: &[(FilterGraphNode, FilterGraphOp)], rg_builder: &mut RenderTaskGraphBuilder, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, data_stores: &mut DataStores, _uv_rect_kind: UvRectKind, original_task_id: RenderTaskId, @@ -2352,7 +2367,7 @@ impl RenderTask { }, op: FilterGraphOp::SVGFEIdentity, content_origin: DevicePoint::zero(), - extra_gpu_data: None, + extra_gpu_cache_handle: None, } ), ).with_uv_rect_kind(UvRectKind::Rect)); @@ -2396,7 +2411,7 @@ impl RenderTask { }, op: FilterGraphOp::SVGFEIdentity, content_origin: node_task_rect.min, - extra_gpu_data: None, + extra_gpu_cache_handle: None, } ), ).with_uv_rect_kind(node_uv_rect_kind)); @@ -2482,7 +2497,7 @@ impl RenderTask { }, op: FilterGraphOp::SVGFEIdentity, content_origin: node_task_rect.min, - extra_gpu_data: None, + extra_gpu_cache_handle: None, } ), ).with_uv_rect_kind(UvRectKind::Rect)); @@ -2537,7 +2552,7 @@ impl RenderTask { std_deviation_x: 0.0, std_deviation_y: 0.0, }, content_origin: node_task_rect.min, - extra_gpu_data: None, + extra_gpu_cache_handle: None, } ), ).with_uv_rect_kind(node_uv_rect_kind)); @@ -2575,7 +2590,7 @@ impl RenderTask { }, op: op.clone(), content_origin: source_subregion.min.cast_unit(), - extra_gpu_data: None, + extra_gpu_cache_handle: None, } ), ).with_uv_rect_kind(node_uv_rect_kind)); @@ -2586,13 +2601,13 @@ impl RenderTask { // FIXME: Doing this in prepare_interned_prim_for_render // doesn't seem to be enough, where should it be done? let filter_data = &mut data_stores.filter_data[handle]; - filter_data.write_gpu_blocks(gpu_buffer); - // ComponentTransfer has a gpu buffer address that we need to + filter_data.update(gpu_cache); + // ComponentTransfer has a gpu_cache_handle that we need to // pass along task_id = rg_builder.add().init(RenderTask::new_dynamic( node_task_size, RenderTaskKind::SVGFENode( - SVGFEFilterTask { + SVGFEFilterTask{ node: FilterGraphNode{ kept_by_optimizer: true, linear: node.linear, @@ -2602,7 +2617,7 @@ impl RenderTask { }, op: op.clone(), content_origin: node_task_rect.min, - extra_gpu_data: Some(filter_data.gpu_buffer_address), + extra_gpu_cache_handle: Some(filter_data.gpu_cache_handle), } ), ).with_uv_rect_kind(node_uv_rect_kind)); @@ -2634,7 +2649,7 @@ impl RenderTask { }, op: op.clone(), content_origin: node_task_rect.min, - extra_gpu_data: None, + extra_gpu_cache_handle: None, } ), ).with_uv_rect_kind(node_uv_rect_kind)); @@ -2675,8 +2690,8 @@ impl RenderTask { self.uv_rect_kind } - pub fn get_texture_address(&self) -> GpuBufferAddress { - self.uv_rect_handle + pub fn get_texture_address(&self, gpu_cache: &GpuCache) -> GpuCacheAddress { + gpu_cache.get_address(&self.uv_rect_handle) } pub fn get_target_texture(&self) -> CacheTextureId { @@ -2758,11 +2773,11 @@ impl RenderTask { pub fn write_gpu_blocks( &mut self, target_rect: DeviceIntRect, - gpu_buffer: &mut GpuBufferBuilder, + gpu_cache: &mut GpuCache, ) { profile_scope!("write_gpu_blocks"); - self.kind.write_gpu_blocks(gpu_buffer); + self.kind.write_gpu_blocks(gpu_cache); if self.cache_handle.is_some() { // The uv rect handle of cached render tasks is requested and set by the @@ -2770,16 +2785,17 @@ impl RenderTask { return; } - let p0 = target_rect.min.to_f32(); - let p1 = target_rect.max.to_f32(); - let image_source = ImageSource { - p0, - p1, - user_data: [0.0; 4], - uv_rect_kind: self.uv_rect_kind, - }; - - self.uv_rect_handle = image_source.write_gpu_blocks(&mut gpu_buffer.f32); + if let Some(mut request) = gpu_cache.request(&mut self.uv_rect_handle) { + let p0 = target_rect.min.to_f32(); + let p1 = target_rect.max.to_f32(); + let image_source = ImageSource { + p0, + p1, + user_data: [0.0; 4], + uv_rect_kind: self.uv_rect_kind, + }; + image_source.write_gpu_blocks(&mut request); + } } /// Called by the render task cache. diff --git a/gfx/wr/webrender/src/render_task_cache.rs b/gfx/wr/webrender/src/render_task_cache.rs @@ -9,6 +9,7 @@ use crate::border::BorderSegmentCacheKey; use crate::box_shadow::BoxShadowCacheKey; use crate::device::TextureFilter; use crate::freelist::{FreeList, FreeListHandle, WeakFreeListHandle}; +use crate::gpu_cache::GpuCache; use crate::internal_types::FastHashMap; use crate::prim_store::image::ImageCacheKey; use crate::prim_store::gradient::{ @@ -164,7 +165,7 @@ impl RenderTaskCache { size: DeviceIntSize, render_task: &mut RenderTask, entry: &mut RenderTaskCacheEntry, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, texture_cache: &mut TextureCache, ) { // Find out what size to alloc in the texture cache. @@ -198,7 +199,7 @@ impl RenderTaskCache { None, entry.user_data.unwrap_or([0.0; 4]), DirtyRect::All, - gpu_buffer, + gpu_cache, None, render_task.uv_rect_kind(), Eviction::Auto, @@ -229,20 +230,22 @@ impl RenderTaskCache { texture_cache: &mut TextureCache, is_opaque: bool, parent: RenderTaskParent, + gpu_cache: &mut GpuCache, gpu_buffer_builder: &mut GpuBufferBuilderF, rg_builder: &mut RenderTaskGraphBuilder, surface_builder: &mut SurfaceBuilder, - f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId, + f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId, ) -> RenderTaskId { // If this render task cache is being drawn this frame, ensure we hook up the // render task for it as a dependency of any render task that uses this as // an input source. let (task_id, rendered_this_frame) = match key { - None => (f(rg_builder, gpu_buffer_builder), true), + None => (f(rg_builder, gpu_buffer_builder, gpu_cache), true), Some(key) => self.request_render_task_impl( key, is_opaque, texture_cache, + gpu_cache, gpu_buffer_builder, rg_builder, f @@ -281,9 +284,10 @@ impl RenderTaskCache { key: RenderTaskCacheKey, is_opaque: bool, texture_cache: &mut TextureCache, + gpu_cache: &mut GpuCache, gpu_buffer_builder: &mut GpuBufferBuilderF, rg_builder: &mut RenderTaskGraphBuilder, - f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId, + f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId, ) -> (RenderTaskId, bool) { let frame_id = self.frame_id; let size = key.size; @@ -305,10 +309,10 @@ impl RenderTaskCache { cache_entry.frame_id = self.frame_id; // Check if this texture cache handle is valid. - if texture_cache.request(&cache_entry.handle, gpu_buffer_builder) { + if texture_cache.request(&cache_entry.handle, gpu_cache) { // Invoke user closure to get render task chain // to draw this into the texture cache. - let render_task_id = f(rg_builder, gpu_buffer_builder); + let render_task_id = f(rg_builder, gpu_buffer_builder, gpu_cache); cache_entry.user_data = None; cache_entry.is_opaque = is_opaque; @@ -324,7 +328,7 @@ impl RenderTaskCache { task_size, render_task, cache_entry, - gpu_buffer_builder, + gpu_cache, texture_cache, ); } diff --git a/gfx/wr/webrender/src/render_task_graph.rs b/gfx/wr/webrender/src/render_task_graph.rs @@ -9,13 +9,12 @@ use api::units::*; use api::ImageFormat; +use crate::gpu_cache::{GpuCache, GpuCacheAddress}; use crate::internal_types::{TextureSource, CacheTextureId, FastHashMap, FastHashSet, FrameId}; use crate::internal_types::size_of_frame_vec; use crate::render_task::{StaticRenderTaskSurface, RenderTaskLocation, RenderTask}; use crate::render_target::RenderTargetKind; use crate::render_task::{RenderTaskData, RenderTaskKind}; -use crate::renderer::GpuBufferAddress; -use crate::renderer::GpuBufferBuilder; use crate::resource_cache::ResourceCache; use crate::texture_pack::GuillotineAllocator; use crate::prim_store::DeferredResolve; @@ -281,7 +280,7 @@ impl RenderTaskGraphBuilder { pub fn end_frame( &mut self, resource_cache: &mut ResourceCache, - gpu_buffers: &mut GpuBufferBuilder, + gpu_cache: &mut GpuCache, deferred_resolves: &mut FrameVec<DeferredResolve>, max_shared_surface_size: i32, memory: &FrameMemory, @@ -631,7 +630,7 @@ impl RenderTaskGraphBuilder { Some(resolve_image( info.request, resource_cache, - &mut gpu_buffers.f32, + gpu_cache, deferred_resolves, info.is_composited, )) @@ -661,7 +660,7 @@ impl RenderTaskGraphBuilder { task.write_gpu_blocks( target_rect, - gpu_buffers, + gpu_cache, ); graph.task_data.push( @@ -724,14 +723,16 @@ impl RenderTaskGraph { pub fn resolve_location( &self, task_id: impl Into<Option<RenderTaskId>>, - ) -> Option<(GpuBufferAddress, TextureSource)> { - self.resolve_impl(task_id.into()?) + gpu_cache: &GpuCache, + ) -> Option<(GpuCacheAddress, TextureSource)> { + self.resolve_impl(task_id.into()?, gpu_cache) } fn resolve_impl( &self, task_id: RenderTaskId, - ) -> Option<(GpuBufferAddress, TextureSource)> { + gpu_cache: &GpuCache, + ) -> Option<(GpuCacheAddress, TextureSource)> { let task = &self[task_id]; let texture_source = task.get_texture_source(); @@ -739,7 +740,7 @@ impl RenderTaskGraph { return None; } - let uv_address = task.get_texture_address(); + let uv_address = task.get_texture_address(gpu_cache); Some((uv_address, texture_source)) } @@ -1094,20 +1095,19 @@ impl RenderTaskGraphBuilder { total_surface_count: usize, unique_surfaces: &[(i32, i32, ImageFormat)], ) { - use crate::{internal_types::FrameStamp, renderer::{GpuBufferBuilderF, GpuBufferBuilderI}}; + use crate::internal_types::FrameStamp; use api::{DocumentId, IdNamespace}; let mut rc = ResourceCache::new_for_testing(); + let mut gc = GpuCache::new(); let mut frame_stamp = FrameStamp::first(DocumentId::new(IdNamespace(1), 1)); frame_stamp.advance(); + gc.prepare_for_frames(); + gc.begin_frame(frame_stamp); let frame_memory = FrameMemory::fallback(); - let mut gpu_buffers = GpuBufferBuilder { - f32: GpuBufferBuilderF::new(&frame_memory), - i32: GpuBufferBuilderI::new(&frame_memory), - }; - let g = self.end_frame(&mut rc, &mut gpu_buffers, &mut frame_memory.new_vec(), 2048, &frame_memory); + let g = self.end_frame(&mut rc, &mut gc, &mut frame_memory.new_vec(), 2048, &frame_memory); g.print(); assert_eq!(g.passes.len(), pass_count); diff --git a/gfx/wr/webrender/src/renderer/gpu_buffer.rs b/gfx/wr/webrender/src/renderer/gpu_buffer.rs @@ -11,8 +11,6 @@ */ -use std::i32; - use crate::gpu_types::UvRectKind; use crate::internal_types::{FrameMemory, FrameVec}; use crate::renderer::MAX_VERTEX_TEXTURE_WIDTH; @@ -72,44 +70,24 @@ pub struct GpuBufferBlockI { data: [i32; 4], } -// TODO(gw): Temporarily encode GPU Cache addresses as a single int. -// In the future, we can change the PrimitiveInstanceData struct -// to use 2x u16 for the vertex attribute instead of an i32. -#[repr(transparent)] #[derive(Copy, Debug, Clone, MallocSizeOf, Eq, PartialEq)] #[cfg_attr(feature = "capture", derive(Serialize))] #[cfg_attr(feature = "replay", derive(Deserialize))] -pub struct GpuBufferAddress(u32); +pub struct GpuBufferAddress { + pub u: u16, + pub v: u16, +} impl GpuBufferAddress { - pub fn new(u: u16, v: u16) -> Self { - GpuBufferAddress( - v as u32 * MAX_VERTEX_TEXTURE_WIDTH as u32 + u as u32 - ) - } - - pub fn as_u32(self) -> u32 { - self.0 - } - - pub fn from_u32(val: u32) -> Self { - GpuBufferAddress(val) - } - #[allow(dead_code)] pub fn as_int(self) -> i32 { - self.0 as i32 + // TODO(gw): Temporarily encode GPU Cache addresses as a single int. + // In the future, we can change the PrimitiveInstanceData struct + // to use 2x u16 for the vertex attribute instead of an i32. + self.v as i32 * MAX_VERTEX_TEXTURE_WIDTH as i32 + self.u as i32 } - #[allow(dead_code)] - pub fn uv(self) -> (u16, u16) { - ( - (self.0 as usize % MAX_VERTEX_TEXTURE_WIDTH) as u16, - (self.0 as usize / MAX_VERTEX_TEXTURE_WIDTH) as u16, - ) - } - - pub const INVALID: GpuBufferAddress = GpuBufferAddress(u32::MAX - 1); + pub const INVALID: GpuBufferAddress = GpuBufferAddress { u: !0, v: !0 }; } impl GpuBufferBlockF { @@ -238,7 +216,7 @@ pub struct GpuBufferWriter<'a, T> { buffer: &'a mut FrameVec<T>, deferred: &'a mut Vec<DeferredBlock>, index: usize, - max_block_count: usize, + block_count: usize, } impl<'a, T> GpuBufferWriter<'a, T> where T: Texel { @@ -246,13 +224,13 @@ impl<'a, T> GpuBufferWriter<'a, T> where T: Texel { buffer: &'a mut FrameVec<T>, deferred: &'a mut Vec<DeferredBlock>, index: usize, - max_block_count: usize, + block_count: usize, ) -> Self { GpuBufferWriter { buffer, deferred, index, - max_block_count, + block_count, } } @@ -280,15 +258,18 @@ impl<'a, T> GpuBufferWriter<'a, T> where T: Texel { /// Close this writer, returning the GPU address of this set of block(s). pub fn finish(self) -> GpuBufferAddress { - assert!(self.buffer.len() <= self.index + self.max_block_count); + assert_eq!(self.buffer.len(), self.index + self.block_count); - GpuBufferAddress(self.index as u32) + GpuBufferAddress { + u: (self.index % MAX_VERTEX_TEXTURE_WIDTH) as u16, + v: (self.index / MAX_VERTEX_TEXTURE_WIDTH) as u16, + } } } impl<'a, T> Drop for GpuBufferWriter<'a, T> { fn drop(&mut self) { - assert!(self.buffer.len() <= self.index + self.max_block_count, "Attempt to write too many GpuBuffer blocks"); + assert_eq!(self.buffer.len(), self.index + self.block_count, "Claimed block_count was not written"); } } @@ -326,17 +307,20 @@ impl<T> GpuBufferBuilderImpl<T> where T: Texel + std::convert::From<DeviceIntRec self.data.extend_from_slice(blocks); - GpuBufferAddress(index as u32) + GpuBufferAddress { + u: (index % MAX_VERTEX_TEXTURE_WIDTH) as u16, + v: (index / MAX_VERTEX_TEXTURE_WIDTH) as u16, + } } /// Begin writing a specific number of blocks pub fn write_blocks( &mut self, - max_block_count: usize, + block_count: usize, ) -> GpuBufferWriter<T> { - assert!(max_block_count <= MAX_VERTEX_TEXTURE_WIDTH); + assert!(block_count <= MAX_VERTEX_TEXTURE_WIDTH); - if (self.data.len() % MAX_VERTEX_TEXTURE_WIDTH) + max_block_count > MAX_VERTEX_TEXTURE_WIDTH { + if (self.data.len() % MAX_VERTEX_TEXTURE_WIDTH) + block_count > MAX_VERTEX_TEXTURE_WIDTH { while self.data.len() % MAX_VERTEX_TEXTURE_WIDTH != 0 { self.data.push(T::default()); } @@ -348,23 +332,10 @@ impl<T> GpuBufferBuilderImpl<T> where T: Texel + std::convert::From<DeviceIntRec &mut self.data, &mut self.deferred, index, - max_block_count, + block_count, ) } - // Reserve space in the gpu buffer for data that will be written by the - // renderer. - pub fn reserve_renderer_deferred_blocks(&mut self, block_count: usize) -> GpuBufferAddress { - let index = self.data.len(); - - self.data.reserve(block_count); - for _ in 0 ..block_count { - self.data.push(Default::default()); - } - - GpuBufferAddress(index as u32) - } - pub fn finalize( mut self, render_tasks: &RenderTaskGraph, diff --git a/gfx/wr/webrender/src/renderer/gpu_cache.rs b/gfx/wr/webrender/src/renderer/gpu_cache.rs @@ -0,0 +1,541 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use std::{cmp, mem}; +use api::units::*; +use malloc_size_of::MallocSizeOfOps; +use crate::{ + device::{CustomVAO, Device, DrawTarget, Program, ReadTarget, Texture, TextureFilter, UploadPBOPool, VBO}, + gpu_cache::{GpuBlockData, GpuCacheUpdate, GpuCacheUpdateList}, + internal_types::{FrameId, RenderTargetInfo, Swizzle}, + prim_store::DeferredResolve, + profiler, + render_api::MemoryReport, +}; + +/// Enabling this toggle would force the GPU cache scattered texture to +/// be resized every frame, which enables GPU debuggers to see if this +/// is performed correctly. +const GPU_CACHE_RESIZE_TEST: bool = false; + +/// Tracks the state of each row in the GPU cache texture. +struct CacheRow { + /// Mirrored block data on CPU for this row. We store a copy of + /// the data on the CPU side to improve upload batching. + cpu_blocks: Box<[GpuBlockData; super::MAX_VERTEX_TEXTURE_WIDTH]>, + /// The first offset in this row that is dirty. + min_dirty: u16, + /// The last offset in this row that is dirty. + max_dirty: u16, +} + +impl CacheRow { + fn new() -> Self { + CacheRow { + cpu_blocks: Box::new([GpuBlockData::EMPTY; super::MAX_VERTEX_TEXTURE_WIDTH]), + min_dirty: super::MAX_VERTEX_TEXTURE_WIDTH as _, + max_dirty: 0, + } + } + + fn is_dirty(&self) -> bool { + return self.min_dirty < self.max_dirty; + } + + fn clear_dirty(&mut self) { + self.min_dirty = super::MAX_VERTEX_TEXTURE_WIDTH as _; + self.max_dirty = 0; + } + + fn add_dirty(&mut self, block_offset: usize, block_count: usize) { + self.min_dirty = self.min_dirty.min(block_offset as _); + self.max_dirty = self.max_dirty.max((block_offset + block_count) as _); + } + + fn dirty_blocks(&self) -> &[GpuBlockData] { + return &self.cpu_blocks[self.min_dirty as usize .. self.max_dirty as usize]; + } +} + +/// The bus over which CPU and GPU versions of the GPU cache +/// get synchronized. +enum GpuCacheBus { + /// PBO-based updates, currently operate on a row granularity. + /// Therefore, are subject to fragmentation issues. + PixelBuffer { + /// Per-row data. + rows: Vec<CacheRow>, + }, + /// Shader-based scattering updates. Currently rendered by a set + /// of points into the GPU texture, each carrying a `GpuBlockData`. + Scatter { + /// Special program to run the scattered update. + program: Program, + /// VAO containing the source vertex buffers. + vao: CustomVAO, + /// VBO for positional data, supplied as normalized `u16`. + buf_position: VBO<[u16; 2]>, + /// VBO for gpu block data. + buf_value: VBO<GpuBlockData>, + /// Currently stored block count. + count: usize, + }, +} + +/// The device-specific representation of the cache texture in gpu_cache.rs +pub struct GpuCacheTexture { + texture: Option<Texture>, + bus: GpuCacheBus, +} + +impl GpuCacheTexture { + /// Ensures that we have an appropriately-sized texture. + fn ensure_texture(&mut self, device: &mut Device, height: i32) { + // If we already have a texture that works, we're done. + if self.texture.as_ref().map_or(false, |t| t.get_dimensions().height >= height) { + if GPU_CACHE_RESIZE_TEST { + // Special debug mode - resize the texture even though it's fine. + } else { + return; + } + } + + // Take the old texture, if any. + let blit_source = self.texture.take(); + + // Create the new texture. + assert!(height >= 2, "Height is too small for ANGLE"); + let new_size = DeviceIntSize::new(super::MAX_VERTEX_TEXTURE_WIDTH as _, height); + // GpuCacheBus::Scatter always requires the texture to be a render target. For + // GpuCacheBus::PixelBuffer, we only create the texture with a render target if + // RGBAF32 render targets are actually supported, and only if glCopyImageSubData + // is not. glCopyImageSubData does not require a render target to copy the texture + // data, and if neither RGBAF32 render targets nor glCopyImageSubData is supported, + // we simply re-upload the entire contents rather than copying upon resize. + let supports_copy_image_sub_data = device.get_capabilities().supports_copy_image_sub_data; + let supports_color_buffer_float = device.get_capabilities().supports_color_buffer_float; + let rt_info = if matches!(self.bus, GpuCacheBus::PixelBuffer { .. }) + && (supports_copy_image_sub_data || !supports_color_buffer_float) + { + None + } else { + Some(RenderTargetInfo { has_depth: false }) + }; + let mut texture = device.create_texture( + api::ImageBufferKind::Texture2D, + api::ImageFormat::RGBAF32, + new_size.width, + new_size.height, + TextureFilter::Nearest, + rt_info, + ); + + // Copy the contents of the previous texture, if applicable. + if let Some(blit_source) = blit_source { + if !supports_copy_image_sub_data && !supports_color_buffer_float { + // Cannot copy texture, so must re-upload everything. + match self.bus { + GpuCacheBus::PixelBuffer { ref mut rows } => { + for row in rows { + row.add_dirty(0, super::MAX_VERTEX_TEXTURE_WIDTH); + } + } + GpuCacheBus::Scatter { .. } => { + panic!("Texture must be copyable to use scatter GPU cache bus method"); + } + } + } else { + device.copy_entire_texture(&mut texture, &blit_source); + } + device.delete_texture(blit_source); + } + + self.texture = Some(texture); + } + + pub fn new(device: &mut Device, use_scatter: bool) -> Result<Self, super::RendererError> { + use super::desc::GPU_CACHE_UPDATE; + + let bus = if use_scatter { + assert!( + device.get_capabilities().supports_color_buffer_float, + "GpuCache scatter method requires EXT_color_buffer_float", + ); + let program = device.create_program_linked( + "gpu_cache_update", + &[], + &GPU_CACHE_UPDATE, + )?; + let buf_position = device.create_vbo(); + let buf_value = device.create_vbo(); + //Note: the vertex attributes have to be supplied in the same order + // as for program creation, but each assigned to a different stream. + let vao = device.create_custom_vao(&[ + buf_position.stream_with(&GPU_CACHE_UPDATE.vertex_attributes[0..1]), + buf_value .stream_with(&GPU_CACHE_UPDATE.vertex_attributes[1..2]), + ]); + GpuCacheBus::Scatter { + program, + vao, + buf_position, + buf_value, + count: 0, + } + } else { + GpuCacheBus::PixelBuffer { + rows: Vec::new(), + } + }; + + Ok(GpuCacheTexture { + texture: None, + bus, + }) + } + + pub fn deinit(mut self, device: &mut Device) { + if let Some(t) = self.texture.take() { + device.delete_texture(t); + } + if let GpuCacheBus::Scatter { program, vao, buf_position, buf_value, .. } = self.bus { + device.delete_program(program); + device.delete_custom_vao(vao); + device.delete_vbo(buf_position); + device.delete_vbo(buf_value); + } + } + + pub fn get_height(&self) -> i32 { + self.texture.as_ref().map_or(0, |t| t.get_dimensions().height) + } + + #[cfg(feature = "capture")] + pub fn get_texture(&self) -> &Texture { + self.texture.as_ref().unwrap() + } + + fn prepare_for_updates( + &mut self, + device: &mut Device, + total_block_count: usize, + max_height: i32, + ) { + self.ensure_texture(device, max_height); + match self.bus { + GpuCacheBus::PixelBuffer { .. } => {}, + GpuCacheBus::Scatter { + ref mut buf_position, + ref mut buf_value, + ref mut count, + .. + } => { + *count = 0; + if total_block_count > buf_value.allocated_count() { + device.allocate_vbo(buf_position, total_block_count, super::ONE_TIME_USAGE_HINT); + device.allocate_vbo(buf_value, total_block_count, super::ONE_TIME_USAGE_HINT); + } + } + } + } + + pub fn invalidate(&mut self) { + match self.bus { + GpuCacheBus::PixelBuffer { ref mut rows, .. } => { + info!("Invalidating GPU caches"); + for row in rows { + row.add_dirty(0, super::MAX_VERTEX_TEXTURE_WIDTH); + } + } + GpuCacheBus::Scatter { .. } => { + warn!("Unable to invalidate scattered GPU cache"); + } + } + } + + fn update(&mut self, device: &mut Device, updates: &GpuCacheUpdateList) { + match self.bus { + GpuCacheBus::PixelBuffer { ref mut rows, .. } => { + for update in &updates.updates { + match *update { + GpuCacheUpdate::Copy { + block_index, + block_count, + address, + } => { + let row = address.v as usize; + + // Ensure that the CPU-side shadow copy of the GPU cache data has enough + // rows to apply this patch. + while rows.len() <= row { + // Add a new row. + rows.push(CacheRow::new()); + } + + // Copy the blocks from the patch array in the shadow CPU copy. + let block_offset = address.u as usize; + let data = &mut rows[row].cpu_blocks; + for i in 0 .. block_count { + data[block_offset + i] = updates.blocks[block_index + i]; + } + + // This row is dirty (needs to be updated in GPU texture). + rows[row].add_dirty(block_offset, block_count); + } + } + } + } + GpuCacheBus::Scatter { + ref buf_position, + ref buf_value, + ref mut count, + .. + } => { + //TODO: re-use this heap allocation + // Unused positions will be left as 0xFFFF, which translates to + // (1.0, 1.0) in the vertex output position and gets culled out + let mut position_data = vec![[!0u16; 2]; updates.blocks.len()]; + let size = self.texture.as_ref().unwrap().get_dimensions().to_usize(); + + for update in &updates.updates { + match *update { + GpuCacheUpdate::Copy { + block_index, + block_count, + address, + } => { + // Convert the absolute texel position into normalized + let y = ((2*address.v as usize + 1) << 15) / size.height; + for i in 0 .. block_count { + let x = ((2*address.u as usize + 2*i + 1) << 15) / size.width; + position_data[block_index + i] = [x as _, y as _]; + } + } + } + } + + device.fill_vbo(buf_value, &updates.blocks, *count); + device.fill_vbo(buf_position, &position_data, *count); + *count += position_data.len(); + } + } + } + + fn flush(&mut self, device: &mut Device, pbo_pool: &mut UploadPBOPool) -> usize { + let texture = self.texture.as_ref().unwrap(); + match self.bus { + GpuCacheBus::PixelBuffer { ref mut rows } => { + let rows_dirty = rows + .iter() + .filter(|row| row.is_dirty()) + .count(); + if rows_dirty == 0 { + return 0 + } + + let mut uploader = device.upload_texture(pbo_pool); + + for (row_index, row) in rows.iter_mut().enumerate() { + if !row.is_dirty() { + continue; + } + + let blocks = row.dirty_blocks(); + let rect = DeviceIntRect::from_origin_and_size( + DeviceIntPoint::new(row.min_dirty as i32, row_index as i32), + DeviceIntSize::new(blocks.len() as i32, 1), + ); + + uploader.upload(device, texture, rect, None, None, blocks.as_ptr(), blocks.len()); + + row.clear_dirty(); + } + + uploader.flush(device); + + rows_dirty + } + GpuCacheBus::Scatter { ref program, ref vao, count, .. } => { + device.disable_depth(); + device.set_blend(false); + device.bind_program(program); + device.bind_custom_vao(vao); + device.bind_draw_target( + DrawTarget::from_texture( + texture, + false, + ), + ); + device.draw_nonindexed_points(0, count as _); + 0 + } + } + } + + #[cfg(feature = "replay")] + pub fn remove_texture(&mut self, device: &mut Device) { + if let Some(t) = self.texture.take() { + device.delete_texture(t); + } + } + + #[cfg(feature = "replay")] + pub fn load_from_data(&mut self, texture: Texture, data: Vec<u8>) { + assert!(self.texture.is_none()); + match self.bus { + GpuCacheBus::PixelBuffer { ref mut rows, .. } => { + let dim = texture.get_dimensions(); + let blocks = unsafe { + std::slice::from_raw_parts( + data.as_ptr() as *const GpuBlockData, + data.len() / mem::size_of::<GpuBlockData>(), + ) + }; + // fill up the CPU cache from the contents we just loaded + rows.clear(); + rows.extend((0 .. dim.height).map(|_| CacheRow::new())); + let chunks = blocks.chunks(super::MAX_VERTEX_TEXTURE_WIDTH); + debug_assert_eq!(chunks.len(), rows.len()); + for (row, chunk) in rows.iter_mut().zip(chunks) { + row.cpu_blocks.copy_from_slice(chunk); + } + } + GpuCacheBus::Scatter { .. } => {} + } + self.texture = Some(texture); + } + + pub fn report_memory_to(&self, report: &mut MemoryReport, size_op_funs: &MallocSizeOfOps) { + if let GpuCacheBus::PixelBuffer{ref rows, ..} = self.bus { + for row in rows.iter() { + report.gpu_cache_cpu_mirror += unsafe { (size_op_funs.size_of_op)(row.cpu_blocks.as_ptr() as *const _) }; + } + } + + // GPU cache GPU memory. + report.gpu_cache_textures += + self.texture.as_ref().map_or(0, |t| t.size_in_bytes()); + } + + pub fn gpu_size_in_bytes(&self) -> usize { + match &self.texture { + Some(tex) => tex.size_in_bytes(), + None => 0, + } + } +} + +impl super::Renderer { + pub fn update_gpu_cache(&mut self) { + let _gm = self.gpu_profiler.start_marker("gpu cache update"); + + // For an artificial stress test of GPU cache resizing, + // always pass an extra update list with at least one block in it. + let gpu_cache_height = self.gpu_cache_texture.get_height(); + if gpu_cache_height != 0 && GPU_CACHE_RESIZE_TEST { + self.pending_gpu_cache_updates.push(GpuCacheUpdateList { + frame_id: FrameId::INVALID, + clear: false, + height: gpu_cache_height, + blocks: vec![[1f32; 4].into()], + updates: Vec::new(), + debug_commands: Vec::new(), + }); + } + + let (updated_blocks, max_requested_height) = self + .pending_gpu_cache_updates + .iter() + .fold((0, gpu_cache_height), |(count, height), list| { + (count + list.blocks.len(), cmp::max(height, list.height)) + }); + + if max_requested_height > self.get_max_texture_size() && !self.gpu_cache_overflow { + self.gpu_cache_overflow = true; + self.renderer_errors.push(super::RendererError::MaxTextureSize); + } + + // Note: if we decide to switch to scatter-style GPU cache update + // permanently, we can have this code nicer with `BufferUploader` kind + // of helper, similarly to how `TextureUploader` API is used. + self.gpu_cache_texture.prepare_for_updates( + &mut self.device, + updated_blocks, + max_requested_height, + ); + + for update_list in self.pending_gpu_cache_updates.drain(..) { + assert!(update_list.height <= max_requested_height); + if update_list.frame_id > self.gpu_cache_frame_id { + self.gpu_cache_frame_id = update_list.frame_id + } + self.gpu_cache_texture + .update(&mut self.device, &update_list); + } + + self.profile.start_time(profiler::GPU_CACHE_UPLOAD_TIME); + let updated_rows = self.gpu_cache_texture.flush( + &mut self.device, + &mut self.texture_upload_pbo_pool + ); + self.gpu_cache_upload_time += self.profile.end_time(profiler::GPU_CACHE_UPLOAD_TIME); + + self.profile.set(profiler::GPU_CACHE_ROWS_UPDATED, updated_rows); + self.profile.set(profiler::GPU_CACHE_BLOCKS_UPDATED, updated_blocks); + } + + pub fn prepare_gpu_cache( + &mut self, + deferred_resolves: &[DeferredResolve], + ) -> Result<(), super::RendererError> { + self.profile.start_time(profiler::GPU_CACHE_PREPARE_TIME); + + if self.pending_gpu_cache_clear { + let use_scatter = + matches!(self.gpu_cache_texture.bus, GpuCacheBus::Scatter { .. }); + let new_cache = match GpuCacheTexture::new(&mut self.device, use_scatter) { + Ok(cache) => cache, + Err(err) => { + self.profile.end_time(profiler::GPU_CACHE_PREPARE_TIME); + return Err(err); + } + }; + let old_cache = mem::replace(&mut self.gpu_cache_texture, new_cache); + old_cache.deinit(&mut self.device); + self.pending_gpu_cache_clear = false; + } + + let deferred_update_list = self.update_deferred_resolves(deferred_resolves); + self.pending_gpu_cache_updates.extend(deferred_update_list); + + self.update_gpu_cache(); + + // Note: the texture might have changed during the `update`, + // so we need to bind it here. + self.device.bind_texture( + super::TextureSampler::GpuCache, + self.gpu_cache_texture.texture.as_ref().unwrap(), + Swizzle::default(), + ); + + self.profile.end_time(profiler::GPU_CACHE_PREPARE_TIME); + + Ok(()) + } + + pub fn read_gpu_cache(&mut self) -> (DeviceIntSize, Vec<u8>) { + let texture = self.gpu_cache_texture.texture.as_ref().unwrap(); + let size = device_size_as_framebuffer_size(texture.get_dimensions()); + let mut texels = vec![0; (size.width * size.height * 16) as usize]; + self.device.begin_frame(); + self.device.bind_read_target(ReadTarget::from_texture(texture)); + self.device.read_pixels_into( + size.into(), + api::ImageFormat::RGBAF32, + &mut texels, + ); + self.device.reset_read_target(); + self.device.end_frame(); + (texture.get_dimensions(), texels) + } +} diff --git a/gfx/wr/webrender/src/renderer/init.rs b/gfx/wr/webrender/src/renderer/init.rs @@ -19,7 +19,7 @@ use crate::frame_builder::FrameBuilderConfig; use crate::glyph_cache::GlyphCache; use glyph_rasterizer::{GlyphRasterThread, GlyphRasterizer, SharedFontResources}; use crate::gpu_types::PrimitiveInstanceData; -use crate::internal_types::{FastHashMap, FastHashSet}; +use crate::internal_types::{FastHashMap, FastHashSet, FrameId}; use crate::picture; use crate::profiler::{self, Profiler, TransactionProfile}; use crate::device::query::{GpuProfiler, GpuDebugMethod}; @@ -29,7 +29,7 @@ use crate::scene_builder_thread::{SceneBuilderThread, SceneBuilderThreadChannels use crate::texture_cache::{TextureCache, TextureCacheConfig}; use crate::picture_textures::PictureTextures; use crate::renderer::{ - debug, vertex, gl, + debug, gpu_cache, vertex, gl, Renderer, DebugOverlayState, BufferDamageTracker, PipelineInfo, TextureResolver, RendererError, ShaderPrecacheFlags, VERTEX_DATA_TEXTURE_COUNT, upload::UploadTexturePool, @@ -514,8 +514,25 @@ pub fn create_webrender_instance( vertex_data_textures.push(vertex::VertexDataTextures::new()); } + // On some (mostly older, integrated) GPUs, the normal GPU texture cache update path + // doesn't work well when running on ANGLE, causing CPU stalls inside D3D and/or the + // GPU driver. See https://bugzilla.mozilla.org/show_bug.cgi?id=1576637 for much + // more detail. To reduce the number of code paths we have active that require testing, + // we will enable the GPU cache scatter update path on all devices running with ANGLE. + // We want a better solution long-term, but for now this is a significant performance + // improvement on HD4600 era GPUs, and shouldn't hurt performance in a noticeable + // way on other systems running under ANGLE. let is_software = device.get_capabilities().renderer_name.starts_with("Software"); + // On other GL platforms, like macOS or Android, creating many PBOs is very inefficient. + // This is what happens in GPU cache updates in PBO path. Instead, we switch everything + // except software GL to use the GPU scattered updates. + let supports_scatter = device.get_capabilities().supports_color_buffer_float; + let gpu_cache_texture = gpu_cache::GpuCacheTexture::new( + &mut device, + supports_scatter && !is_software, + )?; + device.end_frame(); let backend_notifier = notifier.clone(); @@ -763,6 +780,8 @@ pub fn create_webrender_instance( pending_texture_updates: Vec::new(), pending_texture_cache_updates: false, pending_native_surface_updates: Vec::new(), + pending_gpu_cache_updates: Vec::new(), + pending_gpu_cache_clear: false, pending_shader_updates: Vec::new(), shaders, debug: debug::LazyInitializedDebugRenderer::new(), @@ -770,6 +789,7 @@ pub fn create_webrender_instance( profile: TransactionProfile::new(), frame_counter: 0, resource_upload_time: 0.0, + gpu_cache_upload_time: 0.0, profiler: Profiler::new(), max_recorded_profiles: options.max_recorded_profiles, clear_color: options.clear_color, @@ -788,6 +808,10 @@ pub fn create_webrender_instance( size_of_ops: make_size_of_ops(), cpu_profiles: VecDeque::new(), gpu_profiles: VecDeque::new(), + gpu_cache_texture, + gpu_cache_debug_chunks: Vec::new(), + gpu_cache_frame_id: FrameId::INVALID, + gpu_cache_overflow: false, texture_upload_pbo_pool, staging_texture_pool, texture_resolver, diff --git a/gfx/wr/webrender/src/renderer/mod.rs b/gfx/wr/webrender/src/renderer/mod.rs @@ -70,9 +70,11 @@ use crate::device::FBOId; use crate::debug_item::DebugItem; use crate::frame_builder::Frame; use glyph_rasterizer::GlyphFormat; +use crate::gpu_cache::{GpuCacheUpdate, GpuCacheUpdateList}; +use crate::gpu_cache::{GpuCacheDebugChunk, GpuCacheDebugCmd}; use crate::gpu_types::{ScalingInstance, SvgFilterInstance, SVGFEFilterInstance, CopyInstance, PrimitiveInstanceData}; use crate::gpu_types::{BlurInstance, ClearInstance, CompositeInstance, ZBufferId}; -use crate::internal_types::{TextureSource, TextureSourceExternal, FrameVec}; +use crate::internal_types::{TextureSource, TextureSourceExternal, FrameId, FrameVec}; #[cfg(any(feature = "capture", feature = "replay"))] use crate::internal_types::DebugOutput; use crate::internal_types::{CacheTextureId, FastHashMap, FastHashSet, RenderedDocument, ResultMsg}; @@ -120,6 +122,7 @@ use std::collections::hash_map::Entry; mod debug; mod gpu_buffer; +mod gpu_cache; mod shade; mod vertex; mod upload; @@ -129,7 +132,7 @@ pub use debug::DebugRenderer; pub use shade::{PendingShadersToPrecache, Shaders, SharedShaders}; pub use vertex::{desc, VertexArrayKind, MAX_VERTEX_TEXTURE_WIDTH}; pub use gpu_buffer::{GpuBuffer, GpuBufferF, GpuBufferBuilderF, GpuBufferI, GpuBufferBuilderI}; -pub use gpu_buffer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferWriterF, GpuBufferBlockF}; +pub use gpu_buffer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferWriterF}; /// The size of the array of each type of vertex data texture that /// is round-robin-ed each frame during bind_frame_data. Doing this @@ -387,6 +390,7 @@ pub(crate) enum TextureSampler { Color0, Color1, Color2, + GpuCache, TransformPalette, RenderTasks, Dither, @@ -416,14 +420,15 @@ impl Into<TextureSlot> for TextureSampler { TextureSampler::Color0 => TextureSlot(0), TextureSampler::Color1 => TextureSlot(1), TextureSampler::Color2 => TextureSlot(2), - TextureSampler::TransformPalette => TextureSlot(3), - TextureSampler::RenderTasks => TextureSlot(4), - TextureSampler::Dither => TextureSlot(5), - TextureSampler::PrimitiveHeadersF => TextureSlot(6), - TextureSampler::PrimitiveHeadersI => TextureSlot(7), - TextureSampler::ClipMask => TextureSlot(8), - TextureSampler::GpuBufferF => TextureSlot(9), - TextureSampler::GpuBufferI => TextureSlot(10), + TextureSampler::GpuCache => TextureSlot(3), + TextureSampler::TransformPalette => TextureSlot(4), + TextureSampler::RenderTasks => TextureSlot(5), + TextureSampler::Dither => TextureSlot(6), + TextureSampler::PrimitiveHeadersF => TextureSlot(7), + TextureSampler::PrimitiveHeadersI => TextureSlot(8), + TextureSampler::ClipMask => TextureSlot(9), + TextureSampler::GpuBufferF => TextureSlot(10), + TextureSampler::GpuBufferI => TextureSlot(11), } } } @@ -820,6 +825,8 @@ pub struct Renderer { /// True if there are any TextureCacheUpdate pending. pending_texture_cache_updates: bool, pending_native_surface_updates: Vec<NativeSurfaceOperation>, + pending_gpu_cache_updates: Vec<GpuCacheUpdateList>, + pending_gpu_cache_clear: bool, pending_shader_updates: Vec<PathBuf>, active_documents: FastHashMap<DocumentId, RenderedDocument>, @@ -838,6 +845,7 @@ pub struct Renderer { profile: TransactionProfile, frame_counter: u64, resource_upload_time: f64, + gpu_cache_upload_time: f64, profiler: Profiler, #[cfg(feature = "debugger")] debugger: Debugger, @@ -847,9 +855,18 @@ pub struct Renderer { pub gpu_profiler: GpuProfiler, vaos: vertex::RendererVAOs, + gpu_cache_texture: gpu_cache::GpuCacheTexture, vertex_data_textures: Vec<vertex::VertexDataTextures>, current_vertex_data_textures: usize, + /// When the GPU cache debugger is enabled, we keep track of the live blocks + /// in the GPU cache so that we can use them for the debug display. This + /// member stores those live blocks, indexed by row. + gpu_cache_debug_chunks: Vec<Vec<GpuCacheDebugChunk>>, + + gpu_cache_frame_id: FrameId, + gpu_cache_overflow: bool, + pipeline_info: PipelineInfo, // Manages and resolves source textures IDs to real texture IDs. @@ -1102,6 +1119,32 @@ impl Renderer { self.pending_native_surface_updates.extend(resource_update_list.native_surface_updates); self.documents_seen.insert(document_id); } + ResultMsg::UpdateGpuCache(mut list) => { + if list.clear { + self.pending_gpu_cache_clear = true; + } + if list.clear { + self.gpu_cache_debug_chunks = Vec::new(); + } + for cmd in mem::replace(&mut list.debug_commands, Vec::new()) { + match cmd { + GpuCacheDebugCmd::Alloc(chunk) => { + let row = chunk.address.v as usize; + if row >= self.gpu_cache_debug_chunks.len() { + self.gpu_cache_debug_chunks.resize(row + 1, Vec::new()); + } + self.gpu_cache_debug_chunks[row].push(chunk); + }, + GpuCacheDebugCmd::Free(address) => { + let chunks = &mut self.gpu_cache_debug_chunks[address.v as usize]; + let pos = chunks.iter() + .position(|x| x.address == address).unwrap(); + chunks.remove(pos); + }, + } + } + self.pending_gpu_cache_updates.push(list); + } ResultMsg::UpdateResources { resource_updates, memory_pressure, @@ -1326,6 +1369,9 @@ impl Renderer { | DebugCommand::SimulateLongSceneBuild(_) | DebugCommand::EnableNativeCompositor(_) | DebugCommand::SetBatchingLookback(_) => {} + DebugCommand::InvalidateGpuCache => { + self.gpu_cache_texture.invalidate(); + } DebugCommand::SetFlags(flags) => { self.set_debug_flags(flags); } @@ -1461,6 +1507,7 @@ impl Renderer { DebugFlags::RENDER_TARGET_DBG | DebugFlags::TEXTURE_CACHE_DBG | DebugFlags::EPOCHS | + DebugFlags::GPU_CACHE_DBG | DebugFlags::PICTURE_CACHING_DBG | DebugFlags::PICTURE_BORDERS | DebugFlags::ZOOM_DBG | @@ -1701,30 +1748,39 @@ impl Renderer { "Cleared texture cache without sending new document frame."); } - self.update_deferred_resolves(&frame.deferred_resolves, &mut frame.gpu_buffer_f); + match self.prepare_gpu_cache(&frame.deferred_resolves) { + Ok(..) => { + assert!(frame.gpu_cache_frame_id <= self.gpu_cache_frame_id, + "Received frame depends on a later GPU cache epoch ({:?}) than one we received last via `UpdateGpuCache` ({:?})", + frame.gpu_cache_frame_id, self.gpu_cache_frame_id); - self.draw_frame( - frame, - device_size, - buffer_age, - &mut results, - ); + self.draw_frame( + frame, + device_size, + buffer_age, + &mut results, + ); - // TODO(nical): do this automatically by selecting counters in the wr profiler - // Profile marker for the number of invalidated picture cache - if thread_is_being_profiled() { - let duration = Duration::new(0,0); - if let Some(n) = self.profile.get(profiler::RENDERED_PICTURE_TILES) { - let message = (n as usize).to_string(); - add_text_marker("NumPictureCacheInvalidated", &message, duration); - } - } + // TODO(nical): do this automatically by selecting counters in the wr profiler + // Profile marker for the number of invalidated picture cache + if thread_is_being_profiled() { + let duration = Duration::new(0,0); + if let Some(n) = self.profile.get(profiler::RENDERED_PICTURE_TILES) { + let message = (n as usize).to_string(); + add_text_marker("NumPictureCacheInvalidated", &message, duration); + } + } - if device_size.is_some() { - self.draw_frame_debug_items(&frame.debug_items); - } + if device_size.is_some() { + self.draw_frame_debug_items(&frame.debug_items); + } - self.profile.merge(profile); + self.profile.merge(profile); + } + Err(e) => { + self.renderer_errors.push(e); + } + } self.unlock_external_images(&frame.deferred_resolves); @@ -1745,6 +1801,7 @@ impl Renderer { self.bind_debug_overlay(device_size).map(|draw_target| { self.draw_render_target_debug(&draw_target); self.draw_texture_cache_debug(&draw_target); + self.draw_gpu_cache_debug(device_size); self.draw_zoom_debug(device_size); self.draw_epoch_debug(); self.draw_window_visibility_debug(); @@ -1792,6 +1849,8 @@ impl Renderer { self.frame_counter += 1; results.stats.resource_upload_time = self.resource_upload_time; self.resource_upload_time = 0.0; + results.stats.gpu_cache_upload_time = self.gpu_cache_upload_time; + self.gpu_cache_upload_time = 0.0; if let Some(stats) = active_doc.frame_stats.take() { // Copy the full frame stats to RendererStats @@ -4886,23 +4945,28 @@ impl Renderer { } } - fn update_deferred_resolves( - &mut self, - deferred_resolves: &[DeferredResolve], - gpu_buffer: &mut GpuBufferF, - ) { + fn update_deferred_resolves(&mut self, deferred_resolves: &[DeferredResolve]) -> Option<GpuCacheUpdateList> { // The first thing we do is run through any pending deferred // resolves, and use a callback to get the UV rect for this // custom item. Then we patch the resource_rects structure // here before it's uploaded to the GPU. if deferred_resolves.is_empty() { - return; + return None; } let handler = self.external_image_handler .as_mut() .expect("Found external image, but no handler set!"); + let mut list = GpuCacheUpdateList { + frame_id: FrameId::INVALID, + clear: false, + height: self.gpu_cache_texture.get_height(), + blocks: Vec::new(), + updates: Vec::new(), + debug_commands: Vec::new(), + }; + for (i, deferred_resolve) in deferred_resolves.iter().enumerate() { self.gpu_profiler.place_marker("deferred resolve"); let props = &deferred_resolve.image_properties; @@ -4955,11 +5019,16 @@ impl Renderer { .external_images .insert(DeferredResolveIndex(i as u32), texture); - let addr = deferred_resolve.address; - let index = addr.as_u32() as usize; - gpu_buffer.data[index] = image.uv.to_array().into(); - gpu_buffer.data[index + 1] = [0f32; 4].into(); + list.updates.push(GpuCacheUpdate::Copy { + block_index: list.blocks.len(), + block_count: BLOCKS_PER_UV_RECT, + address: deferred_resolve.address, + }); + list.blocks.push(image.uv.into()); + list.blocks.push([0f32; 4].into()); } + + Some(list) } fn unlock_external_images( @@ -5252,6 +5321,10 @@ impl Renderer { let gpu_buffer_mb = (gpu_buffer_bytes_f + gpu_buffer_bytes_i) as f32 * bytes_to_mb; self.profile.set(profiler::GPU_BUFFER_MEM, gpu_buffer_mb); + let gpu_cache_bytes = self.gpu_cache_texture.gpu_size_in_bytes(); + let gpu_cache_mb = gpu_cache_bytes as f32 * bytes_to_mb; + self.profile.set(profiler::GPU_CACHE_MEM, gpu_cache_mb); + // Determine the present mode and dirty rects, if device_size // is Some(..). If it's None, no composite will occur and only // picture cache and texture cache targets will be updated. @@ -5959,6 +6032,42 @@ impl Renderer { } } + fn draw_gpu_cache_debug(&mut self, device_size: DeviceIntSize) { + if !self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) { + return; + } + + let debug_renderer = match self.debug.get_mut(&mut self.device) { + Some(render) => render, + None => return, + }; + + let (x_off, y_off) = (30f32, 30f32); + let height = self.gpu_cache_texture.get_height() + .min(device_size.height - (y_off as i32) * 2) as usize; + debug_renderer.add_quad( + x_off, + y_off, + x_off + MAX_VERTEX_TEXTURE_WIDTH as f32, + y_off + height as f32, + ColorU::new(80, 80, 80, 80), + ColorU::new(80, 80, 80, 80), + ); + + let upper = self.gpu_cache_debug_chunks.len().min(height); + for chunk in self.gpu_cache_debug_chunks[0..upper].iter().flatten() { + let color = ColorU::new(250, 0, 0, 200); + debug_renderer.add_quad( + x_off + chunk.address.u as f32, + y_off + chunk.address.v as f32, + x_off + chunk.address.u as f32 + chunk.size as f32, + y_off + chunk.address.v as f32 + 1.0, + color, + color, + ); + } + } + /// Pass-through to `Device::read_pixels_into`, used by Gecko's WR bindings. pub fn read_pixels_into(&mut self, rect: FramebufferIntRect, format: ImageFormat, output: &mut [u8]) { self.device.read_pixels_into(rect, format, output); @@ -5986,6 +6095,7 @@ impl Renderer { } compositor.deinit(&mut self.device); } + self.gpu_cache_texture.deinit(&mut self.device); if let Some(dither_matrix_texture) = self.dither_matrix_texture { self.device.delete_texture(dither_matrix_texture); } @@ -6026,6 +6136,9 @@ impl Renderer { pub fn report_memory(&self, swgl: *mut c_void) -> MemoryReport { let mut report = MemoryReport::default(); + // GPU cache CPU memory. + self.gpu_cache_texture.report_memory_to(&mut report, self.size_of_ops.as_ref().unwrap()); + self.staging_texture_pool.report_memory_to(&mut report, self.size_of_ops.as_ref().unwrap()); // Render task CPU memory. @@ -6142,6 +6255,7 @@ pub struct RendererStats { pub color_target_count: usize, pub texture_upload_mb: f64, pub resource_upload_time: f64, + pub gpu_cache_upload_time: f64, pub gecko_display_list_time: f64, pub wr_display_list_time: f64, pub scene_build_time: f64, @@ -6201,6 +6315,8 @@ struct PlainTexture { #[cfg_attr(feature = "replay", derive(Deserialize))] struct PlainRenderer { device_size: Option<DeviceIntSize>, + gpu_cache: PlainTexture, + gpu_cache_frame_id: FrameId, textures: FastHashMap<CacheTextureId, PlainTexture>, } @@ -6433,8 +6549,15 @@ impl Renderer { fs::create_dir(&path_textures).unwrap(); } + info!("saving GPU cache"); + self.update_gpu_cache(); // flush pending updates let mut plain_self = PlainRenderer { device_size: self.device_size, + gpu_cache: Self::save_texture( + self.gpu_cache_texture.get_texture(), + None, "gpu", &root, &mut self.device, + ), + gpu_cache_frame_id: self.gpu_cache_frame_id, textures: FastHashMap::default(), }; @@ -6543,6 +6666,7 @@ impl Renderer { } self.device.begin_frame(); + self.gpu_cache_texture.remove_texture(&mut self.device); if let Some(renderer) = config.deserialize_for_resource::<PlainRenderer, _>("renderer") { info!("loading cached textures"); @@ -6566,6 +6690,17 @@ impl Renderer { category: texture.category.unwrap_or(TextureCacheCategory::Standalone), }); } + + info!("loading gpu cache"); + let (t, gpu_cache_data) = Self::load_texture( + ImageBufferKind::Texture2D, + &renderer.gpu_cache, + Some(RenderTargetInfo { has_depth: false }), + &root, + &mut self.device, + ); + self.gpu_cache_texture.load_from_data(t, gpu_cache_data); + self.gpu_cache_frame_id = renderer.gpu_cache_frame_id; } else { info!("loading cached textures"); self.device.begin_frame(); diff --git a/gfx/wr/webrender/src/renderer/shade.rs b/gfx/wr/webrender/src/renderer/shade.rs @@ -287,6 +287,7 @@ impl LazilyCompiledShader { ("sColor0", TextureSampler::Color0), ("sTransformPalette", TextureSampler::TransformPalette), ("sRenderTasks", TextureSampler::RenderTasks), + ("sGpuCache", TextureSampler::GpuCache), ("sPrimitiveHeadersF", TextureSampler::PrimitiveHeadersF), ("sPrimitiveHeadersI", TextureSampler::PrimitiveHeadersI), ("sGpuBufferF", TextureSampler::GpuBufferF), @@ -304,6 +305,7 @@ impl LazilyCompiledShader { ("sDither", TextureSampler::Dither), ("sTransformPalette", TextureSampler::TransformPalette), ("sRenderTasks", TextureSampler::RenderTasks), + ("sGpuCache", TextureSampler::GpuCache), ("sPrimitiveHeadersF", TextureSampler::PrimitiveHeadersF), ("sPrimitiveHeadersI", TextureSampler::PrimitiveHeadersI), ("sClipMask", TextureSampler::ClipMask), diff --git a/gfx/wr/webrender/src/renderer/vertex.rs b/gfx/wr/webrender/src/renderer/vertex.rs @@ -479,8 +479,8 @@ pub mod desc { // specific clip attributes VertexAttribute { name: "aClipDataResourceAddress", - count: 1, - kind: VertexAttributeKind::I32, + count: 2, + kind: VertexAttributeKind::U16, }, VertexAttribute { name: "aClipSrcRectSize", @@ -505,6 +505,22 @@ pub mod desc { ], }; + pub const GPU_CACHE_UPDATE: VertexDescriptor = VertexDescriptor { + vertex_attributes: &[ + VertexAttribute { + name: "aPosition", + count: 2, + kind: VertexAttributeKind::U16Norm, + }, + VertexAttribute { + name: "aValue", + count: 4, + kind: VertexAttributeKind::F32, + }, + ], + instance_attributes: &[], + }; + pub const RESOLVE: VertexDescriptor = VertexDescriptor { vertex_attributes: &[VertexAttribute { name: "aPosition", @@ -562,8 +578,8 @@ pub mod desc { }, VertexAttribute { name: "aFilterExtraDataAddress", - count: 1, - kind: VertexAttributeKind::I32, + count: 2, + kind: VertexAttributeKind::U16, }, ], }; @@ -612,8 +628,8 @@ pub mod desc { }, VertexAttribute { name: "aFilterExtraDataAddress", - count: 1, - kind: VertexAttributeKind::I32, + count: 2, + kind: VertexAttributeKind::U16, }, ], }; diff --git a/gfx/wr/webrender/src/resource_cache.rs b/gfx/wr/webrender/src/resource_cache.rs @@ -27,6 +27,7 @@ use crate::glyph_cache::{GlyphCache, CachedGlyphInfo}; use crate::glyph_cache::GlyphCacheEntry; use glyph_rasterizer::{GLYPH_FLASHING, FontInstance, GlyphFormat, GlyphKey, GlyphRasterizer, GlyphRasterJob}; use glyph_rasterizer::{SharedFontResources, BaseFontInstance}; +use crate::gpu_cache::{GpuCache, GpuCacheAddress, GpuCacheHandle}; use crate::gpu_types::UvRectKind; use crate::internal_types::{ CacheTextureId, FastHashMap, FastHashSet, TextureSource, ResourceUpdateList, @@ -36,7 +37,7 @@ use crate::profiler::{self, TransactionProfile, bytes_to_mb}; use crate::render_task_graph::{RenderTaskId, RenderTaskGraphBuilder}; use crate::render_task_cache::{RenderTaskCache, RenderTaskCacheKey, RenderTaskParent}; use crate::render_task_cache::{RenderTaskCacheEntry, RenderTaskCacheEntryHandle}; -use crate::renderer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferBuilderF}; +use crate::renderer::GpuBufferBuilderF; use crate::surface::SurfaceBuilder; use euclid::point2; use smallvec::SmallVec; @@ -63,7 +64,7 @@ static NEXT_NATIVE_SURFACE_ID: AtomicUsize = AtomicUsize::new(0); #[cfg_attr(feature = "replay", derive(Deserialize))] pub struct GlyphFetchResult { pub index_in_text_run: i32, - pub uv_rect_address: GpuBufferAddress, + pub uv_rect_address: GpuCacheAddress, pub offset: DevicePoint, pub size: DeviceIntSize, pub scale: f32, @@ -83,7 +84,7 @@ pub struct GlyphFetchResult { #[cfg_attr(feature = "replay", derive(Deserialize))] pub struct CacheItem { pub texture_id: TextureSource, - pub uv_rect_handle: GpuBufferAddress, + pub uv_rect_handle: GpuCacheHandle, pub uv_rect: DeviceIntRect, pub user_data: [f32; 4], } @@ -92,7 +93,7 @@ impl CacheItem { pub fn invalid() -> Self { CacheItem { texture_id: TextureSource::Invalid, - uv_rect_handle: GpuBufferAddress::INVALID, + uv_rect_handle: GpuCacheHandle::new(), uv_rect: DeviceIntRect::zero(), user_data: [0.0; 4], } @@ -631,16 +632,18 @@ impl ResourceCache { key: Option<RenderTaskCacheKey>, is_opaque: bool, parent: RenderTaskParent, + gpu_cache: &mut GpuCache, gpu_buffer_builder: &mut GpuBufferBuilderF, rg_builder: &mut RenderTaskGraphBuilder, surface_builder: &mut SurfaceBuilder, - f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId, + f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId, ) -> RenderTaskId { self.cached_render_tasks.request_render_task( key.clone(), &mut self.texture_cache, is_opaque, parent, + gpu_cache, gpu_buffer_builder, rg_builder, surface_builder, @@ -654,12 +657,13 @@ impl ResourceCache { size: DeviceIntSize, rg_builder: &mut RenderTaskGraphBuilder, gpu_buffer_builder: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, is_opaque: bool, adjustment: &AdjustedImageSource, - f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId, + f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId, ) -> RenderTaskId { - let task_id = f(rg_builder, gpu_buffer_builder); + let task_id = f(rg_builder, gpu_buffer_builder, gpu_cache); let render_task = rg_builder.get_task_mut(task_id); @@ -723,7 +727,7 @@ impl ResourceCache { None, user_data, DirtyRect::All, - gpu_buffer_builder, + gpu_cache, None, render_task.uv_rect_kind(), Eviction::Manual, @@ -1097,7 +1101,7 @@ impl ResourceCache { pub fn request_image( &mut self, mut request: ImageRequest, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, ) -> DeviceIntSize { debug_assert_eq!(self.state, State::AddResources); @@ -1198,7 +1202,7 @@ impl ResourceCache { ImageResult::Err(_) => panic!("Errors should already have been handled"), }; - let needs_upload = self.texture_cache.request(&entry.texture_cache_handle, gpu_buffer); + let needs_upload = self.texture_cache.request(&entry.texture_cache_handle, gpu_cache); if !needs_upload && entry.dirty_rect.is_empty() { return size; @@ -1272,7 +1276,7 @@ impl ResourceCache { &mut self, mut font: FontInstance, glyph_keys: &[GlyphKey], - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, ) { debug_assert_eq!(self.state, State::AddResources); @@ -1286,7 +1290,8 @@ impl ResourceCache { if let Some(entry) = glyph_key_cache.try_get(key) { match entry { GlyphCacheEntry::Cached(ref glyph) => { - if !texture_cache.request(&glyph.texture_cache_handle, gpu_buffer) { + // Skip the glyph if it is already has a valid texture cache handle. + if !texture_cache.request(&glyph.texture_cache_handle, gpu_cache) { return false; } // This case gets hit when we already rasterized the glyph, but the @@ -1317,6 +1322,7 @@ impl ResourceCache { mut font: FontInstance, glyph_keys: &[GlyphKey], fetch_buffer: &mut Vec<GlyphFetchResult>, + gpu_cache: &mut GpuCache, mut f: F, ) where F: FnMut(TextureSource, GlyphFormat, &[GlyphFetchResult]), @@ -1348,7 +1354,7 @@ impl ResourceCache { } fetch_buffer.push(GlyphFetchResult { index_in_text_run: loop_index as i32, - uv_rect_address: cache_item.uv_rect_handle, + uv_rect_address: gpu_cache.get_address(&cache_item.uv_rect_handle), offset: DevicePoint::new(cache_item.user_data[0], cache_item.user_data[1]), size: cache_item.uv_rect.size(), scale: cache_item.user_data[2], @@ -1463,7 +1469,7 @@ impl ResourceCache { }) } - pub fn begin_frame(&mut self, stamp: FrameStamp, profile: &mut TransactionProfile) { + pub fn begin_frame(&mut self, stamp: FrameStamp, gpu_cache: &mut GpuCache, profile: &mut TransactionProfile) { profile_scope!("begin_frame"); debug_assert_eq!(self.state, State::Idle); self.state = State::AddResources; @@ -1484,12 +1490,12 @@ impl ResourceCache { v.clear(); self.deleted_blob_keys.push_back(v); - self.texture_cache.run_compaction(); + self.texture_cache.run_compaction(gpu_cache); } pub fn block_until_all_resources_added( &mut self, - gpu_buffer: &mut GpuBufferBuilder, + gpu_cache: &mut GpuCache, profile: &mut TransactionProfile, ) { profile_scope!("block_until_all_resources_added"); @@ -1511,7 +1517,7 @@ impl ResourceCache { } Ok(glyph) => { let mut texture_cache_handle = TextureCacheHandle::invalid(); - texture_cache.request(&texture_cache_handle, &mut gpu_buffer.f32); + texture_cache.request(&texture_cache_handle, gpu_cache); texture_cache.update( &mut texture_cache_handle, ImageDescriptor { @@ -1525,7 +1531,7 @@ impl ResourceCache { Some(CachedImageData::Raw(Arc::new(glyph.bytes))), [glyph.left, -glyph.top, glyph.scale, 0.0], DirtyRect::All, - &mut gpu_buffer.f32, + gpu_cache, Some(glyph_key_cache.eviction_notice()), UvRectKind::Rect, Eviction::Auto, @@ -1544,10 +1550,10 @@ impl ResourceCache { ); // Apply any updates of new / updated images (incl. blobs) to the texture cache. - self.update_texture_cache(gpu_buffer); + self.update_texture_cache(gpu_cache); } - fn update_texture_cache(&mut self, gpu_buffer: &mut GpuBufferBuilder) { + fn update_texture_cache(&mut self, gpu_cache: &mut GpuCache) { profile_scope!("update_texture_cache"); if self.fallback_handle == TextureCacheHandle::invalid() { @@ -1569,7 +1575,7 @@ impl ResourceCache { Some(CachedImageData::Raw(Arc::new(fallback_color))), [0.0; 4], DirtyRect::All, - &mut gpu_buffer.f32, + gpu_cache, None, UvRectKind::Rect, Eviction::Manual, @@ -1687,7 +1693,7 @@ impl ResourceCache { Some(image_data), [0.0; 4], dirty_rect, - &mut gpu_buffer.f32, + gpu_cache, None, UvRectKind::Rect, eviction, diff --git a/gfx/wr/webrender/src/texture_cache.rs b/gfx/wr/webrender/src/texture_cache.rs @@ -9,6 +9,7 @@ use api::units::*; use api::{DocumentId, IdNamespace}; use crate::device::{TextureFilter, TextureFormatPair}; use crate::freelist::{FreeList, FreeListHandle, WeakFreeListHandle}; +use crate::gpu_cache::{GpuCache, GpuCacheHandle}; use crate::gpu_types::{ImageSource, UvRectKind}; use crate::internal_types::{ CacheTextureId, Swizzle, SwizzleSettings, FrameStamp, FrameId, @@ -17,7 +18,6 @@ use crate::internal_types::{ }; use crate::lru_cache::LRUCache; use crate::profiler::{self, TransactionProfile}; -use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF}; use crate::resource_cache::{CacheItem, CachedImageData}; use crate::texture_pack::{ AllocatorList, AllocId, AtlasAllocatorList, ShelfAllocator, ShelfAllocatorOptions, @@ -102,8 +102,8 @@ pub struct CacheEntry { // in the glyph cache eviction code. We could probably remove it // entirely in future (or move to PictureCacheEntry). pub last_access: FrameStamp, - /// Address of the resource rect in the GPU cache. - pub uv_rect_handle: GpuBufferAddress, + /// Handle to the resource rect in the GPU cache. + pub uv_rect_handle: GpuCacheHandle, /// Image format of the data that the entry expects. pub input_format: ImageFormat, pub filter: TextureFilter, @@ -143,7 +143,7 @@ impl CacheEntry { input_format: params.descriptor.format, filter: params.filter, swizzle, - uv_rect_handle: GpuBufferAddress::INVALID, + uv_rect_handle: GpuCacheHandle::new(), eviction_notice: None, uv_rect_kind: params.uv_rect_kind, shader: TargetShader::Default, @@ -154,15 +154,17 @@ impl CacheEntry { // This ensures that the UV rect, and texture layer index // are up to date in the GPU cache for vertex shaders // to fetch from. - fn write_gpu_blocks(&mut self, gpu_buffer: &mut GpuBufferBuilderF) { - let origin = self.details.describe(); - let image_source = ImageSource { - p0: origin.to_f32(), - p1: (origin + self.size).to_f32(), - user_data: self.user_data, - uv_rect_kind: self.uv_rect_kind, - }; - self.uv_rect_handle = image_source.write_gpu_blocks(gpu_buffer); + fn update_gpu_cache(&mut self, gpu_cache: &mut GpuCache) { + if let Some(mut request) = gpu_cache.request(&mut self.uv_rect_handle) { + let origin = self.details.describe(); + let image_source = ImageSource { + p0: origin.to_f32(), + p1: (origin + self.size).to_f32(), + user_data: self.user_data, + uv_rect_kind: self.uv_rect_kind, + }; + image_source.write_gpu_blocks(&mut request); + } } fn evict(&self) { @@ -550,9 +552,11 @@ impl TextureCacheConfig { /// frame in which they are requested, and may be evicted. The API supports /// querying whether an entry is still available. /// -/// The texture cache can be visualized, which is a good way to understand how -/// it works. Enabling gfx.webrender.debug.texture-cache shows a live view of -/// its contents in Firefox. +/// The TextureCache is different from the GpuCache in that the former stores +/// images, whereas the latter stores data and parameters for use in the shaders. +/// This means that the texture cache can be visualized, which is a good way to +/// understand how it works. Enabling gfx.webrender.debug.texture-cache shows a +/// live view of its contents in Firefox. #[cfg_attr(feature = "capture", derive(Serialize))] #[cfg_attr(feature = "replay", derive(Deserialize))] pub struct TextureCache { @@ -752,7 +756,7 @@ impl TextureCache { self.now = FrameStamp::INVALID; } - pub fn run_compaction(&mut self) { + pub fn run_compaction(&mut self, gpu_cache: &mut GpuCache) { // Use the same order as BudgetType::VALUES so that we can index self.bytes_allocated // with the same index. let allocator_lists = [ @@ -803,7 +807,8 @@ impl TextureCache { allocated_size_in_bytes: new_bytes, }; - entry.uv_rect_handle = GpuBufferAddress::INVALID; + gpu_cache.invalidate(&entry.uv_rect_handle); + entry.uv_rect_handle = GpuCacheHandle::new(); let src_rect = DeviceIntRect::from_origin_and_size(change.old_rect.min, entry.size); let dst_rect = DeviceIntRect::from_origin_and_size(change.new_rect.min, entry.size); @@ -832,7 +837,7 @@ impl TextureCache { // Returns true if the image needs to be uploaded to the // texture cache (either never uploaded, or has been // evicted on a previous frame). - pub fn request(&mut self, handle: &TextureCacheHandle, gpu_buffer: &mut GpuBufferBuilderF) -> bool { + pub fn request(&mut self, handle: &TextureCacheHandle, gpu_cache: &mut GpuCache) -> bool { let now = self.now; let entry = match handle { TextureCacheHandle::Empty => None, @@ -847,9 +852,9 @@ impl TextureCache { }; entry.map_or(true, |entry| { // If an image is requested that is already in the cache, - // refresh the GPU buffer data associated with this item. + // refresh the GPU cache data associated with this item. entry.last_access = now; - entry.write_gpu_blocks(gpu_buffer); + entry.update_gpu_cache(gpu_cache); false }) } @@ -908,7 +913,7 @@ impl TextureCache { data: Option<CachedImageData>, user_data: [f32; 4], mut dirty_rect: ImageDirtyRect, - gpu_buffer: &mut GpuBufferBuilderF, + gpu_cache: &mut GpuCache, eviction_notice: Option<&EvictionNotice>, uv_rect_kind: UvRectKind, eviction: Eviction, @@ -948,8 +953,14 @@ impl TextureCache { entry.eviction_notice = eviction_notice.cloned(); entry.uv_rect_kind = uv_rect_kind; + // Invalidate the contents of the resource rect in the GPU cache. + // This ensures that the update_gpu_cache below will add + // the new information to the GPU cache. + //TODO: only invalidate if the parameters change? + gpu_cache.invalidate(&entry.uv_rect_handle); + // Upload the resource rect and texture array layer. - entry.write_gpu_blocks(gpu_buffer); + entry.update_gpu_cache(gpu_cache); // Create an update command, which the render thread processes // to upload the new image data into the correct location @@ -1022,7 +1033,7 @@ impl TextureCache { pub fn try_get_cache_location( &self, handle: &TextureCacheHandle, - ) -> Option<(CacheTextureId, DeviceIntRect, Swizzle, GpuBufferAddress, [f32; 4])> { + ) -> Option<(CacheTextureId, DeviceIntRect, Swizzle, GpuCacheHandle, [f32; 4])> { let entry = self.get_entry_opt(handle)?; let origin = entry.details.describe(); Some(( @@ -1042,7 +1053,7 @@ impl TextureCache { pub fn get_cache_location( &self, handle: &TextureCacheHandle, - ) -> (CacheTextureId, DeviceIntRect, Swizzle, GpuBufferAddress, [f32; 4]) { + ) -> (CacheTextureId, DeviceIntRect, Swizzle, GpuCacheHandle, [f32; 4]) { self.try_get_cache_location(handle).expect("BUG: was dropped from cache or not updated!") } @@ -1349,7 +1360,7 @@ impl TextureCache { alloc_id, allocated_size_in_bytes, }, - uv_rect_handle: GpuBufferAddress::INVALID, + uv_rect_handle: GpuCacheHandle::new(), input_format: params.descriptor.format, filter: params.filter, swizzle, @@ -1647,8 +1658,6 @@ impl TextureCacheUpdate { #[cfg(test)] mod test_texture_cache { - use crate::renderer::GpuBufferBuilderF; - #[test] fn check_allocation_size_balance() { // Allocate some glyphs, observe the total allocation size, and free @@ -1656,15 +1665,14 @@ mod test_texture_cache { // original value. use crate::texture_cache::{TextureCache, TextureCacheHandle, Eviction, TargetShader}; + use crate::gpu_cache::GpuCache; use crate::device::TextureFilter; use crate::gpu_types::UvRectKind; - use crate::frame_allocator::FrameMemory; use api::{ImageDescriptor, ImageDescriptorFlags, ImageFormat, DirtyRect}; use api::units::*; use euclid::size2; let mut texture_cache = TextureCache::new_for_testing(2048, ImageFormat::BGRA8); - let memory = FrameMemory::fallback(); - let mut gpu_buffer = GpuBufferBuilderF::new(&memory); + let mut gpu_cache = GpuCache::new_for_testing(); let sizes: &[DeviceIntSize] = &[ size2(23, 27), @@ -1685,7 +1693,7 @@ mod test_texture_cache { let handles: Vec<TextureCacheHandle> = sizes.iter().map(|size| { let mut texture_cache_handle = TextureCacheHandle::invalid(); - texture_cache.request(&texture_cache_handle, &mut gpu_buffer); + texture_cache.request(&texture_cache_handle, &mut gpu_cache); texture_cache.update( &mut texture_cache_handle, ImageDescriptor { @@ -1699,7 +1707,7 @@ mod test_texture_cache { None, [0.0; 4], DirtyRect::All, - &mut gpu_buffer, + &mut gpu_cache, None, UvRectKind::Rect, Eviction::Manual, diff --git a/gfx/wr/webrender/src/visibility.rs b/gfx/wr/webrender/src/visibility.rs @@ -13,10 +13,10 @@ use std::usize; use crate::clip::ClipStore; use crate::composite::CompositeState; use crate::profiler::TransactionProfile; -use crate::renderer::GpuBufferBuilder; use crate::spatial_tree::{SpatialTree, SpatialNodeIndex}; use crate::clip::{ClipChainInstance, ClipTree}; use crate::frame_builder::FrameBuilderConfig; +use crate::gpu_cache::GpuCache; use crate::picture::{PictureCompositeMode, ClusterFlags, SurfaceInfo, TileCacheInstance}; use crate::picture::{SurfaceIndex, RasterConfig, SubSliceIndex}; use crate::prim_store::{ClipTaskIndex, PictureIndex, PrimitiveInstanceKind}; @@ -41,7 +41,7 @@ pub struct FrameVisibilityContext<'a> { pub struct FrameVisibilityState<'a> { pub clip_store: &'a mut ClipStore, pub resource_cache: &'a mut ResourceCache, - pub frame_gpu_data: &'a mut GpuBufferBuilder, + pub gpu_cache: &'a mut GpuCache, pub data_stores: &'a mut DataStores, pub clip_tree: &'a mut ClipTree, pub composite_state: &'a mut CompositeState, @@ -321,7 +321,7 @@ pub fn update_prim_visibility( &map_local_to_picture, &map_surface_to_vis, &frame_context.spatial_tree, - &mut frame_state.frame_gpu_data.f32, + frame_state.gpu_cache, frame_state.resource_cache, device_pixel_scale, &surface_culling_rect, @@ -363,7 +363,7 @@ pub fn update_prim_visibility( &store.color_bindings, &frame_state.surface_stack, &mut frame_state.composite_state, - &mut frame_state.frame_gpu_data.f32, + &mut frame_state.gpu_cache, &mut frame_state.scratch.primitive, is_root_tile_cache, frame_state.surfaces, diff --git a/gfx/wr/webrender_api/src/units.rs b/gfx/wr/webrender_api/src/units.rs @@ -191,15 +191,6 @@ impl TexelRect { uv1: DevicePoint::new(-1.0, -1.0), } } - - pub fn to_array(&self) -> [f32; 4] { - [ - self.uv0.x, - self.uv0.y, - self.uv1.x, - self.uv1.y, - ] - } } impl Into<TexelRect> for DeviceIntRect { diff --git a/gfx/wr/webrender_build/src/shader.rs b/gfx/wr/webrender_build/src/shader.rs @@ -195,8 +195,8 @@ pub fn build_shader_prefix_string<F: FnMut(&str)>( // detect which platform we're targeting let is_macos = match std::env::var("CARGO_CFG_TARGET_OS") { Ok(os) => os == "macos", - // if this is not called from build.rs (e.g. if the optimized shader - // pref is disabled) we want to use the runtime value + // if this is not called from build.rs (e.g. the gpu_cache_update shader or + // if the optimized shader pref is disabled) we want to use the runtime value Err(_) => cfg!(target_os = "macos"), }; let is_android = match std::env::var("CARGO_CFG_TARGET_OS") { diff --git a/gfx/wr/wrench/src/main.rs b/gfx/wr/wrench/src/main.rs @@ -787,6 +787,7 @@ pub fn main() { } else if let Some(subargs) = args.subcommand_matches("png") { let surface = match subargs.value_of("surface") { Some("screen") | None => png::ReadSurface::Screen, + Some("gpu-cache") => png::ReadSurface::GpuCache, _ => panic!("Unknown surface argument value") }; let output_path = subargs.value_of("OUTPUT").map(PathBuf::from); diff --git a/gfx/wr/wrench/src/png.rs b/gfx/wr/wrench/src/png.rs @@ -14,6 +14,7 @@ use crate::yaml_frame_reader::YamlFrameReader; pub enum ReadSurface { Screen, + GpuCache, } pub struct SaveSettings { @@ -97,6 +98,14 @@ pub fn png( try_crop: true, }) } + ReadSurface::GpuCache => { + let (size, data) = wrench.renderer + .read_gpu_cache(); + (size, data, SaveSettings { + flip_vertical: false, + try_crop: false, + }) + } }; let out_path = out_path.unwrap_or_else(|| {