[ tor-browser ].git.dasho

commit 88ab8c9600f2ad4f1fa40fadffd83aa675f360d7
parent fda063630b13dfe14a41394a2eee3c22d66bfde8
Author: Serban Stanca <sstanca@mozilla.com>
Date:   Wed, 12 Nov 2025 19:10:49 +0200

Revert "Bug 1996818 - Store gpu addresses as a u32. r=gw" for causing mochitests-plain failures in test_ext_web_accessible_resources.html

This reverts commit f04bdbe632dd6c596bce383ddc76e2b26ef0c3f9.

This reverts commit c4c1d2a01faeae04eabd837aa393dc621c46304a.

This reverts commit de3703a4ecf74306332c88cea762c265dda69b0e.

This reverts commit 5e29586453931121d577a3c3fe7f80e8dbc5a304.

This reverts commit 0f3a303c01170f22166784f30d2e1d135942296d.

Diffstat:
M gfx/layers/ipc/CompositorBridgeParent.cpp  | 5 +++++
M gfx/layers/ipc/PCompositorBridge.ipdl  | 1 +
M gfx/layers/wr/WebRenderBridgeParent.cpp  | 1 +
M gfx/layers/wr/WebRenderMessageUtils.h  | 3 +++
M gfx/thebes/gfxPlatform.cpp  | 9 +++++++--
M gfx/wr/webrender/res/blend.glsl  | 14 ++++++--------
M gfx/wr/webrender/res/brush.glsl  | 10 +++++-----
M gfx/wr/webrender/res/brush_blend.glsl  | 2 +-
M gfx/wr/webrender/res/brush_image.glsl  | 4 ++--
M gfx/wr/webrender/res/brush_linear_gradient.glsl  | 2 +-
M gfx/wr/webrender/res/brush_mix_blend.glsl  | 2 +-
M gfx/wr/webrender/res/brush_solid.glsl  | 2 +-
M gfx/wr/webrender/res/brush_yuv_image.glsl  | 4 ++--
M gfx/wr/webrender/res/clip_shared.glsl  | 2 +-
M gfx/wr/webrender/res/cs_clip_box_shadow.glsl  | 8 ++++----
M gfx/wr/webrender/res/cs_conic_gradient.glsl  | 2 +-
M gfx/wr/webrender/res/cs_linear_gradient.glsl  | 2 +-
M gfx/wr/webrender/res/cs_radial_gradient.glsl  | 2 +-
M gfx/wr/webrender/res/cs_svg_filter.glsl  | 30 +++++++++++++-----------------
M gfx/wr/webrender/res/cs_svg_filter_node.glsl  | 34 +++++++++++++++-------------------
M gfx/wr/webrender/res/gpu_buffer.glsl  | 46 +++++++++++++---------------------------------
A gfx/wr/webrender/res/gpu_cache.glsl  | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M gfx/wr/webrender/res/gradient.glsl  | 2 +-
D gfx/wr/webrender/res/image_source.glsl  | 51 ---------------------------------------------------
M gfx/wr/webrender/res/prim_shared.glsl  | 2 +-
M gfx/wr/webrender/res/ps_quad.glsl  | 4 ++--
M gfx/wr/webrender/res/ps_split_composite.glsl  | 15 +++++++++------
M gfx/wr/webrender/res/ps_text_run.glsl  | 8 ++++----
M gfx/wr/webrender/src/batch.rs  | 141 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M gfx/wr/webrender/src/clip.rs  | 12 ++++++------
M gfx/wr/webrender/src/command_buffer.rs  | 27 ++++++++++++++++++---------
M gfx/wr/webrender/src/composite.rs  | 20 ++++++++++----------
M gfx/wr/webrender/src/filterdata.rs  | 35 ++++++++++++++++++-----------------
M gfx/wr/webrender/src/frame_builder.rs  | 93 +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
A gfx/wr/webrender/src/gpu_cache.rs  | 945 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M gfx/wr/webrender/src/gpu_types.rs  | 47 +++++++++++++++++++----------------------------
M gfx/wr/webrender/src/image_source.rs  | 10 +++++-----
M gfx/wr/webrender/src/internal_types.rs  | 2 ++
M gfx/wr/webrender/src/lib.rs  | 1 +
M gfx/wr/webrender/src/picture.rs  | 171 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M gfx/wr/webrender/src/picture_textures.rs  | 37 +++++++++++++++++++------------------
M gfx/wr/webrender/src/prepare.rs  | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
M gfx/wr/webrender/src/prim_store/borders.rs  | 62 +++++++++++++++++++++++++++++++++++---------------------------
M gfx/wr/webrender/src/prim_store/gradient/conic.rs  | 41 +++++++++++++++++++++++------------------
M gfx/wr/webrender/src/prim_store/gradient/linear.rs  | 81 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M gfx/wr/webrender/src/prim_store/gradient/mod.rs  | 6 +++---
M gfx/wr/webrender/src/prim_store/gradient/radial.rs  | 42 +++++++++++++++++++++++-------------------
M gfx/wr/webrender/src/prim_store/image.rs  | 45 +++++++++++++++++++++++----------------------
M gfx/wr/webrender/src/prim_store/line_dec.rs  | 20 ++++++++++----------
M gfx/wr/webrender/src/prim_store/mod.rs  | 35 +++++++++++++++++------------------
M gfx/wr/webrender/src/prim_store/text_run.rs  | 55 ++++++++++++++++++++++++++++---------------------------
M gfx/wr/webrender/src/profiler.rs  | 379 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M gfx/wr/webrender/src/quad.rs  | 4 ++--
M gfx/wr/webrender/src/render_api.rs  | 5 +++++
M gfx/wr/webrender/src/render_backend.rs  | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M gfx/wr/webrender/src/render_target.rs  | 19 +++++++++++++------
M gfx/wr/webrender/src/render_task.rs  | 176 +++++++++++++++++++++++++++++++++++++++++++------------------------------------
M gfx/wr/webrender/src/render_task_cache.rs  | 20 ++++++++++++--------
M gfx/wr/webrender/src/render_task_graph.rs  | 30 +++++++++++++++---------------
M gfx/wr/webrender/src/renderer/gpu_buffer.rs  | 81 ++++++++++++++++++++++++++-----------------------------------------------------
A gfx/wr/webrender/src/renderer/gpu_cache.rs  | 541 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M gfx/wr/webrender/src/renderer/init.rs  | 28 ++++++++++++++++++++++++++--
M gfx/wr/webrender/src/renderer/mod.rs  | 215 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M gfx/wr/webrender/src/renderer/shade.rs  | 2 ++
M gfx/wr/webrender/src/renderer/vertex.rs  | 28 ++++++++++++++++++++++------
M gfx/wr/webrender/src/resource_cache.rs  | 50 ++++++++++++++++++++++++++++----------------------
M gfx/wr/webrender/src/texture_cache.rs  | 74 +++++++++++++++++++++++++++++++++++++++++---------------------------------
M gfx/wr/webrender/src/visibility.rs  | 8 ++++----
M gfx/wr/webrender_api/src/units.rs  | 9 ---------
M gfx/wr/webrender_build/src/shader.rs  | 4 ++--
M gfx/wr/wrench/src/main.rs  | 1 +
M gfx/wr/wrench/src/png.rs  | 9 +++++++++

72 files changed, 3116 insertions(+), 1058 deletions(-)
diff --git a/gfx/layers/ipc/CompositorBridgeParent.cpp b/gfx/layers/ipc/CompositorBridgeParent.cpp
@@ -1860,6 +1860,11 @@ int32_t RecordContentFrameTime(
           .AccumulateSingleSample(
               static_cast<unsigned long long>(fracLatencyNorm));
 
+      if (aStats) {
+        latencyMs -= (double(aStats->gpu_cache_upload_time) / 1000000.0);
+        latencyNorm = latencyMs / aVsyncRate.ToMilliseconds();
+        fracLatencyNorm = lround(latencyNorm * 100.0);
+      }
       mozilla::glean::gfx_content_frame_time::without_resource_upload
           .AccumulateSingleSample(
               static_cast<unsigned long long>(fracLatencyNorm));
diff --git a/gfx/layers/ipc/PCompositorBridge.ipdl b/gfx/layers/ipc/PCompositorBridge.ipdl
@@ -52,6 +52,7 @@ struct FrameStats {
   TimeStamp compositeEnd;
   int32_t contentFrameTime;
   double resourceUploadTime;
+  double gpuCacheUploadTime;
   TimeStamp transactionStart;
   TimeStamp refreshStart;
   TimeStamp fwdTime;
diff --git a/gfx/layers/wr/WebRenderBridgeParent.cpp b/gfx/layers/wr/WebRenderBridgeParent.cpp
@@ -2671,6 +2671,7 @@ void WebRenderBridgeParent::FlushTransactionIdsForEpoch(
             transactionId.mId, aCompositeStartTime, aRenderStartTime, aEndTime,
             contentFrameTime,
             aStats ? (double(aStats->resource_upload_time) / 1000000.0) : 0.0,
+            aStats ? (double(aStats->gpu_cache_upload_time) / 1000000.0) : 0.0,
             transactionId.mTxnStartTime, transactionId.mRefreshStartTime,
             transactionId.mFwdTime, transactionId.mSceneBuiltTime,
             transactionId.mSkippedComposites, transactionId.mTxnURL));
diff --git a/gfx/layers/wr/WebRenderMessageUtils.h b/gfx/layers/wr/WebRenderMessageUtils.h
@@ -296,6 +296,8 @@ inline auto TiedFields<mozilla::wr::MemoryReport>(
   // clang-format off
   return std::tie(
     a.clip_stores,
+    a.gpu_cache_metadata,
+    a.gpu_cache_cpu_mirror,
     a.hit_testers,
     a.fonts,
     a.weak_fonts,
@@ -308,6 +310,7 @@ inline auto TiedFields<mozilla::wr::MemoryReport>(
     a.swgl,
     a.frame_allocator,
     a.render_tasks,
+    a.gpu_cache_textures,
     a.vertex_data_textures,
     a.render_target_textures,
     a.picture_tile_textures,
diff --git a/gfx/thebes/gfxPlatform.cpp b/gfx/thebes/gfxPlatform.cpp
@@ -537,6 +537,7 @@ static void WebRenderDebugPrefChangeCallback(const char* aPrefName, void*) {
   GFX_WEBRENDER_DEBUG(".echo-driver-messages",
                       wr::DebugFlags::ECHO_DRIVER_MESSAGES)
   GFX_WEBRENDER_DEBUG(".show-overdraw", wr::DebugFlags::SHOW_OVERDRAW)
+  GFX_WEBRENDER_DEBUG(".gpu-cache", wr::DebugFlags::GPU_CACHE_DBG)
   GFX_WEBRENDER_DEBUG(".texture-cache.clear-evicted",
                       wr::DebugFlags::TEXTURE_CACHE_DBG_CLEAR_EVICTED)
   GFX_WEBRENDER_DEBUG(".picture-caching", wr::DebugFlags::PICTURE_CACHING_DBG)
@@ -729,6 +730,8 @@ WebRenderMemoryReporter::CollectReports(nsIHandleReportCallback* aHandleReport,
       [=](wr::MemoryReport aReport) {
         // CPU Memory.
         helper.Report(aReport.clip_stores, "clip-stores");
+        helper.Report(aReport.gpu_cache_metadata, "gpu-cache/metadata");
+        helper.Report(aReport.gpu_cache_cpu_mirror, "gpu-cache/cpu-mirror");
         helper.Report(aReport.hit_testers, "hit-testers");
         helper.Report(aReport.fonts, "resource-cache/fonts");
         helper.Report(aReport.weak_fonts, "resource-cache/weak-fonts");
@@ -748,6 +751,7 @@ WebRenderMemoryReporter::CollectReports(nsIHandleReportCallback* aHandleReport,
         WEBRENDER_FOR_EACH_INTERNER(REPORT_DATA_STORE, );
 
         // GPU Memory.
+        helper.ReportTexture(aReport.gpu_cache_textures, "gpu-cache");
         helper.ReportTexture(aReport.vertex_data_textures, "vertex-data");
         helper.ReportTexture(aReport.render_target_textures, "render-targets");
         helper.ReportTexture(aReport.depth_target_textures, "depth-targets");
@@ -3644,7 +3648,8 @@ void gfxPlatform::GetFrameStats(mozilla::widget::InfoObject& aObj) {
         "Frame %" PRIu64
         "(%s) CONTENT_FRAME_TIME %d - Transaction start %f, main-thread time "
         "%f, full paint time %f, Skipped composites %u, Composite start %f, "
-        "Resource upload time %f, Render time %f, Composite time %f",
+        "Resource upload time %f, GPU cache upload time %f, Render time %f, "
+        "Composite time %f",
         f.id().mId, f.url().get(), f.contentFrameTime(),
         (f.transactionStart() - f.refreshStart()).ToMilliseconds(),
         (f.fwdTime() - f.transactionStart()).ToMilliseconds(),
@@ -3653,7 +3658,7 @@ void gfxPlatform::GetFrameStats(mozilla::widget::InfoObject& aObj) {
             : 0.0,
         f.skippedComposites(),
         (f.compositeStart() - f.refreshStart()).ToMilliseconds(),
-        f.resourceUploadTime(),
+        f.resourceUploadTime(), f.gpuCacheUploadTime(),
         (f.compositeEnd() - f.renderStart()).ToMilliseconds(),
         (f.compositeEnd() - f.compositeStart()).ToMilliseconds());
     aObj.DefineProperty(name.get(), value.get());
diff --git a/gfx/wr/webrender/res/blend.glsl b/gfx/wr/webrender/res/blend.glsl
@@ -2,8 +2,6 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include gpu_buffer
-
 #define COMPONENT_TRANSFER_IDENTITY 0
 #define COMPONENT_TRANSFER_TABLE 1
 #define COMPONENT_TRANSFER_DISCRETE 2
@@ -77,14 +75,14 @@ void SetupFilterParams(
         );
         color_offset = vec4(0.0);
     } else if (op == FILTER_COLOR_MATRIX) {
-        vec4 mat_data[4] = fetch_from_gpu_buffer_4f(gpu_data_address);
-        vec4 offset_data = fetch_from_gpu_buffer_1f(gpu_data_address + 4);
+        vec4 mat_data[4] = fetch_from_gpu_cache_4(gpu_data_address);
+        vec4 offset_data = fetch_from_gpu_cache_1(gpu_data_address + 4);
         color_mat = mat4(mat_data[0], mat_data[1], mat_data[2], mat_data[3]);
         color_offset = offset_data;
     } else if (op == FILTER_COMPONENT_TRANSFER) {
         table_address = gpu_data_address;
     } else if (op == FILTER_FLOOD) {
-        color_offset = fetch_from_gpu_buffer_1f(gpu_data_address);
+        color_offset = fetch_from_gpu_cache_1(gpu_data_address);
     }
 }
 #endif
@@ -156,7 +154,7 @@ vec4 ComponentTransfer(vec4 colora, vec4 vfuncs, highp int table_address) {
             case COMPONENT_TRANSFER_DISCRETE: {
                 // fetch value from lookup table
                 k = int(floor(colora[i]*255.0 + 0.5));
-                texel = fetch_from_gpu_buffer_1f(table_address + offset + k/4);
+                texel = fetch_from_gpu_cache_1(table_address + offset + k/4);
                 colora[i] = clamp(texel[k % 4], 0.0, 1.0);
                 // offset plus 256/4 blocks
                 offset = offset + 64;
@@ -164,7 +162,7 @@ vec4 ComponentTransfer(vec4 colora, vec4 vfuncs, highp int table_address) {
             }
             case COMPONENT_TRANSFER_LINEAR: {
                 // fetch the two values for use in the linear equation
-                texel = fetch_from_gpu_buffer_1f(table_address + offset);
+                texel = fetch_from_gpu_cache_1(table_address + offset);
                 colora[i] = clamp(texel[0] * colora[i] + texel[1], 0.0, 1.0);
                 // offset plus 1 block
                 offset = offset + 1;
@@ -172,7 +170,7 @@ vec4 ComponentTransfer(vec4 colora, vec4 vfuncs, highp int table_address) {
             }
             case COMPONENT_TRANSFER_GAMMA: {
                 // fetch the three values for use in the gamma equation
-                texel = fetch_from_gpu_buffer_1f(table_address + offset);
+                texel = fetch_from_gpu_cache_1(table_address + offset);
                 colora[i] = clamp(texel[0] * pow(colora[i], texel[1]) + texel[2], 0.0, 1.0);
                 // offset plus 1 block
                 offset = offset + 1;
diff --git a/gfx/wr/webrender/res/brush.glsl b/gfx/wr/webrender/res/brush.glsl
@@ -24,15 +24,15 @@
 /// | z: flags                   |    |   |   |    local_clip_rect  |  +-----------------------+ | |
 /// |    segment_index           |    |   |   +---------------------+                            | |
 /// | w: resource_address       +--+  |   |                                                      | |
-/// +----------------------------+ |  |   |                             (float gpu buffer)       | |
-///                                |  |   |   (float gpu buffer)         +------------+          | |
+/// +----------------------------+ |  |   |                                 (sGpuCache)          | |
+///                                |  |   |         (sGpuCache)          +------------+          | |
 ///                                |  |   |   +---------------+          | Transform  | <--------+ |
-///           (float gpu buffer)   |  |   +-> | Picture task  |          +------------+            |
+///                (sGpuCache)     |  |   +-> | Picture task  |          +------------+            |
 ///            +-------------+     |  |       |               |                                    |
 ///            |  Resource   | <---+  |       |         ...   |                                    |
 ///            |             |        |       +---------------+   +--------------------------------+
 ///            |             |        |                           |
-///            +-------------+        |       (float gpu buffer)  v                (float gpu buffer)
+///            +-------------+        |             (sGpuCache)   v                        (sGpuCache)
 ///                                   |       +---------------+  +--------------+---------------+-+-+
 ///                                   +-----> | Clip area     |  | Brush data   |  Segment data | | |
 ///                                           |               |  |              |               | | |
@@ -113,7 +113,7 @@ void brush_shader_main_vs(
                               VECS_PER_SPECIFIC_BRUSH +
                               instance.segment_index * VECS_PER_SEGMENT;
 
-        vec4[2] segment_info = fetch_from_gpu_buffer_2f(segment_address);
+        vec4[2] segment_info = fetch_from_gpu_cache_2(segment_address);
         segment_rect = RectWithEndpoint(segment_info[0].xy, segment_info[0].zw);
         segment_rect.p0 += ph.local_rect.p0;
         segment_rect.p1 += ph.local_rect.p0;
diff --git a/gfx/wr/webrender/res/brush_blend.glsl b/gfx/wr/webrender/res/brush_blend.glsl
@@ -5,7 +5,7 @@
 #define VECS_PER_SPECIFIC_BRUSH 3
 #define WR_FEATURE_TEXTURE_2D
 
-#include shared,prim_shared,brush,blend,image_source
+#include shared,prim_shared,brush,blend
 
 // Interpolated UV coordinates to sample.
 varying highp vec2 v_uv;
diff --git a/gfx/wr/webrender/res/brush_image.glsl b/gfx/wr/webrender/res/brush_image.glsl
@@ -4,7 +4,7 @@
 
 #define VECS_PER_SPECIFIC_BRUSH 3
 
-#include shared,prim_shared,brush,image_source
+#include shared,prim_shared,brush
 
 // Interpolated UV coordinates to sample.
 varying highp vec2 v_uv;
@@ -38,7 +38,7 @@ struct ImageBrushData {
 };
 
 ImageBrushData fetch_image_data(int address) {
-    vec4[3] raw_data = fetch_from_gpu_buffer_3f(address);
+    vec4[3] raw_data = fetch_from_gpu_cache_3(address);
     ImageBrushData data = ImageBrushData(
         raw_data[0],
         raw_data[1],
diff --git a/gfx/wr/webrender/res/brush_linear_gradient.glsl b/gfx/wr/webrender/res/brush_linear_gradient.glsl
@@ -20,7 +20,7 @@ struct Gradient {
 };
 
 Gradient fetch_gradient(int address) {
-    vec4 data[2] = fetch_from_gpu_buffer_2f(address);
+    vec4 data[2] = fetch_from_gpu_cache_2(address);
     return Gradient(
         data[0],
         int(data[1].x),
diff --git a/gfx/wr/webrender/res/brush_mix_blend.glsl b/gfx/wr/webrender/res/brush_mix_blend.glsl
@@ -5,7 +5,7 @@
 #define VECS_PER_SPECIFIC_BRUSH 3
 #define WR_FEATURE_TEXTURE_2D
 
-#include shared,prim_shared,brush,image_source
+#include shared,prim_shared,brush
 
 // UV and bounds for the source image
 varying highp vec2 v_src_uv;
diff --git a/gfx/wr/webrender/res/brush_solid.glsl b/gfx/wr/webrender/res/brush_solid.glsl
@@ -15,7 +15,7 @@ struct SolidBrush {
 };
 
 SolidBrush fetch_solid_primitive(int address) {
-    vec4 data = fetch_from_gpu_buffer_1f(address);
+    vec4 data = fetch_from_gpu_cache_1(address);
     return SolidBrush(data);
 }
 
diff --git a/gfx/wr/webrender/res/brush_yuv_image.glsl b/gfx/wr/webrender/res/brush_yuv_image.glsl
@@ -4,7 +4,7 @@
 
 #define VECS_PER_SPECIFIC_BRUSH 1
 
-#include shared,prim_shared,brush,yuv,image_source
+#include shared,prim_shared,brush,yuv
 
 varying highp vec2 vUv_Y;
 flat varying highp vec4 vUvBounds_Y;
@@ -28,7 +28,7 @@ flat varying mediump int vRescaleFactor;
 #ifdef WR_VERTEX_SHADER
 
 YuvPrimitive fetch_yuv_primitive(int address) {
-    vec4 data = fetch_from_gpu_buffer_1f(address);
+    vec4 data = fetch_from_gpu_cache_1(address);
     // From YuvImageData.write_prim_gpu_blocks:
     int channel_bit_depth = int(data.x);
     int color_space = int(data.y);
diff --git a/gfx/wr/webrender/res/clip_shared.glsl b/gfx/wr/webrender/res/clip_shared.glsl
@@ -2,7 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include rect,render_task,transform
+#include rect,render_task,gpu_cache,transform
 
 #ifdef WR_VERTEX_SHADER
 
diff --git a/gfx/wr/webrender/res/cs_clip_box_shadow.glsl b/gfx/wr/webrender/res/cs_clip_box_shadow.glsl
@@ -2,7 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include shared,clip_shared,image_source
+#include shared,clip_shared
 
 varying highp vec4 vLocalPos;
 varying highp vec2 vUv;
@@ -17,7 +17,7 @@ flat varying mediump vec2 vClipMode;
 
 #ifdef WR_VERTEX_SHADER
 
-PER_INSTANCE in int aClipDataResourceAddress;
+PER_INSTANCE in ivec2 aClipDataResourceAddress;
 PER_INSTANCE in vec2 aClipSrcRectSize;
 PER_INSTANCE in int aClipMode;
 PER_INSTANCE in ivec2 aStretchMode;
@@ -25,7 +25,7 @@ PER_INSTANCE in vec4 aClipDestRect;
 
 struct ClipMaskInstanceBoxShadow {
     ClipMaskInstanceCommon base;
-    int resource_address;
+    ivec2 resource_address;
 };
 
 ClipMaskInstanceBoxShadow fetch_clip_item() {
@@ -61,7 +61,7 @@ void main(void) {
     Transform clip_transform = fetch_transform(cmi.base.clip_transform_id);
     Transform prim_transform = fetch_transform(cmi.base.prim_transform_id);
     BoxShadowData bs_data = fetch_data();
-    ImageSource res = fetch_image_source(cmi.resource_address);
+    ImageSource res = fetch_image_source_direct(cmi.resource_address);
 
     RectWithEndpoint dest_rect = bs_data.dest_rect;
 
diff --git a/gfx/wr/webrender/res/cs_conic_gradient.glsl b/gfx/wr/webrender/res/cs_conic_gradient.glsl
@@ -2,7 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include shared,rect,render_task,gpu_buffer,gradient
+#include shared,rect,render_task,gpu_cache,gpu_buffer,gradient
 
 #define PI                  3.141592653589793
 
diff --git a/gfx/wr/webrender/res/cs_linear_gradient.glsl b/gfx/wr/webrender/res/cs_linear_gradient.glsl
@@ -2,7 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include shared,rect,render_task,gpu_buffer,gradient
+#include shared,rect,render_task,gpu_cache,gpu_buffer,gradient
 
 varying highp vec2 v_pos;
 
diff --git a/gfx/wr/webrender/res/cs_radial_gradient.glsl b/gfx/wr/webrender/res/cs_radial_gradient.glsl
@@ -2,7 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include shared,rect,render_task,gpu_buffer,gradient
+#include shared,rect,render_task,gpu_cache,gpu_buffer,gradient
 
 varying highp vec2 v_pos;
 
diff --git a/gfx/wr/webrender/res/cs_svg_filter.glsl b/gfx/wr/webrender/res/cs_svg_filter.glsl
@@ -4,7 +4,7 @@
 
 #define WR_FEATURE_TEXTURE_2D
 
-#include shared,prim_shared,gpu_buffer
+#include shared,prim_shared
 
 varying highp vec2 vInput1Uv;
 varying highp vec2 vInput2Uv;
@@ -53,7 +53,7 @@ PER_INSTANCE in int aFilterInput2TaskAddress;
 PER_INSTANCE in int aFilterKind;
 PER_INSTANCE in int aFilterInputCount;
 PER_INSTANCE in int aFilterGenericInt;
-PER_INSTANCE in int aFilterExtraDataAddress;
+PER_INSTANCE in ivec2 aFilterExtraDataAddress;
 
 struct FilterTask {
     RectWithEndpoint task_rect;
@@ -126,20 +126,18 @@ void main(void) {
             vData = ivec4(aFilterGenericInt, 0, 0, 0);
             break;
         case FILTER_FLOOD:
-            vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress);
+            vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress);
             break;
         case FILTER_OPACITY:
             vFloat0.x = filter_task.user_data.x;
             break;
-        case FILTER_COLOR_MATRIX: {
-            ivec2 buffer_uv = get_gpu_buffer_uv(aFilterExtraDataAddress);
-            vec4 mat_data[4] = fetch_from_gpu_buffer_4f_direct(buffer_uv);
+        case FILTER_COLOR_MATRIX:
+            vec4 mat_data[4] = fetch_from_gpu_cache_4_direct(aFilterExtraDataAddress);
             vColorMat = mat4(mat_data[0], mat_data[1], mat_data[2], mat_data[3]);
-            vFilterData0 = fetch_from_gpu_buffer_1f_direct(buffer_uv + ivec2(4, 0));
+            vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress + ivec2(4, 0));
             break;
-        }
         case FILTER_DROP_SHADOW:
-            vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress);
+            vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress);
             break;
         case FILTER_OFFSET:
             vec2 texture_size = vec2(TEX_SIZE(sColor0).xy);
@@ -150,15 +148,13 @@ void main(void) {
             clipRect /= texture_size.xyxy;
             vFilterData1 = clipRect;
             break;
-        case FILTER_COMPONENT_TRANSFER: {
-            ivec2 buffer_uv = get_gpu_buffer_uv(aFilterExtraDataAddress);
-            vData = ivec4(buffer_uv, 0, 0);
+        case FILTER_COMPONENT_TRANSFER:
+            vData = ivec4(aFilterExtraDataAddress, 0, 0);
             break;
-        }
         case FILTER_COMPOSITE:
             vData = ivec4(aFilterGenericInt, 0, 0, 0);
             if (aFilterGenericInt == COMPOSITE_ARITHMETIC) {
-              vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress);
+              vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress);
             }
             break;
         default:
@@ -447,21 +443,21 @@ vec4 ComponentTransfer(vec4 colora) {
             case COMPONENT_TRANSFER_DISCRETE:
                 // fetch value from lookup table
                 k = int(floor(colora[i]*255.0 + 0.5));
-                texel = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(offset + k/4, 0));
+                texel = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(offset + k/4, 0));
                 colora[i] = clamp(texel[k % 4], 0.0, 1.0);
                 // offset plus 256/4 blocks
                 offset = offset + 64;
                 break;
             case COMPONENT_TRANSFER_LINEAR:
                 // fetch the two values for use in the linear equation
-                texel = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(offset, 0));
+                texel = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(offset, 0));
                 colora[i] = clamp(texel[0] * colora[i] + texel[1], 0.0, 1.0);
                 // offset plus 1 block
                 offset = offset + 1;
                 break;
             case COMPONENT_TRANSFER_GAMMA:
                 // fetch the three values for use in the gamma equation
-                texel = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(offset, 0));
+                texel = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(offset, 0));
                 colora[i] = clamp(texel[0] * pow(colora[i], texel[1]) + texel[2], 0.0, 1.0);
                 // offset plus 1 block
                 offset = offset + 1;
diff --git a/gfx/wr/webrender/res/cs_svg_filter_node.glsl b/gfx/wr/webrender/res/cs_svg_filter_node.glsl
@@ -38,7 +38,7 @@ Notes about specific filter kinds:
 
 #define WR_FEATURE_TEXTURE_2D
 
-#include shared,prim_shared,gpu_buffer
+#include shared,prim_shared
 
 varying highp vec2 vInput1Uv;
 varying highp vec2 vInput2Uv;
@@ -172,7 +172,7 @@ PER_INSTANCE in int aFilterInput1TaskAddress;
 PER_INSTANCE in int aFilterInput2TaskAddress;
 PER_INSTANCE in int aFilterKind;
 PER_INSTANCE in int aFilterInputCount;
-PER_INSTANCE in int aFilterExtraDataAddress;
+PER_INSTANCE in ivec2 aFilterExtraDataAddress;
 
 // used for feFlood and feDropShadow colors
 // this is based on SrgbToLinear below, but that version hits SWGL compile
@@ -270,23 +270,19 @@ void main(void) {
         case FILTER_BLEND_SOFT_LIGHT_CONVERTSRGB:
             break;
         case FILTER_COLOR_MATRIX:
-        case FILTER_COLOR_MATRIX_CONVERTSRGB: {
-            ivec2 gpu_buffer_uv = get_gpu_buffer_uv(aFilterExtraDataAddress);
-            vec4 mat_data[4] = fetch_from_gpu_buffer_4f_direct(gpu_buffer_uv);
+        case FILTER_COLOR_MATRIX_CONVERTSRGB:
+            vec4 mat_data[4] = fetch_from_gpu_cache_4_direct(aFilterExtraDataAddress);
             vColorMat = mat4(mat_data[0], mat_data[1], mat_data[2], mat_data[3]);
-            vFilterData0 = fetch_from_gpu_buffer_1f_direct(gpu_buffer_uv + ivec2(4, 0));
+            vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress + ivec2(4, 0));
             break;
-        }
         case FILTER_COMPONENT_TRANSFER:
-        case FILTER_COMPONENT_TRANSFER_CONVERTSRGB: {
-            ivec2 gpu_buffer_uv = get_gpu_buffer_uv(aFilterExtraDataAddress);
-            vData = ivec4(gpu_buffer_uv, 0, 0);
+        case FILTER_COMPONENT_TRANSFER_CONVERTSRGB:
+            vData = ivec4(aFilterExtraDataAddress, 0, 0);
             break;
-        }
         case FILTER_COMPOSITE_ARITHMETIC:
         case FILTER_COMPOSITE_ARITHMETIC_CONVERTSRGB:
             // arithmetic parameters
-            vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress);
+            vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress);
             break;
         case FILTER_COMPOSITE_ATOP:
         case FILTER_COMPOSITE_ATOP_CONVERTSRGB:
@@ -330,12 +326,12 @@ void main(void) {
             // TODO
             break;
         case FILTER_DROP_SHADOW:
-            vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress);
+            vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress);
             // premultiply the color
             vFilterData0.rgb = vFilterData0.rgb * vFilterData0.a;
             break;
         case FILTER_DROP_SHADOW_CONVERTSRGB:
-            vFilterData0 = fetch_from_gpu_buffer_1f(aFilterExtraDataAddress);
+            vFilterData0 = fetch_from_gpu_cache_1_direct(aFilterExtraDataAddress);
             // convert from sRGB to linearRGB and premultiply by alpha
             vFilterData0.rgb = vertexSrgbToLinear(vFilterData0.rgb);
             vFilterData0.rgb = vFilterData0.rgb * vFilterData0.a;
@@ -605,7 +601,7 @@ void main(void) {
     vec4 result = vec4(1.0, 0.0, 0.0, 1.0);
 
     // This would produce more efficient code for swgl if we used a switch statement.
-    // However, the glsl-optimizer pass produces awful code for switch statements,
+    // However, the glsl-optimizer pass produces awful code for switch statements, 
     // resulting in the optimized fragment shader taking half a minute to compile on
     // some Adreno devices. See bug 1929209.
     // We should fix the optimizer to produce more sensible output for switch
@@ -686,10 +682,10 @@ void main(void) {
         result = floor(clamp(Ns * 255.0, vec4(0.0), vec4(255.0)));
         // SWGL doesn't have an intrinsic for ivec4(vec4)
         k = ivec4(int(result.r), int(result.g), int(result.b), int(result.a));
-        result.r = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(k.r, 0)).r;
-        result.g = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(k.g, 0)).g;
-        result.b = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(k.b, 0)).b;
-        result.a = fetch_from_gpu_buffer_1f_direct(vData.xy + ivec2(k.a, 0)).a;
+        result.r = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(k.r, 0)).r;
+        result.g = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(k.g, 0)).g;
+        result.b = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(k.b, 0)).b;
+        result.a = fetch_from_gpu_cache_1_direct(vData.xy + ivec2(k.a, 0)).a;
         result.rgb = result.rgb * result.a;
     } else if (vFilterKind == FILTER_COMPOSITE_ARITHMETIC || vFilterKind == FILTER_COMPOSITE_ARITHMETIC_CONVERTSRGB) {
         result = Rs * Rb * vFilterData0.x + Rs * vFilterData0.y + Rb * vFilterData0.z + vec4(vFilterData0.w);
diff --git a/gfx/wr/webrender/res/gpu_buffer.glsl b/gfx/wr/webrender/res/gpu_buffer.glsl
@@ -10,14 +10,21 @@ ivec2 get_gpu_buffer_uv(HIGHP_FS_ADDRESS int address) {
                  uint(address) / WR_MAX_VERTEX_TEXTURE_WIDTH);
 }
 
-vec4[2] fetch_from_gpu_buffer_2f_direct(ivec2 uv) {
+vec4 fetch_from_gpu_buffer_1f(HIGHP_FS_ADDRESS int address) {
+    ivec2 uv = get_gpu_buffer_uv(address);
+    return texelFetch(sGpuBufferF, uv, 0);
+}
+
+vec4[2] fetch_from_gpu_buffer_2f(HIGHP_FS_ADDRESS int address) {
+    ivec2 uv = get_gpu_buffer_uv(address);
     return vec4[2](
         TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(0, 0)),
         TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(1, 0))
     );
 }
 
-vec4[3] fetch_from_gpu_buffer_3f_direct(ivec2 uv) {
+vec4[3] fetch_from_gpu_buffer_3f(HIGHP_FS_ADDRESS int address) {
+    ivec2 uv = get_gpu_buffer_uv(address);
     return vec4[3](
         TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(0, 0)),
         TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(1, 0)),
@@ -25,7 +32,8 @@ vec4[3] fetch_from_gpu_buffer_3f_direct(ivec2 uv) {
     );
 }
 
-vec4[4] fetch_from_gpu_buffer_4f_direct(ivec2 uv) {
+vec4[4] fetch_from_gpu_buffer_4f(HIGHP_FS_ADDRESS int address) {
+    ivec2 uv = get_gpu_buffer_uv(address);
     return vec4[4](
         TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(0, 0)),
         TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(1, 0)),
@@ -34,7 +42,8 @@ vec4[4] fetch_from_gpu_buffer_4f_direct(ivec2 uv) {
     );
 }
 
-vec4[5] fetch_from_gpu_buffer_5f_direct(ivec2 uv) {
+vec4[5] fetch_from_gpu_buffer_5f(HIGHP_FS_ADDRESS int address) {
+    ivec2 uv = get_gpu_buffer_uv(address);
     return vec4[5](
         TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(0, 0)),
         TEXEL_FETCH(sGpuBufferF, uv, 0, ivec2(1, 0)),
@@ -44,35 +53,6 @@ vec4[5] fetch_from_gpu_buffer_5f_direct(ivec2 uv) {
     );
 }
 
-vec4 fetch_from_gpu_buffer_1f(HIGHP_FS_ADDRESS int address) {
-    ivec2 uv = get_gpu_buffer_uv(address);
-    return texelFetch(sGpuBufferF, uv, 0);
-}
-
-vec4[2] fetch_from_gpu_buffer_2f(HIGHP_FS_ADDRESS int address) {
-    ivec2 uv = get_gpu_buffer_uv(address);
-    return fetch_from_gpu_buffer_2f_direct(uv);
-}
-
-vec4[3] fetch_from_gpu_buffer_3f(HIGHP_FS_ADDRESS int address) {
-    ivec2 uv = get_gpu_buffer_uv(address);
-    return fetch_from_gpu_buffer_3f_direct(uv);
-}
-
-vec4[4] fetch_from_gpu_buffer_4f(HIGHP_FS_ADDRESS int address) {
-    ivec2 uv = get_gpu_buffer_uv(address);
-    return fetch_from_gpu_buffer_4f_direct(uv);
-}
-
-vec4[5] fetch_from_gpu_buffer_5f(HIGHP_FS_ADDRESS int address) {
-    ivec2 uv = get_gpu_buffer_uv(address);
-    return fetch_from_gpu_buffer_5f_direct(uv);
-}
-
-vec4 fetch_from_gpu_buffer_1f_direct(ivec2 uv) {
-    return texelFetch(sGpuBufferF, uv, 0);
-}
-
 ivec4 fetch_from_gpu_buffer_1i(HIGHP_FS_ADDRESS int address) {
     ivec2 uv = get_gpu_buffer_uv(address);
     return texelFetch(sGpuBufferI, uv, 0);
diff --git a/gfx/wr/webrender/res/gpu_cache.glsl b/gfx/wr/webrender/res/gpu_cache.glsl
@@ -0,0 +1,137 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+uniform HIGHP_SAMPLER_FLOAT sampler2D sGpuCache;
+
+#define VECS_PER_IMAGE_RESOURCE     2
+
+// TODO(gw): This is here temporarily while we have
+//           both GPU store and cache. When the GPU
+//           store code is removed, we can change the
+//           PrimitiveInstance instance structure to
+//           use 2x unsigned shorts as vertex attributes
+//           instead of an int, and encode the UV directly
+//           in the vertices.
+ivec2 get_gpu_cache_uv(HIGHP_FS_ADDRESS int address) {
+    return ivec2(uint(address) % WR_MAX_VERTEX_TEXTURE_WIDTH,
+                 uint(address) / WR_MAX_VERTEX_TEXTURE_WIDTH);
+}
+
+vec4[2] fetch_from_gpu_cache_2_direct(ivec2 address) {
+    return vec4[2](
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(0, 0)),
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(1, 0))
+    );
+}
+
+vec4[2] fetch_from_gpu_cache_2(HIGHP_FS_ADDRESS int address) {
+    ivec2 uv = get_gpu_cache_uv(address);
+    return vec4[2](
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0))
+    );
+}
+
+vec4 fetch_from_gpu_cache_1_direct(ivec2 address) {
+    return texelFetch(sGpuCache, address, 0);
+}
+
+vec4 fetch_from_gpu_cache_1(HIGHP_FS_ADDRESS int address) {
+    ivec2 uv = get_gpu_cache_uv(address);
+    return texelFetch(sGpuCache, uv, 0);
+}
+
+#ifdef WR_VERTEX_SHADER
+
+vec4[8] fetch_from_gpu_cache_8(int address) {
+    ivec2 uv = get_gpu_cache_uv(address);
+    return vec4[8](
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(2, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(3, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(4, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(5, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(6, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(7, 0))
+    );
+}
+
+vec4[3] fetch_from_gpu_cache_3(int address) {
+    ivec2 uv = get_gpu_cache_uv(address);
+    return vec4[3](
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(2, 0))
+    );
+}
+
+vec4[3] fetch_from_gpu_cache_3_direct(ivec2 address) {
+    return vec4[3](
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(0, 0)),
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(1, 0)),
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(2, 0))
+    );
+}
+
+vec4[4] fetch_from_gpu_cache_4_direct(ivec2 address) {
+    return vec4[4](
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(0, 0)),
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(1, 0)),
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(2, 0)),
+        TEXEL_FETCH(sGpuCache, address, 0, ivec2(3, 0))
+    );
+}
+
+vec4[4] fetch_from_gpu_cache_4(int address) {
+    ivec2 uv = get_gpu_cache_uv(address);
+    return vec4[4](
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(2, 0)),
+        TEXEL_FETCH(sGpuCache, uv, 0, ivec2(3, 0))
+    );
+}
+
+//TODO: image resource is too specific for this module
+
+struct ImageSource {
+    RectWithEndpoint uv_rect;
+    vec4 user_data;
+};
+
+ImageSource fetch_image_source(int address) {
+    //Note: number of blocks has to match `renderer::BLOCKS_PER_UV_RECT`
+    vec4 data[2] = fetch_from_gpu_cache_2(address);
+    RectWithEndpoint uv_rect = RectWithEndpoint(data[0].xy, data[0].zw);
+    return ImageSource(uv_rect, data[1]);
+}
+
+ImageSource fetch_image_source_direct(ivec2 address) {
+    vec4 data[2] = fetch_from_gpu_cache_2_direct(address);
+    RectWithEndpoint uv_rect = RectWithEndpoint(data[0].xy, data[0].zw);
+    return ImageSource(uv_rect, data[1]);
+}
+
+// Fetch optional extra data for a texture cache resource. This can contain
+// a polygon defining a UV rect within the texture cache resource.
+// Note: the polygon coordinates are in homogeneous space.
+struct ImageSourceExtra {
+    vec4 st_tl;
+    vec4 st_tr;
+    vec4 st_bl;
+    vec4 st_br;
+};
+
+ImageSourceExtra fetch_image_source_extra(int address) {
+    vec4 data[4] = fetch_from_gpu_cache_4(address + VECS_PER_IMAGE_RESOURCE);
+    return ImageSourceExtra(
+        data[0],
+        data[1],
+        data[2],
+        data[3]
+    );
+}
+
+#endif //WR_VERTEX_SHADER
diff --git a/gfx/wr/webrender/res/gradient.glsl b/gfx/wr/webrender/res/gradient.glsl
@@ -2,7 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include dithering,gpu_buffer
+#include dithering
 
 // Gradient GPU cache address.
 // Packed in to a vector to work around bug 1630356.
diff --git a/gfx/wr/webrender/res/image_source.glsl b/gfx/wr/webrender/res/image_source.glsl
@@ -1,51 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include gpu_buffer
-
-#define VECS_PER_IMAGE_RESOURCE     2
-
-#ifdef WR_VERTEX_SHADER
-
-#include rect
-
-struct ImageSource {
-    RectWithEndpoint uv_rect;
-    vec4 user_data;
-};
-
-ImageSource fetch_image_source(int address) {
-    //Note: number of blocks has to match `renderer::BLOCKS_PER_UV_RECT`
-    vec4 data[2] = fetch_from_gpu_buffer_2f(address);
-    RectWithEndpoint uv_rect = RectWithEndpoint(data[0].xy, data[0].zw);
-    return ImageSource(uv_rect, data[1]);
-}
-
-ImageSource fetch_image_source_direct(ivec2 address) {
-    vec4 data[2] = fetch_from_gpu_buffer_2f_direct(address);
-    RectWithEndpoint uv_rect = RectWithEndpoint(data[0].xy, data[0].zw);
-    return ImageSource(uv_rect, data[1]);
-}
-
-// Fetch optional extra data for a texture cache resource. This can contain
-// a polygon defining a UV rect within the texture cache resource.
-// Note: the polygon coordinates are in homogeneous space.
-struct ImageSourceExtra {
-    vec4 st_tl;
-    vec4 st_tr;
-    vec4 st_bl;
-    vec4 st_br;
-};
-
-ImageSourceExtra fetch_image_source_extra(int address) {
-    vec4 data[4] = fetch_from_gpu_buffer_4f(address + VECS_PER_IMAGE_RESOURCE);
-    return ImageSourceExtra(
-        data[0],
-        data[1],
-        data[2],
-        data[3]
-    );
-}
-
-#endif // WR_VERTEX_SHADER
diff --git a/gfx/wr/webrender/res/prim_shared.glsl b/gfx/wr/webrender/res/prim_shared.glsl
@@ -2,7 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include rect,render_task,transform,image_source
+#include rect,render_task,gpu_cache,transform
 
 #define EXTEND_MODE_CLAMP  0
 #define EXTEND_MODE_REPEAT 1
diff --git a/gfx/wr/webrender/res/ps_quad.glsl b/gfx/wr/webrender/res/ps_quad.glsl
@@ -10,14 +10,14 @@
 ///
 ///```ascii
 ///                                       (int gpu buffer)
-///                                       +---------------+   (float gpu buffer)
+///                                       +---------------+    (sGpuCache)
 ///  (instance-step vertex attr)          |  Int header   |   +-----------+
 /// +-----------------------------+       |               |   | Transform |
 /// |    Quad instance (uvec4)    |  +--> | transform id +--> +-----------+
 /// |                             |  |    | z id          |
 /// | x: int prim address        +---+    +---------------+   (float gpu buffer)
 /// | y: float prim address      +--------------------------> +-----------+--------------+-+-+
-/// | z: quad flags               |     (float gpu buffer)    | Quad Prim | Quad Segment | | |
+/// | z: quad flags               |      (sGpuCache)          | Quad Prim | Quad Segment | | |
 /// |    edge flags               |   +--------------------+  |           |              | | |
 /// |    part index               |   |     Picture task   |  | bounds    | rect         | | |
 /// |    segment index            |   |                    |  | clip      | uv rect      | | |
diff --git a/gfx/wr/webrender/res/ps_split_composite.glsl b/gfx/wr/webrender/res/ps_split_composite.glsl
@@ -4,7 +4,7 @@
 
 #define WR_FEATURE_TEXTURE_2D
 
-#include shared,prim_shared,image_source
+#include shared,prim_shared
 
 // interpolated UV coordinates to sample.
 varying highp vec2 vUv;
@@ -21,14 +21,17 @@ struct SplitGeometry {
 };
 
 SplitGeometry fetch_split_geometry(int address) {
-    vec4[2] data = fetch_from_gpu_buffer_2f(address);
+    ivec2 uv = get_gpu_cache_uv(address);
+
+    vec4 data0 = TEXEL_FETCH(sGpuCache, uv, 0, ivec2(0, 0));
+    vec4 data1 = TEXEL_FETCH(sGpuCache, uv, 0, ivec2(1, 0));
 
     SplitGeometry geo;
     geo.local = vec2[4](
-        data[0].xy,
-        data[0].zw,
-        data[1].xy,
-        data[1].zw
+        data0.xy,
+        data0.zw,
+        data1.xy,
+        data1.zw
     );
 
     return geo;
diff --git a/gfx/wr/webrender/res/ps_text_run.glsl b/gfx/wr/webrender/res/ps_text_run.glsl
@@ -2,7 +2,7 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-#include shared,prim_shared,gpu_buffer
+#include shared,prim_shared
 
 flat varying mediump vec4 v_color;
 flat varying mediump vec3 v_mask_swizzle;
@@ -45,7 +45,7 @@ Glyph fetch_glyph(int specific_prim_address,
     int glyph_address = specific_prim_address +
                         VECS_PER_TEXT_RUN +
                         int(uint(glyph_index) / GLYPHS_PER_GPU_BLOCK);
-    vec4 data = fetch_from_gpu_buffer_1f(glyph_address);
+    vec4 data = fetch_from_gpu_cache_1(glyph_address);
     // Select XY or ZW based on glyph index.
     vec2 glyph = mix(data.xy, data.zw,
                      bvec2(uint(glyph_index) % GLYPHS_PER_GPU_BLOCK == 1U));
@@ -60,7 +60,7 @@ struct GlyphResource {
 };
 
 GlyphResource fetch_glyph_resource(int address) {
-    vec4 data[2] = fetch_from_gpu_buffer_2f(address);
+    vec4 data[2] = fetch_from_gpu_cache_2(address);
     return GlyphResource(data[0], data[1].xy, data[1].z);
 }
 
@@ -69,7 +69,7 @@ struct TextRun {
 };
 
 TextRun fetch_text_run(int address) {
-    vec4 data = fetch_from_gpu_buffer_1f(address);
+    vec4 data = fetch_from_gpu_cache_1(address);
     return TextRun(data);
 }
 
diff --git a/gfx/wr/webrender/src/batch.rs b/gfx/wr/webrender/src/batch.rs
@@ -11,7 +11,8 @@ use crate::composite::CompositorSurfaceKind;
 use crate::pattern::PatternKind;
 use crate::spatial_tree::{SpatialTree, SpatialNodeIndex, CoordinateSystemId};
 use glyph_rasterizer::{GlyphFormat, SubpixelDirection};
-use crate::gpu_types::{BrushFlags, BrushInstance, ImageSource, PrimitiveHeaders, UvRectKind, ZBufferId, ZBufferIdGenerator};
+use crate::gpu_cache::{GpuBlockData, GpuCache, GpuCacheAddress};
+use crate::gpu_types::{BrushFlags, BrushInstance, PrimitiveHeaders, ZBufferId, ZBufferIdGenerator};
 use crate::gpu_types::SplitCompositeInstance;
 use crate::gpu_types::{PrimitiveInstanceData, RasterizationSpace, GlyphInstance};
 use crate::gpu_types::{PrimitiveHeader, PrimitiveHeaderIndex, TransformPaletteId, TransformPalette};
@@ -27,7 +28,7 @@ use crate::quad;
 use crate::render_target::RenderTargetContext;
 use crate::render_task_graph::{RenderTaskId, RenderTaskGraph};
 use crate::render_task::{RenderTaskAddress, RenderTaskKind, SubPass};
-use crate::renderer::{BlendMode, GpuBufferAddress, GpuBufferBlockF, GpuBufferBuilder, ShaderColorMode};
+use crate::renderer::{BlendMode, GpuBufferBuilder, ShaderColorMode};
 use crate::renderer::MAX_VERTEX_TEXTURE_WIDTH;
 use crate::resource_cache::{GlyphFetchResult, ImageProperties};
 use crate::space::SpaceMapper;
@@ -37,7 +38,6 @@ use std::{f32, i32, usize};
 use crate::util::{project_rect, MaxRect, TransformedRectKind, ScaleOffset};
 use crate::segment::EdgeAaSegmentMask;
 
-
 // Special sentinel value recognized by the shader. It is considered to be
 // a dummy task that doesn't mask out anything.
 const OPAQUE_TASK_ADDRESS: RenderTaskAddress = RenderTaskAddress(0x7fffffff);
@@ -820,6 +820,7 @@ impl BatchBuilder {
         cmd: &PrimitiveCommand,
         prim_spatial_node_index: SpatialNodeIndex,
         ctx: &RenderTargetContext,
+        gpu_cache: &mut GpuCache,
         render_tasks: &RenderTaskGraph,
         prim_headers: &mut PrimitiveHeaders,
         transforms: &mut TransformPalette,
@@ -991,7 +992,7 @@ impl BatchBuilder {
             }
 
             let blend_mode = BlendMode::PremultipliedAlpha;
-            let prim_cache_address = ctx.globals.default_image_data;
+            let prim_cache_address = gpu_cache.get_address(&ctx.globals.default_image_handle);
 
             match picture.raster_config {
                 Some(ref raster_config) => {
@@ -1039,7 +1040,7 @@ impl BatchBuilder {
                     let picture_prim_header = PrimitiveHeader {
                         local_rect: prim_rect,
                         local_clip_rect,
-                        specific_prim_address: prim_cache_address.as_int(),
+                        specific_prim_address: prim_cache_address,
                         transform_id,
                         z: z_id,
                         render_task_address: self.batcher.render_task_address,
@@ -1084,7 +1085,7 @@ impl BatchBuilder {
 
                     let (uv_rect_address, texture) = render_tasks.resolve_location(
                         pic_task_id,
-
+                        gpu_cache,
                     ).unwrap();
 
                     // The set of input textures that most composite modes use,
@@ -1146,7 +1147,7 @@ impl BatchBuilder {
                                     let shadow_textures = textures;
 
                                     let content_uv_rect_address = render_tasks[secondary_id]
-                                        .get_texture_address()
+                                        .get_texture_address(gpu_cache)
                                         .as_int();
 
                                     // Build BatchTextures for shadow/content
@@ -1159,12 +1160,15 @@ impl BatchBuilder {
                                     let shadow_key = BatchKey::new(kind, blend_mode, shadow_textures);
                                     let content_key = BatchKey::new(kind, blend_mode, content_textures);
 
-                                    for (shadow, shadow_prim_address) in shadows.iter().zip(picture.extra_gpu_data.iter()) {
+                                    for (shadow, shadow_gpu_data) in shadows.iter().zip(picture.extra_gpu_data_handles.iter()) {
+                                        // Get the GPU cache address of the extra data handle.
+                                        let shadow_prim_address = gpu_cache.get_address(shadow_gpu_data);
+
                                         let shadow_rect = picture_prim_header.local_rect.translate(shadow.offset);
 
                                         let shadow_prim_header = PrimitiveHeader {
                                             local_rect: shadow_rect,
-                                            specific_prim_address: shadow_prim_address.as_int(),
+                                            specific_prim_address: shadow_prim_address,
                                             z: z_id,
                                             user_data: ImageBrushData {
                                                 color_mode: ShaderColorMode::Alpha,
@@ -1239,10 +1243,10 @@ impl BatchBuilder {
                                             (0.01745329251 * angle * 65536.0) as i32
                                         }
                                         Filter::ColorMatrix(_) => {
-                                            picture.extra_gpu_data[0].as_int()
+                                            picture.extra_gpu_data_handles[0].as_int(gpu_cache)
                                         }
                                         Filter::Flood(_) => {
-                                            picture.extra_gpu_data[0].as_int()
+                                            picture.extra_gpu_data_handles[0].as_int(gpu_cache)
                                         }
 
                                         // These filters are handled via different paths.
@@ -1293,7 +1297,7 @@ impl BatchBuilder {
                                   filter_data.data.b_func.to_int() << 20 |
                                   filter_data.data.a_func.to_int() << 16) as i32);
 
-                            let user_data = filter_data.gpu_buffer_address.as_int();
+                            let user_data = filter_data.gpu_cache_handle.as_int(gpu_cache);
 
                             let key = BatchKey::new(
                                 BatchKind::Brush(BrushBatchKind::Blend),
@@ -1380,8 +1384,8 @@ impl BatchBuilder {
                                     clip_mask: clip_mask_texture_id,
                                 },
                             );
-                            let src_uv_address = render_tasks[pic_task_id].get_texture_address();
-                            let readback_uv_address = render_tasks[backdrop_id].get_texture_address();
+                            let src_uv_address = render_tasks[pic_task_id].get_texture_address(gpu_cache);
+                            let readback_uv_address = render_tasks[backdrop_id].get_texture_address(gpu_cache);
                             let prim_header = PrimitiveHeader {
                                 user_data: [
                                     mode as u32 as i32,
@@ -1489,7 +1493,7 @@ impl BatchBuilder {
                                     );
 
                                     let prim_header = PrimitiveHeader {
-                                        specific_prim_address: prim_cache_address.as_int(),
+                                        specific_prim_address: prim_cache_address,
                                         user_data: batch_params.prim_user_data,
                                         ..picture_prim_header
                                     };
@@ -1596,7 +1600,7 @@ impl BatchBuilder {
             transform_id,
             z: z_id,
             render_task_address: self.batcher.render_task_address,
-            specific_prim_address: GpuBufferAddress::INVALID.as_int(), // Will be overridden by most uses
+            specific_prim_address: GpuCacheAddress::INVALID, // Will be overridden by most uses
             user_data: [0; 4], // Will be overridden by most uses
         };
 
@@ -1620,11 +1624,11 @@ impl BatchBuilder {
         };
 
         let (prim_cache_address, segments) = if segment_instance_index == SegmentInstanceIndex::UNUSED {
-            (common_data.gpu_buffer_address, None)
+            (gpu_cache.try_get_address(&common_data.gpu_cache_handle), None)
         } else {
             let segment_instance = &ctx.scratch.segment_instances[segment_instance_index];
             let segments = Some(&ctx.scratch.segments[segment_instance.segments_range]);
-            (segment_instance.gpu_data, segments)
+            (Some(gpu_cache.get_address(&segment_instance.gpu_cache_handle)), segments)
         };
 
         // The following primitives lower to the image brush shader in the same way.
@@ -1649,7 +1653,7 @@ impl BatchBuilder {
         };
 
         if let Some((src_color, visible_tiles_range, brush_segments)) = img_brush_data {
-            let src_color = render_tasks.resolve_location(src_color);
+            let src_color = render_tasks.resolve_location(src_color, gpu_cache);
 
             let (uv_rect_address, texture_source) = match src_color {
                 Some(src) => src,
@@ -1668,7 +1672,7 @@ impl BatchBuilder {
             }.encode();
 
             let prim_header = PrimitiveHeader {
-                specific_prim_address: common_data.gpu_buffer_address.as_int(),
+                specific_prim_address: gpu_cache.get_address(&common_data.gpu_cache_handle),
                 user_data: prim_user_data,
                 ..base_prim_header
             };
@@ -1771,7 +1775,7 @@ impl BatchBuilder {
                 //           use of interning.
 
                 let prim_header = PrimitiveHeader {
-                    specific_prim_address: prim_cache_address.as_int(),
+                    specific_prim_address: prim_cache_address.unwrap(),
                     user_data: [get_shader_opacity(1.0), 0, 0, 0],
                     ..base_prim_header
                 };
@@ -1805,7 +1809,7 @@ impl BatchBuilder {
                 // task for each valid edge / corner of the border.
 
                 for task_id in task_ids {
-                    if let Some((uv_rect_address, texture)) = render_tasks.resolve_location(*task_id) {
+                    if let Some((uv_rect_address, texture)) = render_tasks.resolve_location(*task_id, gpu_cache) {
                         segment_data.push(
                             SegmentInstanceData {
                                 textures: TextureSet::prim_textured(texture),
@@ -1830,7 +1834,7 @@ impl BatchBuilder {
                 );
 
                 let prim_header = PrimitiveHeader {
-                    specific_prim_address: prim_cache_address.as_int(),
+                    specific_prim_address: prim_cache_address.unwrap(),
                     user_data: batch_params.prim_user_data,
                     ..base_prim_header
                 };
@@ -1878,7 +1882,7 @@ impl BatchBuilder {
                         min: prim_rect.min - run.reference_frame_relative_offset,
                         max: run.snapped_reference_frame_relative_offset.to_point(),
                     },
-                    specific_prim_address: prim_cache_address.as_int(),
+                    specific_prim_address: prim_cache_address.unwrap(),
                     user_data: [
                         (run.raster_scale * 65535.0).round() as i32,
                         0,
@@ -1906,6 +1910,7 @@ impl BatchBuilder {
                     font,
                     &glyph_keys,
                     &mut self.glyph_fetch_buffer,
+                    gpu_cache,
                     |texture_id, glyph_format, glyphs| {
                         debug_assert_ne!(texture_id, TextureSource::Invalid);
 
@@ -2083,7 +2088,7 @@ impl BatchBuilder {
 
                 let (batch_kind, textures, prim_user_data, specific_resource_address) = match render_task {
                     Some(task_id) => {
-                        let (uv_rect_address, texture) = render_tasks.resolve_location(*task_id).unwrap();
+                        let (uv_rect_address, texture) = render_tasks.resolve_location(*task_id, gpu_cache).unwrap();
                         let textures = BatchTextures::prim_textured(
                             texture,
                             clip_mask_texture_id,
@@ -2111,7 +2116,7 @@ impl BatchBuilder {
                 };
 
                 let prim_header = PrimitiveHeader {
-                    specific_prim_address: prim_cache_address.as_int(),
+                    specific_prim_address: prim_cache_address.unwrap(),
                     user_data: prim_user_data,
                     ..base_prim_header
                 };
@@ -2146,7 +2151,7 @@ impl BatchBuilder {
                 );
 
                 let prim_header = PrimitiveHeader {
-                    specific_prim_address: prim_cache_address.as_int(),
+                    specific_prim_address: prim_cache_address.unwrap(),
                     user_data: batch_params.prim_user_data,
                     ..base_prim_header
                 };
@@ -2179,6 +2184,7 @@ impl BatchBuilder {
                         z_id,
                         bounding_rect,
                         ctx,
+                        gpu_cache,
                         render_tasks,
                         prim_headers,
                     );
@@ -2195,7 +2201,7 @@ impl BatchBuilder {
                 debug_assert!(channel_count <= 3);
                 for channel in 0 .. channel_count {
 
-                    let src_channel = render_tasks.resolve_location(yuv_image_data.src_yuv[channel]);
+                    let src_channel = render_tasks.resolve_location(yuv_image_data.src_yuv[channel], gpu_cache);
 
                     let (uv_rect_address, texture_source) = match src_channel {
                         Some(src) => src,
@@ -2240,7 +2246,7 @@ impl BatchBuilder {
                 debug_assert_ne!(segment_instance_index, SegmentInstanceIndex::INVALID);
 
                 let prim_header = PrimitiveHeader {
-                    specific_prim_address: prim_cache_address.as_int(),
+                    specific_prim_address: prim_cache_address.unwrap(),
                     user_data: batch_params.prim_user_data,
                     ..base_prim_header
                 };
@@ -2273,6 +2279,7 @@ impl BatchBuilder {
                         z_id,
                         bounding_rect,
                         ctx,
+                        gpu_cache,
                         render_tasks,
                         prim_headers,
                     );
@@ -2306,7 +2313,7 @@ impl BatchBuilder {
                         }
                     }
 
-                    let src_color = render_tasks.resolve_location(image_instance.src_color);
+                    let src_color = render_tasks.resolve_location(image_instance.src_color, gpu_cache);
 
                     let (uv_rect_address, texture_source) = match src_color {
                         Some(src) => src,
@@ -2324,11 +2331,11 @@ impl BatchBuilder {
 
                     debug_assert_ne!(image_instance.segment_instance_index, SegmentInstanceIndex::INVALID);
                     let (prim_cache_address, segments) = if image_instance.segment_instance_index == SegmentInstanceIndex::UNUSED {
-                        (prim_cache_address, None)
+                        (prim_cache_address.unwrap(), None)
                     } else {
                         let segment_instance = &ctx.scratch.segment_instances[image_instance.segment_instance_index];
                         let segments = Some(&ctx.scratch.segments[segment_instance.segments_range]);
-                        (segment_instance.gpu_data, segments)
+                        (gpu_cache.get_address(&segment_instance.gpu_cache_handle), segments)
                     };
 
                     let local_rect = image_instance.adjustment.map_local_rect(&prim_rect);
@@ -2338,7 +2345,7 @@ impl BatchBuilder {
                     let prim_header = PrimitiveHeader {
                         local_rect,
                         local_clip_rect,
-                        specific_prim_address: prim_cache_address.as_int(),
+                        specific_prim_address: prim_cache_address,
                         user_data: batch_params.prim_user_data,
                         ..base_prim_header
                     };
@@ -2376,7 +2383,7 @@ impl BatchBuilder {
                     ).unwrap();
 
                     // use temporary block storage since we don't know the number of visible tiles beforehand
-                    let mut gpu_blocks = Vec::<GpuBufferBlockF>::with_capacity(3 + max_tiles_per_header * 2);
+                    let mut gpu_blocks = Vec::<GpuBlockData>::with_capacity(3 + max_tiles_per_header * 2);
                     for chunk in image_instance.visible_tiles.chunks(max_tiles_per_header) {
                         gpu_blocks.clear();
                         gpu_blocks.push(image_data.color.premultiplied().into()); //color
@@ -2386,25 +2393,20 @@ impl BatchBuilder {
                         for tile in chunk {
                             let tile_rect = tile.local_rect.translate(-prim_rect.min.to_vector());
                             gpu_blocks.push(tile_rect.into());
-                            gpu_blocks.push([0.0; 4].into());
+                            gpu_blocks.push(GpuBlockData::EMPTY);
                         }
 
-                        let mut writer = gpu_buffer_builder.f32.write_blocks(gpu_blocks.len());
-                        for block in &gpu_blocks {
-                            writer.push_one(*block);
-                        }
-                        let specific_prim_address = writer.finish();
-
+                        let gpu_handle = gpu_cache.push_per_frame_blocks(&gpu_blocks);
                         let prim_header = PrimitiveHeader {
                             local_clip_rect: image_instance.tight_local_clip_rect,
-                            specific_prim_address: specific_prim_address.as_int(),
+                            specific_prim_address: gpu_cache.get_address(&gpu_handle),
                             user_data: prim_user_data,
                             ..base_prim_header
                         };
                         let prim_header_index = prim_headers.push(&prim_header);
 
                         for (i, tile) in chunk.iter().enumerate() {
-                            let (uv_rect_address, texture) = match render_tasks.resolve_location(tile.src_color) {
+                            let (uv_rect_address, texture) = match render_tasks.resolve_location(tile.src_color, gpu_cache) {
                                 Some(result) => result,
                                 None => {
                                     return;
@@ -2453,7 +2455,7 @@ impl BatchBuilder {
 
                     let prim_header = PrimitiveHeader {
                         user_data: user_data,
-                        specific_prim_address: prim_data.gpu_buffer_address.as_int(),
+                        specific_prim_address: gpu_cache.get_address(&prim_data.gpu_cache_handle),
                         ..base_prim_header
                     };
                     let prim_header_index = prim_headers.push(&prim_header);
@@ -2495,7 +2497,7 @@ impl BatchBuilder {
 
                     for tile in visible_tiles {
                         let tile_prim_header = PrimitiveHeader {
-                            specific_prim_address: tile.address.as_int(),
+                            specific_prim_address: gpu_cache.get_address(&tile.handle),
                             local_rect: tile.local_rect,
                             local_clip_rect: tile.local_clip_rect,
                             user_data: user_data,
@@ -2531,7 +2533,10 @@ impl BatchBuilder {
                 let kind = BatchKind::Brush(
                     BrushBatchKind::Image(ImageBufferKind::Texture2D)
                 );
-                let (_, texture) = render_tasks.resolve_location(pic_task_id).unwrap();
+                let (_, texture) = render_tasks.resolve_location(
+                    pic_task_id,
+                    gpu_cache,
+                ).unwrap();
                 let textures = BatchTextures::prim_textured(
                     texture,
                     clip_mask_texture_id,
@@ -2543,7 +2548,7 @@ impl BatchBuilder {
                 );
 
                 let prim_header = PrimitiveHeader {
-                    specific_prim_address: ctx.globals.default_image_data.as_int(),
+                    specific_prim_address: gpu_cache.get_address(&ctx.globals.default_image_handle),
                     user_data: ImageBrushData {
                         color_mode: ShaderColorMode::Image,
                         alpha_type: AlphaType::PremultipliedAlpha,
@@ -2591,19 +2596,23 @@ impl BatchBuilder {
                     calculate_screen_uv(points[3].unwrap() * pic_info.device_pixel_scale, backdrop_rect),
                 ];
 
-                let source = ImageSource {
-                    p0: target_rect.min.to_f32(),
-                    p1: target_rect.max.to_f32(),
-                    user_data: [0.0; 4],
-                    uv_rect_kind: UvRectKind::Quad {
-                        top_left: uvs[0],
-                        top_right: uvs[1],
-                        bottom_left: uvs[2],
-                        bottom_right: uvs[3],
-                    },
-                };
-
-                let uv_rect_address = source.write_gpu_blocks(&mut gpu_buffer_builder.f32);
+                // TODO (gw): This is a hack that provides the GPU cache blocks for an
+                //            ImageSource. We should update the GPU cache interfaces to
+                //            allow pushing per-frame blocks via a request interface.
+                let gpu_blocks = &[
+                    GpuBlockData::from([
+                        target_rect.min.x as f32,
+                        target_rect.min.y as f32,
+                        target_rect.max.x as f32,
+                        target_rect.max.y as f32,
+                    ]),
+                    GpuBlockData::from([0.0; 4]),
+                    GpuBlockData::from(uvs[0]),
+                    GpuBlockData::from(uvs[1]),
+                    GpuBlockData::from(uvs[2]),
+                    GpuBlockData::from(uvs[3]),
+                ];
+                let uv_rect_handle = gpu_cache.push_per_frame_blocks(gpu_blocks);
 
                 self.add_brush_instance_to_batches(
                     key,
@@ -2615,7 +2624,7 @@ impl BatchBuilder {
                     clip_task_address,
                     brush_flags,
                     prim_header_index,
-                    uv_rect_address.as_int(),
+                    uv_rect_handle.as_int(gpu_cache),
                 );
             }
         }
@@ -2632,9 +2641,12 @@ impl BatchBuilder {
         z_id: ZBufferId,
         bounding_rect: &PictureRect,
         ctx: &RenderTargetContext,
+        gpu_cache: &mut GpuCache,
         render_tasks: &RenderTaskGraph,
         prim_headers: &mut PrimitiveHeaders,
     ) {
+        let prim_cache_address = gpu_cache.get_address(&ctx.globals.default_black_rect_handle);
+
         let (clip_task_address, clip_mask_texture_id) = ctx.get_prim_clip_task_and_texture(
             clip_task_index,
             render_tasks,
@@ -2643,7 +2655,7 @@ impl BatchBuilder {
         let prim_header = PrimitiveHeader {
             local_rect: prim_rect,
             local_clip_rect,
-            specific_prim_address: ctx.globals.default_black_rect_address.as_int(),
+            specific_prim_address: prim_cache_address,
             transform_id,
             z: z_id,
             render_task_address: self.batcher.render_task_address,
@@ -3145,6 +3157,7 @@ impl ClipBatcher {
         clip_node_range: ClipNodeRange,
         root_spatial_node_index: SpatialNodeIndex,
         render_tasks: &RenderTaskGraph,
+        gpu_cache: &GpuCache,
         clip_store: &ClipStore,
         transforms: &mut TransformPalette,
         actual_rect: DeviceRect,
@@ -3189,7 +3202,7 @@ impl ClipBatcher {
                     let task_id = source
                         .render_task
                         .expect("bug: render task handle not allocated");
-                    let (uv_rect_address, texture) = render_tasks.resolve_location(task_id).unwrap();
+                    let (uv_rect_address, texture) = render_tasks.resolve_location(task_id, gpu_cache).unwrap();
 
                     self.get_batch_list(is_first_clip)
                         .box_shadows
@@ -3197,7 +3210,7 @@ impl ClipBatcher {
                         .or_insert_with(|| ctx.frame_memory.new_vec())
                         .push(ClipMaskInstanceBoxShadow {
                             common,
-                            resource_address: uv_rect_address.as_int(),
+                            resource_address: uv_rect_address,
                             shadow_data: BoxShadowData {
                                 src_rect_size: source.original_alloc_size,
                                 clip_mode: source.clip_mode as i32,
diff --git a/gfx/wr/webrender/src/clip.rs b/gfx/wr/webrender/src/clip.rs
@@ -98,9 +98,9 @@ use api::units::*;
 use crate::image_tiling::{self, Repetition};
 use crate::border::{ensure_no_corner_overlap, BorderRadiusAu};
 use crate::box_shadow::{BLUR_SAMPLE_SCALE, BoxShadowClipSource, BoxShadowCacheKey};
-use crate::renderer::GpuBufferBuilderF;
 use crate::spatial_tree::{SpatialTree, SpatialNodeIndex};
 use crate::ellipse::Ellipse;
+use crate::gpu_cache::GpuCache;
 use crate::gpu_types::{BoxShadowStretchMode};
 use crate::intern;
 use crate::internal_types::{FastHashMap, FastHashSet, LayoutPrimitiveInfo};
@@ -1092,7 +1092,7 @@ impl ClipNodeInfo {
         &self,
         node: &ClipNode,
         clipped_rect: &LayoutRect,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         resource_cache: &mut ResourceCache,
         mask_tiles: &mut Vec<VisibleMaskImageTile>,
         spatial_tree: &SpatialTree,
@@ -1158,7 +1158,7 @@ impl ClipNodeInfo {
                             if request_resources {
                                 resource_cache.request_image(
                                     req,
-                                    gpu_buffer,
+                                    gpu_cache,
                                 );
                             }
 
@@ -1176,7 +1176,7 @@ impl ClipNodeInfo {
                     visible_tiles = Some(tile_range_start..mask_tiles.len());
                 } else {
                     if request_resources {
-                        resource_cache.request_image(request, gpu_buffer);
+                        resource_cache.request_image(request, gpu_cache);
                     }
 
                     let tile_range_start = mask_tiles.len();
@@ -1499,7 +1499,7 @@ impl ClipStore {
         prim_to_pic_mapper: &SpaceMapper<LayoutPixel, PicturePixel>,
         pic_to_vis_mapper: &SpaceMapper<PicturePixel, VisPixel>,
         spatial_tree: &SpatialTree,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         resource_cache: &mut ResourceCache,
         device_pixel_scale: DevicePixelScale,
         culling_rect: &VisRect,
@@ -1567,7 +1567,7 @@ impl ClipStore {
                     if let Some(instance) = node_info.create_instance(
                         node,
                         &local_bounding_rect,
-                        gpu_buffer,
+                        gpu_cache,
                         resource_cache,
                         &mut self.mask_tiles,
                         spatial_tree,
diff --git a/gfx/wr/webrender/src/command_buffer.rs b/gfx/wr/webrender/src/command_buffer.rs
@@ -4,7 +4,7 @@
 
 use api::units::PictureRect;
 use crate::pattern::{PatternKind, PatternShaderInput};
-use crate::{spatial_tree::SpatialNodeIndex, render_task_graph::RenderTaskId, surface::SurfaceTileDescriptor, picture::TileKey, renderer::GpuBufferAddress, FastHashMap, prim_store::PrimitiveInstanceIndex};
+use crate::{spatial_tree::SpatialNodeIndex, render_task_graph::RenderTaskId, surface::SurfaceTileDescriptor, picture::TileKey, renderer::GpuBufferAddress, FastHashMap, prim_store::PrimitiveInstanceIndex, gpu_cache::GpuCacheAddress};
 use crate::gpu_types::{QuadSegment, TransformPaletteId};
 use crate::segment::EdgeAaSegmentMask;
 
@@ -112,7 +112,7 @@ pub enum PrimitiveCommand {
     },
     Complex {
         prim_instance_index: PrimitiveInstanceIndex,
-        gpu_address: GpuBufferAddress,
+        gpu_address: GpuCacheAddress,
     },
     Instance {
         prim_instance_index: PrimitiveInstanceIndex,
@@ -142,7 +142,7 @@ impl PrimitiveCommand {
 
     pub fn complex(
         prim_instance_index: PrimitiveInstanceIndex,
-        gpu_address: GpuBufferAddress,
+        gpu_address: GpuCacheAddress,
     ) -> Self {
         PrimitiveCommand::Complex {
             prim_instance_index,
@@ -239,11 +239,11 @@ impl CommandBuffer {
             }
             PrimitiveCommand::Complex { prim_instance_index, gpu_address } => {
                 self.commands.push(Command::draw_complex_prim(prim_instance_index));
-                self.commands.push(Command::data(gpu_address.as_u32()));
+                self.commands.push(Command::data((gpu_address.u as u32) << 16 | gpu_address.v as u32));
             }
             PrimitiveCommand::Instance { prim_instance_index, gpu_buffer_address } => {
                 self.commands.push(Command::draw_instance(prim_instance_index));
-                self.commands.push(Command::data(gpu_buffer_address.as_u32()));
+                self.commands.push(Command::data((gpu_buffer_address.u as u32) << 16 | gpu_buffer_address.v as u32));
             }
             PrimitiveCommand::Quad { pattern, pattern_input, prim_instance_index, gpu_buffer_address, transform_id, quad_flags, edge_flags, src_color_task_id } => {
                 self.commands.push(Command::draw_quad(prim_instance_index));
@@ -251,7 +251,7 @@ impl CommandBuffer {
                 self.commands.push(Command::data(pattern_input.0 as u32));
                 self.commands.push(Command::data(pattern_input.1 as u32));
                 self.commands.push(Command::data(src_color_task_id.index));
-                self.commands.push(Command::data(gpu_buffer_address.as_u32()));
+                self.commands.push(Command::data((gpu_buffer_address.u as u32) << 16 | gpu_buffer_address.v as u32));
                 self.commands.push(Command::data(transform_id.0));
                 self.commands.push(Command::data((quad_flags.bits() as u32) << 16 | edge_flags.bits() as u32));
             }
@@ -284,7 +284,10 @@ impl CommandBuffer {
                 Command::CMD_DRAW_COMPLEX_PRIM => {
                     let prim_instance_index = PrimitiveInstanceIndex(param);
                     let data = cmd_iter.next().unwrap();
-                    let gpu_address = GpuBufferAddress::from_u32(data.0);
+                    let gpu_address = GpuCacheAddress {
+                        u: (data.0 >> 16) as u16,
+                        v: (data.0 & 0xffff) as u16,
+                    };
                     let cmd = PrimitiveCommand::complex(
                         prim_instance_index,
                         gpu_address,
@@ -304,7 +307,10 @@ impl CommandBuffer {
                     let bits = cmd_iter.next().unwrap().0;
                     let quad_flags = QuadFlags::from_bits((bits >> 16) as u8).unwrap();
                     let edge_flags = EdgeAaSegmentMask::from_bits((bits & 0xff) as u8).unwrap();
-                    let gpu_buffer_address = GpuBufferAddress::from_u32(data.0);
+                    let gpu_buffer_address = GpuBufferAddress {
+                        u: (data.0 >> 16) as u16,
+                        v: (data.0 & 0xffff) as u16,
+                    };
                     let cmd = PrimitiveCommand::quad(
                         pattern,
                         pattern_input,
@@ -321,7 +327,10 @@ impl CommandBuffer {
                 Command::CMD_DRAW_INSTANCE => {
                     let prim_instance_index = PrimitiveInstanceIndex(param);
                     let data = cmd_iter.next().unwrap();
-                    let gpu_buffer_address = GpuBufferAddress::from_u32(data.0);
+                    let gpu_buffer_address = GpuBufferAddress {
+                        u: (data.0 >> 16) as u16,
+                        v: (data.0 & 0xffff) as u16,
+                    };
                     let cmd = PrimitiveCommand::instance(
                         prim_instance_index,
                         gpu_buffer_address,
diff --git a/gfx/wr/webrender/src/composite.rs b/gfx/wr/webrender/src/composite.rs
@@ -6,8 +6,8 @@ use api::{BorderRadius, ColorF, ExternalImageId, ImageBufferKind, ImageKey, Imag
 use api::units::*;
 use api::ColorDepth;
 use crate::image_source::resolve_image;
-use crate::renderer::GpuBufferBuilderF;
 use euclid::Box2D;
+use crate::gpu_cache::GpuCache;
 use crate::gpu_types::{ZBufferId, ZBufferIdGenerator};
 use crate::internal_types::{FrameAllocator, FrameMemory, FrameVec, TextureSource};
 use crate::picture::{ImageDependency, ResolvedSurfaceTexture, TileCacheInstance, TileId, TileSurface};
@@ -854,7 +854,7 @@ impl CompositeState {
         is_opaque: bool,
         device_clip_rect: DeviceRect,
         resource_cache: &ResourceCache,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         deferred_resolves: &mut FrameVec<DeferredResolve>,
         clip_index: Option<CompositorClipIndex>,
     ) {
@@ -905,7 +905,7 @@ impl CompositeState {
                 &image_dependencies,
                 required_plane_count,
                 resource_cache,
-                gpu_buffer,
+                gpu_cache,
                 deferred_resolves,
             );
             if external_surface_index == ResolvedExternalSurfaceIndex::INVALID {
@@ -967,7 +967,7 @@ impl CompositeState {
         tile_cache: &TileCacheInstance,
         device_clip_rect: DeviceRect,
         resource_cache: &ResourceCache,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         deferred_resolves: &mut FrameVec<DeferredResolve>,
     ) {
         let slice_transform = self.get_compositor_transform(tile_cache.transform_index);
@@ -983,7 +983,7 @@ impl CompositeState {
                 tile_cache.compositor_clip,
                 backdrop_surface.device_rect,
             );
-
+    
             // Use the backdrop native surface we created and add that to the composite state.
             self.descriptor.surfaces.push(
                 CompositeSurfaceDescriptor {
@@ -1006,7 +1006,7 @@ impl CompositeState {
                 true,
                 device_clip_rect,
                 resource_cache,
-                gpu_buffer,
+                gpu_cache,
                 deferred_resolves,
                 tile_cache.compositor_clip,
             );
@@ -1092,7 +1092,7 @@ impl CompositeState {
                     compositor_surface.is_opaque,
                     device_clip_rect,
                     resource_cache,
-                    gpu_buffer,
+                    gpu_cache,
                     deferred_resolves,
                     tile_cache.compositor_clip,
                 );
@@ -1136,7 +1136,7 @@ impl CompositeState {
         image_dependencies: &[ImageDependency; 3],
         required_plane_count: usize,
         resource_cache: &ResourceCache,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         deferred_resolves: &mut FrameVec<DeferredResolve>,
     ) -> ResolvedExternalSurfaceIndex {
         let mut planes = [
@@ -1156,7 +1156,7 @@ impl CompositeState {
             let cache_item = resolve_image(
                 request,
                 resource_cache,
-                gpu_buffer,
+                gpu_cache,
                 deferred_resolves,
                 true,
             );
@@ -1801,7 +1801,7 @@ impl Occluders {
             occluders: memory.new_vec(),
             scratch: OccludersScratchBuffers {
                 events: memory.new_vec(),
-                active: memory.new_vec(),
+                active: memory.new_vec(),    
             }
         }
     }
diff --git a/gfx/wr/webrender/src/filterdata.rs b/gfx/wr/webrender/src/filterdata.rs
@@ -3,8 +3,9 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use std::hash;
+use crate::gpu_cache::{GpuCache, GpuCacheHandle};
+use crate::gpu_cache::GpuDataRequest;
 use crate::intern;
-use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF, GpuBufferWriterF};
 use api::ComponentTransferFuncType;
 
 
@@ -108,14 +109,14 @@ impl intern::InternDebug for SFilterDataKey {}
 #[derive(MallocSizeOf)]
 pub struct SFilterDataTemplate {
     pub data: SFilterData,
-    pub gpu_buffer_address: GpuBufferAddress,
+    pub gpu_cache_handle: GpuCacheHandle,
 }
 
 impl From<SFilterDataKey> for SFilterDataTemplate {
     fn from(item: SFilterDataKey) -> Self {
         SFilterDataTemplate {
             data: item.data,
-            gpu_buffer_address: GpuBufferAddress::INVALID,
+            gpu_cache_handle: GpuCacheHandle::new(),
         }
     }
 }
@@ -128,14 +129,12 @@ impl SFilterData {
             && self.a_func == SFilterDataComponent::Identity
     }
 
-    pub fn write_gpu_blocks(&self, gpu_buffer: &mut GpuBufferBuilderF) -> GpuBufferAddress {
+    pub fn update(&self, mut request: GpuDataRequest) {
+        push_component_transfer_data(&self.r_func, &mut request);
+        push_component_transfer_data(&self.g_func, &mut request);
+        push_component_transfer_data(&self.b_func, &mut request);
+        push_component_transfer_data(&self.a_func, &mut request);
         assert!(!self.is_identity());
-        let mut writer = gpu_buffer.write_blocks(1024);
-        push_component_transfer_data(&self.r_func, &mut writer);
-        push_component_transfer_data(&self.g_func, &mut writer);
-        push_component_transfer_data(&self.b_func, &mut writer);
-        push_component_transfer_data(&self.a_func, &mut writer);
-        writer.finish()
     }
 }
 
@@ -144,11 +143,13 @@ impl SFilterDataTemplate {
     /// times per frame, by each primitive reference that refers to this interned
     /// template. The initial request call to the GPU cache ensures that work is only
     /// done if the cache entry is invalid (due to first use or eviction).
-    pub fn write_gpu_blocks(
+    pub fn update(
         &mut self,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
     ) {
-        self.gpu_buffer_address = self.data.write_gpu_blocks(gpu_buffer);
+        if let Some(request) = gpu_cache.request(&mut self.gpu_cache_handle) {
+            self.data.update(request);
+        }
     }
 }
 
@@ -165,7 +166,7 @@ impl intern::Internable for FilterDataIntern {
 
 fn push_component_transfer_data(
     func_comp: &SFilterDataComponent,
-    writer: &mut GpuBufferWriterF,
+    request: &mut GpuDataRequest,
 ) {
     match func_comp {
         SFilterDataComponent::Identity => {}
@@ -204,14 +205,14 @@ fn push_component_transfer_data(
                     }
                 }
 
-                writer.push_one(arr);
+                request.push(arr);
             }
         }
         SFilterDataComponent::Linear(a, b) => {
-            writer.push_one([*a, *b, 0.0, 0.0]);
+            request.push([*a, *b, 0.0, 0.0]);
         }
         SFilterDataComponent::Gamma(a, b, c) => {
-            writer.push_one([*a, *b, *c, 0.0]);
+            request.push([*a, *b, *c, 0.0]);
         }
     }
 }
diff --git a/gfx/wr/webrender/src/frame_builder.rs b/gfx/wr/webrender/src/frame_builder.rs
@@ -13,9 +13,10 @@ use crate::spatial_node::SpatialNodeType;
 use crate::spatial_tree::{SpatialTree, SpatialNodeIndex};
 use crate::composite::{CompositorKind, CompositeState, CompositeStatePreallocator};
 use crate::debug_item::DebugItem;
+use crate::gpu_cache::{GpuCache, GpuCacheHandle};
 use crate::gpu_types::{PrimitiveHeaders, TransformPalette, ZBufferIdGenerator};
 use crate::gpu_types::{QuadSegment, TransformData};
-use crate::internal_types::{FastHashMap, PlaneSplitter, FrameStamp};
+use crate::internal_types::{FastHashMap, PlaneSplitter, FrameId, FrameStamp};
 use crate::picture::{DirtyRegion, SliceId, TileCacheInstance};
 use crate::picture::{SurfaceInfo, SurfaceIndex, ResolvedSurfaceTexture};
 use crate::picture::{SubpixelMode, RasterConfig, PictureCompositeMode};
@@ -24,7 +25,7 @@ use crate::prim_store::{PictureIndex, PrimitiveScratchBuffer};
 use crate::prim_store::{DeferredResolve, PrimitiveInstance};
 use crate::profiler::{self, TransactionProfile};
 use crate::render_backend::{DataStores, ScratchBuffer};
-use crate::renderer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferBuilderF, GpuBufferBuilderI, GpuBufferF, GpuBufferI};
+use crate::renderer::{GpuBufferF, GpuBufferBuilderF, GpuBufferI, GpuBufferBuilderI, GpuBufferBuilder};
 use crate::render_target::{PictureCacheTarget, PictureCacheTargetKind};
 use crate::render_target::{RenderTargetContext, RenderTargetKind, RenderTarget};
 use crate::render_task_graph::{Pass, RenderTaskGraph, RenderTaskId, SubPassSurface};
@@ -80,40 +81,40 @@ pub struct FrameBuilderConfig {
 pub struct FrameGlobalResources {
     /// The image shader block for the most common / default
     /// set of image parameters (color white, stretch == rect.size).
-    pub default_image_data: GpuBufferAddress,
+    pub default_image_handle: GpuCacheHandle,
 
     /// A GPU cache config for drawing cut-out rectangle primitives.
     /// This is used to 'cut out' overlay tiles where a compositor
     /// surface exists.
-    pub default_black_rect_address: GpuBufferAddress,
+    pub default_black_rect_handle: GpuCacheHandle,
 }
 
 impl FrameGlobalResources {
     pub fn empty() -> Self {
         FrameGlobalResources {
-            default_image_data: GpuBufferAddress::INVALID,
-            default_black_rect_address: GpuBufferAddress::INVALID,
+            default_image_handle: GpuCacheHandle::new(),
+            default_black_rect_handle: GpuCacheHandle::new(),
         }
     }
 
     pub fn update(
         &mut self,
-        gpu_buffers: &mut GpuBufferBuilder,
+        gpu_cache: &mut GpuCache,
     ) {
-        let mut writer = gpu_buffers.f32.write_blocks(3);
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one([
-            -1.0,       // -ve means use prim rect for stretch size
-            0.0,
-            0.0,
-            0.0,
-        ]);
-        self.default_image_data = writer.finish();
-
-        let mut writer = gpu_buffers.f32.write_blocks(1);
-        writer.push_one(PremultipliedColorF::BLACK);
-        self.default_black_rect_address = writer.finish();
+        if let Some(mut request) = gpu_cache.request(&mut self.default_image_handle) {
+            request.push(PremultipliedColorF::WHITE);
+            request.push(PremultipliedColorF::WHITE);
+            request.push([
+                -1.0,       // -ve means use prim rect for stretch size
+                0.0,
+                0.0,
+                0.0,
+            ]);
+        }
+
+        if let Some(mut request) = gpu_cache.request(&mut self.default_black_rect_handle) {
+            request.push(PremultipliedColorF::BLACK);
+        }
     }
 }
 
@@ -165,6 +166,7 @@ pub struct FrameBuildingState<'a> {
     pub rg_builder: &'a mut RenderTaskGraphBuilder,
     pub clip_store: &'a mut ClipStore,
     pub resource_cache: &'a mut ResourceCache,
+    pub gpu_cache: &'a mut GpuCache,
     pub transforms: &'a mut TransformPalette,
     pub segment_builder: SegmentBuilder,
     pub surfaces: &'a mut Vec<SurfaceInfo>,
@@ -281,6 +283,7 @@ impl FrameBuilder {
         present: bool,
         global_screen_world_rect: WorldRect,
         resource_cache: &mut ResourceCache,
+        gpu_cache: &mut GpuCache,
         rg_builder: &mut RenderTaskGraphBuilder,
         global_device_pixel_scale: DevicePixelScale,
         scene_properties: &SceneProperties,
@@ -392,7 +395,7 @@ impl FrameBuilder {
                 let mut visibility_state = FrameVisibilityState {
                     clip_store: &mut scene.clip_store,
                     resource_cache,
-                    frame_gpu_data,
+                    gpu_cache,
                     data_stores,
                     clip_tree: &mut scene.clip_tree,
                     composite_state,
@@ -452,7 +455,7 @@ impl FrameBuilder {
                         let mut visibility_state = FrameVisibilityState {
                             clip_store: &mut scene.clip_store,
                             resource_cache,
-                            frame_gpu_data,
+                            gpu_cache,
                             data_stores,
                             clip_tree: &mut scene.clip_tree,
                             composite_state,
@@ -524,6 +527,7 @@ impl FrameBuilder {
             rg_builder,
             clip_store: &mut scene.clip_store,
             resource_cache,
+            gpu_cache,
             transforms: transform_palette,
             segment_builder: SegmentBuilder::new(),
             surfaces: &mut scene.surfaces,
@@ -627,7 +631,7 @@ impl FrameBuilder {
             profile_marker!("BlockOnResources");
 
             resource_cache.block_until_all_resources_added(
-                frame_gpu_data,
+                gpu_cache,
                 profile,
             );
         }
@@ -638,6 +642,7 @@ impl FrameBuilder {
         scene: &mut BuiltScene,
         present: bool,
         resource_cache: &mut ResourceCache,
+        gpu_cache: &mut GpuCache,
         rg_builder: &mut RenderTaskGraphBuilder,
         stamp: FrameStamp,
         device_origin: DeviceIntPoint,
@@ -656,22 +661,18 @@ impl FrameBuilder {
         profile_marker!("BuildFrame");
 
         let mut frame_memory = FrameMemory::new(chunk_pool, stamp.frame_id());
-        // TODO(gw): Recycle backing vec buffers for gpu buffer builder between frames
-        let mut gpu_buffer_builder = GpuBufferBuilder {
-            f32: GpuBufferBuilderF::new(&frame_memory),
-            i32: GpuBufferBuilderI::new(&frame_memory),
-        };
 
         profile.set(profiler::PRIMITIVES, scene.prim_instances.len());
         profile.set(profiler::PICTURE_CACHE_SLICES, scene.tile_cache_config.picture_cache_slice_count);
         scratch.begin_frame();
-        resource_cache.begin_frame(stamp, profile);
+        gpu_cache.begin_frame(stamp);
+        resource_cache.begin_frame(stamp, gpu_cache, profile);
 
         // TODO(gw): Follow up patches won't clear this, as they'll be assigned
         //           statically during scene building.
         scene.surfaces.clear();
 
-        self.globals.update(&mut gpu_buffer_builder);
+        self.globals.update(gpu_cache);
 
         spatial_tree.update_tree(scene_properties);
         let mut transform_palette = spatial_tree.build_transform_palette(&frame_memory);
@@ -697,11 +698,18 @@ impl FrameBuilder {
 
         let mut cmd_buffers = CommandBufferList::new();
 
+        // TODO(gw): Recycle backing vec buffers for gpu buffer builder between frames
+        let mut gpu_buffer_builder = GpuBufferBuilder {
+            f32: GpuBufferBuilderF::new(&frame_memory),
+            i32: GpuBufferBuilderI::new(&frame_memory),
+        };
+
         self.build_layer_screen_rects_and_cull_layers(
             scene,
             present,
             screen_world_rect,
             resource_cache,
+            gpu_cache,
             rg_builder,
             global_device_pixel_scale,
             scene_properties,
@@ -727,7 +735,7 @@ impl FrameBuilder {
         // Finish creating the frame graph and build it.
         let render_tasks = rg_builder.end_frame(
             resource_cache,
-            &mut gpu_buffer_builder,
+            gpu_cache,
             &mut deferred_resolves,
             scene.config.max_shared_surface_size,
             &frame_memory,
@@ -771,6 +779,7 @@ impl FrameBuilder {
                     pass,
                     output_size,
                     &mut ctx,
+                    gpu_cache,
                     &mut gpu_buffer_builder,
                     &render_tasks,
                     &scene.clip_store,
@@ -812,7 +821,7 @@ impl FrameBuilder {
                 self.build_composite_pass(
                     scene,
                     &mut ctx,
-                    &mut gpu_buffer_builder,
+                    gpu_cache,
                     &mut deferred_resolves,
                     &mut composite_state,
                 );
@@ -821,6 +830,8 @@ impl FrameBuilder {
 
         profile.end_time(profiler::FRAME_BATCHING_TIME);
 
+        let gpu_cache_frame_id = gpu_cache.end_frame(profile).frame_id();
+
         resource_cache.end_frame(profile);
 
         self.prim_headers_prealloc.record_vec(&prim_headers.headers_int);
@@ -843,6 +854,7 @@ impl FrameBuilder {
             transform_palette: transform_palette.finish(),
             render_tasks,
             deferred_resolves,
+            gpu_cache_frame_id,
             has_been_rendered: false,
             has_texture_cache_tasks,
             prim_headers,
@@ -994,7 +1006,7 @@ impl FrameBuilder {
         &self,
         scene: &BuiltScene,
         ctx: &RenderTargetContext,
-        gpu_buffers: &mut GpuBufferBuilder,
+        gpu_cache: &mut GpuCache,
         deferred_resolves: &mut FrameVec<DeferredResolve>,
         composite_state: &mut CompositeState,
     ) {
@@ -1022,7 +1034,7 @@ impl FrameBuilder {
                         tile_cache,
                         device_clip_rect,
                         ctx.resource_cache,
-                        &mut gpu_buffers.f32,
+                        gpu_cache,
                         deferred_resolves,
                     );
                 }
@@ -1043,6 +1055,7 @@ pub fn build_render_pass(
     src_pass: &Pass,
     screen_size: DeviceIntSize,
     ctx: &mut RenderTargetContext,
+    gpu_cache: &mut GpuCache,
     gpu_buffer_builder: &mut GpuBufferBuilder,
     render_tasks: &RenderTaskGraph,
     clip_store: &ClipStore,
@@ -1080,6 +1093,7 @@ pub fn build_render_pass(
                             target.add_task(
                                 *task_id,
                                 ctx,
+                                gpu_cache,
                                 gpu_buffer_builder,
                                 render_tasks,
                                 clip_store,
@@ -1104,6 +1118,7 @@ pub fn build_render_pass(
                             target.add_task(
                                 *task_id,
                                 ctx,
+                                gpu_cache,
                                 gpu_buffer_builder,
                                 render_tasks,
                                 clip_store,
@@ -1152,6 +1167,7 @@ pub fn build_render_pass(
                                 cmd,
                                 spatial_node_index,
                                 ctx,
+                                gpu_cache,
                                 render_tasks,
                                 prim_headers,
                                 transforms,
@@ -1240,6 +1256,7 @@ pub fn build_render_pass(
                     texture.add_task(
                         *task_id,
                         ctx,
+                        gpu_cache,
                         gpu_buffer_builder,
                         render_tasks,
                         clip_store,
@@ -1255,6 +1272,7 @@ pub fn build_render_pass(
 
     pass.color.build(
         ctx,
+        gpu_cache,
         render_tasks,
         prim_headers,
         transforms,
@@ -1265,6 +1283,7 @@ pub fn build_render_pass(
     );
     pass.alpha.build(
         ctx,
+        gpu_cache,
         render_tasks,
         prim_headers,
         transforms,
@@ -1277,6 +1296,7 @@ pub fn build_render_pass(
     for target in &mut pass.texture_cache.values_mut() {
         target.build(
             ctx,
+            gpu_cache,
             render_tasks,
             prim_headers,
             transforms,
@@ -1309,6 +1329,9 @@ pub struct Frame {
     pub render_tasks: RenderTaskGraph,
     pub prim_headers: PrimitiveHeaders,
 
+    /// The GPU cache frame that the contents of Self depend on
+    pub gpu_cache_frame_id: FrameId,
+
     /// List of textures that we don't know about yet
     /// from the backend thread. The render thread
     /// will use a callback to resolve these and
diff --git a/gfx/wr/webrender/src/gpu_cache.rs b/gfx/wr/webrender/src/gpu_cache.rs
@@ -0,0 +1,945 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+//! Overview of the GPU cache.
+//!
+//! The main goal of the GPU cache is to allow on-demand
+//! allocation and construction of GPU resources for the
+//! vertex shaders to consume.
+//!
+//! Every item that wants to be stored in the GPU cache
+//! should create a GpuCacheHandle that is used to refer
+//! to a cached GPU resource. Creating a handle is a
+//! cheap operation, that does *not* allocate room in the
+//! cache.
+//!
+//! On any frame when that data is required, the caller
+//! must request that handle, via ```request```. If the
+//! data is not in the cache, the user provided closure
+//! will be invoked to build the data.
+//!
+//! After ```end_frame``` has occurred, callers can
+//! use the ```get_address``` API to get the allocated
+//! address in the GPU cache of a given resource slot
+//! for this frame.
+
+use api::{DebugFlags, DocumentId, PremultipliedColorF};
+#[cfg(test)]
+use api::IdNamespace;
+use api::units::*;
+use euclid::{HomogeneousVector, Box2D};
+use crate::internal_types::{FastHashMap, FastHashSet, FrameStamp, FrameId};
+use crate::profiler::{self, TransactionProfile};
+use crate::prim_store::VECS_PER_SEGMENT;
+use crate::renderer::MAX_VERTEX_TEXTURE_WIDTH;
+use crate::util::VecHelper;
+use std::{u16, u32};
+use std::num::NonZeroU32;
+use std::ops::Add;
+use std::time::{Duration, Instant};
+
+
+/// At the time of this writing, Firefox uses about 15 GPU cache rows on
+/// startup, and then gradually works its way up to the mid-30s with normal
+/// browsing.
+pub const GPU_CACHE_INITIAL_HEIGHT: i32 = 20;
+const NEW_ROWS_PER_RESIZE: i32 = 10;
+
+/// The number of frames an entry can go unused before being evicted.
+const FRAMES_BEFORE_EVICTION: u64 = 10;
+
+/// The ratio of utilized blocks to total blocks for which we start the clock
+/// on reclaiming memory.
+const RECLAIM_THRESHOLD: f32 = 0.2;
+
+/// The amount of time utilization must be below the above threshold before we
+/// blow away the cache and rebuild it.
+const RECLAIM_DELAY_S: u64 = 5;
+
+#[derive(Debug, Copy, Clone, Eq, MallocSizeOf, PartialEq)]
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+struct Epoch(u32);
+
+impl Epoch {
+    fn next(&mut self) {
+        *self = Epoch(self.0.wrapping_add(1));
+    }
+}
+
+#[derive(Debug, Copy, Clone, MallocSizeOf)]
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+struct CacheLocation {
+    block_index: BlockIndex,
+    epoch: Epoch,
+}
+
+/// A single texel in RGBAF32 texture - 16 bytes.
+#[derive(Copy, Clone, Debug, MallocSizeOf)]
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+pub struct GpuBlockData {
+    data: [f32; 4],
+}
+
+impl GpuBlockData {
+    pub const EMPTY: Self = GpuBlockData { data: [0.0; 4] };
+}
+
+/// Conversion helpers for GpuBlockData
+impl From<PremultipliedColorF> for GpuBlockData {
+    fn from(c: PremultipliedColorF) -> Self {
+        GpuBlockData {
+            data: [c.r, c.g, c.b, c.a],
+        }
+    }
+}
+
+impl From<[f32; 4]> for GpuBlockData {
+    fn from(data: [f32; 4]) -> Self {
+        GpuBlockData { data }
+    }
+}
+
+impl<P> From<Box2D<f32, P>> for GpuBlockData {
+    fn from(r: Box2D<f32, P>) -> Self {
+        GpuBlockData {
+            data: [
+                r.min.x,
+                r.min.y,
+                r.max.x,
+                r.max.y,
+            ],
+        }
+    }
+}
+
+impl<P> From<HomogeneousVector<f32, P>> for GpuBlockData {
+    fn from(v: HomogeneousVector<f32, P>) -> Self {
+        GpuBlockData {
+            data: [
+                v.x,
+                v.y,
+                v.z,
+                v.w,
+            ],
+        }
+    }
+}
+
+impl From<TexelRect> for GpuBlockData {
+    fn from(tr: TexelRect) -> Self {
+        GpuBlockData {
+            data: [tr.uv0.x, tr.uv0.y, tr.uv1.x, tr.uv1.y],
+        }
+    }
+}
+
+
+// A handle to a GPU resource.
+#[derive(Debug, Copy, Clone, MallocSizeOf)]
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+pub struct GpuCacheHandle {
+    location: Option<CacheLocation>,
+}
+
+impl GpuCacheHandle {
+    pub fn new() -> Self {
+        GpuCacheHandle { location: None }
+    }
+
+    pub fn as_int(self, gpu_cache: &GpuCache) -> i32 {
+        gpu_cache.get_address(&self).as_int()
+    }
+}
+
+// A unique address in the GPU cache. These are uploaded
+// as part of the primitive instances, to allow the vertex
+// shader to fetch the specific data.
+#[repr(C)]
+#[derive(Copy, Debug, Clone, MallocSizeOf, Eq, PartialEq)]
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+pub struct GpuCacheAddress {
+    pub u: u16,
+    pub v: u16,
+}
+
+impl GpuCacheAddress {
+    fn new(u: usize, v: usize) -> Self {
+        GpuCacheAddress {
+            u: u as u16,
+            v: v as u16,
+        }
+    }
+
+    pub const INVALID: GpuCacheAddress = GpuCacheAddress {
+        u: u16::MAX,
+        v: u16::MAX,
+    };
+
+    pub fn as_int(self) -> i32 {
+        // TODO(gw): Temporarily encode GPU Cache addresses as a single int.
+        //           In the future, we can change the PrimitiveInstanceData struct
+        //           to use 2x u16 for the vertex attribute instead of an i32.
+        self.v as i32 * MAX_VERTEX_TEXTURE_WIDTH as i32 + self.u as i32
+    }
+}
+
+impl Add<usize> for GpuCacheAddress {
+    type Output = GpuCacheAddress;
+
+    fn add(self, other: usize) -> GpuCacheAddress {
+        GpuCacheAddress {
+            u: self.u + other as u16,
+            v: self.v,
+        }
+    }
+}
+
+// An entry in a free-list of blocks in the GPU cache.
+#[derive(Debug, MallocSizeOf)]
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+struct Block {
+    // The location in the cache of this block.
+    address: GpuCacheAddress,
+    // The current epoch (generation) of this block.
+    epoch: Epoch,
+    // Index of the next free block in the list it
+    // belongs to (either a free-list or the
+    // occupied list).
+    next: Option<BlockIndex>,
+    // The last frame this block was referenced.
+    last_access_time: FrameId,
+}
+
+impl Block {
+    fn new(
+        address: GpuCacheAddress,
+        next: Option<BlockIndex>,
+        frame_id: FrameId,
+        epoch: Epoch,
+    ) -> Self {
+        Block {
+            address,
+            next,
+            last_access_time: frame_id,
+            epoch,
+        }
+    }
+
+    fn advance_epoch(&mut self, max_epoch: &mut Epoch) {
+        self.epoch.next();
+        if max_epoch.0 < self.epoch.0 {
+            max_epoch.0 = self.epoch.0;
+        }
+    }
+
+    /// Creates an invalid dummy block ID.
+    pub const INVALID: Block = Block {
+        address: GpuCacheAddress { u: 0, v: 0 },
+        epoch: Epoch(0),
+        next: None,
+        last_access_time: FrameId::INVALID,
+    };
+}
+
+/// Represents the index of a Block in the block array. We only create such
+/// structs for blocks that represent the start of a chunk.
+///
+/// Because we use Option<BlockIndex> in a lot of places, we use a NonZeroU32
+/// here and avoid ever using the index zero.
+#[derive(Debug, Copy, Clone, MallocSizeOf)]
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+struct BlockIndex(NonZeroU32);
+
+impl BlockIndex {
+    fn new(idx: usize) -> Self {
+        debug_assert!(idx <= u32::MAX as usize);
+        BlockIndex(NonZeroU32::new(idx as u32).expect("Index zero forbidden"))
+    }
+
+    fn get(&self) -> usize {
+        self.0.get() as usize
+    }
+}
+
+// A row in the cache texture.
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
+struct Row {
+    // The fixed size of blocks that this row supports.
+    // Each row becomes a slab allocator for a fixed block size.
+    // This means no dealing with fragmentation within a cache
+    // row as items are allocated and freed.
+    block_count_per_item: usize,
+}
+
+impl Row {
+    fn new(block_count_per_item: usize) -> Self {
+        Row {
+            block_count_per_item,
+        }
+    }
+}
+
+// A list of update operations that can be applied on the cache
+// this frame. The list of updates is created by the render backend
+// during frame construction. It's passed to the render thread
+// where GL commands can be applied.
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
+pub enum GpuCacheUpdate {
+    Copy {
+        block_index: usize,
+        block_count: usize,
+        address: GpuCacheAddress,
+    },
+}
+
+/// Command to inform the debug display in the renderer when chunks are allocated
+/// or freed.
+#[derive(MallocSizeOf)]
+pub enum GpuCacheDebugCmd {
+    /// Describes an allocated chunk.
+    Alloc(GpuCacheDebugChunk),
+    /// Describes a freed chunk.
+    Free(GpuCacheAddress),
+}
+
+#[derive(Clone, MallocSizeOf)]
+pub struct GpuCacheDebugChunk {
+    pub address: GpuCacheAddress,
+    pub size: usize,
+}
+
+#[must_use]
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
+pub struct GpuCacheUpdateList {
+    /// The frame current update list was generated from.
+    pub frame_id: FrameId,
+    /// Whether the texture should be cleared before updates
+    /// are applied.
+    pub clear: bool,
+    /// The current height of the texture. The render thread
+    /// should resize the texture if required.
+    pub height: i32,
+    /// List of updates to apply.
+    pub updates: Vec<GpuCacheUpdate>,
+    /// A flat list of GPU blocks that are pending upload
+    /// to GPU memory.
+    pub blocks: Vec<GpuBlockData>,
+    /// Whole state GPU block metadata for debugging.
+    #[cfg_attr(feature = "serde", serde(skip))]
+    pub debug_commands: Vec<GpuCacheDebugCmd>,
+}
+
+// Holds the free lists of fixed size blocks. Mostly
+// just serves to work around the borrow checker.
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
+struct FreeBlockLists {
+    free_list_1: Option<BlockIndex>,
+    free_list_2: Option<BlockIndex>,
+    free_list_4: Option<BlockIndex>,
+    free_list_8: Option<BlockIndex>,
+    free_list_16: Option<BlockIndex>,
+    free_list_32: Option<BlockIndex>,
+    free_list_64: Option<BlockIndex>,
+    free_list_128: Option<BlockIndex>,
+    free_list_256: Option<BlockIndex>,
+    free_list_341: Option<BlockIndex>,
+    free_list_512: Option<BlockIndex>,
+    free_list_1024: Option<BlockIndex>,
+}
+
+impl FreeBlockLists {
+    fn new() -> Self {
+        FreeBlockLists {
+            free_list_1: None,
+            free_list_2: None,
+            free_list_4: None,
+            free_list_8: None,
+            free_list_16: None,
+            free_list_32: None,
+            free_list_64: None,
+            free_list_128: None,
+            free_list_256: None,
+            free_list_341: None,
+            free_list_512: None,
+            free_list_1024: None,
+        }
+    }
+
+    fn get_actual_block_count_and_free_list(
+        &mut self,
+        block_count: usize,
+    ) -> (usize, &mut Option<BlockIndex>) {
+        // Find the appropriate free list to use based on the block size.
+        //
+        // Note that we cheat a bit with the 341 bucket, since it's not quite
+        // a divisor of 1024, because purecss-francine allocates many 260-block
+        // chunks, and there's no reason we shouldn't pack these three to a row.
+        // This means the allocation statistics will under-report by one block
+        // for each row using 341-block buckets, which is fine.
+        debug_assert_eq!(MAX_VERTEX_TEXTURE_WIDTH, 1024, "Need to update bucketing");
+        match block_count {
+            0 => panic!("Can't allocate zero sized blocks!"),
+            1 => (1, &mut self.free_list_1),
+            2 => (2, &mut self.free_list_2),
+            3..=4 => (4, &mut self.free_list_4),
+            5..=8 => (8, &mut self.free_list_8),
+            9..=16 => (16, &mut self.free_list_16),
+            17..=32 => (32, &mut self.free_list_32),
+            33..=64 => (64, &mut self.free_list_64),
+            65..=128 => (128, &mut self.free_list_128),
+            129..=256 => (256, &mut self.free_list_256),
+            257..=341 => (341, &mut self.free_list_341),
+            342..=512 => (512, &mut self.free_list_512),
+            513..=1024 => (1024, &mut self.free_list_1024),
+            _ => panic!("Can't allocate > MAX_VERTEX_TEXTURE_WIDTH per resource!"),
+        }
+    }
+}
+
+// CPU-side representation of the GPU resource cache texture.
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
+struct Texture {
+    // Current texture height
+    height: i32,
+    // All blocks that have been created for this texture
+    blocks: Vec<Block>,
+    // Metadata about each allocated row.
+    rows: Vec<Row>,
+    // The base Epoch for this texture.
+    base_epoch: Epoch,
+    // The maximum epoch reached. We track this along with the above so
+    // that we can rebuild the Texture and avoid collisions with handles
+    // allocated for the old texture.
+    max_epoch: Epoch,
+    // Free lists of available blocks for each supported
+    // block size in the texture. These are intrusive
+    // linked lists.
+    free_lists: FreeBlockLists,
+    // Linked list of currently occupied blocks. This
+    // makes it faster to iterate blocks looking for
+    // candidates to be evicted from the cache.
+    occupied_list_heads: FastHashMap<DocumentId, BlockIndex>,
+    // Pending blocks that have been written this frame
+    // and will need to be sent to the GPU.
+    pending_blocks: Vec<GpuBlockData>,
+    // Pending update commands.
+    updates: Vec<GpuCacheUpdate>,
+    // Profile stats
+    allocated_block_count: usize,
+    // The stamp at which we first reached our threshold for reclaiming `GpuCache`
+    // memory, or `None` if the threshold hasn't been reached.
+    #[cfg_attr(feature = "serde", serde(skip))]
+    reached_reclaim_threshold: Option<Instant>,
+    // List of debug commands to be sent to the renderer when the GPU cache
+    // debug display is enabled.
+    #[cfg_attr(feature = "serde", serde(skip))]
+    debug_commands: Vec<GpuCacheDebugCmd>,
+    // The current debug flags for the system.
+    debug_flags: DebugFlags,
+}
+
+impl Texture {
+    fn new(base_epoch: Epoch, debug_flags: DebugFlags) -> Self {
+        // Pre-fill the block array with one invalid block so that we never use
+        // 0 for a BlockIndex. This lets us use NonZeroU32 for BlockIndex, which
+        // saves memory.
+        let blocks = vec![Block::INVALID];
+
+        Texture {
+            height: GPU_CACHE_INITIAL_HEIGHT,
+            blocks,
+            rows: Vec::new(),
+            base_epoch,
+            max_epoch: base_epoch,
+            free_lists: FreeBlockLists::new(),
+            pending_blocks: Vec::new(),
+            updates: Vec::new(),
+            occupied_list_heads: FastHashMap::default(),
+            allocated_block_count: 0,
+            reached_reclaim_threshold: None,
+            debug_commands: Vec::new(),
+            debug_flags,
+        }
+    }
+
+    // Push new data into the cache. The ```pending_block_index``` field represents
+    // where the data was pushed into the texture ```pending_blocks``` array.
+    // Return the allocated address for this data.
+    fn push_data(
+        &mut self,
+        pending_block_index: Option<usize>,
+        block_count: usize,
+        frame_stamp: FrameStamp
+    ) -> CacheLocation {
+        debug_assert!(frame_stamp.is_valid());
+        // Find the appropriate free list to use based on the block size.
+        let (alloc_size, free_list) = self.free_lists
+            .get_actual_block_count_and_free_list(block_count);
+
+        // See if we need a new row (if free-list has nothing available)
+        if free_list.is_none() {
+            if self.rows.len() as i32 == self.height {
+                self.height += NEW_ROWS_PER_RESIZE;
+            }
+
+            // Create a new row.
+            let items_per_row = MAX_VERTEX_TEXTURE_WIDTH / alloc_size;
+            let row_index = self.rows.len();
+            self.rows.push(Row::new(alloc_size));
+
+            // Create a ```Block``` for each possible allocation address
+            // in this row, and link it in to the free-list for this
+            // block size.
+            let mut prev_block_index = None;
+            for i in 0 .. items_per_row {
+                let address = GpuCacheAddress::new(i * alloc_size, row_index);
+                let block_index = BlockIndex::new(self.blocks.len());
+                let block = Block::new(address, prev_block_index, frame_stamp.frame_id(), self.base_epoch);
+                self.blocks.push(block);
+                prev_block_index = Some(block_index);
+            }
+
+            *free_list = prev_block_index;
+        }
+
+        // Given the code above, it's now guaranteed that there is a block
+        // available in the appropriate free-list. Pull a block from the
+        // head of the list.
+        let free_block_index = free_list.take().unwrap();
+        let block = &mut self.blocks[free_block_index.get()];
+        *free_list = block.next;
+
+        // Add the block to the occupied linked list.
+        block.next = self.occupied_list_heads.get(&frame_stamp.document_id()).cloned();
+        block.last_access_time = frame_stamp.frame_id();
+        self.occupied_list_heads.insert(frame_stamp.document_id(), free_block_index);
+        self.allocated_block_count += alloc_size;
+
+        if let Some(pending_block_index) = pending_block_index {
+            // Add this update to the pending list of blocks that need
+            // to be updated on the GPU.
+            self.updates.push(GpuCacheUpdate::Copy {
+                block_index: pending_block_index,
+                block_count,
+                address: block.address,
+            });
+        }
+
+        // If we're using the debug display, communicate the allocation to the
+        // renderer thread. Note that we do this regardless of whether or not
+        // pending_block_index is None (if it is, the renderer thread will fill
+        // in the data via a deferred resolve, but the block is still considered
+        // allocated).
+        if self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) {
+            self.debug_commands.push(GpuCacheDebugCmd::Alloc(GpuCacheDebugChunk {
+                address: block.address,
+                size: block_count,
+            }));
+        }
+
+        CacheLocation {
+            block_index: free_block_index,
+            epoch: block.epoch,
+        }
+    }
+
+    // Run through the list of occupied cache blocks and evict
+    // any old blocks that haven't been referenced for a while.
+    fn evict_old_blocks(&mut self, frame_stamp: FrameStamp) {
+        debug_assert!(frame_stamp.is_valid());
+        // Prune any old items from the list to make room.
+        // Traverse the occupied linked list and see
+        // which items have not been used for a long time.
+        let mut current_block = self.occupied_list_heads.get(&frame_stamp.document_id()).map(|x| *x);
+        let mut prev_block: Option<BlockIndex> = None;
+
+        while let Some(index) = current_block {
+            let (next_block, should_unlink) = {
+                let block = &mut self.blocks[index.get()];
+
+                let next_block = block.next;
+                let mut should_unlink = false;
+
+                // If this resource has not been used in the last
+                // few frames, free it from the texture and mark
+                // as empty.
+                if block.last_access_time + FRAMES_BEFORE_EVICTION < frame_stamp.frame_id() {
+                    should_unlink = true;
+
+                    // Get the row metadata from the address.
+                    let row = &mut self.rows[block.address.v as usize];
+
+                    // Use the row metadata to determine which free-list
+                    // this block belongs to.
+                    let (_, free_list) = self.free_lists
+                        .get_actual_block_count_and_free_list(row.block_count_per_item);
+
+                    block.advance_epoch(&mut self.max_epoch);
+                    block.next = *free_list;
+                    *free_list = Some(index);
+
+                    self.allocated_block_count -= row.block_count_per_item;
+
+                    if self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) {
+                        let cmd = GpuCacheDebugCmd::Free(block.address);
+                        self.debug_commands.push(cmd);
+                    }
+                };
+
+                (next_block, should_unlink)
+            };
+
+            // If the block was released, we will need to remove it
+            // from the occupied linked list.
+            if should_unlink {
+                match prev_block {
+                    Some(prev_block) => {
+                        self.blocks[prev_block.get()].next = next_block;
+                    }
+                    None => {
+                        match next_block {
+                            Some(next_block) => {
+                                self.occupied_list_heads.insert(frame_stamp.document_id(), next_block);
+                            }
+                            None => {
+                                self.occupied_list_heads.remove(&frame_stamp.document_id());
+                            }
+                        }
+                    }
+                }
+            } else {
+                prev_block = current_block;
+            }
+
+            current_block = next_block;
+        }
+    }
+
+    /// Returns the ratio of utilized blocks.
+    fn utilization(&self) -> f32 {
+        let total_blocks = self.rows.len() * MAX_VERTEX_TEXTURE_WIDTH;
+        debug_assert!(total_blocks > 0);
+        let ratio = self.allocated_block_count as f32 / total_blocks as f32;
+        debug_assert!(0.0 <= ratio && ratio <= 1.0, "Bad ratio: {}", ratio);
+        ratio
+    }
+}
+
+
+/// A wrapper object for GPU data requests,
+/// works as a container that can only grow.
+#[must_use]
+pub struct GpuDataRequest<'a> {
+    //TODO: remove this, see
+    // https://bugzilla.mozilla.org/show_bug.cgi?id=1690546
+    #[allow(dead_code)]
+    handle: &'a mut GpuCacheHandle,
+    frame_stamp: FrameStamp,
+    start_index: usize,
+    max_block_count: usize,
+    texture: &'a mut Texture,
+}
+
+impl<'a> GpuDataRequest<'a> {
+    pub fn push<B>(&mut self, block: B)
+    where
+        B: Into<GpuBlockData>,
+    {
+        self.texture.pending_blocks.push(block.into());
+    }
+
+    // Write the GPU cache data for an individual segment.
+    pub fn write_segment(
+        &mut self,
+        local_rect: LayoutRect,
+        extra_data: [f32; 4],
+    ) {
+        let _ = VECS_PER_SEGMENT;
+        self.push(local_rect);
+        self.push(extra_data);
+    }
+
+    pub fn current_used_block_num(&self) -> usize {
+        self.texture.pending_blocks.len() - self.start_index
+    }
+}
+
+impl<'a> Drop for GpuDataRequest<'a> {
+    fn drop(&mut self) {
+        // Push the data to the texture pending updates list.
+        let block_count = self.current_used_block_num();
+        debug_assert!(block_count <= self.max_block_count);
+
+        let location = self.texture
+            .push_data(Some(self.start_index), block_count, self.frame_stamp);
+        self.handle.location = Some(location);
+    }
+}
+
+
+/// The main LRU cache interface.
+#[cfg_attr(feature = "capture", derive(Serialize))]
+#[cfg_attr(feature = "replay", derive(Deserialize))]
+#[derive(MallocSizeOf)]
+pub struct GpuCache {
+    /// Current FrameId.
+    now: FrameStamp,
+    /// CPU-side texture allocator.
+    texture: Texture,
+    /// Number of blocks requested this frame that don't
+    /// need to be re-uploaded.
+    saved_block_count: usize,
+    /// The current debug flags for the system.
+    debug_flags: DebugFlags,
+    /// Whether there is a pending clear to send with the
+    /// next update.
+    pending_clear: bool,
+    /// Indicates that prepare_for_frames has been called for this group of frames.
+    /// Used for sanity checks.
+    prepared_for_frames: bool,
+    /// This indicates that we performed a cleanup operation which requires all
+    /// documents to build a frame.
+    requires_frame_build: bool,
+    /// The set of documents which have had frames built in this update. Used for
+    /// sanity checks.
+    document_frames_to_build: FastHashSet<DocumentId>,
+}
+
+impl GpuCache {
+    pub fn new() -> Self {
+        let debug_flags = DebugFlags::empty();
+        GpuCache {
+            now: FrameStamp::INVALID,
+            texture: Texture::new(Epoch(0), debug_flags),
+            saved_block_count: 0,
+            debug_flags,
+            pending_clear: false,
+            prepared_for_frames: false,
+            requires_frame_build: false,
+            document_frames_to_build: FastHashSet::default(),
+        }
+    }
+
+    /// Creates a GpuCache and sets it up with a valid `FrameStamp`, which
+    /// is useful for avoiding panics when instantiating the `GpuCache`
+    /// directly from unit test code.
+    #[cfg(test)]
+    pub fn new_for_testing() -> Self {
+        let mut cache = Self::new();
+        let mut now = FrameStamp::first(DocumentId::new(IdNamespace(1), 1));
+        now.advance();
+        cache.prepared_for_frames = true;
+        cache.begin_frame(now);
+        cache
+    }
+
+    /// Drops everything in the GPU cache. Must not be called once gpu cache entries
+    /// for the next frame have already been requested.
+    pub fn clear(&mut self) {
+        assert!(self.texture.updates.is_empty(), "Clearing with pending updates");
+        let mut next_base_epoch = self.texture.max_epoch;
+        next_base_epoch.next();
+        self.texture = Texture::new(next_base_epoch, self.debug_flags);
+        self.saved_block_count = 0;
+        self.pending_clear = true;
+        self.requires_frame_build = true;
+    }
+
+    pub fn requires_frame_build(&self) -> bool {
+        self.requires_frame_build
+    }
+
+    pub fn prepare_for_frames(&mut self) {
+        self.prepared_for_frames = true;
+        if self.should_reclaim_memory() {
+            self.clear();
+            debug_assert!(self.document_frames_to_build.is_empty());
+            for &document_id in self.texture.occupied_list_heads.keys() {
+                self.document_frames_to_build.insert(document_id);
+            }
+        }
+    }
+
+    pub fn bookkeep_after_frames(&mut self) {
+        assert!(self.document_frames_to_build.is_empty());
+        assert!(self.prepared_for_frames);
+        self.requires_frame_build = false;
+        self.prepared_for_frames = false;
+    }
+
+    /// Begin a new frame.
+    pub fn begin_frame(&mut self, stamp: FrameStamp) {
+        debug_assert!(self.texture.pending_blocks.is_empty());
+        assert!(self.prepared_for_frames);
+        profile_scope!("begin_frame");
+        self.now = stamp;
+        self.texture.evict_old_blocks(self.now);
+        self.saved_block_count = 0;
+    }
+
+    // Invalidate a (possibly) existing block in the cache.
+    // This means the next call to request() for this location
+    // will rebuild the data and upload it to the GPU.
+    pub fn invalidate(&mut self, handle: &GpuCacheHandle) {
+        if let Some(ref location) = handle.location {
+            // don't invalidate blocks that are already re-assigned
+            if let Some(block) = self.texture.blocks.get_mut(location.block_index.get()) {
+                if block.epoch == location.epoch {
+                    block.advance_epoch(&mut self.texture.max_epoch);
+                }
+            }
+        }
+    }
+
+    /// Request a resource be added to the cache. If the resource
+    /// is already in the cache, `None` will be returned.
+    pub fn request<'a>(&'a mut self, handle: &'a mut GpuCacheHandle) -> Option<GpuDataRequest<'a>> {
+        let mut max_block_count = MAX_VERTEX_TEXTURE_WIDTH;
+        // Check if the allocation for this handle is still valid.
+        if let Some(ref location) = handle.location {
+            if let Some(block) = self.texture.blocks.get_mut(location.block_index.get()) {
+                if block.epoch == location.epoch {
+                    max_block_count = self.texture.rows[block.address.v as usize].block_count_per_item;
+                    if block.last_access_time != self.now.frame_id() {
+                        // Mark last access time to avoid evicting this block.
+                        block.last_access_time = self.now.frame_id();
+                        self.saved_block_count += max_block_count;
+                    }
+                    return None;
+                }
+            }
+        }
+
+        debug_assert!(self.now.is_valid());
+        Some(GpuDataRequest {
+            handle,
+            frame_stamp: self.now,
+            start_index: self.texture.pending_blocks.len(),
+            texture: &mut self.texture,
+            max_block_count,
+        })
+    }
+
+    // Push an array of data blocks to be uploaded to the GPU
+    // unconditionally for this frame. The cache handle will
+    // assert if the caller tries to retrieve the address
+    // of this handle on a subsequent frame. This is typically
+    // used for uploading data that changes every frame, and
+    // therefore makes no sense to try and cache.
+    pub fn push_per_frame_blocks(&mut self, blocks: &[GpuBlockData]) -> GpuCacheHandle {
+        let start_index = self.texture.pending_blocks.len();
+        self.texture.pending_blocks.extend_from_slice(blocks);
+        let location = self.texture
+            .push_data(Some(start_index), blocks.len(), self.now);
+        GpuCacheHandle {
+            location: Some(location),
+        }
+    }
+
+    // Reserve space in the cache for per-frame blocks that
+    // will be resolved by the render thread via the
+    // external image callback.
+    pub fn push_deferred_per_frame_blocks(&mut self, block_count: usize) -> GpuCacheHandle {
+        let location = self.texture.push_data(None, block_count, self.now);
+        GpuCacheHandle {
+            location: Some(location),
+        }
+    }
+
+    /// End the frame. Return the list of updates to apply to the
+    /// device specific cache texture.
+    pub fn end_frame(
+        &mut self,
+        profile: &mut TransactionProfile,
+    ) -> FrameStamp {
+        profile_scope!("end_frame");
+        profile.set(profiler::GPU_CACHE_ROWS_TOTAL, self.texture.rows.len());
+        profile.set(profiler::GPU_CACHE_BLOCKS_TOTAL, self.texture.allocated_block_count);
+        profile.set(profiler::GPU_CACHE_BLOCKS_SAVED, self.saved_block_count);
+
+        let reached_threshold =
+            self.texture.rows.len() > (GPU_CACHE_INITIAL_HEIGHT as usize) &&
+            self.texture.utilization() < RECLAIM_THRESHOLD;
+        if reached_threshold {
+            self.texture.reached_reclaim_threshold.get_or_insert_with(Instant::now);
+        } else {
+            self.texture.reached_reclaim_threshold = None;
+        }
+
+        self.document_frames_to_build.remove(&self.now.document_id());
+        self.now
+    }
+
+    /// Returns true if utilization has been low enough for long enough that we
+    /// should blow the cache away and rebuild it.
+    pub fn should_reclaim_memory(&self) -> bool {
+        self.texture.reached_reclaim_threshold
+            .map_or(false, |t| t.elapsed() > Duration::from_secs(RECLAIM_DELAY_S))
+    }
+
+    /// Extract the pending updates from the cache.
+    pub fn extract_updates(&mut self) -> GpuCacheUpdateList {
+        let clear = self.pending_clear;
+        self.pending_clear = false;
+        GpuCacheUpdateList {
+            frame_id: self.now.frame_id(),
+            clear,
+            height: self.texture.height,
+            debug_commands: self.texture.debug_commands.take_and_preallocate(),
+            updates: self.texture.updates.take_and_preallocate(),
+            blocks: self.texture.pending_blocks.take_and_preallocate(),
+        }
+    }
+
+    /// Sets the current debug flags for the system.
+    pub fn set_debug_flags(&mut self, flags: DebugFlags) {
+        self.debug_flags = flags;
+        self.texture.debug_flags = flags;
+    }
+
+    /// Get the actual GPU address in the texture for a given slot ID.
+    /// It's assumed at this point that the given slot has been requested
+    /// and built for this frame. Attempting to get the address for a
+    /// freed or pending slot will panic!
+    pub fn get_address(&self, id: &GpuCacheHandle) -> GpuCacheAddress {
+        self.try_get_address(id).expect("handle not requested or allocated!")
+    }
+
+    /// Get the actual GPU address in the texture for a given slot ID.
+    ///
+    /// Returns None if the slot has not been requested.
+    pub fn try_get_address(&self, id: &GpuCacheHandle) -> Option<GpuCacheAddress> {
+        let Some(location) = id.location else { return None; };
+        let block = &self.texture.blocks[location.block_index.get()];
+        debug_assert_eq!(block.epoch, location.epoch);
+        debug_assert_eq!(block.last_access_time, self.now.frame_id());
+        Some(block.address)
+    }
+}
+
+#[test]
+#[cfg(target_pointer_width = "64")]
+fn test_struct_sizes() {
+    use std::mem;
+    // We can end up with a lot of blocks stored in the global vec, and keeping
+    // them small helps reduce memory overhead.
+    assert_eq!(mem::size_of::<Block>(), 24, "Block size changed");
+}
diff --git a/gfx/wr/webrender/src/gpu_types.rs b/gfx/wr/webrender/src/gpu_types.rs
@@ -4,15 +4,15 @@
 
 use api::{AlphaType, PremultipliedColorF, YuvFormat, YuvRangedColorSpace};
 use api::units::*;
-use euclid::HomogeneousVector;
 use crate::composite::{CompositeFeatures, CompositorClip};
 use crate::segment::EdgeAaSegmentMask;
 use crate::spatial_tree::{SpatialTree, SpatialNodeIndex};
+use crate::gpu_cache::{GpuCacheAddress, GpuDataRequest};
 use crate::internal_types::{FastHashMap, FrameVec, FrameMemory};
 use crate::prim_store::ClipData;
 use crate::render_task::RenderTaskAddress;
 use crate::render_task_graph::RenderTaskId;
-use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF, GpuBufferWriterF, ShaderColorMode};
+use crate::renderer::{ShaderColorMode, GpuBufferAddress};
 use std::i32;
 use crate::util::{MatrixHelpers, TransformedRectKind};
 use glyph_rasterizer::SubpixelDirection;
@@ -172,7 +172,7 @@ pub struct SvgFilterInstance {
     pub input_count: u16,
     pub generic_int: u16,
     pub padding: u16,
-    pub extra_data_address: i32,
+    pub extra_data_address: GpuCacheAddress,
 }
 
 #[derive(Clone, Debug)]
@@ -187,7 +187,7 @@ pub struct SVGFEFilterInstance {
     pub input_2_task_address: RenderTaskAddress,
     pub kind: u16,
     pub input_count: u16,
-    pub extra_data_address: i32,
+    pub extra_data_address: GpuCacheAddress,
 }
 
 #[derive(Copy, Clone, Debug, Hash, MallocSizeOf, PartialEq, Eq)]
@@ -261,7 +261,7 @@ pub struct BoxShadowData {
 #[repr(C)]
 pub struct ClipMaskInstanceBoxShadow {
     pub common: ClipMaskInstanceCommon,
-    pub resource_address: i32,
+    pub resource_address: GpuCacheAddress,
     pub shadow_data: BoxShadowData,
 }
 
@@ -505,7 +505,7 @@ impl PrimitiveHeaders {
         self.headers_int.push(PrimitiveHeaderI {
             z: prim_header.z,
             render_task_address: prim_header.render_task_address,
-            specific_prim_address: prim_header.specific_prim_address,
+            specific_prim_address: prim_header.specific_prim_address.as_int(),
             transform_id: prim_header.transform_id,
             user_data: prim_header.user_data,
         });
@@ -520,7 +520,7 @@ impl PrimitiveHeaders {
 pub struct PrimitiveHeader {
     pub local_rect: LayoutRect,
     pub local_clip_rect: LayoutRect,
-    pub specific_prim_address: i32,
+    pub specific_prim_address: GpuCacheAddress,
     pub transform_id: TransformPaletteId,
     pub z: ZBufferId,
     pub render_task_address: RenderTaskAddress,
@@ -571,7 +571,7 @@ impl GlyphInstance {
         clip_task: RenderTaskAddress,
         subpx_dir: SubpixelDirection,
         glyph_index_in_text_run: i32,
-        glyph_uv_rect: GpuBufferAddress,
+        glyph_uv_rect: GpuCacheAddress,
         color_mode: ShaderColorMode,
     ) -> PrimitiveInstanceData {
         PrimitiveInstanceData {
@@ -612,8 +612,8 @@ impl From<SplitCompositeInstance> for PrimitiveInstanceData {
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct QuadInstance {
     pub dst_task_address: RenderTaskAddress,
-    pub prim_address_i: i32,
-    pub prim_address_f: i32,
+    pub prim_address_i: GpuBufferAddress,
+    pub prim_address_f: GpuBufferAddress,
     pub quad_flags: u8,
     pub edge_flags: u8,
     pub part_index: u8,
@@ -631,8 +631,8 @@ impl From<QuadInstance> for PrimitiveInstanceData {
 
         PrimitiveInstanceData {
             data: [
-                instance.prim_address_i,
-                instance.prim_address_f,
+                instance.prim_address_i.as_int(),
+                instance.prim_address_f.as_int(),
 
                 ((instance.quad_flags as i32)    << 24) |
                 ((instance.edge_flags as i32)    << 16) |
@@ -1006,34 +1006,25 @@ pub struct ImageSource {
 }
 
 impl ImageSource {
-    pub fn write_gpu_blocks(&self, gpu_buffer: &mut GpuBufferBuilderF) -> GpuBufferAddress {
-        let mut writer = gpu_buffer.write_blocks(6);
-        self.push_gpu_blocks(&mut writer);
-        writer.finish()
-    }
-
-    pub fn push_gpu_blocks(&self, writer: &mut GpuBufferWriterF) {
+    pub fn write_gpu_blocks(&self, request: &mut GpuDataRequest) {
         // see fetch_image_resource in GLSL
         // has to be VECS_PER_IMAGE_RESOURCE vectors
-        writer.push_one([
+        request.push([
             self.p0.x,
             self.p0.y,
             self.p1.x,
             self.p1.y,
         ]);
-        writer.push_one(self.user_data);
+        request.push(self.user_data);
 
         // If this is a polygon uv kind, then upload the four vertices.
         if let UvRectKind::Quad { top_left, top_right, bottom_left, bottom_right } = self.uv_rect_kind {
             // see fetch_image_resource_extra in GLSL
             //Note: we really need only 3 components per point here: X, Y, and W
-            fn to_array(v: HomogeneousVector<f32, DevicePixel>) -> [f32; 4] {
-                [v.x, v.y, v.z, v.w]
-            }
-            writer.push_one(to_array(top_left));
-            writer.push_one(to_array(top_right));
-            writer.push_one(to_array(bottom_left));
-            writer.push_one(to_array(bottom_right));
+            request.push(top_left);
+            request.push(top_right);
+            request.push(bottom_left);
+            request.push(bottom_right);
         }
     }
 }
diff --git a/gfx/wr/webrender/src/image_source.rs b/gfx/wr/webrender/src/image_source.rs
@@ -11,10 +11,10 @@
 
 use crate::api::ExternalImageType;
 use crate::api::units::*;
+use crate::gpu_cache::GpuCache;
 use crate::prim_store::DeferredResolve;
 use crate::renderer::BLOCKS_PER_UV_RECT;
 use crate::render_task_cache::RenderTaskCacheEntryHandle;
-use crate::renderer::GpuBufferBuilderF;
 use crate::resource_cache::{ResourceCache, ImageRequest, CacheItem};
 use crate::internal_types::{TextureSource, TextureSourceExternal, DeferredResolveIndex, FrameVec};
 
@@ -22,7 +22,7 @@ use crate::internal_types::{TextureSource, TextureSourceExternal, DeferredResolv
 pub fn resolve_image(
     request: ImageRequest,
     resource_cache: &ResourceCache,
-    gpu_buffer: &mut GpuBufferBuilderF,
+    gpu_cache: &mut GpuCache,
     deferred_resolves: &mut FrameVec<DeferredResolve>,
     is_composited: bool,
 ) -> CacheItem {
@@ -35,7 +35,7 @@ pub fn resolve_image(
                     // This is an external texture - we will add it to
                     // the deferred resolves list to be patched by
                     // the render thread...
-                    let uv_rect_address = gpu_buffer.reserve_renderer_deferred_blocks(BLOCKS_PER_UV_RECT);
+                    let cache_handle = gpu_cache.push_deferred_per_frame_blocks(BLOCKS_PER_UV_RECT);
 
                     let deferred_resolve_index = DeferredResolveIndex(deferred_resolves.len() as u32);
 
@@ -56,7 +56,7 @@ pub fn resolve_image(
                             kind: image_buffer_kind,
                             normalized_uvs: external_image.normalized_uvs,
                         }),
-                        uv_rect_handle: uv_rect_address,
+                        uv_rect_handle: cache_handle,
                         uv_rect: DeviceIntRect::from_size(
                             image_properties.descriptor.size,
                         ),
@@ -65,7 +65,7 @@ pub fn resolve_image(
 
                     deferred_resolves.push(DeferredResolve {
                         image_properties,
-                        address: uv_rect_address,
+                        address: gpu_cache.get_address(&cache_handle),
                         rendering: request.rendering,
                         is_composited,
                     });
diff --git a/gfx/wr/webrender/src/internal_types.rs b/gfx/wr/webrender/src/internal_types.rs
@@ -10,6 +10,7 @@ use crate::render_api::DebugCommand;
 use crate::composite::NativeSurfaceOperation;
 use crate::device::TextureFilter;
 use crate::renderer::{FullFrameStats, PipelineInfo};
+use crate::gpu_cache::GpuCacheUpdateList;
 use crate::gpu_types::BlurEdgeMode;
 use crate::frame_builder::Frame;
 use crate::profiler::TransactionProfile;
@@ -1349,6 +1350,7 @@ pub enum ResultMsg {
     DebugCommand(DebugCommand),
     DebugOutput(DebugOutput),
     RefreshShader(PathBuf),
+    UpdateGpuCache(GpuCacheUpdateList),
     UpdateResources {
         resource_updates: ResourceUpdateList,
         memory_pressure: bool,
diff --git a/gfx/wr/webrender/src/lib.rs b/gfx/wr/webrender/src/lib.rs
@@ -102,6 +102,7 @@ mod filterdata;
 mod frame_builder;
 mod freelist;
 mod glyph_cache;
+mod gpu_cache;
 mod gpu_types;
 mod hit_test;
 mod internal_types;
diff --git a/gfx/wr/webrender/src/picture.rs b/gfx/wr/webrender/src/picture.rs
@@ -116,6 +116,7 @@ use crate::intern::ItemUid;
 use crate::internal_types::{FastHashMap, FastHashSet, PlaneSplitter, FilterGraphOp, FilterGraphNode, Filter, FrameId};
 use crate::internal_types::{PlaneSplitterIndex, PlaneSplitAnchor, TextureSource};
 use crate::frame_builder::{FrameBuildingContext, FrameBuildingState, PictureState, PictureContext};
+use crate::gpu_cache::{GpuCache, GpuCacheAddress, GpuCacheHandle};
 use crate::gpu_types::{UvRectKind, ZBufferId, BlurEdgeMode};
 use peek_poke::{PeekPoke, poke_into_vec, peek_from_slice, ensure_red_zone};
 use plane_split::{Clipper, Polygon};
@@ -127,7 +128,7 @@ use crate::render_task_graph::RenderTaskId;
 use crate::render_target::RenderTargetKind;
 use crate::render_task::{BlurTask, RenderTask, RenderTaskLocation, BlurTaskCache};
 use crate::render_task::{StaticRenderTaskSurface, RenderTaskKind};
-use crate::renderer::{BlendMode, GpuBufferAddress};
+use crate::renderer::BlendMode;
 use crate::resource_cache::{ResourceCache, ImageGeneration, ImageRequest};
 use crate::space::SpaceMapper;
 use crate::scene::SceneProperties;
@@ -2193,7 +2194,7 @@ impl TileCacheInstance {
                 &map_local_to_picture,
                 &pic_to_vis_mapper,
                 frame_context.spatial_tree,
-                &mut frame_state.frame_gpu_data.f32,
+                frame_state.gpu_cache,
                 frame_state.resource_cache,
                 frame_context.global_device_pixel_scale,
                 &surface.culling_rect,
@@ -2725,7 +2726,7 @@ impl TileCacheInstance {
         api_keys: &[ImageKey; 3],
         resource_cache: &mut ResourceCache,
         composite_state: &mut CompositeState,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         image_rendering: ImageRendering,
         color_depth: ColorDepth,
         color_space: YuvRangedColorSpace,
@@ -2740,7 +2741,7 @@ impl TileCacheInstance {
                         rendering: image_rendering,
                         tile: None,
                     },
-                    gpu_buffer,
+                    gpu_cache,
                 );
             }
         }
@@ -2781,7 +2782,7 @@ impl TileCacheInstance {
         api_key: ImageKey,
         resource_cache: &mut ResourceCache,
         composite_state: &mut CompositeState,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         image_rendering: ImageRendering,
         is_opaque: bool,
         surface_kind: CompositorSurfaceKind,
@@ -2800,7 +2801,7 @@ impl TileCacheInstance {
                 rendering: image_rendering,
                 tile: None,
             },
-            gpu_buffer,
+            gpu_cache,
         );
 
         self.setup_compositor_surfaces_impl(
@@ -3146,7 +3147,7 @@ impl TileCacheInstance {
         color_bindings: &ColorBindingStorage,
         surface_stack: &[(PictureIndex, SurfaceIndex)],
         composite_state: &mut CompositeState,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         scratch: &mut PrimitiveScratchBuffer,
         is_root_tile_cache: bool,
         surfaces: &mut [SurfaceInfo],
@@ -3390,7 +3391,7 @@ impl TileCacheInstance {
                             image_data.key,
                             resource_cache,
                             composite_state,
-                            gpu_buffer,
+                            gpu_cache,
                             image_data.image_rendering,
                             is_opaque,
                             kind,
@@ -3508,7 +3509,7 @@ impl TileCacheInstance {
                             &prim_data.kind.yuv_key,
                             resource_cache,
                             composite_state,
-                            gpu_buffer,
+                            gpu_cache,
                             prim_data.kind.image_rendering,
                             prim_data.kind.color_depth,
                             prim_data.kind.color_space.with_range(prim_data.kind.color_range),
@@ -4956,7 +4957,7 @@ pub enum Picture3DContext<C> {
 #[cfg_attr(feature = "capture", derive(Serialize))]
 pub struct OrderedPictureChild {
     pub anchor: PlaneSplitAnchor,
-    pub gpu_address: GpuBufferAddress,
+    pub gpu_address: GpuCacheAddress,
 }
 
 bitflags! {
@@ -5217,7 +5218,7 @@ pub struct PicturePrimitive {
     // Optional cache handles for storing extra data
     // in the GPU cache, depending on the type of
     // picture.
-    pub extra_gpu_data: SmallVec<[GpuBufferAddress; 1]>,
+    pub extra_gpu_data_handles: SmallVec<[GpuCacheHandle; 1]>,
 
     /// The spatial node index of this picture when it is
     /// composited into the parent picture.
@@ -5331,7 +5332,7 @@ impl PicturePrimitive {
             composite_mode,
             raster_config: None,
             context_3d,
-            extra_gpu_data: SmallVec::new(),
+            extra_gpu_data_handles: SmallVec::new(),
             is_backface_visible: prim_flags.contains(PrimitiveFlags::IS_BACKFACE_VISIBLE),
             spatial_node_index,
             prev_local_rect: LayoutRect::zero(),
@@ -5509,7 +5510,7 @@ impl PicturePrimitive {
                         if let Some(TileSurface::Texture { descriptor, .. }) = tile.surface.as_ref() {
                             if let SurfaceTextureDescriptor::TextureCache { handle: Some(handle), .. } = descriptor {
                                 frame_state.resource_cache
-                                    .picture_textures.request(handle, &mut frame_state.frame_gpu_data.f32);
+                                    .picture_textures.request(handle, frame_state.gpu_cache);
                             }
                         }
 
@@ -5545,7 +5546,7 @@ impl PicturePrimitive {
                                         // TODO(gw): Consider switching to manual eviction policy?
                                         frame_state.resource_cache
                                             .picture_textures
-                                            .request(handle.as_ref().unwrap(), &mut frame_state.frame_gpu_data.f32);
+                                            .request(handle.as_ref().unwrap(), frame_state.gpu_cache);
                                     } else {
                                         // If the texture was evicted on a previous frame, we need to assume
                                         // that the entire tile rect is dirty.
@@ -5602,7 +5603,7 @@ impl PicturePrimitive {
                                         frame_state.resource_cache.picture_textures.update(
                                             tile_cache.current_tile_size,
                                             handle,
-                                            &mut frame_state.frame_gpu_data.f32,
+                                            frame_state.gpu_cache,
                                             &mut frame_state.resource_cache.texture_cache.next_id,
                                             &mut frame_state.resource_cache.texture_cache.pending_updates,
                                         );
@@ -6027,6 +6028,14 @@ impl PicturePrimitive {
                 //           use of the conservative picture rect for segmenting (which should
                 //           be done during scene building).
                 if local_rect != self.prev_local_rect {
+                    match raster_config.composite_mode {
+                        PictureCompositeMode::Filter(Filter::DropShadows(..)) => {
+                            for handle in &self.extra_gpu_data_handles {
+                                frame_state.gpu_cache.invalidate(handle);
+                            }
+                        }
+                        _ => {}
+                    }
                     // Invalidate any segments built for this picture, since the local
                     // rect has changed.
                     self.segments_are_valid = false;
@@ -6130,7 +6139,7 @@ impl PicturePrimitive {
                             &self.snapshot,
                             &surface_rects,
                             false,
-                            &mut|rg_builder, _| {
+                            &mut|rg_builder, _, _| {
                                 RenderTask::new_blur(
                                     blur_std_deviation,
                                     picture_task_id,
@@ -6179,7 +6188,7 @@ impl PicturePrimitive {
 
                         let mut blur_tasks = BlurTaskCache::default();
 
-                        self.extra_gpu_data.resize(shadows.len(), GpuBufferAddress::INVALID);
+                        self.extra_gpu_data_handles.resize(shadows.len(), GpuCacheHandle::new());
 
                         let mut blur_render_task_id = picture_task_id;
                         for shadow in shadows {
@@ -6307,7 +6316,7 @@ impl PicturePrimitive {
                             &self.snapshot,
                             &surface_rects,
                             is_opaque,
-                            &mut|rg_builder, _| {
+                            &mut|rg_builder, _, _| {
                                 rg_builder.add().init(
                                     RenderTask::new_dynamic(
                                         task_size,
@@ -6346,7 +6355,7 @@ impl PicturePrimitive {
                             &self.snapshot,
                             &surface_rects,
                             is_opaque,
-                            &mut|rg_builder, _| {
+                            &mut|rg_builder, _, _| {
                                 rg_builder.add().init(
                                     RenderTask::new_dynamic(
                                         surface_rects.task_size,
@@ -6385,7 +6394,7 @@ impl PicturePrimitive {
                             &self.snapshot,
                             &surface_rects,
                             is_opaque,
-                            &mut|rg_builder, _| {
+                            &mut|rg_builder, _, _| {
                                 rg_builder.add().init(
                                     RenderTask::new_dynamic(
                                         surface_rects.task_size,
@@ -6425,7 +6434,7 @@ impl PicturePrimitive {
                             &self.snapshot,
                             &surface_rects,
                             is_opaque,
-                            &mut|rg_builder, _| {
+                            &mut|rg_builder, _, _| {
                                 rg_builder.add().init(
                                     RenderTask::new_dynamic(
                                         surface_rects.task_size,
@@ -6470,7 +6479,7 @@ impl PicturePrimitive {
                             &self.snapshot,
                             &surface_rects,
                             is_opaque,
-                            &mut|rg_builder, _| {
+                            &mut|rg_builder, _, _| {
                                 rg_builder.add().init(
                                     RenderTask::new_dynamic(
                                         surface_rects.task_size,
@@ -6529,7 +6538,7 @@ impl PicturePrimitive {
                             &self.snapshot,
                             &surface_rects,
                             is_opaque,
-                            &mut|rg_builder, _| {
+                            &mut|rg_builder, _, _| {
                                 RenderTask::new_svg_filter(
                                     primitives,
                                     filter_datas,
@@ -6604,11 +6613,11 @@ impl PicturePrimitive {
                             &self.snapshot,
                             &surface_rects,
                             false,
-                            &mut|rg_builder, gpu_buffer| {
+                            &mut|rg_builder, _, gpu_cache| {
                                 RenderTask::new_svg_filter_graph(
                                     filters,
                                     rg_builder,
-                                    gpu_buffer,
+                                    gpu_cache,
                                     data_stores,
                                     surface_rects.uv_rect_kind,
                                     picture_task_id,
@@ -6769,7 +6778,7 @@ impl PicturePrimitive {
             PicturePrimitive::resolve_split_planes(
                 splitter,
                 list,
-                &mut frame_state.frame_gpu_data.f32,
+                &mut frame_state.gpu_cache,
                 &frame_context.spatial_tree,
             );
 
@@ -6878,7 +6887,7 @@ impl PicturePrimitive {
     fn resolve_split_planes(
         splitter: &mut PlaneSplitter,
         ordered: &mut Vec<OrderedPictureChild>,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         spatial_tree: &SpatialTree,
     ) {
         ordered.clear();
@@ -6914,11 +6923,12 @@ impl PicturePrimitive {
             let p1 = local_points[1].unwrap();
             let p2 = local_points[2].unwrap();
             let p3 = local_points[3].unwrap();
-
-            let mut writer = gpu_buffer.write_blocks(2);
-            writer.push_one([p0.x, p0.y, p1.x, p1.y]);
-            writer.push_one([p2.x, p2.y, p3.x, p3.y]);
-            let gpu_address = writer.finish();
+            let gpu_blocks = [
+                [p0.x, p0.y, p1.x, p1.y].into(),
+                [p2.x, p2.y, p3.x, p3.y].into(),
+            ];
+            let gpu_handle = gpu_cache.push_per_frame_blocks(&gpu_blocks);
+            let gpu_address = gpu_cache.get_address(&gpu_handle);
 
             ordered.push(OrderedPictureChild {
                 anchor: poly.anchor,
@@ -7242,7 +7252,7 @@ impl PicturePrimitive {
             }
         };
 
-        // TODO(gw): Almost all of the Picture types below use extra_gpu_data
+        // TODO(gw): Almost all of the Picture types below use extra_gpu_cache_data
         //           to store the same type of data. The exception is the filter
         //           with a ColorMatrix, which stores the color matrix here. It's
         //           probably worth tidying this code up to be a bit more consistent.
@@ -7253,68 +7263,67 @@ impl PicturePrimitive {
             PictureCompositeMode::TileCache { .. } => {}
             PictureCompositeMode::Filter(Filter::Blur { .. }) => {}
             PictureCompositeMode::Filter(Filter::DropShadows(ref shadows)) => {
-                self.extra_gpu_data.resize(shadows.len(), GpuBufferAddress::INVALID);
-                for (shadow, extra_handle) in shadows.iter().zip(self.extra_gpu_data.iter_mut()) {
-                    let mut writer = frame_state.frame_gpu_data.f32.write_blocks(5);
-                    let surface = &frame_state.surfaces[raster_config.surface_index.0];
-                    let prim_rect = surface.clipped_local_rect.cast_unit();
-
-                    // Basic brush primitive header is (see end of prepare_prim_for_render_inner in prim_store.rs)
-                    //  [brush specific data]
-                    //  [segment_rect, segment data]
-                    let (blur_inflation_x, blur_inflation_y) = surface.clamp_blur_radius(
-                        shadow.blur_radius,
-                        shadow.blur_radius,
-                    );
+                self.extra_gpu_data_handles.resize(shadows.len(), GpuCacheHandle::new());
+                for (shadow, extra_handle) in shadows.iter().zip(self.extra_gpu_data_handles.iter_mut()) {
+                    if let Some(mut request) = frame_state.gpu_cache.request(extra_handle) {
+                        let surface = &frame_state.surfaces[raster_config.surface_index.0];
+                        let prim_rect = surface.clipped_local_rect.cast_unit();
+
+                        // Basic brush primitive header is (see end of prepare_prim_for_render_inner in prim_store.rs)
+                        //  [brush specific data]
+                        //  [segment_rect, segment data]
+                        let (blur_inflation_x, blur_inflation_y) = surface.clamp_blur_radius(
+                            shadow.blur_radius,
+                            shadow.blur_radius,
+                        );
 
-                    let shadow_rect = prim_rect.inflate(
-                        blur_inflation_x * BLUR_SAMPLE_SCALE,
-                        blur_inflation_y * BLUR_SAMPLE_SCALE,
-                    ).translate(shadow.offset);
-
-                    // ImageBrush colors
-                    writer.push_one(shadow.color.premultiplied());
-                    writer.push_one(PremultipliedColorF::WHITE);
-                    writer.push_one([
-                        shadow_rect.width(),
-                        shadow_rect.height(),
-                        0.0,
-                        0.0,
-                    ]);
-
-                    // segment rect / extra data
-                    writer.push_one(shadow_rect);
-                    writer.push_one([0.0, 0.0, 0.0, 0.0]);
-
-                    *extra_handle = writer.finish();
+                        let shadow_rect = prim_rect.inflate(
+                            blur_inflation_x * BLUR_SAMPLE_SCALE,
+                            blur_inflation_y * BLUR_SAMPLE_SCALE,
+                        ).translate(shadow.offset);
+
+                        // ImageBrush colors
+                        request.push(shadow.color.premultiplied());
+                        request.push(PremultipliedColorF::WHITE);
+                        request.push([
+                            shadow_rect.width(),
+                            shadow_rect.height(),
+                            0.0,
+                            0.0,
+                        ]);
+
+                        // segment rect / extra data
+                        request.push(shadow_rect);
+                        request.push([0.0, 0.0, 0.0, 0.0]);
+                    }
                 }
             }
             PictureCompositeMode::Filter(ref filter) => {
                 match *filter {
                     Filter::ColorMatrix(ref m) => {
-                        if self.extra_gpu_data.is_empty() {
-                            self.extra_gpu_data.push(GpuBufferAddress::INVALID);
+                        if self.extra_gpu_data_handles.is_empty() {
+                            self.extra_gpu_data_handles.push(GpuCacheHandle::new());
                         }
-                        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(5);
-                        for i in 0..5 {
-                            writer.push_one([m[i*4], m[i*4+1], m[i*4+2], m[i*4+3]]);
+                        if let Some(mut request) = frame_state.gpu_cache.request(&mut self.extra_gpu_data_handles[0]) {
+                            for i in 0..5 {
+                                request.push([m[i*4], m[i*4+1], m[i*4+2], m[i*4+3]]);
+                            }
                         }
-                        self.extra_gpu_data[0] = writer.finish();
                     }
                     Filter::Flood(ref color) => {
-                        if self.extra_gpu_data.is_empty() {
-                            self.extra_gpu_data.push(GpuBufferAddress::INVALID);
+                        if self.extra_gpu_data_handles.is_empty() {
+                            self.extra_gpu_data_handles.push(GpuCacheHandle::new());
+                        }
+                        if let Some(mut request) = frame_state.gpu_cache.request(&mut self.extra_gpu_data_handles[0]) {
+                            request.push(color.to_array());
                         }
-                        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(1);
-                        writer.push_one(color.to_array());
-                        self.extra_gpu_data[0] = writer.finish();
                     }
                     _ => {}
                 }
             }
             PictureCompositeMode::ComponentTransferFilter(handle) => {
                 let filter_data = &mut data_stores.filter_data[handle];
-                filter_data.write_gpu_blocks(&mut frame_state.frame_gpu_data.f32);
+                filter_data.update(&mut frame_state.gpu_cache);
             }
             PictureCompositeMode::MixBlend(..) |
             PictureCompositeMode::Blit(_) |
@@ -7326,7 +7335,7 @@ impl PicturePrimitive {
                     match op {
                         FilterGraphOp::SVGFEComponentTransferInterned { handle, creates_pixels: _ } => {
                             let filter_data = &mut data_stores.filter_data[*handle];
-                            filter_data.write_gpu_blocks(&mut frame_state.frame_gpu_data.f32);
+                            filter_data.update(&mut frame_state.gpu_cache);
                         }
                         _ => {}
                     }
@@ -8613,7 +8622,7 @@ fn request_render_task(
     snapshot: &Option<SnapshotInfo>,
     surface_rects: &SurfaceAllocInfo,
     is_opaque: bool,
-    f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId,
+    f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId,
 ) -> RenderTaskId {
 
     let task_id = match snapshot {
@@ -8627,6 +8636,7 @@ fn request_render_task(
                 surface_rects.task_size,
                 frame_state.rg_builder,
                 &mut frame_state.frame_gpu_data.f32,
+                frame_state.gpu_cache,
                 is_opaque,
                 &adjustment,
                 f
@@ -8649,6 +8659,7 @@ fn request_render_task(
             f(
                 frame_state.rg_builder,
                 &mut frame_state.frame_gpu_data.f32,
+                frame_state.gpu_cache
             )
         }
     };
diff --git a/gfx/wr/webrender/src/picture_textures.rs b/gfx/wr/webrender/src/picture_textures.rs
@@ -13,8 +13,8 @@ use crate::internal_types::{
 };
 use crate::profiler::{self, TransactionProfile};
 use crate::gpu_types::{ImageSource, UvRectKind};
+use crate::gpu_cache::{GpuCache, GpuCacheHandle};
 use crate::freelist::{FreeList, FreeListHandle, WeakFreeListHandle};
-use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF};
 
 
 #[derive(Debug, PartialEq)]
@@ -42,23 +42,24 @@ pub struct PictureCacheEntry {
     //           in the glyph cache eviction code. We could probably remove it
     //           entirely in future (or move to EntryDetails::Picture).
     pub last_access: FrameStamp,
-    /// Handle to the resource rect in the float GPU buffer.
-    pub uv_rect_handle: GpuBufferAddress,
+    /// Handle to the resource rect in the GPU cache.
+    pub uv_rect_handle: GpuCacheHandle,
     /// The actual device texture ID this is part of.
     pub texture_id: CacheTextureId,
 }
 
 impl PictureCacheEntry {
-    fn write_gpu_blocks(&mut self, gpu_buffer: &mut GpuBufferBuilderF) {
-        let origin = DeviceIntPoint::zero();
-        let image_source = ImageSource {
-            p0: origin.to_f32(),
-            p1: (origin + self.size).to_f32(),
-            uv_rect_kind: UvRectKind::Rect,
-            user_data: [0.0; 4],
-        };
-
-        self.uv_rect_handle = image_source.write_gpu_blocks(gpu_buffer);
+    fn update_gpu_cache(&mut self, gpu_cache: &mut GpuCache) {
+        if let Some(mut request) = gpu_cache.request(&mut self.uv_rect_handle) {
+            let origin = DeviceIntPoint::zero();
+            let image_source = ImageSource {
+                p0: origin.to_f32(),
+                p1: (origin + self.size).to_f32(),
+                uv_rect_kind: UvRectKind::Rect,
+                user_data: [0.0; 4],
+            };
+            image_source.write_gpu_blocks(&mut request);
+        }
     }
 }
 
@@ -129,7 +130,7 @@ impl PictureTextures {
         &mut self,
         tile_size: DeviceIntSize,
         handle: &mut Option<PictureCacheTextureHandle>,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         next_texture_id: &mut CacheTextureId,
         pending_updates: &mut TextureUpdateList,
     ) {
@@ -159,7 +160,7 @@ impl PictureTextures {
             self.cache_entries
                 .get_opt_mut(handle)
                 .expect("BUG: handle must be valid now")
-                .write_gpu_blocks(gpu_buffer);
+                .update_gpu_cache(gpu_cache);
         } else {
             panic!("The handle should be valid picture cache handle now")
         }
@@ -218,7 +219,7 @@ impl PictureTextures {
         let cache_entry = PictureCacheEntry {
             size: tile_size,
             last_access: self.now,
-            uv_rect_handle: GpuBufferAddress::INVALID,
+            uv_rect_handle: GpuCacheHandle::new(),
             texture_id,
         };
 
@@ -263,14 +264,14 @@ impl PictureTextures {
         }
     }
 
-    pub fn request(&mut self, handle: &PictureCacheTextureHandle, gpu_buffer: &mut GpuBufferBuilderF) -> bool {
+    pub fn request(&mut self, handle: &PictureCacheTextureHandle, gpu_cache: &mut GpuCache) -> bool {
         let entry = self.cache_entries.get_opt_mut(handle);
         let now = self.now;
         entry.map_or(true, |entry| {
             // If an image is requested that is already in the cache,
             // refresh the GPU cache data associated with this item.
             entry.last_access = now;
-            entry.write_gpu_blocks(gpu_buffer);
+            entry.update_gpu_cache(gpu_cache);
             false
         })
     }
diff --git a/gfx/wr/webrender/src/prepare.rs b/gfx/wr/webrender/src/prepare.rs
@@ -6,7 +6,7 @@
 //!
 //! TODO: document this!
 
-use api::{ColorF, DebugFlags};
+use api::{ColorF, DebugFlags, PropertyBinding};
 use api::{BoxShadowClipMode, BorderStyle, ClipMode};
 use api::units::*;
 use euclid::Scale;
@@ -17,10 +17,10 @@ use crate::image_tiling::{self, Repetition};
 use crate::border::{get_max_scale_for_border, build_border_instances};
 use crate::clip::{ClipStore, ClipNodeRange};
 use crate::pattern::Pattern;
-use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF, GpuBufferWriterF};
 use crate::spatial_tree::{SpatialNodeIndex, SpatialTree};
 use crate::clip::{ClipDataStore, ClipNodeFlags, ClipChainInstance, ClipItemKind};
 use crate::frame_builder::{FrameBuildingContext, FrameBuildingState, PictureContext, PictureState};
+use crate::gpu_cache::{GpuCacheHandle, GpuDataRequest};
 use crate::gpu_types::BrushFlags;
 use crate::internal_types::{FastHashMap, PlaneSplitAnchor, Filter};
 use crate::picture::{ClusterFlags, PictureCompositeMode, PicturePrimitive, SliceId};
@@ -438,10 +438,11 @@ fn prepare_interned_prim_for_render(
                     }),
                     false,
                     RenderTaskParent::Surface,
+                    frame_state.gpu_cache,
                     &mut frame_state.frame_gpu_data.f32,
                     frame_state.rg_builder,
                     &mut frame_state.surface_builder,
-                    &mut |rg_builder, _| {
+                    &mut |rg_builder, _, _| {
                         rg_builder.add().init(RenderTask::new_dynamic(
                             task_size,
                             RenderTaskKind::new_line_decoration(
@@ -513,11 +514,13 @@ fn prepare_interned_prim_for_render(
                 allow_subpixel,
                 frame_context.fb_config.low_quality_pinch_zoom,
                 frame_state.resource_cache,
-                &mut frame_state.frame_gpu_data.f32,
+                frame_state.gpu_cache,
                 frame_context.spatial_tree,
                 scratch,
             );
 
+            // Update the template this instane references, which may refresh the GPU
+            // cache with any shared template data.
             prim_data.update(frame_state);
         }
         PrimitiveInstanceKind::Clear { data_handle, .. } => {
@@ -589,10 +592,11 @@ fn prepare_interned_prim_for_render(
                     Some(cache_key),
                     false,          // TODO(gw): We don't calculate opacity for borders yet!
                     RenderTaskParent::Surface,
+                    frame_state.gpu_cache,
                     &mut frame_state.frame_gpu_data.f32,
                     frame_state.rg_builder,
                     &mut frame_state.surface_builder,
-                    &mut |rg_builder, _| {
+                    &mut |rg_builder, _, _| {
                         rg_builder.add().init(RenderTask::new_dynamic(
                             cache_size,
                             RenderTaskKind::new_border_segment(
@@ -626,13 +630,35 @@ fn prepare_interned_prim_for_render(
                 frame_state
             );
         }
-        PrimitiveInstanceKind::Rectangle { data_handle, segment_instance_index, use_legacy_path, .. } => {
+        PrimitiveInstanceKind::Rectangle { data_handle, segment_instance_index, color_binding_index, use_legacy_path, .. } => {
             profile_scope!("Rectangle");
 
             if *use_legacy_path {
                 let prim_data = &mut data_stores.prim[*data_handle];
                 prim_data.common.may_need_repetition = false;
 
+                // TODO(gw): Legacy rect rendering path - remove once we support masks on quad prims
+                if *color_binding_index != ColorBindingIndex::INVALID {
+                    match store.color_bindings[*color_binding_index] {
+                        PropertyBinding::Binding(..) => {
+                            // We explicitly invalidate the gpu cache
+                            // if the color is animating.
+                            let gpu_cache_handle =
+                                if *segment_instance_index == SegmentInstanceIndex::INVALID {
+                                    None
+                                } else if *segment_instance_index == SegmentInstanceIndex::UNUSED {
+                                    Some(&prim_data.common.gpu_cache_handle)
+                                } else {
+                                    Some(&scratch.segment_instances[*segment_instance_index].gpu_cache_handle)
+                                };
+                            if let Some(gpu_cache_handle) = gpu_cache_handle {
+                                frame_state.gpu_cache.invalidate(gpu_cache_handle);
+                            }
+                        }
+                        PropertyBinding::Value(..) => {},
+                    }
+                }
+
                 // Update the template this instane references, which may refresh the GPU
                 // cache with any shared template data.
                 prim_data.update(
@@ -695,8 +721,8 @@ fn prepare_interned_prim_for_render(
                 frame_state,
                 &mut scratch.segments,
                 &mut scratch.segment_instances,
-                |writer| {
-                    yuv_image_data.write_prim_gpu_blocks(writer);
+                |request| {
+                    yuv_image_data.write_prim_gpu_blocks(request);
                 }
             );
         }
@@ -778,22 +804,19 @@ fn prepare_interned_prim_for_render(
                     frame_state,
                     &mut scratch.gradient_tiles,
                     &frame_context.spatial_tree,
-                    Some(&mut |_, gpu_buffer| {
-                        let mut writer = gpu_buffer.write_blocks(2);
-                        writer.push_one([
+                    Some(&mut |_, mut request| {
+                        request.push([
                             prim_data.start_point.x,
                             prim_data.start_point.y,
                             prim_data.end_point.x,
                             prim_data.end_point.y,
                         ]);
-                        writer.push_one([
+                        request.push([
                             pack_as_float(prim_data.extend_mode as u32),
                             prim_data.stretch_size.width,
                             prim_data.stretch_size.height,
                             0.0,
                         ]);
-
-                        writer.finish()
                     }),
                 );
 
@@ -1210,22 +1233,23 @@ fn write_segment<F>(
     segments: &mut SegmentStorage,
     segment_instances: &mut SegmentInstanceStorage,
     f: F,
-) where F: Fn(&mut GpuBufferWriterF) {
+) where F: Fn(&mut GpuDataRequest) {
     debug_assert_ne!(segment_instance_index, SegmentInstanceIndex::INVALID);
     if segment_instance_index != SegmentInstanceIndex::UNUSED {
         let segment_instance = &mut segment_instances[segment_instance_index];
 
-        let segments = &segments[segment_instance.segments_range];
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + segments.len() * VECS_PER_SEGMENT);
+        if let Some(mut request) = frame_state.gpu_cache.request(&mut segment_instance.gpu_cache_handle) {
+            let segments = &segments[segment_instance.segments_range];
 
-        f(&mut writer);
+            f(&mut request);
 
-        for segment in segments {
-            writer.push_one(segment.local_rect);
-            writer.push_one([0.0; 4]);
+            for segment in segments {
+                request.write_segment(
+                    segment.local_rect,
+                    [0.0; 4],
+                );
+            }
         }
-
-        segment_instance.gpu_data = writer.finish();
     }
 }
 
@@ -1238,7 +1262,7 @@ fn decompose_repeated_gradient(
     frame_state: &mut FrameBuildingState,
     gradient_tiles: &mut GradientTileStorage,
     spatial_tree: &SpatialTree,
-    mut callback: Option<&mut dyn FnMut(&LayoutRect, &mut GpuBufferBuilderF) -> GpuBufferAddress>,
+    mut callback: Option<&mut dyn FnMut(&LayoutRect, GpuDataRequest)>,
 ) -> GradientTileRange {
     let tile_range = gradient_tiles.open_range();
 
@@ -1262,21 +1286,22 @@ fn decompose_repeated_gradient(
         let repetitions = image_tiling::repetitions(prim_local_rect, &visible_rect, stride);
         gradient_tiles.reserve(repetitions.num_repetitions());
         for Repetition { origin, .. } in repetitions {
+            let mut handle = GpuCacheHandle::new();
             let rect = LayoutRect::from_origin_and_size(
                 origin,
                 *stretch_size,
             );
 
-            let mut address = GpuBufferAddress::INVALID;
-
             if let Some(callback) = &mut callback {
-                address = callback(&rect, &mut frame_state.frame_gpu_data.f32);
+                if let Some(request) = frame_state.gpu_cache.request(&mut handle) {
+                    callback(&rect, request);
+                }
             }
 
             gradient_tiles.push(VisibleGradientTile {
                 local_rect: rect,
                 local_clip_rect: tight_clip_rect,
-                address,
+                handle
             });
         }
     }
@@ -1451,7 +1476,7 @@ fn update_clip_task_for_brush(
                     &pic_state.map_local_to_pic,
                     &pic_state.map_pic_to_vis,
                     &frame_context.spatial_tree,
-                    &mut frame_state.frame_gpu_data.f32,
+                    frame_state.gpu_cache,
                     frame_state.resource_cache,
                     device_pixel_scale,
                     &dirty_rect,
@@ -1548,6 +1573,7 @@ pub fn update_clip_task(
             instance.vis.clip_chain.clips_range,
             root_spatial_node_index,
             frame_state.clip_store,
+            frame_state.gpu_cache,
             &mut frame_state.frame_gpu_data.f32,
             frame_state.resource_cache,
             frame_state.rg_builder,
@@ -1613,6 +1639,7 @@ pub fn update_brush_segment_clip_task(
         clip_chain.clips_range,
         root_spatial_node_index,
         frame_state.clip_store,
+        frame_state.gpu_cache,
         &mut frame_state.frame_gpu_data.f32,
         frame_state.resource_cache,
         frame_state.rg_builder,
@@ -1827,7 +1854,7 @@ fn build_segments_if_needed(
 
             let instance = SegmentedInstance {
                 segments_range,
-                gpu_data: GpuBufferAddress::INVALID,
+                gpu_cache_handle: GpuCacheHandle::new(),
             };
 
             *segment_instance_index = segment_instances_store.push(instance);
diff --git a/gfx/wr/webrender/src/prim_store/borders.rs b/gfx/wr/webrender/src/prim_store/borders.rs
@@ -6,13 +6,16 @@ use api::{NormalBorder, PremultipliedColorF, Shadow, RasterSpace};
 use api::units::*;
 use crate::border::create_border_segments;
 use crate::border::NormalBorderAu;
-use crate::renderer::GpuBufferWriterF;
 use crate::scene_building::{CreateShadow, IsVisible};
 use crate::frame_builder::FrameBuildingState;
+use crate::gpu_cache::GpuDataRequest;
 use crate::intern;
 use crate::internal_types::{LayoutPrimitiveInfo, FrameId};
 use crate::prim_store::{
-    BorderSegmentInfo, BrushSegment, InternablePrimitive, NinePatchDescriptor, PrimKey, PrimTemplate, PrimTemplateCommonData, PrimitiveInstanceKind, PrimitiveOpacity, PrimitiveStore, VECS_PER_SEGMENT
+    BorderSegmentInfo, BrushSegment, NinePatchDescriptor, PrimKey,
+    PrimTemplate, PrimTemplateCommonData,
+    PrimitiveInstanceKind, PrimitiveOpacity,
+    PrimitiveStore, InternablePrimitive,
 };
 use crate::resource_cache::ImageRequest;
 use crate::render_task::RenderTask;
@@ -64,24 +67,25 @@ impl NormalBorderData {
         common: &mut PrimTemplateCommonData,
         frame_state: &mut FrameBuildingState,
     ) {
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT);
-        self.write_prim_gpu_blocks(&mut writer, common.prim_rect.size());
-        self.write_segment_gpu_blocks(&mut writer);
-        common.gpu_buffer_address = writer.finish();
+        if let Some(ref mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) {
+            self.write_prim_gpu_blocks(request, common.prim_rect.size());
+            self.write_segment_gpu_blocks(request);
+        }
+
         common.opacity = PrimitiveOpacity::translucent();
     }
 
     fn write_prim_gpu_blocks(
         &self,
-        writer: &mut GpuBufferWriterF,
+        request: &mut GpuDataRequest,
         prim_size: LayoutSize
     ) {
         // Border primitives currently used for
         // image borders, and run through the
         // normal brush_image shader.
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one([
+        request.push(PremultipliedColorF::WHITE);
+        request.push(PremultipliedColorF::WHITE);
+        request.push([
             prim_size.width,
             prim_size.height,
             0.0,
@@ -91,12 +95,14 @@ impl NormalBorderData {
 
     fn write_segment_gpu_blocks(
         &self,
-        writer: &mut GpuBufferWriterF,
+        request: &mut GpuDataRequest,
     ) {
         for segment in &self.brush_segments {
             // has to match VECS_PER_SEGMENT
-            writer.push_one(segment.local_rect);
-            writer.push_one(segment.extra_data);
+            request.write_segment(
+                segment.local_rect,
+                segment.extra_data,
+            );
         }
     }
 }
@@ -239,10 +245,10 @@ impl ImageBorderData {
         common: &mut PrimTemplateCommonData,
         frame_state: &mut FrameBuildingState,
     ) {
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT);
-        self.write_prim_gpu_blocks(&mut writer, &common.prim_rect.size());
-        self.write_segment_gpu_blocks(&mut writer);
-        common.gpu_buffer_address = writer.finish();
+        if let Some(ref mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) {
+            self.write_prim_gpu_blocks(request, &common.prim_rect.size());
+            self.write_segment_gpu_blocks(request);
+        }
 
         let frame_id = frame_state.rg_builder.frame_id();
         if self.frame_id != frame_id {
@@ -250,7 +256,7 @@ impl ImageBorderData {
 
             let size = frame_state.resource_cache.request_image(
                 self.request,
-                &mut frame_state.frame_gpu_data.f32,
+                frame_state.gpu_cache,
             );
 
             let task_id = frame_state.rg_builder.add().init(
@@ -273,15 +279,15 @@ impl ImageBorderData {
 
     fn write_prim_gpu_blocks(
         &self,
-        writer: &mut GpuBufferWriterF,
+        request: &mut GpuDataRequest,
         prim_size: &LayoutSize,
     ) {
         // Border primitives currently used for
         // image borders, and run through the
         // normal brush_image shader.
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one([
+        request.push(PremultipliedColorF::WHITE);
+        request.push(PremultipliedColorF::WHITE);
+        request.push([
             prim_size.width,
             prim_size.height,
             0.0,
@@ -291,12 +297,14 @@ impl ImageBorderData {
 
     fn write_segment_gpu_blocks(
         &self,
-        writer: &mut GpuBufferWriterF,
+        request: &mut GpuDataRequest,
     ) {
         for segment in &self.brush_segments {
             // has to match VECS_PER_SEGMENT
-            writer.push_one(segment.local_rect);
-            writer.push_one(segment.extra_data);
+            request.write_segment(
+                segment.local_rect,
+                segment.extra_data,
+            );
         }
     }
 }
@@ -369,9 +377,9 @@ fn test_struct_sizes() {
     // (b) You made a structure larger. This is not necessarily a problem, but should only
     //     be done with care, and after checking if talos performance regresses badly.
     assert_eq!(mem::size_of::<NormalBorderPrim>(), 84, "NormalBorderPrim size changed");
-    assert_eq!(mem::size_of::<NormalBorderTemplate>(), 208, "NormalBorderTemplate size changed");
+    assert_eq!(mem::size_of::<NormalBorderTemplate>(), 216, "NormalBorderTemplate size changed");
     assert_eq!(mem::size_of::<NormalBorderKey>(), 104, "NormalBorderKey size changed");
     assert_eq!(mem::size_of::<ImageBorder>(), 68, "ImageBorder size changed");
-    assert_eq!(mem::size_of::<ImageBorderTemplate>(), 96, "ImageBorderTemplate size changed");
+    assert_eq!(mem::size_of::<ImageBorderTemplate>(), 104, "ImageBorderTemplate size changed");
     assert_eq!(mem::size_of::<ImageBorderKey>(), 88, "ImageBorderKey size changed");
 }
diff --git a/gfx/wr/webrender/src/prim_store/gradient/conic.rs b/gfx/wr/webrender/src/prim_store/gradient/conic.rs
@@ -17,7 +17,7 @@ use crate::scene_building::IsVisible;
 use crate::frame_builder::FrameBuildingState;
 use crate::intern::{Internable, InternDebug, Handle as InternHandle};
 use crate::internal_types::LayoutPrimitiveInfo;
-use crate::prim_store::{BrushSegment, GradientTileRange, VECS_PER_SEGMENT};
+use crate::prim_store::{BrushSegment, GradientTileRange};
 use crate::prim_store::{PrimitiveInstanceKind, PrimitiveOpacity, FloatKey};
 use crate::prim_store::{PrimKeyCommonData, PrimTemplateCommonData, PrimitiveStore};
 use crate::prim_store::{NinePatchDescriptor, PointKey, SizeKey, InternablePrimitive};
@@ -261,23 +261,27 @@ impl ConicGradientTemplate {
         &mut self,
         frame_state: &mut FrameBuildingState,
     ) {
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT);
-        // write_prim_gpu_blocks
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one([
-            self.stretch_size.width,
-            self.stretch_size.height,
-            0.0,
-            0.0,
-        ]);
-        // write_segment_gpu_blocks
-        for segment in &self.brush_segments {
-            // has to match VECS_PER_SEGMENT
-            writer.push_one(segment.local_rect);
-            writer.push_one(segment.extra_data);
+        if let Some(mut request) =
+            frame_state.gpu_cache.request(&mut self.common.gpu_cache_handle) {
+            // write_prim_gpu_blocks
+            request.push(PremultipliedColorF::WHITE);
+            request.push(PremultipliedColorF::WHITE);
+            request.push([
+                self.stretch_size.width,
+                self.stretch_size.height,
+                0.0,
+                0.0,
+            ]);
+
+            // write_segment_gpu_blocks
+            for segment in &self.brush_segments {
+                // has to match VECS_PER_SEGMENT
+                request.write_segment(
+                    segment.local_rect,
+                    segment.extra_data,
+                );
+            }
         }
-        self.common.gpu_buffer_address = writer.finish();
 
         let cache_key = ConicGradientCacheKey {
             size: self.task_size,
@@ -297,10 +301,11 @@ impl ConicGradientTemplate {
             }),
             false,
             RenderTaskParent::Surface,
+            frame_state.gpu_cache,
             &mut frame_state.frame_gpu_data.f32,
             frame_state.rg_builder,
             &mut frame_state.surface_builder,
-            &mut |rg_builder, gpu_buffer_builder| {
+            &mut |rg_builder, gpu_buffer_builder, _| {
                 let stops = GradientGpuBlockBuilder::build(
                     false,
                     gpu_buffer_builder,
diff --git a/gfx/wr/webrender/src/prim_store/gradient/linear.rs b/gfx/wr/webrender/src/prim_store/gradient/linear.rs
@@ -19,7 +19,7 @@ use crate::frame_builder::FrameBuildingState;
 use crate::intern::{Internable, InternDebug, Handle as InternHandle};
 use crate::internal_types::LayoutPrimitiveInfo;
 use crate::image_tiling::simplify_repeated_primitive;
-use crate::prim_store::{BrushSegment, GradientTileRange, VECS_PER_SEGMENT};
+use crate::prim_store::{BrushSegment, GradientTileRange};
 use crate::prim_store::{PrimitiveInstanceKind, PrimitiveOpacity};
 use crate::prim_store::{PrimKeyCommonData, PrimTemplateCommonData, PrimitiveStore};
 use crate::prim_store::{NinePatchDescriptor, PointKey, SizeKey, InternablePrimitive};
@@ -494,44 +494,47 @@ impl LinearGradientTemplate {
         &mut self,
         frame_state: &mut FrameBuildingState,
     ) {
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT);
-
-        // Write_prim_gpu_blocks
-        if self.cached {
-            // We are using the image brush.
-            writer.push_one(PremultipliedColorF::WHITE);
-            writer.push_one(PremultipliedColorF::WHITE);
-            writer.push_one([
-                self.stretch_size.width,
-                self.stretch_size.height,
-                0.0,
-                0.0,
-            ]);
-        } else {
-            // We are using the gradient brush.
-            writer.push_one([
-                self.start_point.x,
-                self.start_point.y,
-                self.end_point.x,
-                self.end_point.y,
-            ]);
-            writer.push_one([
-                pack_as_float(self.extend_mode as u32),
-                self.stretch_size.width,
-                self.stretch_size.height,
-                0.0,
-            ]);
-        }
+        if let Some(mut request) = frame_state.gpu_cache.request(
+            &mut self.common.gpu_cache_handle
+        ) {
+
+            // Write_prim_gpu_blocks
+            if self.cached {
+                // We are using the image brush.
+                request.push(PremultipliedColorF::WHITE);
+                request.push(PremultipliedColorF::WHITE);
+                request.push([
+                    self.stretch_size.width,
+                    self.stretch_size.height,
+                    0.0,
+                    0.0,
+                ]);
+            } else {
+                // We are using the gradient brush.
+                request.push([
+                    self.start_point.x,
+                    self.start_point.y,
+                    self.end_point.x,
+                    self.end_point.y,
+                ]);
+                request.push([
+                    pack_as_float(self.extend_mode as u32),
+                    self.stretch_size.width,
+                    self.stretch_size.height,
+                    0.0,
+                ]);
+            }
 
-        // write_segment_gpu_blocks
-        for segment in &self.brush_segments {
-            // has to match VECS_PER_SEGMENT
-            writer.push_one(segment.local_rect);
-            writer.push_one(segment.extra_data);
+            // write_segment_gpu_blocks
+            for segment in &self.brush_segments {
+                // has to match VECS_PER_SEGMENT
+                request.write_segment(
+                    segment.local_rect,
+                    segment.extra_data,
+                );
+            }
         }
 
-        self.common.gpu_buffer_address = writer.finish();
-
         // Tile spacing is always handled by decomposing into separate draw calls so the
         // primitive opacity is equivalent to stops opacity. This might change to being
         // set to non-opaque in the presence of tile spacing if/when tile spacing is handled
@@ -562,10 +565,11 @@ impl LinearGradientTemplate {
                 }),
                 false,
                 RenderTaskParent::Surface,
+                frame_state.gpu_cache,
                 &mut frame_state.frame_gpu_data.f32,
                 frame_state.rg_builder,
                 &mut frame_state.surface_builder,
-                &mut |rg_builder, _| {
+                &mut |rg_builder, _, _| {
                     rg_builder.add().init(RenderTask::new_dynamic(
                         self.task_size,
                         RenderTaskKind::FastLinearGradient(gradient),
@@ -590,10 +594,11 @@ impl LinearGradientTemplate {
                 }),
                 false,
                 RenderTaskParent::Surface,
+                frame_state.gpu_cache,
                 &mut frame_state.frame_gpu_data.f32,
                 frame_state.rg_builder,
                 &mut frame_state.surface_builder,
-                &mut |rg_builder, gpu_buffer_builder| {
+                &mut |rg_builder, gpu_buffer_builder, _| {
                     let stops = Some(GradientGpuBlockBuilder::build(
                         self.reverse_stops,
                         gpu_buffer_builder,
diff --git a/gfx/wr/webrender/src/prim_store/gradient/mod.rs b/gfx/wr/webrender/src/prim_store/gradient/mod.rs
@@ -590,14 +590,14 @@ fn test_struct_sizes() {
     // (b) You made a structure larger. This is not necessarily a problem, but should only
     //     be done with care, and after checking if talos performance regresses badly.
     assert_eq!(mem::size_of::<LinearGradient>(), 72, "LinearGradient size changed");
-    assert_eq!(mem::size_of::<LinearGradientTemplate>(), 136, "LinearGradientTemplate size changed");
+    assert_eq!(mem::size_of::<LinearGradientTemplate>(), 144, "LinearGradientTemplate size changed");
     assert_eq!(mem::size_of::<LinearGradientKey>(), 96, "LinearGradientKey size changed");
 
     assert_eq!(mem::size_of::<RadialGradient>(), 72, "RadialGradient size changed");
-    assert_eq!(mem::size_of::<RadialGradientTemplate>(), 136, "RadialGradientTemplate size changed");
+    assert_eq!(mem::size_of::<RadialGradientTemplate>(), 144, "RadialGradientTemplate size changed");
     assert_eq!(mem::size_of::<RadialGradientKey>(), 96, "RadialGradientKey size changed");
 
     assert_eq!(mem::size_of::<ConicGradient>(), 72, "ConicGradient size changed");
-    assert_eq!(mem::size_of::<ConicGradientTemplate>(), 136, "ConicGradientTemplate size changed");
+    assert_eq!(mem::size_of::<ConicGradientTemplate>(), 144, "ConicGradientTemplate size changed");
     assert_eq!(mem::size_of::<ConicGradientKey>(), 96, "ConicGradientKey size changed");
 }
diff --git a/gfx/wr/webrender/src/prim_store/gradient/radial.rs b/gfx/wr/webrender/src/prim_store/gradient/radial.rs
@@ -17,7 +17,7 @@ use crate::scene_building::IsVisible;
 use crate::frame_builder::FrameBuildingState;
 use crate::intern::{Internable, InternDebug, Handle as InternHandle};
 use crate::internal_types::LayoutPrimitiveInfo;
-use crate::prim_store::{BrushSegment, GradientTileRange, InternablePrimitive, VECS_PER_SEGMENT};
+use crate::prim_store::{BrushSegment, GradientTileRange, InternablePrimitive};
 use crate::prim_store::{PrimitiveInstanceKind, PrimitiveOpacity};
 use crate::prim_store::{PrimKeyCommonData, PrimTemplateCommonData, PrimitiveStore};
 use crate::prim_store::{NinePatchDescriptor, PointKey, SizeKey, FloatKey};
@@ -228,24 +228,27 @@ impl RadialGradientTemplate {
         &mut self,
         frame_state: &mut FrameBuildingState,
     ) {
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3 + self.brush_segments.len() * VECS_PER_SEGMENT);
-
-        // write_prim_gpu_blocks
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one([
-            self.stretch_size.width,
-            self.stretch_size.height,
-            0.0,
-            0.0,
-        ]);
-        // write_segment_gpu_blocks
-        for segment in &self.brush_segments {
-            // has to match VECS_PER_SEGMENT
-            writer.push_one(segment.local_rect);
-            writer.push_one(segment.extra_data);
+        if let Some(mut request) =
+            frame_state.gpu_cache.request(&mut self.common.gpu_cache_handle) {
+            // write_prim_gpu_blocks
+            request.push(PremultipliedColorF::WHITE);
+            request.push(PremultipliedColorF::WHITE);
+            request.push([
+                self.stretch_size.width,
+                self.stretch_size.height,
+                0.0,
+                0.0,
+            ]);
+
+            // write_segment_gpu_blocks
+            for segment in &self.brush_segments {
+                // has to match VECS_PER_SEGMENT
+                request.write_segment(
+                    segment.local_rect,
+                    segment.extra_data,
+                );
+            }
         }
-        self.common.gpu_buffer_address = writer.finish();
 
         let task_size = self.task_size;
         let cache_key = RadialGradientCacheKey {
@@ -266,10 +269,11 @@ impl RadialGradientTemplate {
             }),
             false,
             RenderTaskParent::Surface,
+            frame_state.gpu_cache,
             &mut frame_state.frame_gpu_data.f32,
             frame_state.rg_builder,
             &mut frame_state.surface_builder,
-            &mut |rg_builder, gpu_buffer_builder| {
+            &mut |rg_builder, gpu_buffer_builder, _| {
                 let stops = GradientGpuBlockBuilder::build(
                     false,
                     gpu_buffer_builder,
diff --git a/gfx/wr/webrender/src/prim_store/image.rs b/gfx/wr/webrender/src/prim_store/image.rs
@@ -10,9 +10,9 @@ use api::{
 use api::units::*;
 use euclid::point2;
 use crate::composite::CompositorSurfaceKind;
-use crate::renderer::{GpuBufferBuilderF, GpuBufferWriterF};
 use crate::scene_building::{CreateShadow, IsVisible};
 use crate::frame_builder::{FrameBuildingContext, FrameBuildingState};
+use crate::gpu_cache::{GpuCache, GpuDataRequest};
 use crate::intern::{Internable, InternDebug, Handle as InternHandle};
 use crate::internal_types::LayoutPrimitiveInfo;
 use crate::prim_store::{
@@ -192,7 +192,7 @@ impl ImageData {
 
                 let mut size = frame_state.resource_cache.request_image(
                     request,
-                    &mut frame_state.frame_gpu_data.f32,
+                    frame_state.gpu_cache,
                 );
 
                 let mut task_id = frame_state.rg_builder.add().init(
@@ -274,10 +274,11 @@ impl ImageData {
                         }),
                         descriptor.is_opaque(),
                         RenderTaskParent::Surface,
+                        frame_state.gpu_cache,
                         &mut frame_state.frame_gpu_data.f32,
                         frame_state.rg_builder,
                         &mut frame_state.surface_builder,
-                        &mut |rg_builder, _| {
+                        &mut |rg_builder, _, _| {
                             // Create a task to blit from the texture cache to
                             // a normal transient render task surface.
                             // TODO: figure out if/when we can do a blit instead.
@@ -356,7 +357,7 @@ impl ImageData {
                         let request = request.with_tile(tile.offset);
                         let size = frame_state.resource_cache.request_image(
                             request,
-                            &mut frame_state.frame_gpu_data.f32,
+                            frame_state.gpu_cache,
                         );
 
                         let task_id = frame_state.rg_builder.add().init(
@@ -389,19 +390,19 @@ impl ImageData {
             );
         }
 
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3);
-        self.write_prim_gpu_blocks(&image_instance.adjustment, &mut writer);
-        common.gpu_buffer_address = writer.finish();
+        if let Some(mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) {
+            self.write_prim_gpu_blocks(&image_instance.adjustment, &mut request);
+        }
     }
 
-    pub fn write_prim_gpu_blocks(&self, adjustment: &AdjustedImageSource, writer: &mut GpuBufferWriterF) {
+    pub fn write_prim_gpu_blocks(&self, adjustment: &AdjustedImageSource, request: &mut GpuDataRequest) {
         let stretch_size = adjustment.map_stretch_size(self.stretch_size);
         // Images are drawn as a white color, modulated by the total
         // opacity coming from any collapsed property bindings.
         // Size has to match `VECS_PER_SPECIFIC_BRUSH` from `brush_image.glsl` exactly.
-        writer.push_one(self.color.premultiplied());
-        writer.push_one(PremultipliedColorF::WHITE);
-        writer.push_one([
+        request.push(self.color.premultiplied());
+        request.push(PremultipliedColorF::WHITE);
+        request.push([
             stretch_size.width + self.tile_spacing.width,
             stretch_size.height + self.tile_spacing.height,
             0.0,
@@ -672,7 +673,7 @@ impl YuvImageData {
 
             let size = frame_state.resource_cache.request_image(
                 request,
-                &mut frame_state.frame_gpu_data.f32,
+                frame_state.gpu_cache,
             );
 
             let task_id = frame_state.rg_builder.add().init(
@@ -686,18 +687,18 @@ impl YuvImageData {
             self.src_yuv[channel] = Some(task_id);
         }
 
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(1);
-        self.write_prim_gpu_blocks(&mut writer);
-        common.gpu_buffer_address = writer.finish();
+        if let Some(mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) {
+            self.write_prim_gpu_blocks(&mut request);
+        };
 
-    // YUV images never have transparency
+        // YUV images never have transparency
         common.opacity = PrimitiveOpacity::opaque();
     }
 
     pub fn request_resources(
         &mut self,
         resource_cache: &mut ResourceCache,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
     ) {
         let channel_num = self.format.get_plane_num();
         debug_assert!(channel_num <= 3);
@@ -708,14 +709,14 @@ impl YuvImageData {
                     rendering: self.image_rendering,
                     tile: None,
                 },
-                gpu_buffer,
+                gpu_cache,
             );
         }
     }
 
-    pub fn write_prim_gpu_blocks(&self, writer: &mut GpuBufferWriterF) {
+    pub fn write_prim_gpu_blocks(&self, request: &mut GpuDataRequest) {
         let ranged_color_space = self.color_space.with_range(self.color_range);
-        writer.push_one([
+        request.push([
             pack_as_float(self.color_depth.bit_depth()),
             pack_as_float(ranged_color_space as u32),
             pack_as_float(self.format as u32),
@@ -784,9 +785,9 @@ fn test_struct_sizes() {
     // (b) You made a structure larger. This is not necessarily a problem, but should only
     //     be done with care, and after checking if talos performance regresses badly.
     assert_eq!(mem::size_of::<Image>(), 32, "Image size changed");
-    assert_eq!(mem::size_of::<ImageTemplate>(), 68, "ImageTemplate size changed");
+    assert_eq!(mem::size_of::<ImageTemplate>(), 72, "ImageTemplate size changed");
     assert_eq!(mem::size_of::<ImageKey>(), 52, "ImageKey size changed");
     assert_eq!(mem::size_of::<YuvImage>(), 32, "YuvImage size changed");
-    assert_eq!(mem::size_of::<YuvImageTemplate>(), 80, "YuvImageTemplate size changed");
+    assert_eq!(mem::size_of::<YuvImageTemplate>(), 84, "YuvImageTemplate size changed");
     assert_eq!(mem::size_of::<YuvImageKey>(), 52, "YuvImageKey size changed");
 }
diff --git a/gfx/wr/webrender/src/prim_store/line_dec.rs b/gfx/wr/webrender/src/prim_store/line_dec.rs
@@ -7,9 +7,9 @@ use api::{
     LineOrientation, LineStyle, PremultipliedColorF, Shadow,
 };
 use api::units::*;
-use crate::renderer::GpuBufferWriterF;
 use crate::scene_building::{CreateShadow, IsVisible};
 use crate::frame_builder::FrameBuildingState;
+use crate::gpu_cache::GpuDataRequest;
 use crate::intern;
 use crate::internal_types::LayoutPrimitiveInfo;
 use crate::prim_store::{
@@ -78,20 +78,20 @@ impl LineDecorationData {
         common: &mut PrimTemplateCommonData,
         frame_state: &mut FrameBuildingState,
     ) {
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(3);
-        self.write_prim_gpu_blocks(&mut writer);
-        common.gpu_buffer_address = writer.finish();
+        if let Some(ref mut request) = frame_state.gpu_cache.request(&mut common.gpu_cache_handle) {
+            self.write_prim_gpu_blocks(request);
+        }
     }
 
     fn write_prim_gpu_blocks(
         &self,
-        writer: &mut GpuBufferWriterF
+        request: &mut GpuDataRequest
     ) {
         match self.cache_key.as_ref() {
             Some(cache_key) => {
-                writer.push_one(self.color.premultiplied());
-                writer.push_one(PremultipliedColorF::WHITE);
-                writer.push_one([
+                request.push(self.color.premultiplied());
+                request.push(PremultipliedColorF::WHITE);
+                request.push([
                     cache_key.size.width.to_f32_px(),
                     cache_key.size.height.to_f32_px(),
                     0.0,
@@ -99,7 +99,7 @@ impl LineDecorationData {
                 ]);
             }
             None => {
-                writer.push_one(self.color.premultiplied());
+                request.push(self.color.premultiplied());
             }
         }
     }
@@ -251,6 +251,6 @@ fn test_struct_sizes() {
     // (b) You made a structure larger. This is not necessarily a problem, but should only
     //     be done with care, and after checking if talos performance regresses badly.
     assert_eq!(mem::size_of::<LineDecoration>(), 20, "LineDecoration size changed");
-    assert_eq!(mem::size_of::<LineDecorationTemplate>(), 56, "LineDecorationTemplate size changed");
+    assert_eq!(mem::size_of::<LineDecorationTemplate>(), 60, "LineDecorationTemplate size changed");
     assert_eq!(mem::size_of::<LineDecorationKey>(), 40, "LineDecorationKey size changed");
 }
diff --git a/gfx/wr/webrender/src/prim_store/mod.rs b/gfx/wr/webrender/src/prim_store/mod.rs
@@ -13,7 +13,6 @@ use crate::composite::CompositorSurfaceKind;
 use crate::clip::ClipLeafId;
 use crate::pattern::{Pattern, PatternBuilder, PatternBuilderContext, PatternBuilderState};
 use crate::quad::QuadTileClassifier;
-use crate::renderer::{GpuBufferAddress, GpuBufferWriterF};
 use crate::segment::EdgeAaSegmentMask;
 use crate::border::BorderSegmentCacheKey;
 use crate::debug_item::{DebugItem, DebugMessage};
@@ -21,6 +20,7 @@ use crate::debug_colors;
 use crate::scene_building::{CreateShadow, IsVisible};
 use crate::frame_builder::FrameBuildingState;
 use glyph_rasterizer::GlyphKey;
+use crate::gpu_cache::{GpuCacheAddress, GpuCacheHandle, GpuDataRequest};
 use crate::gpu_types::{BrushFlags, QuadSegment};
 use crate::intern;
 use crate::picture::PicturePrimitive;
@@ -90,7 +90,7 @@ impl PrimitiveOpacity {
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct DeferredResolve {
-    pub address: GpuBufferAddress,
+    pub address: GpuCacheAddress,
     pub image_properties: ImageProperties,
     pub rendering: ImageRendering,
     pub is_composited: bool,
@@ -488,16 +488,16 @@ impl PrimitiveTemplateKind {
     /// Write any GPU blocks for the primitive template to the given request object.
     pub fn write_prim_gpu_blocks(
         &self,
-        writer: &mut GpuBufferWriterF,
+        request: &mut GpuDataRequest,
         scene_properties: &SceneProperties,
     ) {
         match *self {
             PrimitiveTemplateKind::Clear => {
                 // Opaque black with operator dest out
-                writer.push_one(PremultipliedColorF::BLACK);
+                request.push(PremultipliedColorF::BLACK);
             }
             PrimitiveTemplateKind::Rectangle { ref color, .. } => {
-                writer.push_one(scene_properties.resolve_color(color).premultiplied())
+                request.push(scene_properties.resolve_color(color).premultiplied())
             }
         }
     }
@@ -530,12 +530,11 @@ pub struct PrimTemplateCommonData {
     pub may_need_repetition: bool,
     pub prim_rect: LayoutRect,
     pub opacity: PrimitiveOpacity,
-    /// Address of the per-primitive data in the GPU cache.
-    ///
-    /// TODO: This is only valid during the current frame and must
-    /// be overwritten each frame. We should move this out of the
-    /// common data to avoid accidental reuse.
-    pub gpu_buffer_address: GpuBufferAddress,
+    /// The GPU cache handle for a primitive template. Since this structure
+    /// is retained across display lists by interning, this GPU cache handle
+    /// also remains valid, which reduces the number of updates to the GPU
+    /// cache when a new display list is processed.
+    pub gpu_cache_handle: GpuCacheHandle,
     /// Specifies the edges that are *allowed* to have anti-aliasing.
     /// In other words EdgeAaSegmentFlags::all() does not necessarily mean all edges will
     /// be anti-aliased, only that they could be.
@@ -550,7 +549,7 @@ impl PrimTemplateCommonData {
             flags: common.flags,
             may_need_repetition: true,
             prim_rect: common.prim_rect.into(),
-            gpu_buffer_address: GpuBufferAddress::INVALID,
+            gpu_cache_handle: GpuCacheHandle::new(),
             opacity: PrimitiveOpacity::translucent(),
             edge_aa_mask: EdgeAaSegmentMask::all(),
         }
@@ -640,9 +639,9 @@ impl PrimitiveTemplate {
         frame_state: &mut FrameBuildingState,
         scene_properties: &SceneProperties,
     ) {
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(1);
-        self.kind.write_prim_gpu_blocks(&mut writer, scene_properties);
-        self.common.gpu_buffer_address = writer.finish();
+        if let Some(mut request) = frame_state.gpu_cache.request(&mut self.common.gpu_cache_handle) {
+            self.kind.write_prim_gpu_blocks(&mut request, scene_properties);
+        }
 
         self.opacity = match self.kind {
             PrimitiveTemplateKind::Clear => {
@@ -713,7 +712,7 @@ pub struct VisibleMaskImageTile {
 #[derive(Debug)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 pub struct VisibleGradientTile {
-    pub address: GpuBufferAddress,
+    pub handle: GpuCacheHandle,
     pub local_rect: LayoutRect,
     pub local_clip_rect: LayoutRect,
 }
@@ -1203,7 +1202,7 @@ impl PrimitiveInstance {
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[derive(Debug)]
 pub struct SegmentedInstance {
-    pub gpu_data: GpuBufferAddress,
+    pub gpu_cache_handle: GpuCacheHandle,
     pub segments_range: SegmentsRange,
 }
 
@@ -1556,7 +1555,7 @@ fn test_struct_sizes() {
     //     be done with care, and after checking if talos performance regresses badly.
     assert_eq!(mem::size_of::<PrimitiveInstance>(), 88, "PrimitiveInstance size changed");
     assert_eq!(mem::size_of::<PrimitiveInstanceKind>(), 24, "PrimitiveInstanceKind size changed");
-    assert_eq!(mem::size_of::<PrimitiveTemplate>(), 52, "PrimitiveTemplate size changed");
+    assert_eq!(mem::size_of::<PrimitiveTemplate>(), 56, "PrimitiveTemplate size changed");
     assert_eq!(mem::size_of::<PrimitiveTemplateKind>(), 28, "PrimitiveTemplateKind size changed");
     assert_eq!(mem::size_of::<PrimitiveKey>(), 36, "PrimitiveKey size changed");
     assert_eq!(mem::size_of::<PrimitiveKeyKind>(), 16, "PrimitiveKeyKind size changed");
diff --git a/gfx/wr/webrender/src/prim_store/text_run.rs b/gfx/wr/webrender/src/prim_store/text_run.rs
@@ -8,12 +8,13 @@ use api::units::*;
 use crate::scene_building::{CreateShadow, IsVisible};
 use crate::frame_builder::FrameBuildingState;
 use glyph_rasterizer::{FontInstance, FontTransform, GlyphKey, FONT_SIZE_LIMIT};
+use crate::gpu_cache::GpuCache;
 use crate::intern;
 use crate::internal_types::LayoutPrimitiveInfo;
 use crate::picture::SurfaceInfo;
 use crate::prim_store::{PrimitiveOpacity,  PrimitiveScratchBuffer};
 use crate::prim_store::{PrimitiveStore, PrimKeyCommonData, PrimTemplateCommonData};
-use crate::renderer::{GpuBufferBuilderF, MAX_VERTEX_TEXTURE_WIDTH};
+use crate::renderer::MAX_VERTEX_TEXTURE_WIDTH;
 use crate::resource_cache::ResourceCache;
 use crate::util::MatrixHelpers;
 use crate::prim_store::{InternablePrimitive, PrimitiveInstanceKind, LayoutPointAu};
@@ -135,32 +136,32 @@ impl TextRunTemplate {
         &mut self,
         frame_state: &mut FrameBuildingState,
     ) {
-        // Corresponds to `fetch_glyph` in the shaders.
-        let num_blocks = (self.glyphs.len() + 1) / 2 + 1;
-        assert!(num_blocks <= MAX_VERTEX_TEXTURE_WIDTH);
-        let mut writer = frame_state.frame_gpu_data.f32.write_blocks(num_blocks);
-        writer.push_one(ColorF::from(self.font.color).premultiplied());
-
-        let mut gpu_block = [0.0; 4];
-        for (i, src) in self.glyphs.iter().enumerate() {
-            // Two glyphs are packed per GPU block.
-            if (i & 1) == 0 {
-                gpu_block[0] = src.point.x;
-                gpu_block[1] = src.point.y;
-            } else {
-                gpu_block[2] = src.point.x;
-                gpu_block[3] = src.point.y;
-                writer.push_one(gpu_block);
+        // corresponds to `fetch_glyph` in the shaders
+        if let Some(mut request) = frame_state.gpu_cache.request(&mut self.common.gpu_cache_handle) {
+            request.push(ColorF::from(self.font.color).premultiplied());
+
+            let mut gpu_block = [0.0; 4];
+            for (i, src) in self.glyphs.iter().enumerate() {
+                // Two glyphs are packed per GPU block.
+
+                if (i & 1) == 0 {
+                    gpu_block[0] = src.point.x;
+                    gpu_block[1] = src.point.y;
+                } else {
+                    gpu_block[2] = src.point.x;
+                    gpu_block[3] = src.point.y;
+                    request.push(gpu_block);
+                }
             }
-        }
 
-        // Ensure the last block is added in the case
-        // of an odd number of glyphs.
-        if (self.glyphs.len() & 1) != 0 {
-            writer.push_one(gpu_block);
-        }
+            // Ensure the last block is added in the case
+            // of an odd number of glyphs.
+            if (self.glyphs.len() & 1) != 0 {
+                request.push(gpu_block);
+            }
 
-        self.common.gpu_buffer_address = writer.finish();
+            assert!(request.current_used_block_num() <= MAX_VERTEX_TEXTURE_WIDTH);
+        }
     }
 }
 
@@ -465,7 +466,7 @@ impl TextRunPrimitive {
         allow_subpixel: bool,
         low_quality_pinch_zoom: bool,
         resource_cache: &mut ResourceCache,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         spatial_tree: &SpatialTree,
         scratch: &mut PrimitiveScratchBuffer,
     ) {
@@ -506,7 +507,7 @@ impl TextRunPrimitive {
         resource_cache.request_glyphs(
             self.used_font.clone(),
             &scratch.glyph_keys[self.glyph_keys_range],
-            gpu_buffer,
+            gpu_cache,
         );
     }
 }
@@ -523,7 +524,7 @@ fn test_struct_sizes() {
     // (b) You made a structure larger. This is not necessarily a problem, but should only
     //     be done with care, and after checking if talos performance regresses badly.
     assert_eq!(mem::size_of::<TextRun>(), 88, "TextRun size changed");
-    assert_eq!(mem::size_of::<TextRunTemplate>(), 88, "TextRunTemplate size changed");
+    assert_eq!(mem::size_of::<TextRunTemplate>(), 96, "TextRunTemplate size changed");
     assert_eq!(mem::size_of::<TextRunKey>(), 104, "TextRunKey size changed");
     assert_eq!(mem::size_of::<TextRunPrimitive>(), 80, "TextRunPrimitive size changed");
 }
diff --git a/gfx/wr/webrender/src/profiler.rs b/gfx/wr/webrender/src/profiler.rs
@@ -27,7 +27,7 @@ use crate::renderer::DebugRenderer;
 use crate::device::query::GpuTimer;
 use euclid::{Point2D, Rect, Size2D, vec2, default};
 use crate::internal_types::FastHashMap;
-use crate::renderer::{FullFrameStats, init::wr_has_been_initialized};
+use crate::renderer::{FullFrameStats, MAX_VERTEX_TEXTURE_WIDTH, init::wr_has_been_initialized};
 use api::units::DeviceIntSize;
 use std::collections::vec_deque::VecDeque;
 use std::fmt::{Write, Debug};
@@ -150,134 +150,144 @@ pub const UPLOAD_NUM_COPY_BATCHES: usize = 23;
 pub const TOTAL_UPLOAD_TIME: usize = 24;
 pub const CREATE_CACHE_TEXTURE_TIME: usize = 25;
 pub const DELETE_CACHE_TEXTURE_TIME: usize = 26;
+pub const GPU_CACHE_UPLOAD_TIME: usize = 27;
 
-pub const RASTERIZED_BLOBS: usize = 27;
-pub const RASTERIZED_BLOB_TILES: usize = 28;
-pub const RASTERIZED_BLOBS_PX: usize = 29;
-pub const BLOB_RASTERIZATION_TIME: usize = 30;
+pub const RASTERIZED_BLOBS: usize = 28;
+pub const RASTERIZED_BLOB_TILES: usize = 29;
+pub const RASTERIZED_BLOBS_PX: usize = 30;
+pub const BLOB_RASTERIZATION_TIME: usize = 31;
 
-pub const RASTERIZED_GLYPHS: usize = 31;
-pub const GLYPH_RESOLVE_TIME: usize = 32;
+pub const RASTERIZED_GLYPHS: usize = 32;
+pub const GLYPH_RESOLVE_TIME: usize = 33;
 
-pub const DRAW_CALLS: usize = 33;
-pub const VERTICES: usize = 34;
-pub const PRIMITIVES: usize = 35;
-pub const VISIBLE_PRIMITIVES: usize = 36;
+pub const DRAW_CALLS: usize = 34;
+pub const VERTICES: usize = 35;
+pub const PRIMITIVES: usize = 36;
+pub const VISIBLE_PRIMITIVES: usize = 37;
 
-pub const USED_TARGETS: usize = 37;
-pub const CREATED_TARGETS: usize = 38;
-pub const PICTURE_CACHE_SLICES: usize = 39;
+pub const USED_TARGETS: usize = 38;
+pub const CREATED_TARGETS: usize = 39;
+pub const PICTURE_CACHE_SLICES: usize = 40;
 
-pub const COLOR_PASSES: usize = 40;
-pub const ALPHA_PASSES: usize = 41;
-pub const PICTURE_TILES: usize = 42;
-pub const RENDERED_PICTURE_TILES: usize = 43;
+pub const COLOR_PASSES: usize = 41;
+pub const ALPHA_PASSES: usize = 42;
+pub const PICTURE_TILES: usize = 43;
+pub const RENDERED_PICTURE_TILES: usize = 44;
 
-pub const FONT_TEMPLATES: usize = 44;
-pub const FONT_TEMPLATES_MEM: usize = 45;
-pub const IMAGE_TEMPLATES: usize = 46;
-pub const IMAGE_TEMPLATES_MEM: usize = 47;
+pub const FONT_TEMPLATES: usize = 45;
+pub const FONT_TEMPLATES_MEM: usize = 46;
+pub const IMAGE_TEMPLATES: usize = 47;
+pub const IMAGE_TEMPLATES_MEM: usize = 48;
+
+pub const GPU_CACHE_ROWS_TOTAL: usize = 49;
+pub const GPU_CACHE_ROWS_UPDATED: usize = 50;
+pub const GPU_CACHE_BLOCKS_TOTAL: usize = 51;
+pub const GPU_CACHE_BLOCKS_UPDATED: usize = 52;
+pub const GPU_CACHE_BLOCKS_SAVED: usize = 53;
 
 // Atlas items represents the area occupied by items in the cache textures.
 // The actual texture memory allocated is ATLAS_TEXTURES_MEM.
-pub const ATLAS_ITEMS_MEM: usize = 48;
-pub const ATLAS_A8_PIXELS: usize = 49;
-pub const ATLAS_A8_TEXTURES: usize = 50;
-pub const ATLAS_A16_PIXELS: usize = 51;
-pub const ATLAS_A16_TEXTURES: usize = 52;
-pub const ATLAS_RGBA8_LINEAR_PIXELS: usize = 53;
-pub const ATLAS_RGBA8_LINEAR_TEXTURES: usize = 54;
-pub const ATLAS_RGBA8_NEAREST_PIXELS: usize = 55;
-pub const ATLAS_RGBA8_NEAREST_TEXTURES: usize = 56;
-pub const ATLAS_RGBA8_GLYPHS_PIXELS: usize = 57;
-pub const ATLAS_RGBA8_GLYPHS_TEXTURES: usize = 58;
-pub const ATLAS_A8_GLYPHS_PIXELS: usize = 59;
-pub const ATLAS_A8_GLYPHS_TEXTURES: usize = 60;
-pub const ATLAS_COLOR8_LINEAR_PRESSURE: usize = 61;
-pub const ATLAS_COLOR8_NEAREST_PRESSURE: usize = 62;
-pub const ATLAS_COLOR8_GLYPHS_PRESSURE: usize = 63;
-pub const ATLAS_ALPHA8_PRESSURE: usize = 64;
-pub const ATLAS_ALPHA8_GLYPHS_PRESSURE: usize = 65;
-pub const ATLAS_ALPHA16_PRESSURE: usize = 66;
-pub const ATLAS_STANDALONE_PRESSURE: usize = 67;
-
-pub const TEXTURE_CACHE_EVICTION_COUNT: usize = 68;
-pub const TEXTURE_CACHE_YOUNGEST_EVICTION: usize = 69;
-pub const EXTERNAL_IMAGE_BYTES: usize = 70;
-pub const ATLAS_TEXTURES_MEM: usize = 71;
-pub const STANDALONE_TEXTURES_MEM: usize = 72;
-pub const PICTURE_TILES_MEM: usize = 73;
-pub const RENDER_TARGET_MEM: usize = 74;
-
-pub const ALPHA_TARGETS_SAMPLERS: usize = 75;
-pub const TRANSPARENT_PASS_SAMPLERS: usize = 76;
-pub const OPAQUE_PASS_SAMPLERS: usize = 77;
-pub const TOTAL_SAMPLERS: usize = 78;
-
-pub const INTERNED_PRIMITIVES: usize = 79;
-pub const INTERNED_CLIPS: usize = 80;
-pub const INTERNED_TEXT_RUNS: usize = 81;
-pub const INTERNED_NORMAL_BORDERS: usize = 82;
-pub const INTERNED_IMAGE_BORDERS: usize = 83;
-pub const INTERNED_IMAGES: usize = 84;
-pub const INTERNED_YUV_IMAGES: usize = 85;
-pub const INTERNED_LINE_DECORATIONS: usize = 86;
-pub const INTERNED_LINEAR_GRADIENTS: usize = 87;
-pub const INTERNED_RADIAL_GRADIENTS: usize = 88;
-pub const INTERNED_CONIC_GRADIENTS: usize = 89;
-pub const INTERNED_PICTURES: usize = 90;
-pub const INTERNED_FILTER_DATA: usize = 91;
-pub const INTERNED_BACKDROP_CAPTURES: usize = 92;
-pub const INTERNED_BACKDROP_RENDERS: usize = 93;
-pub const INTERNED_POLYGONS: usize = 94;
-pub const INTERNED_BOX_SHADOWS: usize = 95;
-pub const DEPTH_TARGETS_MEM: usize = 96;
-
-pub const SHADER_BUILD_TIME: usize = 97;
-
-pub const RENDER_REASON_FIRST: usize = 98;
-pub const RENDER_REASON_SCENE: usize = 99;
-pub const RENDER_REASON_ANIMATED_PROPERTY: usize = 100;
-pub const RENDER_REASON_RESOURCE_UPDATE: usize = 101;
-pub const RENDER_REASON_ASYNC_IMAGE: usize = 102;
-pub const RENDER_REASON_CLEAR_RESOURCES: usize = 103;
-pub const RENDER_REASON_APZ: usize = 104;
-pub const RENDER_REASON_RESIZE: usize = 105;
-pub const RENDER_REASON_WIDGET: usize = 106;
-pub const RENDER_REASON_TEXTURE_CACHE_FLUSH: usize = 107;
-pub const RENDER_REASON_SNAPSHOT: usize = 108;
-pub const RENDER_REASON_POST_RESOURCE_UPDATE_HOOKS: usize = 109;
-pub const RENDER_REASON_CONFIG_CHANGE: usize = 110;
-pub const RENDER_REASON_CONTENT_SYNC: usize = 111;
-pub const RENDER_REASON_FLUSH: usize = 112;
-pub const RENDER_REASON_TESTING: usize = 113;
-pub const RENDER_REASON_OTHER: usize = 114;
-pub const RENDER_REASON_VSYNC: usize = 115;
-
-pub const TEXTURES_CREATED: usize = 116;
-pub const TEXTURES_DELETED: usize = 117;
-
-pub const SLOW_FRAME_CPU_COUNT: usize = 118;
-pub const SLOW_FRAME_GPU_COUNT: usize = 119;
-pub const SLOW_FRAME_BUILD_COUNT: usize = 120;
-pub const SLOW_UPLOAD_COUNT: usize = 121;
-pub const SLOW_RENDER_COUNT: usize = 122;
-pub const SLOW_DRAW_CALLS_COUNT: usize = 123;
-pub const SLOW_TARGETS_COUNT: usize = 124;
-pub const SLOW_BLOB_COUNT: usize = 125;
-pub const SLOW_SCROLL_AFTER_SCENE_COUNT: usize = 126;
-
-pub const GPU_BUFFER_MEM: usize = 127;
-pub const GPU_TOTAL_MEM: usize = 128;
-
-pub const FRAME_SEND_TIME: usize = 129;
-pub const UPDATE_DOCUMENT_TIME: usize = 130;
-
-pub const COMPOSITOR_SURFACE_UNDERLAYS: usize = 131;
-pub const COMPOSITOR_SURFACE_OVERLAYS: usize = 132;
-pub const COMPOSITOR_SURFACE_BLITS: usize = 133;
-
-pub const NUM_PROFILER_EVENTS: usize = 134;
+pub const ATLAS_ITEMS_MEM: usize = 54;
+pub const ATLAS_A8_PIXELS: usize = 55;
+pub const ATLAS_A8_TEXTURES: usize = 56;
+pub const ATLAS_A16_PIXELS: usize = 57;
+pub const ATLAS_A16_TEXTURES: usize = 58;
+pub const ATLAS_RGBA8_LINEAR_PIXELS: usize = 59;
+pub const ATLAS_RGBA8_LINEAR_TEXTURES: usize = 60;
+pub const ATLAS_RGBA8_NEAREST_PIXELS: usize = 61;
+pub const ATLAS_RGBA8_NEAREST_TEXTURES: usize = 62;
+pub const ATLAS_RGBA8_GLYPHS_PIXELS: usize = 63;
+pub const ATLAS_RGBA8_GLYPHS_TEXTURES: usize = 64;
+pub const ATLAS_A8_GLYPHS_PIXELS: usize = 65;
+pub const ATLAS_A8_GLYPHS_TEXTURES: usize = 66;
+pub const ATLAS_COLOR8_LINEAR_PRESSURE: usize = 67;
+pub const ATLAS_COLOR8_NEAREST_PRESSURE: usize = 68;
+pub const ATLAS_COLOR8_GLYPHS_PRESSURE: usize = 69;
+pub const ATLAS_ALPHA8_PRESSURE: usize = 70;
+pub const ATLAS_ALPHA8_GLYPHS_PRESSURE: usize = 71;
+pub const ATLAS_ALPHA16_PRESSURE: usize = 72;
+pub const ATLAS_STANDALONE_PRESSURE: usize = 73;
+
+pub const TEXTURE_CACHE_EVICTION_COUNT: usize = 74;
+pub const TEXTURE_CACHE_YOUNGEST_EVICTION: usize = 75;
+pub const EXTERNAL_IMAGE_BYTES: usize = 76;
+pub const ATLAS_TEXTURES_MEM: usize = 77;
+pub const STANDALONE_TEXTURES_MEM: usize = 78;
+pub const PICTURE_TILES_MEM: usize = 79;
+pub const RENDER_TARGET_MEM: usize = 80;
+
+pub const ALPHA_TARGETS_SAMPLERS: usize = 81;
+pub const TRANSPARENT_PASS_SAMPLERS: usize = 82;
+pub const OPAQUE_PASS_SAMPLERS: usize = 83;
+pub const TOTAL_SAMPLERS: usize = 84;
+
+pub const INTERNED_PRIMITIVES: usize = 85;
+pub const INTERNED_CLIPS: usize = 86;
+pub const INTERNED_TEXT_RUNS: usize = 87;
+pub const INTERNED_NORMAL_BORDERS: usize = 88;
+pub const INTERNED_IMAGE_BORDERS: usize = 89;
+pub const INTERNED_IMAGES: usize = 90;
+pub const INTERNED_YUV_IMAGES: usize = 91;
+pub const INTERNED_LINE_DECORATIONS: usize = 92;
+pub const INTERNED_LINEAR_GRADIENTS: usize = 93;
+pub const INTERNED_RADIAL_GRADIENTS: usize = 94;
+pub const INTERNED_CONIC_GRADIENTS: usize = 95;
+pub const INTERNED_PICTURES: usize = 96;
+pub const INTERNED_FILTER_DATA: usize = 97;
+pub const INTERNED_BACKDROP_CAPTURES: usize = 98;
+pub const INTERNED_BACKDROP_RENDERS: usize = 99;
+pub const INTERNED_POLYGONS: usize = 100;
+pub const INTERNED_BOX_SHADOWS: usize = 101;
+pub const DEPTH_TARGETS_MEM: usize = 102;
+
+pub const SHADER_BUILD_TIME: usize = 103;
+
+pub const RENDER_REASON_FIRST: usize = 104;
+pub const RENDER_REASON_SCENE: usize = 104;
+pub const RENDER_REASON_ANIMATED_PROPERTY: usize = 105;
+pub const RENDER_REASON_RESOURCE_UPDATE: usize = 106;
+pub const RENDER_REASON_ASYNC_IMAGE: usize = 107;
+pub const RENDER_REASON_CLEAR_RESOURCES: usize = 108;
+pub const RENDER_REASON_APZ: usize = 109;
+pub const RENDER_REASON_RESIZE: usize = 110;
+pub const RENDER_REASON_WIDGET: usize = 111;
+pub const RENDER_REASON_TEXTURE_CACHE_FLUSH: usize = 112;
+pub const RENDER_REASON_SNAPSHOT: usize = 113;
+pub const RENDER_REASON_POST_RESOURCE_UPDATE_HOOKS: usize = 114;
+pub const RENDER_REASON_CONFIG_CHANGE: usize = 115;
+pub const RENDER_REASON_CONTENT_SYNC: usize = 116;
+pub const RENDER_REASON_FLUSH: usize = 117;
+pub const RENDER_REASON_TESTING: usize = 118;
+pub const RENDER_REASON_OTHER: usize = 119;
+pub const RENDER_REASON_VSYNC: usize = 120;
+
+pub const TEXTURES_CREATED: usize = 121;
+pub const TEXTURES_DELETED: usize = 122;
+
+pub const SLOW_FRAME_CPU_COUNT: usize = 123;
+pub const SLOW_FRAME_GPU_COUNT: usize = 124;
+pub const SLOW_FRAME_BUILD_COUNT: usize = 125;
+pub const SLOW_UPLOAD_COUNT: usize = 126;
+pub const SLOW_RENDER_COUNT: usize = 127;
+pub const SLOW_DRAW_CALLS_COUNT: usize = 128;
+pub const SLOW_TARGETS_COUNT: usize = 129;
+pub const SLOW_BLOB_COUNT: usize = 130;
+pub const SLOW_SCROLL_AFTER_SCENE_COUNT: usize = 131;
+
+pub const GPU_CACHE_MEM: usize = 132;
+pub const GPU_BUFFER_MEM: usize = 133;
+pub const GPU_TOTAL_MEM: usize = 134;
+
+pub const GPU_CACHE_PREPARE_TIME: usize = 135;
+
+pub const FRAME_SEND_TIME: usize = 136;
+pub const UPDATE_DOCUMENT_TIME: usize = 137;
+
+pub const COMPOSITOR_SURFACE_UNDERLAYS: usize = 138;
+pub const COMPOSITOR_SURFACE_OVERLAYS: usize = 139;
+pub const COMPOSITOR_SURFACE_BLITS: usize = 140;
+
+pub const NUM_PROFILER_EVENTS: usize = 141;
 
 pub struct Profiler {
     counters: Vec<Counter>,
@@ -366,6 +376,7 @@ impl Profiler {
             float("Texture cache upload", "ms", TOTAL_UPLOAD_TIME, expected(0.0..5.0)),
             float("Cache texture creation", "ms", CREATE_CACHE_TEXTURE_TIME, expected(0.0..2.0)),
             float("Cache texture deletion", "ms", DELETE_CACHE_TEXTURE_TIME, expected(0.0..1.0)),
+            float("GPU cache upload", "ms", GPU_CACHE_UPLOAD_TIME, expected(0.0..2.0)),
 
             int("Rasterized blobs", "", RASTERIZED_BLOBS, expected(0..15)),
             int("Rasterized blob tiles", "", RASTERIZED_BLOB_TILES, expected(0..15)),
@@ -394,6 +405,12 @@ impl Profiler {
             int("Image templates", "", IMAGE_TEMPLATES, expected(0..100)),
             float("Image templates mem", "MB", IMAGE_TEMPLATES_MEM, expected(0.0..50.0)),
 
+            int("GPU cache rows total", "", GPU_CACHE_ROWS_TOTAL, expected(1..50)),
+            int("GPU cache rows updated", "", GPU_CACHE_ROWS_UPDATED, expected(0..25)),
+            int("GPU blocks total", "", GPU_CACHE_BLOCKS_TOTAL, expected(1..65_000)),
+            int("GPU blocks updated", "", GPU_CACHE_BLOCKS_UPDATED, expected(0..1000)),
+            int("GPU blocks saved", "", GPU_CACHE_BLOCKS_SAVED, expected(0..50_000)),
+
             float("Atlas items mem", "MB", ATLAS_ITEMS_MEM, expected(0.0..100.0)),
             int("Atlas A8 pixels", "px", ATLAS_A8_PIXELS, expected(0..1_000_000)),
             int("Atlas A8 textures", "", ATLAS_A8_TEXTURES, expected(0..2)),
@@ -449,7 +466,6 @@ impl Profiler {
             float("Depth targets mem", "MB", DEPTH_TARGETS_MEM, Expected::none()),
             float("Shader build time", "ms", SHADER_BUILD_TIME, Expected::none()),
             // We use the expected range to highlight render reasons that are happening.
-            float("Reason First", "", RENDER_REASON_FIRST, expected(0.0..0.01)),
             float("Reason scene", "", RENDER_REASON_SCENE, expected(0.0..0.01)),
             float("Reason animated property", "", RENDER_REASON_ANIMATED_PROPERTY, expected(0.0..0.01)),
             float("Reason resource update", "", RENDER_REASON_RESOURCE_UPDATE, expected(0.0..0.01)),
@@ -481,9 +497,11 @@ impl Profiler {
             int("Slow: blobs", "%", SLOW_BLOB_COUNT, Expected::none()),
             int("Slow: after scene", "%", SLOW_SCROLL_AFTER_SCENE_COUNT, Expected::none()),
 
+            float("GPU cache mem", "MB", GPU_CACHE_MEM, Expected::none()),
             float("GPU buffer mem", "MB", GPU_BUFFER_MEM, Expected::none()),
             float("GPU total mem", "MB", GPU_TOTAL_MEM, Expected::none()),
 
+            float("GPU cache preapre", "ms", GPU_CACHE_PREPARE_TIME, Expected::none()),
             float("Frame send", "ms", FRAME_SEND_TIME, Expected::none()),
             float("Update document", "ms", UPDATE_DOCUMENT_TIME, Expected::none()),
 
@@ -689,6 +707,7 @@ impl Profiler {
             RENDER_TARGET_MEM,
             DEPTH_TARGETS_MEM,
             ATLAS_ITEMS_MEM,
+            GPU_CACHE_MEM,
             GPU_BUFFER_MEM,
         ] {
             if let Some(val) = self.counters[counter].get() {
@@ -786,6 +805,10 @@ impl Profiler {
                     flush_counters(&mut counters, selection);
                     selection.push(Item::GpuTimeQueries);
                 }
+                "GPU cache bars" => {
+                    flush_counters(&mut counters, selection);
+                    selection.push(Item::GpuCacheBars);
+                }
                 "Paint phase graph" => {
                     flush_counters(&mut counters, selection);
                     selection.push(Item::PaintPhaseGraph);
@@ -834,6 +857,10 @@ impl Profiler {
         &self.counters
     }
 
+    pub fn get(&self, id: usize) -> Option<f64> {
+        self.counters[id].get()
+    }
+
     fn draw_counters(
         counters: &[Counter],
         selected: &[usize],
@@ -1072,6 +1099,102 @@ impl Profiler {
         }
     }
 
+    fn draw_bar(
+        label: &str,
+        label_color: ColorU,
+        counters: &[(ColorU, usize)],
+        x: f32, y: f32,
+        debug_renderer: &mut DebugRenderer,
+    ) -> default::Rect<f32> {
+        let x = x + 8.0;
+        let y = y + 24.0;
+        let text_rect = debug_renderer.add_text(
+            x, y,
+            label,
+            label_color,
+            None,
+        );
+
+        let x_base = text_rect.max_x() + 10.0;
+        let width = 300.0;
+        let total_value = counters.last().unwrap().1;
+        let scale = width / total_value as f32;
+        let mut x_current = x_base;
+
+        for &(color, counter) in counters {
+            let x_stop = x_base + counter as f32 * scale;
+            debug_renderer.add_quad(
+                x_current,
+                text_rect.origin.y,
+                x_stop,
+                text_rect.max_y(),
+                color,
+                color,
+            );
+            x_current = x_stop;
+
+        }
+
+        let mut total_rect = text_rect;
+        total_rect.size.width += width + 10.0;
+
+        total_rect
+    }
+
+    fn draw_gpu_cache_bars(&self, x: f32, mut y: f32, text_buffer: &mut String, debug_renderer: &mut DebugRenderer) -> default::Rect<f32> {
+        let color_updated = ColorU::new(0xFF, 0, 0, 0xFF);
+        let color_free = ColorU::new(0, 0, 0xFF, 0xFF);
+        let color_saved = ColorU::new(0, 0xFF, 0, 0xFF);
+
+        let updated_blocks = self.get(GPU_CACHE_BLOCKS_UPDATED).unwrap_or(0.0) as usize;
+        let saved_blocks = self.get(GPU_CACHE_BLOCKS_SAVED).unwrap_or(0.0) as usize;
+        let allocated_blocks = self.get(GPU_CACHE_BLOCKS_TOTAL).unwrap_or(0.0) as usize;
+        let allocated_rows = self.get(GPU_CACHE_ROWS_TOTAL).unwrap_or(0.0) as usize;
+        let updated_rows = self.get(GPU_CACHE_ROWS_UPDATED).unwrap_or(0.0) as usize;
+        let requested_blocks = updated_blocks + saved_blocks;
+        let total_blocks = allocated_rows * MAX_VERTEX_TEXTURE_WIDTH;
+
+        set_text!(text_buffer, "GPU cache rows ({}):", allocated_rows);
+
+        let rect0 = Profiler::draw_bar(
+            text_buffer,
+            ColorU::new(0xFF, 0xFF, 0xFF, 0xFF),
+            &[
+                (color_updated, updated_rows),
+                (color_free, allocated_rows),
+            ],
+            x, y,
+            debug_renderer,
+        );
+
+        y = rect0.max_y();
+
+        let rect1 = Profiler::draw_bar(
+            "GPU cache blocks",
+            ColorU::new(0xFF, 0xFF, 0, 0xFF),
+            &[
+                (color_updated, updated_blocks),
+                (color_saved, requested_blocks),
+                (color_free, allocated_blocks),
+                (ColorU::new(0, 0, 0, 0xFF), total_blocks),
+            ],
+            x, y,
+            debug_renderer,
+        );
+
+        let total_rect = rect0.union(&rect1).inflate(10.0, 10.0);
+        debug_renderer.add_quad(
+            total_rect.origin.x,
+            total_rect.origin.y,
+            total_rect.origin.x + total_rect.size.width,
+            total_rect.origin.y + total_rect.size.height,
+            ColorF::new(0.1, 0.1, 0.1, 0.8).into(),
+            ColorF::new(0.2, 0.2, 0.2, 0.8).into(),
+        );
+
+        total_rect
+    }
+
     // Draws a frame graph for a given frame collection.
     fn draw_frame_graph(
         frame_collection: &ProfilerFrameCollection,
@@ -1237,6 +1360,9 @@ impl Profiler {
                 Item::GpuTimeQueries => {
                     Profiler::draw_frame_graph(&self.gpu_frames, x, y, debug_renderer)
                 }
+                Item::GpuCacheBars => {
+                    self.draw_gpu_cache_bars(x, y, &mut text_buffer, debug_renderer)
+                }
                 Item::PaintPhaseGraph => {
                     Profiler::draw_frame_graph(&self.frame_stats, x, y, debug_renderer)
                 }
@@ -1947,6 +2073,7 @@ pub struct CpuFrameTimings {
     pub frame_building_other: f64,
     pub frame_send: f64,
     pub uploads: f64,
+    pub gpu_cache: f64,
     pub draw_calls: f64,
     pub unknown: f64,
 }
@@ -1962,9 +2089,10 @@ impl CpuFrameTimings {
         let frame_send = counters[FRAME_SEND_TIME].get().unwrap_or(0.0);
         let renderer = counters[RENDERER_TIME].get().unwrap_or(0.0);
         let uploads = counters[TEXTURE_CACHE_UPDATE_TIME].get().unwrap_or(0.0);
+        let gpu_cache = counters[GPU_CACHE_PREPARE_TIME].get().unwrap_or(0.0);
         let frame_build = visibility + prepare + glyph_resolve + batching;
         let update_document = counters[UPDATE_DOCUMENT_TIME].get().unwrap_or(0.0) - frame_build;
-        let draw_calls = renderer - uploads;
+        let draw_calls = renderer - uploads - gpu_cache;
         let unknown = (total - (api_send + update_document + frame_build + frame_send + renderer)).max(0.0);
         let frame_building_other = (counters[FRAME_BUILDING_TIME].get().unwrap_or(0.0) - frame_build).max(0.0);
 
@@ -1979,6 +2107,7 @@ impl CpuFrameTimings {
             frame_building_other,
             frame_send,
             uploads,
+            gpu_cache,
             draw_calls,
             unknown,
         }
@@ -2010,9 +2139,10 @@ impl CpuFrameTimings {
                 sample(self.frame_send, "08. frame send", ColorF { r: 1.0, g: 0.8, b: 0.8, a: 1.0 }),
                 // Renderer
                 sample(self.uploads, "09. texture uploads", ColorF { r: 0.8, g: 0.0, b: 0.3, a: 1.0 }),
-                sample(self.draw_calls, "10. draw calls", ColorF { r: 1.0, g: 0.5, b: 0.0, a: 1.0 }),
+                sample(self.gpu_cache, "10. gpu cache update", ColorF { r: 0.5, g: 0.0, b: 0.4, a: 1.0 }),
+                sample(self.draw_calls, "11. draw calls", ColorF { r: 1.0, g: 0.5, b: 0.0, a: 1.0 }),
                 // Unaccounted time
-                sample(self.unknown, "11. unknown", ColorF { r: 0.3, g: 0.3, b: 0.3, a: 1.0 }),
+                sample(self.unknown, "12. unknown", ColorF { r: 0.3, g: 0.3, b: 0.3, a: 1.0 }),
             ],
         }
     }
@@ -2037,6 +2167,7 @@ enum Item {
     ChangeIndicator(usize),
     Fps,
     GpuTimeQueries,
+    GpuCacheBars,
     PaintPhaseGraph,
     SlowScrollFrames,
     Text(String),
diff --git a/gfx/wr/webrender/src/quad.rs b/gfx/wr/webrender/src/quad.rs
@@ -1219,8 +1219,8 @@ pub fn add_to_batch<F>(
 
     let mut instance = QuadInstance {
         dst_task_address,
-        prim_address_i: prim_address_i.as_int(),
-        prim_address_f: prim_address_f.as_int(),
+        prim_address_i,
+        prim_address_f,
         edge_flags: edge_flags_bits,
         quad_flags: quad_flags.bits(),
         part_index: PartIndex::All as u8,
diff --git a/gfx/wr/webrender/src/render_api.rs b/gfx/wr/webrender/src/render_api.rs
@@ -973,6 +973,8 @@ pub enum DebugCommand {
     EnableNativeCompositor(bool),
     /// Sets the maximum amount of existing batches to visit before creating a new one.
     SetBatchingLookback(u32),
+    /// Invalidate GPU cache, forcing the update from the CPU mirror.
+    InvalidateGpuCache,
     /// Causes the scene builder to pause for a given amount of milliseconds each time it
     /// processes a transaction.
     SimulateLongSceneBuild(u32),
@@ -1488,6 +1490,8 @@ pub struct MemoryReport {
     // CPU Memory.
     //
     pub clip_stores: usize,
+    pub gpu_cache_metadata: usize,
+    pub gpu_cache_cpu_mirror: usize,
     pub hit_testers: usize,
     pub fonts: usize,
     pub weak_fonts: usize,
@@ -1504,6 +1508,7 @@ pub struct MemoryReport {
     //
     // GPU memory.
     //
+    pub gpu_cache_textures: usize,
     pub vertex_data_textures: usize,
     pub render_target_textures: usize,
     pub picture_tile_textures: usize,
diff --git a/gfx/wr/webrender/src/render_backend.rs b/gfx/wr/webrender/src/render_backend.rs
@@ -30,6 +30,7 @@ use crate::capture::CaptureConfig;
 use crate::composite::{CompositorKind, CompositeDescriptor};
 use crate::frame_builder::{FrameBuilder, FrameBuilderConfig, FrameScratchBuffer};
 use glyph_rasterizer::FontInstance;
+use crate::gpu_cache::GpuCache;
 use crate::hit_test::{HitTest, HitTester, SharedHitTester};
 use crate::intern::DataStore;
 #[cfg(any(feature = "capture", feature = "replay"))]
@@ -511,6 +512,7 @@ impl Document {
     fn build_frame(
         &mut self,
         resource_cache: &mut ResourceCache,
+        gpu_cache: &mut GpuCache,
         debug_flags: DebugFlags,
         tile_caches: &mut FastHashMap<SliceId, Box<TileCacheInstance>>,
         frame_stats: Option<FullFrameStats>,
@@ -531,6 +533,7 @@ impl Document {
                 &mut self.scene,
                 present,
                 resource_cache,
+                gpu_cache,
                 &mut self.rg_builder,
                 self.stamp,
                 self.view.scene.device_rect.min,
@@ -584,6 +587,7 @@ impl Document {
         &mut self,
         mut txn: OffscreenBuiltScene,
         resource_cache: &mut ResourceCache,
+        gpu_cache: &mut GpuCache,
         chunk_pool: Arc<ChunkPool>,
         debug_flags: DebugFlags,
     ) -> RenderedDocument {
@@ -609,6 +613,7 @@ impl Document {
             &mut txn.scene,
             present,
             resource_cache,
+            gpu_cache,
             &mut self.rg_builder,
             self.stamp, // TODO(nical)
             self.view.scene.device_rect.min,
@@ -773,6 +778,7 @@ pub struct RenderBackend {
     result_tx: Sender<ResultMsg>,
     scene_tx: Sender<SceneBuilderRequest>,
 
+    gpu_cache: GpuCache,
     resource_cache: ResourceCache,
     chunk_pool: Arc<ChunkPool>,
 
@@ -824,6 +830,7 @@ impl RenderBackend {
             result_tx,
             scene_tx,
             resource_cache,
+            gpu_cache: GpuCache::new(),
             chunk_pool,
             frame_config,
             default_compositor_kind : frame_config.compositor_kind,
@@ -922,6 +929,7 @@ impl RenderBackend {
         result_tx: Option<Sender<SceneSwapResult>>,
         frame_counter: &mut u32,
     ) -> bool {
+        self.prepare_for_frames();
         self.maybe_force_nop_documents(
             frame_counter,
             |document_id| txns.iter().any(|txn| txn.document_id == document_id));
@@ -1010,10 +1018,14 @@ impl RenderBackend {
                     let rendered_document = doc.process_offscreen_scene(
                         offscreen_scene,
                         &mut self.resource_cache,
+                        &mut self.gpu_cache,
                         self.chunk_pool.clone(),
                         self.debug_flags,
                     );
 
+                    let msg = ResultMsg::UpdateGpuCache(self.gpu_cache.extract_updates());
+                    self.result_tx.send(msg).unwrap();
+
                     let pending_update = self.resource_cache.pending_updates();
 
                     let msg = ResultMsg::PublishDocument(
@@ -1100,6 +1112,8 @@ impl RenderBackend {
                 // recently used resources.
                 self.resource_cache.clear(ClearCache::all());
 
+                self.gpu_cache.clear();
+
                 for (_, doc) in &mut self.documents {
                     doc.scratch.memory_pressure();
                     for tile_cache in self.tile_caches.values_mut() {
@@ -1135,6 +1149,8 @@ impl RenderBackend {
                         return RenderBackendStatus::Continue;
                     }
                     DebugCommand::GenerateFrame => {
+                        self.prepare_for_frames();
+
                         let documents: Vec<DocumentId> = self.documents.keys()
                             .cloned()
                             .collect();
@@ -1166,6 +1182,7 @@ impl RenderBackend {
                                 doc.scene.config.force_invalidation = invalidation_config;
                             }
                         }
+                        self.bookkeep_after_frames();
 
                         return RenderBackendStatus::Continue;
                     }
@@ -1265,6 +1282,7 @@ impl RenderBackend {
                     }
                     DebugCommand::SetFlags(flags) => {
                         self.resource_cache.set_debug_flags(flags);
+                        self.gpu_cache.set_debug_flags(flags);
 
                         let force_invalidation = flags.contains(DebugFlags::FORCE_PICTURE_INVALIDATION);
                         if self.frame_config.force_invalidation != force_invalidation {
@@ -1275,6 +1293,19 @@ impl RenderBackend {
                             self.update_frame_builder_config();
                         }
 
+                        // If we're toggling on the GPU cache debug display, we
+                        // need to blow away the cache. This is because we only
+                        // send allocation/free notifications to the renderer
+                        // thread when the debug display is enabled, and thus
+                        // enabling it when the cache is partially populated will
+                        // give the renderer an incomplete view of the world.
+                        // And since we might as well drop all the debugging state
+                        // from the renderer when we disable the debug display,
+                        // we just clear the cache on toggle.
+                        let changed = self.debug_flags ^ flags;
+                        if changed.contains(DebugFlags::GPU_CACHE_DBG) {
+                            self.gpu_cache.clear();
+                        }
                         self.debug_flags = flags;
 
                         ResultMsg::DebugCommand(option)
@@ -1318,6 +1349,7 @@ impl RenderBackend {
                     result_tx,
                     frame_counter,
                 );
+                self.bookkeep_after_frames();
             },
             #[cfg(feature = "capture")]
             SceneBuilderResult::CapturedTransactions(txns, capture_config, result_tx) => {
@@ -1340,6 +1372,8 @@ impl RenderBackend {
                 if built_frame {
                     self.save_capture_sequence();
                 }
+
+                self.bookkeep_after_frames();
             },
             #[cfg(feature = "capture")]
             SceneBuilderResult::StopCaptureSequence => {
@@ -1405,8 +1439,16 @@ impl RenderBackend {
         );
     }
 
+    fn prepare_for_frames(&mut self) {
+        self.gpu_cache.prepare_for_frames();
+    }
+
+    fn bookkeep_after_frames(&mut self) {
+        self.gpu_cache.bookkeep_after_frames();
+    }
+
     fn requires_frame_build(&mut self) -> bool {
-        false // TODO(nical)
+        self.gpu_cache.requires_frame_build()
     }
 
     fn prepare_transactions(
@@ -1414,6 +1456,7 @@ impl RenderBackend {
         txns: Vec<Box<TransactionMsg>>,
         frame_counter: &mut u32,
     ) {
+        self.prepare_for_frames();
         self.maybe_force_nop_documents(
             frame_counter,
             |document_id| txns.iter().any(|txn| txn.document_id == document_id));
@@ -1446,6 +1489,7 @@ impl RenderBackend {
             #[cfg(feature = "capture")]
             self.save_capture_sequence();
         }
+        self.bookkeep_after_frames();
     }
 
     /// In certain cases, resources shared by multiple documents have to run
@@ -1599,6 +1643,7 @@ impl RenderBackend {
 
                 let rendered_document = doc.build_frame(
                     &mut self.resource_cache,
+                    &mut self.gpu_cache,
                     self.debug_flags,
                     &mut self.tile_caches,
                     frame_stats,
@@ -1610,6 +1655,9 @@ impl RenderBackend {
                 debug!("generated frame for document {:?} with {} passes",
                     document_id, rendered_document.frame.passes.len());
 
+                let msg = ResultMsg::UpdateGpuCache(self.gpu_cache.extract_updates());
+                self.result_tx.send(msg).unwrap();
+
                 Telemetry::stop_and_accumulate_framebuild_time(timer_id);
 
                 let pending_update = self.resource_cache.pending_updates();
@@ -1730,6 +1778,7 @@ impl RenderBackend {
         let mut report = Box::new(MemoryReport::default());
         let ops = self.size_of_ops.as_mut().unwrap();
         let op = ops.size_of_op;
+        report.gpu_cache_metadata = self.gpu_cache.size_of(ops);
         for doc in self.documents.values() {
             report.clip_stores += doc.scene.clip_store.size_of(ops);
             report.hit_testers += match &doc.hit_tester {
@@ -1795,6 +1844,10 @@ impl RenderBackend {
         }
         let config = CaptureConfig::new(root, bits);
 
+        if config.bits.contains(CaptureBits::FRAME) {
+            self.prepare_for_frames();
+        }
+
         for (&id, doc) in &mut self.documents {
             debug!("\tdocument {:?}", id);
             if config.bits.contains(CaptureBits::FRAME) {
@@ -1802,6 +1855,7 @@ impl RenderBackend {
                 let force_invalidation = std::mem::replace(&mut doc.scene.config.force_invalidation, true);
                 let rendered_document = doc.build_frame(
                     &mut self.resource_cache,
+                    &mut self.gpu_cache,
                     self.debug_flags,
                     &mut self.tile_caches,
                     None,
@@ -1812,6 +1866,11 @@ impl RenderBackend {
 
                 doc.scene.config.force_invalidation = force_invalidation;
 
+                // After we rendered the frames, there are pending updates to both
+                // GPU cache and resources. Instead of serializing them, we are going to make sure
+                // they are applied on the `Renderer` side.
+                let msg_update_gpu_cache = ResultMsg::UpdateGpuCache(self.gpu_cache.extract_updates());
+                self.result_tx.send(msg_update_gpu_cache).unwrap();
                 //TODO: write down doc's pipeline info?
                 // it has `pipeline_epoch_map`,
                 // which may capture necessary details for some cases.
@@ -1869,6 +1928,7 @@ impl RenderBackend {
             // report it here if we do. If we don't, it will simply crash in
             // Renderer::render_impl and give us less information about the source.
             assert!(!self.requires_frame_build(), "Caches were cleared during a capture.");
+            self.bookkeep_after_frames();
         }
 
         debug!("\tscene builder");
@@ -1902,6 +1962,8 @@ impl RenderBackend {
             info!("\tresource cache");
             let caches = self.resource_cache.save_caches(&config.root);
             config.serialize_for_resource(&caches, "resource_cache");
+            info!("\tgpu cache");
+            config.serialize_for_resource(&self.gpu_cache, "gpu_cache");
         }
 
         DebugOutput::SaveCapture(config, deferred)
@@ -1975,6 +2037,11 @@ impl RenderBackend {
                 DebugOutput::LoadCapture(config.clone(), plain_externals)
             );
             self.result_tx.send(msg_load).unwrap();
+
+            self.gpu_cache = match config.deserialize_for_resource::<GpuCache, _>("gpu_cache") {
+                Some(gpu_cache) => gpu_cache,
+                None => GpuCache::new(),
+            };
         }
 
         self.frame_config = backend.frame_config;
@@ -2059,6 +2126,9 @@ impl RenderBackend {
                 Some(frame) => {
                     info!("\tloaded a built frame with {} passes", frame.passes.len());
 
+                    let msg_update = ResultMsg::UpdateGpuCache(self.gpu_cache.extract_updates());
+                    self.result_tx.send(msg_update).unwrap();
+
                     self.frame_publish_id.advance();
                     let msg_publish = ResultMsg::PublishDocument(
                         self.frame_publish_id,
diff --git a/gfx/wr/webrender/src/render_target.rs b/gfx/wr/webrender/src/render_target.rs
@@ -13,6 +13,7 @@ use crate::segment::EdgeAaSegmentMask;
 use crate::spatial_tree::SpatialTree;
 use crate::clip::{ClipStore, ClipItemKind};
 use crate::frame_builder::FrameGlobalResources;
+use crate::gpu_cache::{GpuCache, GpuCacheAddress};
 use crate::gpu_types::{BorderInstance, SvgFilterInstance, SVGFEFilterInstance, BlurDirection, BlurInstance, PrimitiveHeaders, ScalingInstance};
 use crate::gpu_types::{TransformPalette, ZBufferIdGenerator, MaskInstance, ClipSpace, BlurEdgeMode};
 use crate::gpu_types::{ZBufferId, QuadSegment, PrimitiveInstanceData, TransformPaletteId};
@@ -107,6 +108,7 @@ impl RenderTargetList {
     pub fn build(
         &mut self,
         ctx: &mut RenderTargetContext,
+        gpu_cache: &mut GpuCache,
         render_tasks: &RenderTaskGraph,
         prim_headers: &mut PrimitiveHeaders,
         transforms: &mut TransformPalette,
@@ -122,6 +124,7 @@ impl RenderTargetList {
         for target in &mut self.targets {
             target.build(
                 ctx,
+                gpu_cache,
                 render_tasks,
                 prim_headers,
                 transforms,
@@ -253,6 +256,7 @@ impl RenderTarget {
     pub fn build(
         &mut self,
         ctx: &mut RenderTargetContext,
+        gpu_cache: &mut GpuCache,
         render_tasks: &RenderTaskGraph,
         prim_headers: &mut PrimitiveHeaders,
         transforms: &mut TransformPalette,
@@ -309,6 +313,7 @@ impl RenderTarget {
                             cmd,
                             spatial_node_index,
                             ctx,
+                            gpu_cache,
                             render_tasks,
                             prim_headers,
                             transforms,
@@ -349,6 +354,7 @@ impl RenderTarget {
         &mut self,
         task_id: RenderTaskId,
         ctx: &RenderTargetContext,
+        gpu_cache: &mut GpuCache,
         gpu_buffer_builder: &mut GpuBufferBuilder,
         render_tasks: &RenderTaskGraph,
         clip_store: &ClipStore,
@@ -438,7 +444,7 @@ impl RenderTarget {
                     task_id,
                     task.children.get(0).cloned(),
                     task.children.get(1).cloned(),
-                    task_info.extra_gpu_data,
+                    task_info.extra_gpu_cache_handle.map(|handle| gpu_cache.get_address(&handle)),
                     &ctx.frame_memory,
                 )
             }
@@ -450,7 +456,7 @@ impl RenderTarget {
                     task,
                     task.children.get(0).cloned(),
                     task.children.get(1).cloned(),
-                    task_info.extra_gpu_data,
+                    task_info.extra_gpu_cache_handle.map(|handle| gpu_cache.get_address(&handle)),
                     &ctx.frame_memory,
                 )
             }
@@ -465,6 +471,7 @@ impl RenderTarget {
                     task_info.clip_node_range,
                     task_info.root_spatial_node_index,
                     render_tasks,
+                    gpu_cache,
                     clip_store,
                     transforms,
                     task_info.actual_rect,
@@ -675,7 +682,7 @@ fn add_svg_filter_instances(
     task_id: RenderTaskId,
     input_1_task: Option<RenderTaskId>,
     input_2_task: Option<RenderTaskId>,
-    extra_data_address: Option<GpuBufferAddress>,
+    extra_data_address: Option<GpuCacheAddress>,
     memory: &FrameMemory,
 ) {
     let mut textures = BatchTextures::empty();
@@ -746,7 +753,7 @@ fn add_svg_filter_instances(
         input_count,
         generic_int,
         padding: 0,
-        extra_data_address: extra_data_address.unwrap_or(GpuBufferAddress::INVALID).as_int(),
+        extra_data_address: extra_data_address.unwrap_or(GpuCacheAddress::INVALID),
     };
 
     for (ref mut batch_textures, ref mut batch) in instances.iter_mut() {
@@ -779,7 +786,7 @@ fn add_svg_filter_node_instances(
     target_task: &RenderTask,
     input_1_task: Option<RenderTaskId>,
     input_2_task: Option<RenderTaskId>,
-    extra_data_address: Option<GpuBufferAddress>,
+    extra_data_address: Option<GpuCacheAddress>,
     memory: &FrameMemory,
 ) {
     let node = &task_info.node;
@@ -801,7 +808,7 @@ fn add_svg_filter_node_instances(
         input_2_task_address: RenderTaskId::INVALID.into(),
         kind: 0,
         input_count: node.inputs.len() as u16,
-        extra_data_address: extra_data_address.unwrap_or(GpuBufferAddress::INVALID).as_int(),
+        extra_data_address: extra_data_address.unwrap_or(GpuCacheAddress::INVALID),
     };
 
     // Must match FILTER_* in cs_svg_filter_node.glsl
diff --git a/gfx/wr/webrender/src/render_task.rs b/gfx/wr/webrender/src/render_task.rs
@@ -15,6 +15,7 @@ use crate::profiler::{add_text_marker};
 use crate::spatial_tree::SpatialNodeIndex;
 use crate::filterdata::SFilterData;
 use crate::frame_builder::FrameBuilderConfig;
+use crate::gpu_cache::{GpuCache, GpuCacheAddress, GpuCacheHandle};
 use crate::gpu_types::{BorderInstance, ImageSource, UvRectKind, TransformPaletteId, BlurEdgeMode};
 use crate::internal_types::{CacheTextureId, FastHashMap, FilterGraphNode, FilterGraphOp, FilterGraphPictureReference, SVGFE_CONVOLVE_VALUES_LIMIT, TextureSource, Swizzle};
 use crate::picture::{ResolvedSurfaceTexture, MAX_SURFACE_SIZE};
@@ -25,7 +26,7 @@ use crate::prim_store::gradient::{
 };
 use crate::resource_cache::{ResourceCache, ImageRequest};
 use std::{usize, f32, i32, u32};
-use crate::renderer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferBuilderF};
+use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF};
 use crate::render_backend::DataStores;
 use crate::render_target::{ResolveOp, RenderTargetKind};
 use crate::render_task_graph::{PassId, RenderTaskId, RenderTaskGraphBuilder};
@@ -345,7 +346,7 @@ pub enum SvgFilterInfo {
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct SvgFilterTask {
     pub info: SvgFilterInfo,
-    pub extra_gpu_data: Option<GpuBufferAddress>,
+    pub extra_gpu_cache_handle: Option<GpuCacheHandle>,
 }
 
 #[derive(Debug)]
@@ -355,7 +356,7 @@ pub struct SVGFEFilterTask {
     pub node: FilterGraphNode,
     pub op: FilterGraphOp,
     pub content_origin: DevicePoint,
-    pub extra_gpu_data: Option<GpuBufferAddress>,
+    pub extra_gpu_cache_handle: Option<GpuCacheHandle>,
 }
 
 #[cfg_attr(feature = "capture", derive(Serialize))]
@@ -627,6 +628,7 @@ impl RenderTaskKind {
         clip_node_range: ClipNodeRange,
         root_spatial_node_index: SpatialNodeIndex,
         clip_store: &mut ClipStore,
+        gpu_cache: &mut GpuCache,
         gpu_buffer_builder: &mut GpuBufferBuilderF,
         resource_cache: &mut ResourceCache,
         rg_builder: &mut RenderTaskGraphBuilder,
@@ -684,10 +686,11 @@ impl RenderTaskKind {
                         }),
                         false,
                         RenderTaskParent::RenderTask(clip_task_id),
+                        gpu_cache,
                         gpu_buffer_builder,
                         rg_builder,
                         surface_builder,
-                        &mut |rg_builder, _| {
+                        &mut |rg_builder, _, _| {
                             let clip_data = ClipData::rounded_rect(
                                 source.minimal_shadow_rect.size(),
                                 &source.shadow_radius,
@@ -847,32 +850,38 @@ impl RenderTaskKind {
 
     pub fn write_gpu_blocks(
         &mut self,
-        gpu_buffer: &mut GpuBufferBuilder,
+        gpu_cache: &mut GpuCache,
     ) {
         match self {
             RenderTaskKind::SvgFilter(ref mut filter_task) => {
                 match filter_task.info {
                     SvgFilterInfo::ColorMatrix(ref matrix) => {
-                        let mut writer = gpu_buffer.f32.write_blocks(5);
-                        for i in 0..5 {
-                            writer.push_one([matrix[i*4], matrix[i*4+1], matrix[i*4+2], matrix[i*4+3]]);
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(mut request) = gpu_cache.request(handle) {
+                            for i in 0..5 {
+                                request.push([matrix[i*4], matrix[i*4+1], matrix[i*4+2], matrix[i*4+3]]);
+                            }
                         }
-                        filter_task.extra_gpu_data = Some(writer.finish());
                     }
                     SvgFilterInfo::DropShadow(color) |
                     SvgFilterInfo::Flood(color) => {
-                        let mut writer = gpu_buffer.f32.write_blocks(1);
-                        writer.push_one(color.to_array());
-                        filter_task.extra_gpu_data = Some(writer.finish());
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(mut request) = gpu_cache.request(handle) {
+                            request.push(color.to_array());
+                        }
                     }
                     SvgFilterInfo::ComponentTransfer(ref data) => {
-                        filter_task.extra_gpu_data = Some(data.write_gpu_blocks(&mut gpu_buffer.f32));
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(request) = gpu_cache.request(handle) {
+                            data.update(request);
+                        }
                     }
                     SvgFilterInfo::Composite(ref operator) => {
                         if let CompositeOperator::Arithmetic(k_vals) = operator {
-                            let mut writer = gpu_buffer.f32.write_blocks(1);
-                            writer.push_one(*k_vals);
-                            filter_task.extra_gpu_data = Some(writer.finish());
+                            let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                            if let Some(mut request) = gpu_cache.request(handle) {
+                                request.push(*k_vals);
+                            }
                         }
                     }
                     _ => {},
@@ -896,19 +905,21 @@ impl RenderTaskKind {
                     FilterGraphOp::SVGFEBlendSaturation => {}
                     FilterGraphOp::SVGFEBlendColor => {}
                     FilterGraphOp::SVGFEBlendLuminosity => {}
-                    FilterGraphOp::SVGFEColorMatrix { values: matrix } => {
-                        let mut writer = gpu_buffer.f32.write_blocks(5);
-                        for i in 0..5 {
-                            writer.push_one([matrix[i*4], matrix[i*4+1], matrix[i*4+2], matrix[i*4+3]]);
+                    FilterGraphOp::SVGFEColorMatrix{values: matrix} => {
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(mut request) = gpu_cache.request(handle) {
+                            for i in 0..5 {
+                                request.push([matrix[i*4], matrix[i*4+1], matrix[i*4+2], matrix[i*4+3]]);
+                            }
                         }
-                        filter_task.extra_gpu_data = Some(writer.finish());
                     }
                     FilterGraphOp::SVGFEComponentTransfer => unreachable!(),
                     FilterGraphOp::SVGFEComponentTransferInterned{..} => {}
                     FilterGraphOp::SVGFECompositeArithmetic{k1, k2, k3, k4} => {
-                        let mut writer = gpu_buffer.f32.write_blocks(1);
-                        writer.push_one([k1, k2, k3, k4]);
-                        filter_task.extra_gpu_data = Some(writer.finish());
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(mut request) = gpu_cache.request(handle) {
+                            request.push([k1, k2, k3, k4]);
+                        }
                     }
                     FilterGraphOp::SVGFECompositeATop => {}
                     FilterGraphOp::SVGFECompositeIn => {}
@@ -919,40 +930,44 @@ impl RenderTaskKind {
                     FilterGraphOp::SVGFEConvolveMatrixEdgeModeDuplicate{order_x, order_y, kernel, divisor, bias, target_x, target_y, kernel_unit_length_x, kernel_unit_length_y, preserve_alpha} |
                     FilterGraphOp::SVGFEConvolveMatrixEdgeModeNone{order_x, order_y, kernel, divisor, bias, target_x, target_y, kernel_unit_length_x, kernel_unit_length_y, preserve_alpha} |
                     FilterGraphOp::SVGFEConvolveMatrixEdgeModeWrap{order_x, order_y, kernel, divisor, bias, target_x, target_y, kernel_unit_length_x, kernel_unit_length_y, preserve_alpha} => {
-                        let mut writer = gpu_buffer.f32.write_blocks(8);
-                        assert!(SVGFE_CONVOLVE_VALUES_LIMIT == 25);
-                        writer.push_one([-target_x as f32, -target_y as f32, order_x as f32, order_y as f32]);
-                        writer.push_one([kernel_unit_length_x as f32, kernel_unit_length_y as f32, 1.0 / divisor, bias]);
-                        writer.push_one([kernel[0], kernel[1], kernel[2], kernel[3]]);
-                        writer.push_one([kernel[4], kernel[5], kernel[6], kernel[7]]);
-                        writer.push_one([kernel[8], kernel[9], kernel[10], kernel[11]]);
-                        writer.push_one([kernel[12], kernel[13], kernel[14], kernel[15]]);
-                        writer.push_one([kernel[16], kernel[17], kernel[18], kernel[19]]);
-                        writer.push_one([kernel[20], 0.0, 0.0, preserve_alpha as f32]);
-                        filter_task.extra_gpu_data = Some(writer.finish());
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(mut request) = gpu_cache.request(handle) {
+                            request.push([-target_x as f32, -target_y as f32, order_x as f32, order_y as f32]);
+                            request.push([kernel_unit_length_x as f32, kernel_unit_length_y as f32, 1.0 / divisor, bias]);
+                            assert!(SVGFE_CONVOLVE_VALUES_LIMIT == 25);
+                            request.push([kernel[0], kernel[1], kernel[2], kernel[3]]);
+                            request.push([kernel[4], kernel[5], kernel[6], kernel[7]]);
+                            request.push([kernel[8], kernel[9], kernel[10], kernel[11]]);
+                            request.push([kernel[12], kernel[13], kernel[14], kernel[15]]);
+                            request.push([kernel[16], kernel[17], kernel[18], kernel[19]]);
+                            request.push([kernel[20], 0.0, 0.0, preserve_alpha as f32]);
+                        }
                     }
                     FilterGraphOp::SVGFEDiffuseLightingDistant{..} => {}
                     FilterGraphOp::SVGFEDiffuseLightingPoint{..} => {}
                     FilterGraphOp::SVGFEDiffuseLightingSpot{..} => {}
                     FilterGraphOp::SVGFEDisplacementMap{scale, x_channel_selector, y_channel_selector} => {
-                        let mut writer = gpu_buffer.f32.write_blocks(1);
-                        writer.push_one([x_channel_selector as f32, y_channel_selector as f32, scale, 0.0]);
-                        filter_task.extra_gpu_data = Some(writer.finish());
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(mut request) = gpu_cache.request(handle) {
+                            request.push([x_channel_selector as f32, y_channel_selector as f32, scale, 0.0]);
+                        }
+                    }
+                    FilterGraphOp::SVGFEDropShadow{color, ..} |
+                    FilterGraphOp::SVGFEFlood{color} => {
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(mut request) = gpu_cache.request(handle) {
+                            request.push(color.to_array());
+                        }
                     }
-                    FilterGraphOp::SVGFEDropShadow { color, .. } |
-                    FilterGraphOp::SVGFEFlood { color } => {
-                        let mut writer = gpu_buffer.f32.write_blocks(1);
-                        writer.push_one(color.to_array());
-                        filter_task.extra_gpu_data = Some(writer.finish());
-                     }
                     FilterGraphOp::SVGFEGaussianBlur{..} => {}
                     FilterGraphOp::SVGFEIdentity => {}
-                    FilterGraphOp::SVGFEImage {..} => {}
-                    FilterGraphOp::SVGFEMorphologyDilate { radius_x, radius_y } |
-                    FilterGraphOp::SVGFEMorphologyErode { radius_x, radius_y } => {
-                        let mut writer = gpu_buffer.f32.write_blocks(1);
-                        writer.push_one([radius_x, radius_y, 0.0, 0.0]);
-                        filter_task.extra_gpu_data = Some(writer.finish());
+                    FilterGraphOp::SVGFEImage{..} => {}
+                    FilterGraphOp::SVGFEMorphologyDilate{radius_x, radius_y} |
+                    FilterGraphOp::SVGFEMorphologyErode{radius_x, radius_y} => {
+                        let handle = filter_task.extra_gpu_cache_handle.get_or_insert_with(GpuCacheHandle::new);
+                        if let Some(mut request) = gpu_cache.request(handle) {
+                            request.push([radius_x, radius_y, 0.0, 0.0]);
+                        }
                     }
                     FilterGraphOp::SVGFEOpacity{..} => {}
                     FilterGraphOp::SVGFESourceAlpha => {}
@@ -1038,7 +1053,7 @@ pub struct RenderTask {
     ///
     /// Will be set to None if the render task is cached, in which case the texture cache
     /// manages the handle.
-    pub uv_rect_handle: GpuBufferAddress,
+    pub uv_rect_handle: GpuCacheHandle,
     pub cache_handle: Option<RenderTaskCacheEntryHandle>,
     uv_rect_kind: UvRectKind,
 }
@@ -1056,7 +1071,7 @@ impl RenderTask {
             kind,
             free_after: PassId::MAX,
             render_on: PassId::MIN,
-            uv_rect_handle: GpuBufferAddress::INVALID,
+            uv_rect_handle: GpuCacheHandle::new(),
             uv_rect_kind: UvRectKind::Rect,
             cache_handle: None,
             sub_pass: None,
@@ -1101,7 +1116,7 @@ impl RenderTask {
             }),
             free_after: PassId::MAX,
             render_on: PassId::MIN,
-            uv_rect_handle: GpuBufferAddress::INVALID,
+            uv_rect_handle: GpuCacheHandle::new(),
             uv_rect_kind: UvRectKind::Rect,
             cache_handle: None,
             sub_pass: None,
@@ -1120,7 +1135,7 @@ impl RenderTask {
             kind: RenderTaskKind::Test(target),
             free_after: PassId::MAX,
             render_on: PassId::MIN,
-            uv_rect_handle: GpuBufferAddress::INVALID,
+            uv_rect_handle: GpuCacheHandle::new(),
             uv_rect_kind: UvRectKind::Rect,
             cache_handle: None,
             sub_pass: None,
@@ -1620,7 +1635,7 @@ impl RenderTask {
         let task_id = rg_builder.add().init(RenderTask::new_dynamic(
             target_size,
             RenderTaskKind::SvgFilter(SvgFilterTask {
-                extra_gpu_data: None,
+                extra_gpu_cache_handle: None,
                 info,
             }),
         ).with_uv_rect_kind(uv_rect_kind));
@@ -1650,7 +1665,7 @@ impl RenderTask {
     pub fn new_svg_filter_graph(
         filter_nodes: &[(FilterGraphNode, FilterGraphOp)],
         rg_builder: &mut RenderTaskGraphBuilder,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         data_stores: &mut DataStores,
         _uv_rect_kind: UvRectKind,
         original_task_id: RenderTaskId,
@@ -2352,7 +2367,7 @@ impl RenderTask {
                                 },
                                 op: FilterGraphOp::SVGFEIdentity,
                                 content_origin: DevicePoint::zero(),
-                                extra_gpu_data: None,
+                                extra_gpu_cache_handle: None,
                             }
                         ),
                     ).with_uv_rect_kind(UvRectKind::Rect));
@@ -2396,7 +2411,7 @@ impl RenderTask {
                                 },
                                 op: FilterGraphOp::SVGFEIdentity,
                                 content_origin: node_task_rect.min,
-                                extra_gpu_data: None,
+                                extra_gpu_cache_handle: None,
                             }
                         ),
                     ).with_uv_rect_kind(node_uv_rect_kind));
@@ -2482,7 +2497,7 @@ impl RenderTask {
                                 },
                                 op: FilterGraphOp::SVGFEIdentity,
                                 content_origin: node_task_rect.min,
-                                extra_gpu_data: None,
+                                extra_gpu_cache_handle: None,
                             }
                         ),
                     ).with_uv_rect_kind(UvRectKind::Rect));
@@ -2537,7 +2552,7 @@ impl RenderTask {
                                     std_deviation_x: 0.0, std_deviation_y: 0.0,
                                 },
                                 content_origin: node_task_rect.min,
-                                extra_gpu_data: None,
+                                extra_gpu_cache_handle: None,
                             }
                         ),
                     ).with_uv_rect_kind(node_uv_rect_kind));
@@ -2575,7 +2590,7 @@ impl RenderTask {
                                 },
                                 op: op.clone(),
                                 content_origin: source_subregion.min.cast_unit(),
-                                extra_gpu_data: None,
+                                extra_gpu_cache_handle: None,
                             }
                         ),
                     ).with_uv_rect_kind(node_uv_rect_kind));
@@ -2586,13 +2601,13 @@ impl RenderTask {
                     // FIXME: Doing this in prepare_interned_prim_for_render
                     // doesn't seem to be enough, where should it be done?
                     let filter_data = &mut data_stores.filter_data[handle];
-                    filter_data.write_gpu_blocks(gpu_buffer);
-                    // ComponentTransfer has a gpu buffer address that we need to
+                    filter_data.update(gpu_cache);
+                    // ComponentTransfer has a gpu_cache_handle that we need to
                     // pass along
                     task_id = rg_builder.add().init(RenderTask::new_dynamic(
                         node_task_size,
                         RenderTaskKind::SVGFENode(
-                            SVGFEFilterTask {
+                            SVGFEFilterTask{
                                 node: FilterGraphNode{
                                     kept_by_optimizer: true,
                                     linear: node.linear,
@@ -2602,7 +2617,7 @@ impl RenderTask {
                                 },
                                 op: op.clone(),
                                 content_origin: node_task_rect.min,
-                                extra_gpu_data: Some(filter_data.gpu_buffer_address),
+                                extra_gpu_cache_handle: Some(filter_data.gpu_cache_handle),
                             }
                         ),
                     ).with_uv_rect_kind(node_uv_rect_kind));
@@ -2634,7 +2649,7 @@ impl RenderTask {
                                 },
                                 op: op.clone(),
                                 content_origin: node_task_rect.min,
-                                extra_gpu_data: None,
+                                extra_gpu_cache_handle: None,
                             }
                         ),
                     ).with_uv_rect_kind(node_uv_rect_kind));
@@ -2675,8 +2690,8 @@ impl RenderTask {
         self.uv_rect_kind
     }
 
-    pub fn get_texture_address(&self) -> GpuBufferAddress {
-        self.uv_rect_handle
+    pub fn get_texture_address(&self, gpu_cache: &GpuCache) -> GpuCacheAddress {
+        gpu_cache.get_address(&self.uv_rect_handle)
     }
 
     pub fn get_target_texture(&self) -> CacheTextureId {
@@ -2758,11 +2773,11 @@ impl RenderTask {
     pub fn write_gpu_blocks(
         &mut self,
         target_rect: DeviceIntRect,
-        gpu_buffer: &mut GpuBufferBuilder,
+        gpu_cache: &mut GpuCache,
     ) {
         profile_scope!("write_gpu_blocks");
 
-        self.kind.write_gpu_blocks(gpu_buffer);
+        self.kind.write_gpu_blocks(gpu_cache);
 
         if self.cache_handle.is_some() {
             // The uv rect handle of cached render tasks is requested and set by the
@@ -2770,16 +2785,17 @@ impl RenderTask {
             return;
         }
 
-        let p0 = target_rect.min.to_f32();
-        let p1 = target_rect.max.to_f32();
-        let image_source = ImageSource {
-            p0,
-            p1,
-            user_data: [0.0; 4],
-            uv_rect_kind: self.uv_rect_kind,
-        };
-
-        self.uv_rect_handle = image_source.write_gpu_blocks(&mut gpu_buffer.f32);
+        if let Some(mut request) = gpu_cache.request(&mut self.uv_rect_handle) {
+            let p0 = target_rect.min.to_f32();
+            let p1 = target_rect.max.to_f32();
+            let image_source = ImageSource {
+                p0,
+                p1,
+                user_data: [0.0; 4],
+                uv_rect_kind: self.uv_rect_kind,
+            };
+            image_source.write_gpu_blocks(&mut request);
+        }
     }
 
     /// Called by the render task cache.
diff --git a/gfx/wr/webrender/src/render_task_cache.rs b/gfx/wr/webrender/src/render_task_cache.rs
@@ -9,6 +9,7 @@ use crate::border::BorderSegmentCacheKey;
 use crate::box_shadow::BoxShadowCacheKey;
 use crate::device::TextureFilter;
 use crate::freelist::{FreeList, FreeListHandle, WeakFreeListHandle};
+use crate::gpu_cache::GpuCache;
 use crate::internal_types::FastHashMap;
 use crate::prim_store::image::ImageCacheKey;
 use crate::prim_store::gradient::{
@@ -164,7 +165,7 @@ impl RenderTaskCache {
         size: DeviceIntSize,
         render_task: &mut RenderTask,
         entry: &mut RenderTaskCacheEntry,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         texture_cache: &mut TextureCache,
     ) {
         // Find out what size to alloc in the texture cache.
@@ -198,7 +199,7 @@ impl RenderTaskCache {
             None,
             entry.user_data.unwrap_or([0.0; 4]),
             DirtyRect::All,
-            gpu_buffer,
+            gpu_cache,
             None,
             render_task.uv_rect_kind(),
             Eviction::Auto,
@@ -229,20 +230,22 @@ impl RenderTaskCache {
         texture_cache: &mut TextureCache,
         is_opaque: bool,
         parent: RenderTaskParent,
+        gpu_cache: &mut GpuCache,
         gpu_buffer_builder: &mut GpuBufferBuilderF,
         rg_builder: &mut RenderTaskGraphBuilder,
         surface_builder: &mut SurfaceBuilder,
-        f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId,
+        f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId,
     ) -> RenderTaskId {
         // If this render task cache is being drawn this frame, ensure we hook up the
         // render task for it as a dependency of any render task that uses this as
         // an input source.
         let (task_id, rendered_this_frame) = match key {
-            None => (f(rg_builder, gpu_buffer_builder), true),
+            None => (f(rg_builder, gpu_buffer_builder, gpu_cache), true),
             Some(key) => self.request_render_task_impl(
                 key,
                 is_opaque,
                 texture_cache,
+                gpu_cache,
                 gpu_buffer_builder,
                 rg_builder,
                 f
@@ -281,9 +284,10 @@ impl RenderTaskCache {
         key: RenderTaskCacheKey,
         is_opaque: bool,
         texture_cache: &mut TextureCache,
+        gpu_cache: &mut GpuCache,
         gpu_buffer_builder: &mut GpuBufferBuilderF,
         rg_builder: &mut RenderTaskGraphBuilder,
-        f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId,
+        f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId,
     ) -> (RenderTaskId, bool) {
         let frame_id = self.frame_id;
         let size = key.size;
@@ -305,10 +309,10 @@ impl RenderTaskCache {
         cache_entry.frame_id = self.frame_id;
 
         // Check if this texture cache handle is valid.
-        if texture_cache.request(&cache_entry.handle, gpu_buffer_builder) {
+        if texture_cache.request(&cache_entry.handle, gpu_cache) {
             // Invoke user closure to get render task chain
             // to draw this into the texture cache.
-            let render_task_id = f(rg_builder, gpu_buffer_builder);
+            let render_task_id = f(rg_builder, gpu_buffer_builder, gpu_cache);
 
             cache_entry.user_data = None;
             cache_entry.is_opaque = is_opaque;
@@ -324,7 +328,7 @@ impl RenderTaskCache {
                 task_size,
                 render_task,
                 cache_entry,
-                gpu_buffer_builder,
+                gpu_cache,
                 texture_cache,
             );
         }
diff --git a/gfx/wr/webrender/src/render_task_graph.rs b/gfx/wr/webrender/src/render_task_graph.rs
@@ -9,13 +9,12 @@
 
 use api::units::*;
 use api::ImageFormat;
+use crate::gpu_cache::{GpuCache, GpuCacheAddress};
 use crate::internal_types::{TextureSource, CacheTextureId, FastHashMap, FastHashSet, FrameId};
 use crate::internal_types::size_of_frame_vec;
 use crate::render_task::{StaticRenderTaskSurface, RenderTaskLocation, RenderTask};
 use crate::render_target::RenderTargetKind;
 use crate::render_task::{RenderTaskData, RenderTaskKind};
-use crate::renderer::GpuBufferAddress;
-use crate::renderer::GpuBufferBuilder;
 use crate::resource_cache::ResourceCache;
 use crate::texture_pack::GuillotineAllocator;
 use crate::prim_store::DeferredResolve;
@@ -281,7 +280,7 @@ impl RenderTaskGraphBuilder {
     pub fn end_frame(
         &mut self,
         resource_cache: &mut ResourceCache,
-        gpu_buffers: &mut GpuBufferBuilder,
+        gpu_cache: &mut GpuCache,
         deferred_resolves: &mut FrameVec<DeferredResolve>,
         max_shared_surface_size: i32,
         memory: &FrameMemory,
@@ -631,7 +630,7 @@ impl RenderTaskGraphBuilder {
                 Some(resolve_image(
                     info.request,
                     resource_cache,
-                    &mut gpu_buffers.f32,
+                    gpu_cache,
                     deferred_resolves,
                     info.is_composited,
                 ))
@@ -661,7 +660,7 @@ impl RenderTaskGraphBuilder {
 
             task.write_gpu_blocks(
                 target_rect,
-                gpu_buffers,
+                gpu_cache,
             );
 
             graph.task_data.push(
@@ -724,14 +723,16 @@ impl RenderTaskGraph {
     pub fn resolve_location(
         &self,
         task_id: impl Into<Option<RenderTaskId>>,
-    ) -> Option<(GpuBufferAddress, TextureSource)> {
-        self.resolve_impl(task_id.into()?)
+        gpu_cache: &GpuCache,
+    ) -> Option<(GpuCacheAddress, TextureSource)> {
+        self.resolve_impl(task_id.into()?, gpu_cache)
     }
 
     fn resolve_impl(
         &self,
         task_id: RenderTaskId,
-    ) -> Option<(GpuBufferAddress, TextureSource)> {
+        gpu_cache: &GpuCache,
+    ) -> Option<(GpuCacheAddress, TextureSource)> {
         let task = &self[task_id];
         let texture_source = task.get_texture_source();
 
@@ -739,7 +740,7 @@ impl RenderTaskGraph {
             return None;
         }
 
-        let uv_address = task.get_texture_address();
+        let uv_address = task.get_texture_address(gpu_cache);
 
         Some((uv_address, texture_source))
     }
@@ -1094,20 +1095,19 @@ impl RenderTaskGraphBuilder {
         total_surface_count: usize,
         unique_surfaces: &[(i32, i32, ImageFormat)],
     ) {
-        use crate::{internal_types::FrameStamp, renderer::{GpuBufferBuilderF, GpuBufferBuilderI}};
+        use crate::internal_types::FrameStamp;
         use api::{DocumentId, IdNamespace};
 
         let mut rc = ResourceCache::new_for_testing();
+        let mut gc =  GpuCache::new();
 
         let mut frame_stamp = FrameStamp::first(DocumentId::new(IdNamespace(1), 1));
         frame_stamp.advance();
+        gc.prepare_for_frames();
+        gc.begin_frame(frame_stamp);
 
         let frame_memory = FrameMemory::fallback();
-        let mut gpu_buffers = GpuBufferBuilder {
-            f32: GpuBufferBuilderF::new(&frame_memory),
-            i32: GpuBufferBuilderI::new(&frame_memory),
-        };
-        let g = self.end_frame(&mut rc, &mut gpu_buffers, &mut frame_memory.new_vec(), 2048, &frame_memory);
+        let g = self.end_frame(&mut rc, &mut gc, &mut frame_memory.new_vec(), 2048, &frame_memory);
         g.print();
 
         assert_eq!(g.passes.len(), pass_count);
diff --git a/gfx/wr/webrender/src/renderer/gpu_buffer.rs b/gfx/wr/webrender/src/renderer/gpu_buffer.rs
@@ -11,8 +11,6 @@
 
  */
 
-use std::i32;
-
 use crate::gpu_types::UvRectKind;
 use crate::internal_types::{FrameMemory, FrameVec};
 use crate::renderer::MAX_VERTEX_TEXTURE_WIDTH;
@@ -72,44 +70,24 @@ pub struct GpuBufferBlockI {
     data: [i32; 4],
 }
 
-// TODO(gw): Temporarily encode GPU Cache addresses as a single int.
-//           In the future, we can change the PrimitiveInstanceData struct
-//           to use 2x u16 for the vertex attribute instead of an i32.
-#[repr(transparent)]
 #[derive(Copy, Debug, Clone, MallocSizeOf, Eq, PartialEq)]
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
-pub struct GpuBufferAddress(u32);
+pub struct GpuBufferAddress {
+    pub u: u16,
+    pub v: u16,
+}
 
 impl GpuBufferAddress {
-    pub fn new(u: u16, v: u16) -> Self {
-        GpuBufferAddress(
-            v as u32 * MAX_VERTEX_TEXTURE_WIDTH as u32 + u as u32
-        )
-    }
-
-    pub fn as_u32(self) -> u32 {
-        self.0
-    }
-
-    pub fn from_u32(val: u32) -> Self {
-        GpuBufferAddress(val)
-    }
-
     #[allow(dead_code)]
     pub fn as_int(self) -> i32 {
-        self.0 as i32
+        // TODO(gw): Temporarily encode GPU Cache addresses as a single int.
+        //           In the future, we can change the PrimitiveInstanceData struct
+        //           to use 2x u16 for the vertex attribute instead of an i32.
+        self.v as i32 * MAX_VERTEX_TEXTURE_WIDTH as i32 + self.u as i32
     }
 
-    #[allow(dead_code)]
-    pub fn uv(self) -> (u16, u16) {
-        (
-            (self.0 as usize % MAX_VERTEX_TEXTURE_WIDTH) as u16,
-            (self.0 as usize / MAX_VERTEX_TEXTURE_WIDTH) as u16,
-        )
-    }
-
-    pub const INVALID: GpuBufferAddress = GpuBufferAddress(u32::MAX - 1);
+    pub const INVALID: GpuBufferAddress = GpuBufferAddress { u: !0, v: !0 };
 }
 
 impl GpuBufferBlockF {
@@ -238,7 +216,7 @@ pub struct GpuBufferWriter<'a, T> {
     buffer: &'a mut FrameVec<T>,
     deferred: &'a mut Vec<DeferredBlock>,
     index: usize,
-    max_block_count: usize,
+    block_count: usize,
 }
 
 impl<'a, T> GpuBufferWriter<'a, T> where T: Texel {
@@ -246,13 +224,13 @@ impl<'a, T> GpuBufferWriter<'a, T> where T: Texel {
         buffer: &'a mut FrameVec<T>,
         deferred: &'a mut Vec<DeferredBlock>,
         index: usize,
-        max_block_count: usize,
+        block_count: usize,
     ) -> Self {
         GpuBufferWriter {
             buffer,
             deferred,
             index,
-            max_block_count,
+            block_count,
         }
     }
 
@@ -280,15 +258,18 @@ impl<'a, T> GpuBufferWriter<'a, T> where T: Texel {
 
     /// Close this writer, returning the GPU address of this set of block(s).
     pub fn finish(self) -> GpuBufferAddress {
-        assert!(self.buffer.len() <= self.index + self.max_block_count);
+        assert_eq!(self.buffer.len(), self.index + self.block_count);
 
-        GpuBufferAddress(self.index as u32)
+        GpuBufferAddress {
+            u: (self.index % MAX_VERTEX_TEXTURE_WIDTH) as u16,
+            v: (self.index / MAX_VERTEX_TEXTURE_WIDTH) as u16,
+        }
     }
 }
 
 impl<'a, T> Drop for GpuBufferWriter<'a, T> {
     fn drop(&mut self) {
-        assert!(self.buffer.len() <= self.index + self.max_block_count, "Attempt to write too many GpuBuffer blocks");
+        assert_eq!(self.buffer.len(), self.index + self.block_count, "Claimed block_count was not written");
     }
 }
 
@@ -326,17 +307,20 @@ impl<T> GpuBufferBuilderImpl<T> where T: Texel + std::convert::From<DeviceIntRec
 
         self.data.extend_from_slice(blocks);
 
-        GpuBufferAddress(index as u32)
+        GpuBufferAddress {
+            u: (index % MAX_VERTEX_TEXTURE_WIDTH) as u16,
+            v: (index / MAX_VERTEX_TEXTURE_WIDTH) as u16,
+        }
     }
 
     /// Begin writing a specific number of blocks
     pub fn write_blocks(
         &mut self,
-        max_block_count: usize,
+        block_count: usize,
     ) -> GpuBufferWriter<T> {
-        assert!(max_block_count <= MAX_VERTEX_TEXTURE_WIDTH);
+        assert!(block_count <= MAX_VERTEX_TEXTURE_WIDTH);
 
-        if (self.data.len() % MAX_VERTEX_TEXTURE_WIDTH) + max_block_count > MAX_VERTEX_TEXTURE_WIDTH {
+        if (self.data.len() % MAX_VERTEX_TEXTURE_WIDTH) + block_count > MAX_VERTEX_TEXTURE_WIDTH {
             while self.data.len() % MAX_VERTEX_TEXTURE_WIDTH != 0 {
                 self.data.push(T::default());
             }
@@ -348,23 +332,10 @@ impl<T> GpuBufferBuilderImpl<T> where T: Texel + std::convert::From<DeviceIntRec
             &mut self.data,
             &mut self.deferred,
             index,
-            max_block_count,
+            block_count,
         )
     }
 
-    // Reserve space in the gpu buffer for data that will be written by the
-    // renderer.
-    pub fn reserve_renderer_deferred_blocks(&mut self, block_count: usize) -> GpuBufferAddress {
-        let index = self.data.len();
-
-        self.data.reserve(block_count);
-        for _ in 0 ..block_count {
-            self.data.push(Default::default());
-        }
-
-        GpuBufferAddress(index as u32)
-    }
-
     pub fn finalize(
         mut self,
         render_tasks: &RenderTaskGraph,
diff --git a/gfx/wr/webrender/src/renderer/gpu_cache.rs b/gfx/wr/webrender/src/renderer/gpu_cache.rs
@@ -0,0 +1,541 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+use std::{cmp, mem};
+use api::units::*;
+use malloc_size_of::MallocSizeOfOps;
+use crate::{
+    device::{CustomVAO, Device, DrawTarget, Program, ReadTarget, Texture, TextureFilter, UploadPBOPool, VBO},
+    gpu_cache::{GpuBlockData, GpuCacheUpdate, GpuCacheUpdateList},
+    internal_types::{FrameId, RenderTargetInfo, Swizzle},
+    prim_store::DeferredResolve,
+    profiler,
+    render_api::MemoryReport,
+};
+
+/// Enabling this toggle would force the GPU cache scattered texture to
+/// be resized every frame, which enables GPU debuggers to see if this
+/// is performed correctly.
+const GPU_CACHE_RESIZE_TEST: bool = false;
+
+/// Tracks the state of each row in the GPU cache texture.
+struct CacheRow {
+    /// Mirrored block data on CPU for this row. We store a copy of
+    /// the data on the CPU side to improve upload batching.
+    cpu_blocks: Box<[GpuBlockData; super::MAX_VERTEX_TEXTURE_WIDTH]>,
+    /// The first offset in this row that is dirty.
+    min_dirty: u16,
+    /// The last offset in this row that is dirty.
+    max_dirty: u16,
+}
+
+impl CacheRow {
+    fn new() -> Self {
+        CacheRow {
+            cpu_blocks: Box::new([GpuBlockData::EMPTY; super::MAX_VERTEX_TEXTURE_WIDTH]),
+            min_dirty: super::MAX_VERTEX_TEXTURE_WIDTH as _,
+            max_dirty: 0,
+        }
+    }
+
+    fn is_dirty(&self) -> bool {
+        return self.min_dirty < self.max_dirty;
+    }
+
+    fn clear_dirty(&mut self) {
+        self.min_dirty = super::MAX_VERTEX_TEXTURE_WIDTH as _;
+        self.max_dirty = 0;
+    }
+
+    fn add_dirty(&mut self, block_offset: usize, block_count: usize) {
+        self.min_dirty = self.min_dirty.min(block_offset as _);
+        self.max_dirty = self.max_dirty.max((block_offset + block_count) as _);
+    }
+
+    fn dirty_blocks(&self) -> &[GpuBlockData] {
+        return &self.cpu_blocks[self.min_dirty as usize .. self.max_dirty as usize];
+    }
+}
+
+/// The bus over which CPU and GPU versions of the GPU cache
+/// get synchronized.
+enum GpuCacheBus {
+    /// PBO-based updates, currently operate on a row granularity.
+    /// Therefore, are subject to fragmentation issues.
+    PixelBuffer {
+        /// Per-row data.
+        rows: Vec<CacheRow>,
+    },
+    /// Shader-based scattering updates. Currently rendered by a set
+    /// of points into the GPU texture, each carrying a `GpuBlockData`.
+    Scatter {
+        /// Special program to run the scattered update.
+        program: Program,
+        /// VAO containing the source vertex buffers.
+        vao: CustomVAO,
+        /// VBO for positional data, supplied as normalized `u16`.
+        buf_position: VBO<[u16; 2]>,
+        /// VBO for gpu block data.
+        buf_value: VBO<GpuBlockData>,
+        /// Currently stored block count.
+        count: usize,
+    },
+}
+
+/// The device-specific representation of the cache texture in gpu_cache.rs
+pub struct GpuCacheTexture {
+    texture: Option<Texture>,
+    bus: GpuCacheBus,
+}
+
+impl GpuCacheTexture {
+    /// Ensures that we have an appropriately-sized texture.
+    fn ensure_texture(&mut self, device: &mut Device, height: i32) {
+        // If we already have a texture that works, we're done.
+        if self.texture.as_ref().map_or(false, |t| t.get_dimensions().height >= height) {
+            if GPU_CACHE_RESIZE_TEST {
+                // Special debug mode - resize the texture even though it's fine.
+            } else {
+                return;
+            }
+        }
+
+        // Take the old texture, if any.
+        let blit_source = self.texture.take();
+
+        // Create the new texture.
+        assert!(height >= 2, "Height is too small for ANGLE");
+        let new_size = DeviceIntSize::new(super::MAX_VERTEX_TEXTURE_WIDTH as _, height);
+        // GpuCacheBus::Scatter always requires the texture to be a render target. For
+        // GpuCacheBus::PixelBuffer, we only create the texture with a render target if
+        // RGBAF32 render targets are actually supported, and only if glCopyImageSubData
+        // is not. glCopyImageSubData does not require a render target to copy the texture
+        // data, and if neither RGBAF32 render targets nor glCopyImageSubData is supported,
+        // we simply re-upload the entire contents rather than copying upon resize.
+        let supports_copy_image_sub_data = device.get_capabilities().supports_copy_image_sub_data;
+        let supports_color_buffer_float = device.get_capabilities().supports_color_buffer_float;
+        let rt_info = if matches!(self.bus, GpuCacheBus::PixelBuffer { .. })
+            && (supports_copy_image_sub_data || !supports_color_buffer_float)
+        {
+            None
+        } else {
+            Some(RenderTargetInfo { has_depth: false })
+        };
+        let mut texture = device.create_texture(
+            api::ImageBufferKind::Texture2D,
+            api::ImageFormat::RGBAF32,
+            new_size.width,
+            new_size.height,
+            TextureFilter::Nearest,
+            rt_info,
+        );
+
+        // Copy the contents of the previous texture, if applicable.
+        if let Some(blit_source) = blit_source {
+            if !supports_copy_image_sub_data && !supports_color_buffer_float {
+                // Cannot copy texture, so must re-upload everything.
+                match self.bus {
+                    GpuCacheBus::PixelBuffer { ref mut rows } => {
+                        for row in rows {
+                            row.add_dirty(0, super::MAX_VERTEX_TEXTURE_WIDTH);
+                        }
+                    }
+                    GpuCacheBus::Scatter { .. } => {
+                        panic!("Texture must be copyable to use scatter GPU cache bus method");
+                    }
+                }
+            } else {
+                device.copy_entire_texture(&mut texture, &blit_source);
+            }
+            device.delete_texture(blit_source);
+        }
+
+        self.texture = Some(texture);
+    }
+
+    pub fn new(device: &mut Device, use_scatter: bool) -> Result<Self, super::RendererError> {
+        use super::desc::GPU_CACHE_UPDATE;
+
+        let bus = if use_scatter {
+            assert!(
+                device.get_capabilities().supports_color_buffer_float,
+                "GpuCache scatter method requires EXT_color_buffer_float",
+            );
+            let program = device.create_program_linked(
+                "gpu_cache_update",
+                &[],
+                &GPU_CACHE_UPDATE,
+            )?;
+            let buf_position = device.create_vbo();
+            let buf_value = device.create_vbo();
+            //Note: the vertex attributes have to be supplied in the same order
+            // as for program creation, but each assigned to a different stream.
+            let vao = device.create_custom_vao(&[
+                buf_position.stream_with(&GPU_CACHE_UPDATE.vertex_attributes[0..1]),
+                buf_value   .stream_with(&GPU_CACHE_UPDATE.vertex_attributes[1..2]),
+            ]);
+            GpuCacheBus::Scatter {
+                program,
+                vao,
+                buf_position,
+                buf_value,
+                count: 0,
+            }
+        } else {
+            GpuCacheBus::PixelBuffer {
+                rows: Vec::new(),
+            }
+        };
+
+        Ok(GpuCacheTexture {
+            texture: None,
+            bus,
+        })
+    }
+
+    pub fn deinit(mut self, device: &mut Device) {
+        if let Some(t) = self.texture.take() {
+            device.delete_texture(t);
+        }
+        if let GpuCacheBus::Scatter { program, vao, buf_position, buf_value, .. } = self.bus {
+            device.delete_program(program);
+            device.delete_custom_vao(vao);
+            device.delete_vbo(buf_position);
+            device.delete_vbo(buf_value);
+        }
+    }
+
+    pub fn get_height(&self) -> i32 {
+        self.texture.as_ref().map_or(0, |t| t.get_dimensions().height)
+    }
+
+    #[cfg(feature = "capture")]
+    pub fn get_texture(&self) -> &Texture {
+        self.texture.as_ref().unwrap()
+    }
+
+    fn prepare_for_updates(
+        &mut self,
+        device: &mut Device,
+        total_block_count: usize,
+        max_height: i32,
+    ) {
+        self.ensure_texture(device, max_height);
+        match self.bus {
+            GpuCacheBus::PixelBuffer { .. } => {},
+            GpuCacheBus::Scatter {
+                ref mut buf_position,
+                ref mut buf_value,
+                ref mut count,
+                ..
+            } => {
+                *count = 0;
+                if total_block_count > buf_value.allocated_count() {
+                    device.allocate_vbo(buf_position, total_block_count, super::ONE_TIME_USAGE_HINT);
+                    device.allocate_vbo(buf_value,    total_block_count, super::ONE_TIME_USAGE_HINT);
+                }
+            }
+        }
+    }
+
+    pub fn invalidate(&mut self) {
+        match self.bus {
+            GpuCacheBus::PixelBuffer { ref mut rows, .. } => {
+                info!("Invalidating GPU caches");
+                for row in rows {
+                    row.add_dirty(0, super::MAX_VERTEX_TEXTURE_WIDTH);
+                }
+            }
+            GpuCacheBus::Scatter { .. } => {
+                warn!("Unable to invalidate scattered GPU cache");
+            }
+        }
+    }
+
+    fn update(&mut self, device: &mut Device, updates: &GpuCacheUpdateList) {
+        match self.bus {
+            GpuCacheBus::PixelBuffer { ref mut rows, .. } => {
+                for update in &updates.updates {
+                    match *update {
+                        GpuCacheUpdate::Copy {
+                            block_index,
+                            block_count,
+                            address,
+                        } => {
+                            let row = address.v as usize;
+
+                            // Ensure that the CPU-side shadow copy of the GPU cache data has enough
+                            // rows to apply this patch.
+                            while rows.len() <= row {
+                                // Add a new row.
+                                rows.push(CacheRow::new());
+                            }
+
+                            // Copy the blocks from the patch array in the shadow CPU copy.
+                            let block_offset = address.u as usize;
+                            let data = &mut rows[row].cpu_blocks;
+                            for i in 0 .. block_count {
+                                data[block_offset + i] = updates.blocks[block_index + i];
+                            }
+
+                            // This row is dirty (needs to be updated in GPU texture).
+                            rows[row].add_dirty(block_offset, block_count);
+                        }
+                    }
+                }
+            }
+            GpuCacheBus::Scatter {
+                ref buf_position,
+                ref buf_value,
+                ref mut count,
+                ..
+            } => {
+                //TODO: re-use this heap allocation
+                // Unused positions will be left as 0xFFFF, which translates to
+                // (1.0, 1.0) in the vertex output position and gets culled out
+                let mut position_data = vec![[!0u16; 2]; updates.blocks.len()];
+                let size = self.texture.as_ref().unwrap().get_dimensions().to_usize();
+
+                for update in &updates.updates {
+                    match *update {
+                        GpuCacheUpdate::Copy {
+                            block_index,
+                            block_count,
+                            address,
+                        } => {
+                            // Convert the absolute texel position into normalized
+                            let y = ((2*address.v as usize + 1) << 15) / size.height;
+                            for i in 0 .. block_count {
+                                let x = ((2*address.u as usize + 2*i + 1) << 15) / size.width;
+                                position_data[block_index + i] = [x as _, y as _];
+                            }
+                        }
+                    }
+                }
+
+                device.fill_vbo(buf_value, &updates.blocks, *count);
+                device.fill_vbo(buf_position, &position_data, *count);
+                *count += position_data.len();
+            }
+        }
+    }
+
+    fn flush(&mut self, device: &mut Device, pbo_pool: &mut UploadPBOPool) -> usize {
+        let texture = self.texture.as_ref().unwrap();
+        match self.bus {
+            GpuCacheBus::PixelBuffer { ref mut rows } => {
+                let rows_dirty = rows
+                    .iter()
+                    .filter(|row| row.is_dirty())
+                    .count();
+                if rows_dirty == 0 {
+                    return 0
+                }
+
+                let mut uploader = device.upload_texture(pbo_pool);
+
+                for (row_index, row) in rows.iter_mut().enumerate() {
+                    if !row.is_dirty() {
+                        continue;
+                    }
+
+                    let blocks = row.dirty_blocks();
+                    let rect = DeviceIntRect::from_origin_and_size(
+                        DeviceIntPoint::new(row.min_dirty as i32, row_index as i32),
+                        DeviceIntSize::new(blocks.len() as i32, 1),
+                    );
+
+                    uploader.upload(device, texture, rect, None, None, blocks.as_ptr(), blocks.len());
+
+                    row.clear_dirty();
+                }
+
+                uploader.flush(device);
+
+                rows_dirty
+            }
+            GpuCacheBus::Scatter { ref program, ref vao, count, .. } => {
+                device.disable_depth();
+                device.set_blend(false);
+                device.bind_program(program);
+                device.bind_custom_vao(vao);
+                device.bind_draw_target(
+                    DrawTarget::from_texture(
+                        texture,
+                        false,
+                    ),
+                );
+                device.draw_nonindexed_points(0, count as _);
+                0
+            }
+        }
+    }
+
+    #[cfg(feature = "replay")]
+    pub fn remove_texture(&mut self, device: &mut Device) {
+        if let Some(t) = self.texture.take() {
+            device.delete_texture(t);
+        }
+    }
+
+    #[cfg(feature = "replay")]
+    pub fn load_from_data(&mut self, texture: Texture, data: Vec<u8>) {
+        assert!(self.texture.is_none());
+        match self.bus {
+            GpuCacheBus::PixelBuffer { ref mut rows, .. } => {
+                let dim = texture.get_dimensions();
+                let blocks = unsafe {
+                    std::slice::from_raw_parts(
+                        data.as_ptr() as *const GpuBlockData,
+                        data.len() / mem::size_of::<GpuBlockData>(),
+                    )
+                };
+                // fill up the CPU cache from the contents we just loaded
+                rows.clear();
+                rows.extend((0 .. dim.height).map(|_| CacheRow::new()));
+                let chunks = blocks.chunks(super::MAX_VERTEX_TEXTURE_WIDTH);
+                debug_assert_eq!(chunks.len(), rows.len());
+                for (row, chunk) in rows.iter_mut().zip(chunks) {
+                    row.cpu_blocks.copy_from_slice(chunk);
+                }
+            }
+            GpuCacheBus::Scatter { .. } => {}
+        }
+        self.texture = Some(texture);
+    }
+
+    pub fn report_memory_to(&self, report: &mut MemoryReport, size_op_funs: &MallocSizeOfOps) {
+        if let GpuCacheBus::PixelBuffer{ref rows, ..} = self.bus {
+            for row in rows.iter() {
+                report.gpu_cache_cpu_mirror += unsafe { (size_op_funs.size_of_op)(row.cpu_blocks.as_ptr() as *const _) };
+            }
+        }
+
+        // GPU cache GPU memory.
+        report.gpu_cache_textures +=
+            self.texture.as_ref().map_or(0, |t| t.size_in_bytes());
+    }
+
+    pub fn gpu_size_in_bytes(&self) -> usize {
+        match &self.texture {
+            Some(tex) => tex.size_in_bytes(),
+            None => 0,
+        }
+    }
+}
+
+impl super::Renderer {
+    pub fn update_gpu_cache(&mut self) {
+        let _gm = self.gpu_profiler.start_marker("gpu cache update");
+
+        // For an artificial stress test of GPU cache resizing,
+        // always pass an extra update list with at least one block in it.
+        let gpu_cache_height = self.gpu_cache_texture.get_height();
+        if gpu_cache_height != 0 && GPU_CACHE_RESIZE_TEST {
+            self.pending_gpu_cache_updates.push(GpuCacheUpdateList {
+                frame_id: FrameId::INVALID,
+                clear: false,
+                height: gpu_cache_height,
+                blocks: vec![[1f32; 4].into()],
+                updates: Vec::new(),
+                debug_commands: Vec::new(),
+            });
+        }
+
+        let (updated_blocks, max_requested_height) = self
+            .pending_gpu_cache_updates
+            .iter()
+            .fold((0, gpu_cache_height), |(count, height), list| {
+                (count + list.blocks.len(), cmp::max(height, list.height))
+            });
+
+        if max_requested_height > self.get_max_texture_size() && !self.gpu_cache_overflow {
+            self.gpu_cache_overflow = true;
+            self.renderer_errors.push(super::RendererError::MaxTextureSize);
+        }
+
+        // Note: if we decide to switch to scatter-style GPU cache update
+        // permanently, we can have this code nicer with `BufferUploader` kind
+        // of helper, similarly to how `TextureUploader` API is used.
+        self.gpu_cache_texture.prepare_for_updates(
+            &mut self.device,
+            updated_blocks,
+            max_requested_height,
+        );
+
+        for update_list in self.pending_gpu_cache_updates.drain(..) {
+            assert!(update_list.height <= max_requested_height);
+            if update_list.frame_id > self.gpu_cache_frame_id {
+                self.gpu_cache_frame_id = update_list.frame_id
+            }
+            self.gpu_cache_texture
+                .update(&mut self.device, &update_list);
+        }
+
+        self.profile.start_time(profiler::GPU_CACHE_UPLOAD_TIME);
+        let updated_rows = self.gpu_cache_texture.flush(
+            &mut self.device,
+            &mut self.texture_upload_pbo_pool
+        );
+        self.gpu_cache_upload_time += self.profile.end_time(profiler::GPU_CACHE_UPLOAD_TIME);
+
+        self.profile.set(profiler::GPU_CACHE_ROWS_UPDATED, updated_rows);
+        self.profile.set(profiler::GPU_CACHE_BLOCKS_UPDATED, updated_blocks);
+    }
+
+    pub fn prepare_gpu_cache(
+        &mut self,
+        deferred_resolves: &[DeferredResolve],
+    ) -> Result<(), super::RendererError> {
+        self.profile.start_time(profiler::GPU_CACHE_PREPARE_TIME);
+
+        if self.pending_gpu_cache_clear {
+            let use_scatter =
+                matches!(self.gpu_cache_texture.bus, GpuCacheBus::Scatter { .. });
+            let new_cache = match GpuCacheTexture::new(&mut self.device, use_scatter) {
+                Ok(cache) => cache,
+                Err(err) => {
+                    self.profile.end_time(profiler::GPU_CACHE_PREPARE_TIME);
+                    return Err(err);
+                }
+            };
+            let old_cache = mem::replace(&mut self.gpu_cache_texture, new_cache);
+            old_cache.deinit(&mut self.device);
+            self.pending_gpu_cache_clear = false;
+        }
+
+        let deferred_update_list = self.update_deferred_resolves(deferred_resolves);
+        self.pending_gpu_cache_updates.extend(deferred_update_list);
+
+        self.update_gpu_cache();
+
+        // Note: the texture might have changed during the `update`,
+        // so we need to bind it here.
+        self.device.bind_texture(
+            super::TextureSampler::GpuCache,
+            self.gpu_cache_texture.texture.as_ref().unwrap(),
+            Swizzle::default(),
+        );
+
+        self.profile.end_time(profiler::GPU_CACHE_PREPARE_TIME);
+
+        Ok(())
+    }
+
+    pub fn read_gpu_cache(&mut self) -> (DeviceIntSize, Vec<u8>) {
+        let texture = self.gpu_cache_texture.texture.as_ref().unwrap();
+        let size = device_size_as_framebuffer_size(texture.get_dimensions());
+        let mut texels = vec![0; (size.width * size.height * 16) as usize];
+        self.device.begin_frame();
+        self.device.bind_read_target(ReadTarget::from_texture(texture));
+        self.device.read_pixels_into(
+            size.into(),
+            api::ImageFormat::RGBAF32,
+            &mut texels,
+        );
+        self.device.reset_read_target();
+        self.device.end_frame();
+        (texture.get_dimensions(), texels)
+    }
+}
diff --git a/gfx/wr/webrender/src/renderer/init.rs b/gfx/wr/webrender/src/renderer/init.rs
@@ -19,7 +19,7 @@ use crate::frame_builder::FrameBuilderConfig;
 use crate::glyph_cache::GlyphCache;
 use glyph_rasterizer::{GlyphRasterThread, GlyphRasterizer, SharedFontResources};
 use crate::gpu_types::PrimitiveInstanceData;
-use crate::internal_types::{FastHashMap, FastHashSet};
+use crate::internal_types::{FastHashMap, FastHashSet, FrameId};
 use crate::picture;
 use crate::profiler::{self, Profiler, TransactionProfile};
 use crate::device::query::{GpuProfiler, GpuDebugMethod};
@@ -29,7 +29,7 @@ use crate::scene_builder_thread::{SceneBuilderThread, SceneBuilderThreadChannels
 use crate::texture_cache::{TextureCache, TextureCacheConfig};
 use crate::picture_textures::PictureTextures;
 use crate::renderer::{
-    debug, vertex, gl,
+    debug, gpu_cache, vertex, gl,
     Renderer, DebugOverlayState, BufferDamageTracker, PipelineInfo, TextureResolver,
     RendererError, ShaderPrecacheFlags, VERTEX_DATA_TEXTURE_COUNT,
     upload::UploadTexturePool,
@@ -514,8 +514,25 @@ pub fn create_webrender_instance(
         vertex_data_textures.push(vertex::VertexDataTextures::new());
     }
 
+    // On some (mostly older, integrated) GPUs, the normal GPU texture cache update path
+    // doesn't work well when running on ANGLE, causing CPU stalls inside D3D and/or the
+    // GPU driver. See https://bugzilla.mozilla.org/show_bug.cgi?id=1576637 for much
+    // more detail. To reduce the number of code paths we have active that require testing,
+    // we will enable the GPU cache scatter update path on all devices running with ANGLE.
+    // We want a better solution long-term, but for now this is a significant performance
+    // improvement on HD4600 era GPUs, and shouldn't hurt performance in a noticeable
+    // way on other systems running under ANGLE.
     let is_software = device.get_capabilities().renderer_name.starts_with("Software");
 
+    // On other GL platforms, like macOS or Android, creating many PBOs is very inefficient.
+    // This is what happens in GPU cache updates in PBO path. Instead, we switch everything
+    // except software GL to use the GPU scattered updates.
+    let supports_scatter = device.get_capabilities().supports_color_buffer_float;
+    let gpu_cache_texture = gpu_cache::GpuCacheTexture::new(
+        &mut device,
+        supports_scatter && !is_software,
+    )?;
+
     device.end_frame();
 
     let backend_notifier = notifier.clone();
@@ -763,6 +780,8 @@ pub fn create_webrender_instance(
         pending_texture_updates: Vec::new(),
         pending_texture_cache_updates: false,
         pending_native_surface_updates: Vec::new(),
+        pending_gpu_cache_updates: Vec::new(),
+        pending_gpu_cache_clear: false,
         pending_shader_updates: Vec::new(),
         shaders,
         debug: debug::LazyInitializedDebugRenderer::new(),
@@ -770,6 +789,7 @@ pub fn create_webrender_instance(
         profile: TransactionProfile::new(),
         frame_counter: 0,
         resource_upload_time: 0.0,
+        gpu_cache_upload_time: 0.0,
         profiler: Profiler::new(),
         max_recorded_profiles: options.max_recorded_profiles,
         clear_color: options.clear_color,
@@ -788,6 +808,10 @@ pub fn create_webrender_instance(
         size_of_ops: make_size_of_ops(),
         cpu_profiles: VecDeque::new(),
         gpu_profiles: VecDeque::new(),
+        gpu_cache_texture,
+        gpu_cache_debug_chunks: Vec::new(),
+        gpu_cache_frame_id: FrameId::INVALID,
+        gpu_cache_overflow: false,
         texture_upload_pbo_pool,
         staging_texture_pool,
         texture_resolver,
diff --git a/gfx/wr/webrender/src/renderer/mod.rs b/gfx/wr/webrender/src/renderer/mod.rs
@@ -70,9 +70,11 @@ use crate::device::FBOId;
 use crate::debug_item::DebugItem;
 use crate::frame_builder::Frame;
 use glyph_rasterizer::GlyphFormat;
+use crate::gpu_cache::{GpuCacheUpdate, GpuCacheUpdateList};
+use crate::gpu_cache::{GpuCacheDebugChunk, GpuCacheDebugCmd};
 use crate::gpu_types::{ScalingInstance, SvgFilterInstance, SVGFEFilterInstance, CopyInstance, PrimitiveInstanceData};
 use crate::gpu_types::{BlurInstance, ClearInstance, CompositeInstance, ZBufferId};
-use crate::internal_types::{TextureSource, TextureSourceExternal, FrameVec};
+use crate::internal_types::{TextureSource, TextureSourceExternal, FrameId, FrameVec};
 #[cfg(any(feature = "capture", feature = "replay"))]
 use crate::internal_types::DebugOutput;
 use crate::internal_types::{CacheTextureId, FastHashMap, FastHashSet, RenderedDocument, ResultMsg};
@@ -120,6 +122,7 @@ use std::collections::hash_map::Entry;
 
 mod debug;
 mod gpu_buffer;
+mod gpu_cache;
 mod shade;
 mod vertex;
 mod upload;
@@ -129,7 +132,7 @@ pub use debug::DebugRenderer;
 pub use shade::{PendingShadersToPrecache, Shaders, SharedShaders};
 pub use vertex::{desc, VertexArrayKind, MAX_VERTEX_TEXTURE_WIDTH};
 pub use gpu_buffer::{GpuBuffer, GpuBufferF, GpuBufferBuilderF, GpuBufferI, GpuBufferBuilderI};
-pub use gpu_buffer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferWriterF, GpuBufferBlockF};
+pub use gpu_buffer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferWriterF};
 
 /// The size of the array of each type of vertex data texture that
 /// is round-robin-ed each frame during bind_frame_data. Doing this
@@ -387,6 +390,7 @@ pub(crate) enum TextureSampler {
     Color0,
     Color1,
     Color2,
+    GpuCache,
     TransformPalette,
     RenderTasks,
     Dither,
@@ -416,14 +420,15 @@ impl Into<TextureSlot> for TextureSampler {
             TextureSampler::Color0 => TextureSlot(0),
             TextureSampler::Color1 => TextureSlot(1),
             TextureSampler::Color2 => TextureSlot(2),
-            TextureSampler::TransformPalette => TextureSlot(3),
-            TextureSampler::RenderTasks => TextureSlot(4),
-            TextureSampler::Dither => TextureSlot(5),
-            TextureSampler::PrimitiveHeadersF => TextureSlot(6),
-            TextureSampler::PrimitiveHeadersI => TextureSlot(7),
-            TextureSampler::ClipMask => TextureSlot(8),
-            TextureSampler::GpuBufferF => TextureSlot(9),
-            TextureSampler::GpuBufferI => TextureSlot(10),
+            TextureSampler::GpuCache => TextureSlot(3),
+            TextureSampler::TransformPalette => TextureSlot(4),
+            TextureSampler::RenderTasks => TextureSlot(5),
+            TextureSampler::Dither => TextureSlot(6),
+            TextureSampler::PrimitiveHeadersF => TextureSlot(7),
+            TextureSampler::PrimitiveHeadersI => TextureSlot(8),
+            TextureSampler::ClipMask => TextureSlot(9),
+            TextureSampler::GpuBufferF => TextureSlot(10),
+            TextureSampler::GpuBufferI => TextureSlot(11),
         }
     }
 }
@@ -820,6 +825,8 @@ pub struct Renderer {
     /// True if there are any TextureCacheUpdate pending.
     pending_texture_cache_updates: bool,
     pending_native_surface_updates: Vec<NativeSurfaceOperation>,
+    pending_gpu_cache_updates: Vec<GpuCacheUpdateList>,
+    pending_gpu_cache_clear: bool,
     pending_shader_updates: Vec<PathBuf>,
     active_documents: FastHashMap<DocumentId, RenderedDocument>,
 
@@ -838,6 +845,7 @@ pub struct Renderer {
     profile: TransactionProfile,
     frame_counter: u64,
     resource_upload_time: f64,
+    gpu_cache_upload_time: f64,
     profiler: Profiler,
     #[cfg(feature = "debugger")]
     debugger: Debugger,
@@ -847,9 +855,18 @@ pub struct Renderer {
     pub gpu_profiler: GpuProfiler,
     vaos: vertex::RendererVAOs,
 
+    gpu_cache_texture: gpu_cache::GpuCacheTexture,
     vertex_data_textures: Vec<vertex::VertexDataTextures>,
     current_vertex_data_textures: usize,
 
+    /// When the GPU cache debugger is enabled, we keep track of the live blocks
+    /// in the GPU cache so that we can use them for the debug display. This
+    /// member stores those live blocks, indexed by row.
+    gpu_cache_debug_chunks: Vec<Vec<GpuCacheDebugChunk>>,
+
+    gpu_cache_frame_id: FrameId,
+    gpu_cache_overflow: bool,
+
     pipeline_info: PipelineInfo,
 
     // Manages and resolves source textures IDs to real texture IDs.
@@ -1102,6 +1119,32 @@ impl Renderer {
                     self.pending_native_surface_updates.extend(resource_update_list.native_surface_updates);
                     self.documents_seen.insert(document_id);
                 }
+                ResultMsg::UpdateGpuCache(mut list) => {
+                    if list.clear {
+                        self.pending_gpu_cache_clear = true;
+                    }
+                    if list.clear {
+                        self.gpu_cache_debug_chunks = Vec::new();
+                    }
+                    for cmd in mem::replace(&mut list.debug_commands, Vec::new()) {
+                        match cmd {
+                            GpuCacheDebugCmd::Alloc(chunk) => {
+                                let row = chunk.address.v as usize;
+                                if row >= self.gpu_cache_debug_chunks.len() {
+                                    self.gpu_cache_debug_chunks.resize(row + 1, Vec::new());
+                                }
+                                self.gpu_cache_debug_chunks[row].push(chunk);
+                            },
+                            GpuCacheDebugCmd::Free(address) => {
+                                let chunks = &mut self.gpu_cache_debug_chunks[address.v as usize];
+                                let pos = chunks.iter()
+                                    .position(|x| x.address == address).unwrap();
+                                chunks.remove(pos);
+                            },
+                        }
+                    }
+                    self.pending_gpu_cache_updates.push(list);
+                }
                 ResultMsg::UpdateResources {
                     resource_updates,
                     memory_pressure,
@@ -1326,6 +1369,9 @@ impl Renderer {
             | DebugCommand::SimulateLongSceneBuild(_)
             | DebugCommand::EnableNativeCompositor(_)
             | DebugCommand::SetBatchingLookback(_) => {}
+            DebugCommand::InvalidateGpuCache => {
+                self.gpu_cache_texture.invalidate();
+            }
             DebugCommand::SetFlags(flags) => {
                 self.set_debug_flags(flags);
             }
@@ -1461,6 +1507,7 @@ impl Renderer {
             DebugFlags::RENDER_TARGET_DBG |
             DebugFlags::TEXTURE_CACHE_DBG |
             DebugFlags::EPOCHS |
+            DebugFlags::GPU_CACHE_DBG |
             DebugFlags::PICTURE_CACHING_DBG |
             DebugFlags::PICTURE_BORDERS |
             DebugFlags::ZOOM_DBG |
@@ -1701,30 +1748,39 @@ impl Renderer {
                     "Cleared texture cache without sending new document frame.");
         }
 
-        self.update_deferred_resolves(&frame.deferred_resolves, &mut frame.gpu_buffer_f);
+        match self.prepare_gpu_cache(&frame.deferred_resolves) {
+            Ok(..) => {
+                assert!(frame.gpu_cache_frame_id <= self.gpu_cache_frame_id,
+                    "Received frame depends on a later GPU cache epoch ({:?}) than one we received last via `UpdateGpuCache` ({:?})",
+                    frame.gpu_cache_frame_id, self.gpu_cache_frame_id);
 
-        self.draw_frame(
-            frame,
-            device_size,
-            buffer_age,
-            &mut results,
-        );
+                self.draw_frame(
+                    frame,
+                    device_size,
+                    buffer_age,
+                    &mut results,
+                );
 
-        // TODO(nical): do this automatically by selecting counters in the wr profiler
-        // Profile marker for the number of invalidated picture cache
-        if thread_is_being_profiled() {
-            let duration = Duration::new(0,0);
-            if let Some(n) = self.profile.get(profiler::RENDERED_PICTURE_TILES) {
-                let message = (n as usize).to_string();
-                add_text_marker("NumPictureCacheInvalidated", &message, duration);
-            }
-        }
+                // TODO(nical): do this automatically by selecting counters in the wr profiler
+                // Profile marker for the number of invalidated picture cache
+                if thread_is_being_profiled() {
+                    let duration = Duration::new(0,0);
+                    if let Some(n) = self.profile.get(profiler::RENDERED_PICTURE_TILES) {
+                        let message = (n as usize).to_string();
+                        add_text_marker("NumPictureCacheInvalidated", &message, duration);
+                    }
+                }
 
-        if device_size.is_some() {
-            self.draw_frame_debug_items(&frame.debug_items);
-        }
+                if device_size.is_some() {
+                    self.draw_frame_debug_items(&frame.debug_items);
+                }
 
-        self.profile.merge(profile);
+                self.profile.merge(profile);
+            }
+            Err(e) => {
+                self.renderer_errors.push(e);
+            }
+        }
 
         self.unlock_external_images(&frame.deferred_resolves);
 
@@ -1745,6 +1801,7 @@ impl Renderer {
             self.bind_debug_overlay(device_size).map(|draw_target| {
                 self.draw_render_target_debug(&draw_target);
                 self.draw_texture_cache_debug(&draw_target);
+                self.draw_gpu_cache_debug(device_size);
                 self.draw_zoom_debug(device_size);
                 self.draw_epoch_debug();
                 self.draw_window_visibility_debug();
@@ -1792,6 +1849,8 @@ impl Renderer {
         self.frame_counter += 1;
         results.stats.resource_upload_time = self.resource_upload_time;
         self.resource_upload_time = 0.0;
+        results.stats.gpu_cache_upload_time = self.gpu_cache_upload_time;
+        self.gpu_cache_upload_time = 0.0;
 
         if let Some(stats) = active_doc.frame_stats.take() {
           // Copy the full frame stats to RendererStats
@@ -4886,23 +4945,28 @@ impl Renderer {
         }
     }
 
-    fn update_deferred_resolves(
-        &mut self,
-        deferred_resolves: &[DeferredResolve],
-        gpu_buffer: &mut GpuBufferF,
-    ) {
+    fn update_deferred_resolves(&mut self, deferred_resolves: &[DeferredResolve]) -> Option<GpuCacheUpdateList> {
         // The first thing we do is run through any pending deferred
         // resolves, and use a callback to get the UV rect for this
         // custom item. Then we patch the resource_rects structure
         // here before it's uploaded to the GPU.
         if deferred_resolves.is_empty() {
-            return;
+            return None;
         }
 
         let handler = self.external_image_handler
             .as_mut()
             .expect("Found external image, but no handler set!");
 
+        let mut list = GpuCacheUpdateList {
+            frame_id: FrameId::INVALID,
+            clear: false,
+            height: self.gpu_cache_texture.get_height(),
+            blocks: Vec::new(),
+            updates: Vec::new(),
+            debug_commands: Vec::new(),
+        };
+
         for (i, deferred_resolve) in deferred_resolves.iter().enumerate() {
             self.gpu_profiler.place_marker("deferred resolve");
             let props = &deferred_resolve.image_properties;
@@ -4955,11 +5019,16 @@ impl Renderer {
                 .external_images
                 .insert(DeferredResolveIndex(i as u32), texture);
 
-            let addr = deferred_resolve.address;
-            let index = addr.as_u32() as usize;
-            gpu_buffer.data[index] = image.uv.to_array().into();
-            gpu_buffer.data[index + 1] = [0f32; 4].into();
+            list.updates.push(GpuCacheUpdate::Copy {
+                block_index: list.blocks.len(),
+                block_count: BLOCKS_PER_UV_RECT,
+                address: deferred_resolve.address,
+            });
+            list.blocks.push(image.uv.into());
+            list.blocks.push([0f32; 4].into());
         }
+
+        Some(list)
     }
 
     fn unlock_external_images(
@@ -5252,6 +5321,10 @@ impl Renderer {
         let gpu_buffer_mb = (gpu_buffer_bytes_f + gpu_buffer_bytes_i) as f32 * bytes_to_mb;
         self.profile.set(profiler::GPU_BUFFER_MEM, gpu_buffer_mb);
 
+        let gpu_cache_bytes = self.gpu_cache_texture.gpu_size_in_bytes();
+        let gpu_cache_mb = gpu_cache_bytes as f32 * bytes_to_mb;
+        self.profile.set(profiler::GPU_CACHE_MEM, gpu_cache_mb);
+
         // Determine the present mode and dirty rects, if device_size
         // is Some(..). If it's None, no composite will occur and only
         // picture cache and texture cache targets will be updated.
@@ -5959,6 +6032,42 @@ impl Renderer {
         }
     }
 
+    fn draw_gpu_cache_debug(&mut self, device_size: DeviceIntSize) {
+        if !self.debug_flags.contains(DebugFlags::GPU_CACHE_DBG) {
+            return;
+        }
+
+        let debug_renderer = match self.debug.get_mut(&mut self.device) {
+            Some(render) => render,
+            None => return,
+        };
+
+        let (x_off, y_off) = (30f32, 30f32);
+        let height = self.gpu_cache_texture.get_height()
+            .min(device_size.height - (y_off as i32) * 2) as usize;
+        debug_renderer.add_quad(
+            x_off,
+            y_off,
+            x_off + MAX_VERTEX_TEXTURE_WIDTH as f32,
+            y_off + height as f32,
+            ColorU::new(80, 80, 80, 80),
+            ColorU::new(80, 80, 80, 80),
+        );
+
+        let upper = self.gpu_cache_debug_chunks.len().min(height);
+        for chunk in self.gpu_cache_debug_chunks[0..upper].iter().flatten() {
+            let color = ColorU::new(250, 0, 0, 200);
+            debug_renderer.add_quad(
+                x_off + chunk.address.u as f32,
+                y_off + chunk.address.v as f32,
+                x_off + chunk.address.u as f32 + chunk.size as f32,
+                y_off + chunk.address.v as f32 + 1.0,
+                color,
+                color,
+            );
+        }
+    }
+
     /// Pass-through to `Device::read_pixels_into`, used by Gecko's WR bindings.
     pub fn read_pixels_into(&mut self, rect: FramebufferIntRect, format: ImageFormat, output: &mut [u8]) {
         self.device.read_pixels_into(rect, format, output);
@@ -5986,6 +6095,7 @@ impl Renderer {
             }
             compositor.deinit(&mut self.device);
         }
+        self.gpu_cache_texture.deinit(&mut self.device);
         if let Some(dither_matrix_texture) = self.dither_matrix_texture {
             self.device.delete_texture(dither_matrix_texture);
         }
@@ -6026,6 +6136,9 @@ impl Renderer {
     pub fn report_memory(&self, swgl: *mut c_void) -> MemoryReport {
         let mut report = MemoryReport::default();
 
+        // GPU cache CPU memory.
+        self.gpu_cache_texture.report_memory_to(&mut report, self.size_of_ops.as_ref().unwrap());
+
         self.staging_texture_pool.report_memory_to(&mut report, self.size_of_ops.as_ref().unwrap());
 
         // Render task CPU memory.
@@ -6142,6 +6255,7 @@ pub struct RendererStats {
     pub color_target_count: usize,
     pub texture_upload_mb: f64,
     pub resource_upload_time: f64,
+    pub gpu_cache_upload_time: f64,
     pub gecko_display_list_time: f64,
     pub wr_display_list_time: f64,
     pub scene_build_time: f64,
@@ -6201,6 +6315,8 @@ struct PlainTexture {
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 struct PlainRenderer {
     device_size: Option<DeviceIntSize>,
+    gpu_cache: PlainTexture,
+    gpu_cache_frame_id: FrameId,
     textures: FastHashMap<CacheTextureId, PlainTexture>,
 }
 
@@ -6433,8 +6549,15 @@ impl Renderer {
                 fs::create_dir(&path_textures).unwrap();
             }
 
+            info!("saving GPU cache");
+            self.update_gpu_cache(); // flush pending updates
             let mut plain_self = PlainRenderer {
                 device_size: self.device_size,
+                gpu_cache: Self::save_texture(
+                    self.gpu_cache_texture.get_texture(),
+                    None, "gpu", &root, &mut self.device,
+                ),
+                gpu_cache_frame_id: self.gpu_cache_frame_id,
                 textures: FastHashMap::default(),
             };
 
@@ -6543,6 +6666,7 @@ impl Renderer {
         }
 
         self.device.begin_frame();
+        self.gpu_cache_texture.remove_texture(&mut self.device);
 
         if let Some(renderer) = config.deserialize_for_resource::<PlainRenderer, _>("renderer") {
             info!("loading cached textures");
@@ -6566,6 +6690,17 @@ impl Renderer {
                     category: texture.category.unwrap_or(TextureCacheCategory::Standalone),
                 });
             }
+
+            info!("loading gpu cache");
+            let (t, gpu_cache_data) = Self::load_texture(
+                ImageBufferKind::Texture2D,
+                &renderer.gpu_cache,
+                Some(RenderTargetInfo { has_depth: false }),
+                &root,
+                &mut self.device,
+            );
+            self.gpu_cache_texture.load_from_data(t, gpu_cache_data);
+            self.gpu_cache_frame_id = renderer.gpu_cache_frame_id;
         } else {
             info!("loading cached textures");
             self.device.begin_frame();
diff --git a/gfx/wr/webrender/src/renderer/shade.rs b/gfx/wr/webrender/src/renderer/shade.rs
@@ -287,6 +287,7 @@ impl LazilyCompiledShader {
                             ("sColor0", TextureSampler::Color0),
                             ("sTransformPalette", TextureSampler::TransformPalette),
                             ("sRenderTasks", TextureSampler::RenderTasks),
+                            ("sGpuCache", TextureSampler::GpuCache),
                             ("sPrimitiveHeadersF", TextureSampler::PrimitiveHeadersF),
                             ("sPrimitiveHeadersI", TextureSampler::PrimitiveHeadersI),
                             ("sGpuBufferF", TextureSampler::GpuBufferF),
@@ -304,6 +305,7 @@ impl LazilyCompiledShader {
                             ("sDither", TextureSampler::Dither),
                             ("sTransformPalette", TextureSampler::TransformPalette),
                             ("sRenderTasks", TextureSampler::RenderTasks),
+                            ("sGpuCache", TextureSampler::GpuCache),
                             ("sPrimitiveHeadersF", TextureSampler::PrimitiveHeadersF),
                             ("sPrimitiveHeadersI", TextureSampler::PrimitiveHeadersI),
                             ("sClipMask", TextureSampler::ClipMask),
diff --git a/gfx/wr/webrender/src/renderer/vertex.rs b/gfx/wr/webrender/src/renderer/vertex.rs
@@ -479,8 +479,8 @@ pub mod desc {
             // specific clip attributes
             VertexAttribute {
                 name: "aClipDataResourceAddress",
-                count: 1,
-                kind: VertexAttributeKind::I32,
+                count: 2,
+                kind: VertexAttributeKind::U16,
             },
             VertexAttribute {
                 name: "aClipSrcRectSize",
@@ -505,6 +505,22 @@ pub mod desc {
         ],
     };
 
+    pub const GPU_CACHE_UPDATE: VertexDescriptor = VertexDescriptor {
+        vertex_attributes: &[
+            VertexAttribute {
+                name: "aPosition",
+                count: 2,
+                kind: VertexAttributeKind::U16Norm,
+            },
+            VertexAttribute {
+                name: "aValue",
+                count: 4,
+                kind: VertexAttributeKind::F32,
+            },
+        ],
+        instance_attributes: &[],
+    };
+
     pub const RESOLVE: VertexDescriptor = VertexDescriptor {
         vertex_attributes: &[VertexAttribute {
             name: "aPosition",
@@ -562,8 +578,8 @@ pub mod desc {
             },
             VertexAttribute {
                 name: "aFilterExtraDataAddress",
-                count: 1,
-                kind: VertexAttributeKind::I32,
+                count: 2,
+                kind: VertexAttributeKind::U16,
             },
         ],
     };
@@ -612,8 +628,8 @@ pub mod desc {
             },
             VertexAttribute {
                 name: "aFilterExtraDataAddress",
-                count: 1,
-                kind: VertexAttributeKind::I32,
+                count: 2,
+                kind: VertexAttributeKind::U16,
             },
         ],
     };
diff --git a/gfx/wr/webrender/src/resource_cache.rs b/gfx/wr/webrender/src/resource_cache.rs
@@ -27,6 +27,7 @@ use crate::glyph_cache::{GlyphCache, CachedGlyphInfo};
 use crate::glyph_cache::GlyphCacheEntry;
 use glyph_rasterizer::{GLYPH_FLASHING, FontInstance, GlyphFormat, GlyphKey, GlyphRasterizer, GlyphRasterJob};
 use glyph_rasterizer::{SharedFontResources, BaseFontInstance};
+use crate::gpu_cache::{GpuCache, GpuCacheAddress, GpuCacheHandle};
 use crate::gpu_types::UvRectKind;
 use crate::internal_types::{
     CacheTextureId, FastHashMap, FastHashSet, TextureSource, ResourceUpdateList,
@@ -36,7 +37,7 @@ use crate::profiler::{self, TransactionProfile, bytes_to_mb};
 use crate::render_task_graph::{RenderTaskId, RenderTaskGraphBuilder};
 use crate::render_task_cache::{RenderTaskCache, RenderTaskCacheKey, RenderTaskParent};
 use crate::render_task_cache::{RenderTaskCacheEntry, RenderTaskCacheEntryHandle};
-use crate::renderer::{GpuBufferAddress, GpuBufferBuilder, GpuBufferBuilderF};
+use crate::renderer::GpuBufferBuilderF;
 use crate::surface::SurfaceBuilder;
 use euclid::point2;
 use smallvec::SmallVec;
@@ -63,7 +64,7 @@ static NEXT_NATIVE_SURFACE_ID: AtomicUsize = AtomicUsize::new(0);
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct GlyphFetchResult {
     pub index_in_text_run: i32,
-    pub uv_rect_address: GpuBufferAddress,
+    pub uv_rect_address: GpuCacheAddress,
     pub offset: DevicePoint,
     pub size: DeviceIntSize,
     pub scale: f32,
@@ -83,7 +84,7 @@ pub struct GlyphFetchResult {
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct CacheItem {
     pub texture_id: TextureSource,
-    pub uv_rect_handle: GpuBufferAddress,
+    pub uv_rect_handle: GpuCacheHandle,
     pub uv_rect: DeviceIntRect,
     pub user_data: [f32; 4],
 }
@@ -92,7 +93,7 @@ impl CacheItem {
     pub fn invalid() -> Self {
         CacheItem {
             texture_id: TextureSource::Invalid,
-            uv_rect_handle: GpuBufferAddress::INVALID,
+            uv_rect_handle: GpuCacheHandle::new(),
             uv_rect: DeviceIntRect::zero(),
             user_data: [0.0; 4],
         }
@@ -631,16 +632,18 @@ impl ResourceCache {
         key: Option<RenderTaskCacheKey>,
         is_opaque: bool,
         parent: RenderTaskParent,
+        gpu_cache: &mut GpuCache,
         gpu_buffer_builder: &mut GpuBufferBuilderF,
         rg_builder: &mut RenderTaskGraphBuilder,
         surface_builder: &mut SurfaceBuilder,
-        f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId,
+        f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId,
     ) -> RenderTaskId {
         self.cached_render_tasks.request_render_task(
             key.clone(),
             &mut self.texture_cache,
             is_opaque,
             parent,
+            gpu_cache,
             gpu_buffer_builder,
             rg_builder,
             surface_builder,
@@ -654,12 +657,13 @@ impl ResourceCache {
         size: DeviceIntSize,
         rg_builder: &mut RenderTaskGraphBuilder,
         gpu_buffer_builder: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         is_opaque: bool,
         adjustment: &AdjustedImageSource,
-        f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF) -> RenderTaskId,
+        f: &mut dyn FnMut(&mut RenderTaskGraphBuilder, &mut GpuBufferBuilderF, &mut GpuCache) -> RenderTaskId,
     ) -> RenderTaskId {
 
-        let task_id = f(rg_builder, gpu_buffer_builder);
+        let task_id = f(rg_builder, gpu_buffer_builder, gpu_cache);
 
         let render_task = rg_builder.get_task_mut(task_id);
 
@@ -723,7 +727,7 @@ impl ResourceCache {
             None,
             user_data,
             DirtyRect::All,
-            gpu_buffer_builder,
+            gpu_cache,
             None,
             render_task.uv_rect_kind(),
             Eviction::Manual,
@@ -1097,7 +1101,7 @@ impl ResourceCache {
     pub fn request_image(
         &mut self,
         mut request: ImageRequest,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
     ) -> DeviceIntSize {
         debug_assert_eq!(self.state, State::AddResources);
 
@@ -1198,7 +1202,7 @@ impl ResourceCache {
             ImageResult::Err(_) => panic!("Errors should already have been handled"),
         };
 
-        let needs_upload = self.texture_cache.request(&entry.texture_cache_handle, gpu_buffer);
+        let needs_upload = self.texture_cache.request(&entry.texture_cache_handle, gpu_cache);
 
         if !needs_upload && entry.dirty_rect.is_empty() {
             return size;
@@ -1272,7 +1276,7 @@ impl ResourceCache {
         &mut self,
         mut font: FontInstance,
         glyph_keys: &[GlyphKey],
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
     ) {
         debug_assert_eq!(self.state, State::AddResources);
 
@@ -1286,7 +1290,8 @@ impl ResourceCache {
                 if let Some(entry) = glyph_key_cache.try_get(key) {
                     match entry {
                         GlyphCacheEntry::Cached(ref glyph) => {
-                            if !texture_cache.request(&glyph.texture_cache_handle, gpu_buffer) {
+                            // Skip the glyph if it is already has a valid texture cache handle.
+                            if !texture_cache.request(&glyph.texture_cache_handle, gpu_cache) {
                                 return false;
                             }
                             // This case gets hit when we already rasterized the glyph, but the
@@ -1317,6 +1322,7 @@ impl ResourceCache {
         mut font: FontInstance,
         glyph_keys: &[GlyphKey],
         fetch_buffer: &mut Vec<GlyphFetchResult>,
+        gpu_cache: &mut GpuCache,
         mut f: F,
     ) where
         F: FnMut(TextureSource, GlyphFormat, &[GlyphFetchResult]),
@@ -1348,7 +1354,7 @@ impl ResourceCache {
             }
             fetch_buffer.push(GlyphFetchResult {
                 index_in_text_run: loop_index as i32,
-                uv_rect_address: cache_item.uv_rect_handle,
+                uv_rect_address: gpu_cache.get_address(&cache_item.uv_rect_handle),
                 offset: DevicePoint::new(cache_item.user_data[0], cache_item.user_data[1]),
                 size: cache_item.uv_rect.size(),
                 scale: cache_item.user_data[2],
@@ -1463,7 +1469,7 @@ impl ResourceCache {
         })
     }
 
-    pub fn begin_frame(&mut self, stamp: FrameStamp, profile: &mut TransactionProfile) {
+    pub fn begin_frame(&mut self, stamp: FrameStamp, gpu_cache: &mut GpuCache, profile: &mut TransactionProfile) {
         profile_scope!("begin_frame");
         debug_assert_eq!(self.state, State::Idle);
         self.state = State::AddResources;
@@ -1484,12 +1490,12 @@ impl ResourceCache {
         v.clear();
         self.deleted_blob_keys.push_back(v);
 
-        self.texture_cache.run_compaction();
+        self.texture_cache.run_compaction(gpu_cache);
     }
 
     pub fn block_until_all_resources_added(
         &mut self,
-        gpu_buffer: &mut GpuBufferBuilder,
+        gpu_cache: &mut GpuCache,
         profile: &mut TransactionProfile,
     ) {
         profile_scope!("block_until_all_resources_added");
@@ -1511,7 +1517,7 @@ impl ResourceCache {
                     }
                     Ok(glyph) => {
                         let mut texture_cache_handle = TextureCacheHandle::invalid();
-                        texture_cache.request(&texture_cache_handle, &mut gpu_buffer.f32);
+                        texture_cache.request(&texture_cache_handle, gpu_cache);
                         texture_cache.update(
                             &mut texture_cache_handle,
                             ImageDescriptor {
@@ -1525,7 +1531,7 @@ impl ResourceCache {
                             Some(CachedImageData::Raw(Arc::new(glyph.bytes))),
                             [glyph.left, -glyph.top, glyph.scale, 0.0],
                             DirtyRect::All,
-                            &mut gpu_buffer.f32,
+                            gpu_cache,
                             Some(glyph_key_cache.eviction_notice()),
                             UvRectKind::Rect,
                             Eviction::Auto,
@@ -1544,10 +1550,10 @@ impl ResourceCache {
         );
 
         // Apply any updates of new / updated images (incl. blobs) to the texture cache.
-        self.update_texture_cache(gpu_buffer);
+        self.update_texture_cache(gpu_cache);
     }
 
-    fn update_texture_cache(&mut self, gpu_buffer: &mut GpuBufferBuilder) {
+    fn update_texture_cache(&mut self, gpu_cache: &mut GpuCache) {
         profile_scope!("update_texture_cache");
 
         if self.fallback_handle == TextureCacheHandle::invalid() {
@@ -1569,7 +1575,7 @@ impl ResourceCache {
                 Some(CachedImageData::Raw(Arc::new(fallback_color))),
                 [0.0; 4],
                 DirtyRect::All,
-                &mut gpu_buffer.f32,
+                gpu_cache,
                 None,
                 UvRectKind::Rect,
                 Eviction::Manual,
@@ -1687,7 +1693,7 @@ impl ResourceCache {
                     Some(image_data),
                     [0.0; 4],
                     dirty_rect,
-                    &mut gpu_buffer.f32,
+                    gpu_cache,
                     None,
                     UvRectKind::Rect,
                     eviction,
diff --git a/gfx/wr/webrender/src/texture_cache.rs b/gfx/wr/webrender/src/texture_cache.rs
@@ -9,6 +9,7 @@ use api::units::*;
 use api::{DocumentId, IdNamespace};
 use crate::device::{TextureFilter, TextureFormatPair};
 use crate::freelist::{FreeList, FreeListHandle, WeakFreeListHandle};
+use crate::gpu_cache::{GpuCache, GpuCacheHandle};
 use crate::gpu_types::{ImageSource, UvRectKind};
 use crate::internal_types::{
     CacheTextureId, Swizzle, SwizzleSettings, FrameStamp, FrameId,
@@ -17,7 +18,6 @@ use crate::internal_types::{
 };
 use crate::lru_cache::LRUCache;
 use crate::profiler::{self, TransactionProfile};
-use crate::renderer::{GpuBufferAddress, GpuBufferBuilderF};
 use crate::resource_cache::{CacheItem, CachedImageData};
 use crate::texture_pack::{
     AllocatorList, AllocId, AtlasAllocatorList, ShelfAllocator, ShelfAllocatorOptions,
@@ -102,8 +102,8 @@ pub struct CacheEntry {
     //           in the glyph cache eviction code. We could probably remove it
     //           entirely in future (or move to PictureCacheEntry).
     pub last_access: FrameStamp,
-    /// Address of the resource rect in the GPU cache.
-    pub uv_rect_handle: GpuBufferAddress,
+    /// Handle to the resource rect in the GPU cache.
+    pub uv_rect_handle: GpuCacheHandle,
     /// Image format of the data that the entry expects.
     pub input_format: ImageFormat,
     pub filter: TextureFilter,
@@ -143,7 +143,7 @@ impl CacheEntry {
             input_format: params.descriptor.format,
             filter: params.filter,
             swizzle,
-            uv_rect_handle: GpuBufferAddress::INVALID,
+            uv_rect_handle: GpuCacheHandle::new(),
             eviction_notice: None,
             uv_rect_kind: params.uv_rect_kind,
             shader: TargetShader::Default,
@@ -154,15 +154,17 @@ impl CacheEntry {
     // This ensures that the UV rect, and texture layer index
     // are up to date in the GPU cache for vertex shaders
     // to fetch from.
-    fn write_gpu_blocks(&mut self, gpu_buffer: &mut GpuBufferBuilderF) {
-        let origin = self.details.describe();
-        let image_source = ImageSource {
-            p0: origin.to_f32(),
-            p1: (origin + self.size).to_f32(),
-            user_data: self.user_data,
-            uv_rect_kind: self.uv_rect_kind,
-        };
-        self.uv_rect_handle = image_source.write_gpu_blocks(gpu_buffer);
+    fn update_gpu_cache(&mut self, gpu_cache: &mut GpuCache) {
+        if let Some(mut request) = gpu_cache.request(&mut self.uv_rect_handle) {
+            let origin = self.details.describe();
+            let image_source = ImageSource {
+                p0: origin.to_f32(),
+                p1: (origin + self.size).to_f32(),
+                user_data: self.user_data,
+                uv_rect_kind: self.uv_rect_kind,
+            };
+            image_source.write_gpu_blocks(&mut request);
+        }
     }
 
     fn evict(&self) {
@@ -550,9 +552,11 @@ impl TextureCacheConfig {
 /// frame in which they are requested, and may be evicted. The API supports
 /// querying whether an entry is still available.
 ///
-/// The texture cache can be visualized, which is a good way to understand how
-/// it works. Enabling gfx.webrender.debug.texture-cache shows a live view of
-/// its contents in Firefox.
+/// The TextureCache is different from the GpuCache in that the former stores
+/// images, whereas the latter stores data and parameters for use in the shaders.
+/// This means that the texture cache can be visualized, which is a good way to
+/// understand how it works. Enabling gfx.webrender.debug.texture-cache shows a
+/// live view of its contents in Firefox.
 #[cfg_attr(feature = "capture", derive(Serialize))]
 #[cfg_attr(feature = "replay", derive(Deserialize))]
 pub struct TextureCache {
@@ -752,7 +756,7 @@ impl TextureCache {
         self.now = FrameStamp::INVALID;
     }
 
-    pub fn run_compaction(&mut self) {
+    pub fn run_compaction(&mut self, gpu_cache: &mut GpuCache) {
         // Use the same order as BudgetType::VALUES so that we can index self.bytes_allocated
         // with the same index.
         let allocator_lists = [
@@ -803,7 +807,8 @@ impl TextureCache {
                 allocated_size_in_bytes: new_bytes,
             };
 
-            entry.uv_rect_handle = GpuBufferAddress::INVALID;
+            gpu_cache.invalidate(&entry.uv_rect_handle);
+            entry.uv_rect_handle = GpuCacheHandle::new();
 
             let src_rect = DeviceIntRect::from_origin_and_size(change.old_rect.min, entry.size);
             let dst_rect = DeviceIntRect::from_origin_and_size(change.new_rect.min, entry.size);
@@ -832,7 +837,7 @@ impl TextureCache {
     // Returns true if the image needs to be uploaded to the
     // texture cache (either never uploaded, or has been
     // evicted on a previous frame).
-    pub fn request(&mut self, handle: &TextureCacheHandle, gpu_buffer: &mut GpuBufferBuilderF) -> bool {
+    pub fn request(&mut self, handle: &TextureCacheHandle, gpu_cache: &mut GpuCache) -> bool {
         let now = self.now;
         let entry = match handle {
             TextureCacheHandle::Empty => None,
@@ -847,9 +852,9 @@ impl TextureCache {
         };
         entry.map_or(true, |entry| {
             // If an image is requested that is already in the cache,
-            // refresh the GPU buffer data associated with this item.
+            // refresh the GPU cache data associated with this item.
             entry.last_access = now;
-            entry.write_gpu_blocks(gpu_buffer);
+            entry.update_gpu_cache(gpu_cache);
             false
         })
     }
@@ -908,7 +913,7 @@ impl TextureCache {
         data: Option<CachedImageData>,
         user_data: [f32; 4],
         mut dirty_rect: ImageDirtyRect,
-        gpu_buffer: &mut GpuBufferBuilderF,
+        gpu_cache: &mut GpuCache,
         eviction_notice: Option<&EvictionNotice>,
         uv_rect_kind: UvRectKind,
         eviction: Eviction,
@@ -948,8 +953,14 @@ impl TextureCache {
         entry.eviction_notice = eviction_notice.cloned();
         entry.uv_rect_kind = uv_rect_kind;
 
+        // Invalidate the contents of the resource rect in the GPU cache.
+        // This ensures that the update_gpu_cache below will add
+        // the new information to the GPU cache.
+        //TODO: only invalidate if the parameters change?
+        gpu_cache.invalidate(&entry.uv_rect_handle);
+
         // Upload the resource rect and texture array layer.
-        entry.write_gpu_blocks(gpu_buffer);
+        entry.update_gpu_cache(gpu_cache);
 
         // Create an update command, which the render thread processes
         // to upload the new image data into the correct location
@@ -1022,7 +1033,7 @@ impl TextureCache {
     pub fn try_get_cache_location(
         &self,
         handle: &TextureCacheHandle,
-    ) -> Option<(CacheTextureId, DeviceIntRect, Swizzle, GpuBufferAddress, [f32; 4])> {
+    ) -> Option<(CacheTextureId, DeviceIntRect, Swizzle, GpuCacheHandle, [f32; 4])> {
         let entry = self.get_entry_opt(handle)?;
         let origin = entry.details.describe();
         Some((
@@ -1042,7 +1053,7 @@ impl TextureCache {
     pub fn get_cache_location(
         &self,
         handle: &TextureCacheHandle,
-    ) -> (CacheTextureId, DeviceIntRect, Swizzle, GpuBufferAddress, [f32; 4]) {
+    ) -> (CacheTextureId, DeviceIntRect, Swizzle, GpuCacheHandle, [f32; 4]) {
         self.try_get_cache_location(handle).expect("BUG: was dropped from cache or not updated!")
     }
 
@@ -1349,7 +1360,7 @@ impl TextureCache {
                 alloc_id,
                 allocated_size_in_bytes,
             },
-            uv_rect_handle: GpuBufferAddress::INVALID,
+            uv_rect_handle: GpuCacheHandle::new(),
             input_format: params.descriptor.format,
             filter: params.filter,
             swizzle,
@@ -1647,8 +1658,6 @@ impl TextureCacheUpdate {
 
 #[cfg(test)]
 mod test_texture_cache {
-    use crate::renderer::GpuBufferBuilderF;
-
     #[test]
     fn check_allocation_size_balance() {
         // Allocate some glyphs, observe the total allocation size, and free
@@ -1656,15 +1665,14 @@ mod test_texture_cache {
         // original value.
 
         use crate::texture_cache::{TextureCache, TextureCacheHandle, Eviction, TargetShader};
+        use crate::gpu_cache::GpuCache;
         use crate::device::TextureFilter;
         use crate::gpu_types::UvRectKind;
-        use crate::frame_allocator::FrameMemory;
         use api::{ImageDescriptor, ImageDescriptorFlags, ImageFormat, DirtyRect};
         use api::units::*;
         use euclid::size2;
         let mut texture_cache = TextureCache::new_for_testing(2048, ImageFormat::BGRA8);
-        let memory = FrameMemory::fallback();
-        let mut gpu_buffer = GpuBufferBuilderF::new(&memory);
+        let mut gpu_cache = GpuCache::new_for_testing();
 
         let sizes: &[DeviceIntSize] = &[
             size2(23, 27),
@@ -1685,7 +1693,7 @@ mod test_texture_cache {
 
         let handles: Vec<TextureCacheHandle> = sizes.iter().map(|size| {
             let mut texture_cache_handle = TextureCacheHandle::invalid();
-            texture_cache.request(&texture_cache_handle, &mut gpu_buffer);
+            texture_cache.request(&texture_cache_handle, &mut gpu_cache);
             texture_cache.update(
                 &mut texture_cache_handle,
                 ImageDescriptor {
@@ -1699,7 +1707,7 @@ mod test_texture_cache {
                 None,
                 [0.0; 4],
                 DirtyRect::All,
-                &mut gpu_buffer,
+                &mut gpu_cache,
                 None,
                 UvRectKind::Rect,
                 Eviction::Manual,
diff --git a/gfx/wr/webrender/src/visibility.rs b/gfx/wr/webrender/src/visibility.rs
@@ -13,10 +13,10 @@ use std::usize;
 use crate::clip::ClipStore;
 use crate::composite::CompositeState;
 use crate::profiler::TransactionProfile;
-use crate::renderer::GpuBufferBuilder;
 use crate::spatial_tree::{SpatialTree, SpatialNodeIndex};
 use crate::clip::{ClipChainInstance, ClipTree};
 use crate::frame_builder::FrameBuilderConfig;
+use crate::gpu_cache::GpuCache;
 use crate::picture::{PictureCompositeMode, ClusterFlags, SurfaceInfo, TileCacheInstance};
 use crate::picture::{SurfaceIndex, RasterConfig, SubSliceIndex};
 use crate::prim_store::{ClipTaskIndex, PictureIndex, PrimitiveInstanceKind};
@@ -41,7 +41,7 @@ pub struct FrameVisibilityContext<'a> {
 pub struct FrameVisibilityState<'a> {
     pub clip_store: &'a mut ClipStore,
     pub resource_cache: &'a mut ResourceCache,
-    pub frame_gpu_data: &'a mut GpuBufferBuilder,
+    pub gpu_cache: &'a mut GpuCache,
     pub data_stores: &'a mut DataStores,
     pub clip_tree: &'a mut ClipTree,
     pub composite_state: &'a mut CompositeState,
@@ -321,7 +321,7 @@ pub fn update_prim_visibility(
                     &map_local_to_picture,
                     &map_surface_to_vis,
                     &frame_context.spatial_tree,
-                    &mut frame_state.frame_gpu_data.f32,
+                    frame_state.gpu_cache,
                     frame_state.resource_cache,
                     device_pixel_scale,
                     &surface_culling_rect,
@@ -363,7 +363,7 @@ pub fn update_prim_visibility(
                         &store.color_bindings,
                         &frame_state.surface_stack,
                         &mut frame_state.composite_state,
-                        &mut frame_state.frame_gpu_data.f32,
+                        &mut frame_state.gpu_cache,
                         &mut frame_state.scratch.primitive,
                         is_root_tile_cache,
                         frame_state.surfaces,
diff --git a/gfx/wr/webrender_api/src/units.rs b/gfx/wr/webrender_api/src/units.rs
@@ -191,15 +191,6 @@ impl TexelRect {
             uv1: DevicePoint::new(-1.0, -1.0),
         }
     }
-
-    pub fn to_array(&self) -> [f32; 4] {
-        [
-            self.uv0.x,
-            self.uv0.y,
-            self.uv1.x,
-            self.uv1.y,
-        ]
-    }
 }
 
 impl Into<TexelRect> for DeviceIntRect {
diff --git a/gfx/wr/webrender_build/src/shader.rs b/gfx/wr/webrender_build/src/shader.rs
@@ -195,8 +195,8 @@ pub fn build_shader_prefix_string<F: FnMut(&str)>(
     // detect which platform we're targeting
     let is_macos = match std::env::var("CARGO_CFG_TARGET_OS") {
         Ok(os) => os == "macos",
-        // if this is not called from build.rs (e.g. if the optimized shader
-        // pref is disabled) we want to use the runtime value
+        // if this is not called from build.rs (e.g. the gpu_cache_update shader or
+        // if the optimized shader pref is disabled) we want to use the runtime value
         Err(_) => cfg!(target_os = "macos"),
     };
     let is_android = match std::env::var("CARGO_CFG_TARGET_OS") {
diff --git a/gfx/wr/wrench/src/main.rs b/gfx/wr/wrench/src/main.rs
@@ -787,6 +787,7 @@ pub fn main() {
     } else if let Some(subargs) = args.subcommand_matches("png") {
         let surface = match subargs.value_of("surface") {
             Some("screen") | None => png::ReadSurface::Screen,
+            Some("gpu-cache") => png::ReadSurface::GpuCache,
             _ => panic!("Unknown surface argument value")
         };
         let output_path = subargs.value_of("OUTPUT").map(PathBuf::from);
diff --git a/gfx/wr/wrench/src/png.rs b/gfx/wr/wrench/src/png.rs
@@ -14,6 +14,7 @@ use crate::yaml_frame_reader::YamlFrameReader;
 
 pub enum ReadSurface {
     Screen,
+    GpuCache,
 }
 
 pub struct SaveSettings {
@@ -97,6 +98,14 @@ pub fn png(
                 try_crop: true,
             })
         }
+        ReadSurface::GpuCache => {
+            let (size, data) = wrench.renderer
+                .read_gpu_cache();
+            (size, data, SaveSettings {
+                flip_vertical: false,
+                try_crop: false,
+            })
+        }
     };
 
     let out_path = out_path.unwrap_or_else(|| {

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	gfx/layers/ipc/CompositorBridgeParent.cpp	\|	5	+++++
M	gfx/layers/ipc/PCompositorBridge.ipdl	\|	1	+
M	gfx/layers/wr/WebRenderBridgeParent.cpp	\|	1	+
M	gfx/layers/wr/WebRenderMessageUtils.h	\|	3	+++
M	gfx/thebes/gfxPlatform.cpp	\|	9	+++++++--
M	gfx/wr/webrender/res/blend.glsl	\|	14	++++++--------
M	gfx/wr/webrender/res/brush.glsl	\|	10	+++++-----
M	gfx/wr/webrender/res/brush_blend.glsl	\|	2	+-
M	gfx/wr/webrender/res/brush_image.glsl	\|	4	++--
M	gfx/wr/webrender/res/brush_linear_gradient.glsl	\|	2	+-
M	gfx/wr/webrender/res/brush_mix_blend.glsl	\|	2	+-
M	gfx/wr/webrender/res/brush_solid.glsl	\|	2	+-
M	gfx/wr/webrender/res/brush_yuv_image.glsl	\|	4	++--
M	gfx/wr/webrender/res/clip_shared.glsl	\|	2	+-
M	gfx/wr/webrender/res/cs_clip_box_shadow.glsl	\|	8	++++----
M	gfx/wr/webrender/res/cs_conic_gradient.glsl	\|	2	+-
M	gfx/wr/webrender/res/cs_linear_gradient.glsl	\|	2	+-
M	gfx/wr/webrender/res/cs_radial_gradient.glsl	\|	2	+-
M	gfx/wr/webrender/res/cs_svg_filter.glsl	\|	30	+++++++++++++-----------------
M	gfx/wr/webrender/res/cs_svg_filter_node.glsl	\|	34	+++++++++++++++-------------------
M	gfx/wr/webrender/res/gpu_buffer.glsl	\|	46	+++++++++++++---------------------------------
A	gfx/wr/webrender/res/gpu_cache.glsl	\|	137	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	gfx/wr/webrender/res/gradient.glsl	\|	2	+-
D	gfx/wr/webrender/res/image_source.glsl	\|	51	---------------------------------------------------
M	gfx/wr/webrender/res/prim_shared.glsl	\|	2	+-
M	gfx/wr/webrender/res/ps_quad.glsl	\|	4	++--
M	gfx/wr/webrender/res/ps_split_composite.glsl	\|	15	+++++++++------
M	gfx/wr/webrender/res/ps_text_run.glsl	\|	8	++++----
M	gfx/wr/webrender/src/batch.rs	\|	141	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	gfx/wr/webrender/src/clip.rs	\|	12	++++++------
M	gfx/wr/webrender/src/command_buffer.rs	\|	27	++++++++++++++++++---------
M	gfx/wr/webrender/src/composite.rs	\|	20	++++++++++----------
M	gfx/wr/webrender/src/filterdata.rs	\|	35	++++++++++++++++++-----------------
M	gfx/wr/webrender/src/frame_builder.rs	\|	93	+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
A	gfx/wr/webrender/src/gpu_cache.rs	\|	945	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	gfx/wr/webrender/src/gpu_types.rs	\|	47	+++++++++++++++++++----------------------------
M	gfx/wr/webrender/src/image_source.rs	\|	10	+++++-----
M	gfx/wr/webrender/src/internal_types.rs	\|	2	++
M	gfx/wr/webrender/src/lib.rs	\|	1	+
M	gfx/wr/webrender/src/picture.rs	\|	171	++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M	gfx/wr/webrender/src/picture_textures.rs	\|	37	+++++++++++++++++++------------------
M	gfx/wr/webrender/src/prepare.rs	\|	87	++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
M	gfx/wr/webrender/src/prim_store/borders.rs	\|	62	+++++++++++++++++++++++++++++++++++---------------------------
M	gfx/wr/webrender/src/prim_store/gradient/conic.rs	\|	41	+++++++++++++++++++++++------------------
M	gfx/wr/webrender/src/prim_store/gradient/linear.rs	\|	81	++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M	gfx/wr/webrender/src/prim_store/gradient/mod.rs	\|	6	+++---
M	gfx/wr/webrender/src/prim_store/gradient/radial.rs	\|	42	+++++++++++++++++++++++-------------------
M	gfx/wr/webrender/src/prim_store/image.rs	\|	45	+++++++++++++++++++++++----------------------
M	gfx/wr/webrender/src/prim_store/line_dec.rs	\|	20	++++++++++----------
M	gfx/wr/webrender/src/prim_store/mod.rs	\|	35	+++++++++++++++++------------------
M	gfx/wr/webrender/src/prim_store/text_run.rs	\|	55	++++++++++++++++++++++++++++---------------------------
M	gfx/wr/webrender/src/profiler.rs	\|	379	+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M	gfx/wr/webrender/src/quad.rs	\|	4	++--
M	gfx/wr/webrender/src/render_api.rs	\|	5	+++++
M	gfx/wr/webrender/src/render_backend.rs	\|	72	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M	gfx/wr/webrender/src/render_target.rs	\|	19	+++++++++++++------
M	gfx/wr/webrender/src/render_task.rs	\|	176	+++++++++++++++++++++++++++++++++++++++++++------------------------------------
M	gfx/wr/webrender/src/render_task_cache.rs	\|	20	++++++++++++--------
M	gfx/wr/webrender/src/render_task_graph.rs	\|	30	+++++++++++++++---------------
M	gfx/wr/webrender/src/renderer/gpu_buffer.rs	\|	81	++++++++++++++++++++++++++-----------------------------------------------------
A	gfx/wr/webrender/src/renderer/gpu_cache.rs	\|	541	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	gfx/wr/webrender/src/renderer/init.rs	\|	28	++++++++++++++++++++++++++--
M	gfx/wr/webrender/src/renderer/mod.rs	\|	215	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M	gfx/wr/webrender/src/renderer/shade.rs	\|	2	++
M	gfx/wr/webrender/src/renderer/vertex.rs	\|	28	++++++++++++++++++++++------
M	gfx/wr/webrender/src/resource_cache.rs	\|	50	++++++++++++++++++++++++++++----------------------
M	gfx/wr/webrender/src/texture_cache.rs	\|	74	+++++++++++++++++++++++++++++++++++++++++---------------------------------
M	gfx/wr/webrender/src/visibility.rs	\|	8	++++----
M	gfx/wr/webrender_api/src/units.rs	\|	9	---------
M	gfx/wr/webrender_build/src/shader.rs	\|	4	++--
M	gfx/wr/wrench/src/main.rs	\|	1	+
M	gfx/wr/wrench/src/png.rs	\|	9	+++++++++