tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

texture.h (45631B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 namespace glsl {
      6 
      7 using PackedRGBA8 = V16<uint8_t>;
      8 using WideRGBA8 = V16<uint16_t>;
      9 using HalfRGBA8 = V8<uint16_t>;
     10 
     11 SI WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); }
     12 
     13 template <int N>
     14 UNUSED SI VectorType<uint8_t, N> genericPackWide(VectorType<uint16_t, N> p) {
     15  typedef VectorType<uint8_t, N> packed_type;
     16  // Generic conversions only mask off the low byte without actually clamping
     17  // like a real pack. First force the word to all 1s if it overflows, and then
     18  // add on the sign bit to cause it to roll over to 0 if it was negative.
     19  p = (p | (p > 255)) + (p >> 15);
     20  return CONVERT(p, packed_type);
     21 }
     22 
     23 SI PackedRGBA8 pack(WideRGBA8 p) {
     24 #if USE_SSE2
     25  return _mm_packus_epi16(lowHalf(p), highHalf(p));
     26 #elif USE_NEON
     27  return vcombine_u8(vqmovun_s16(bit_cast<V8<int16_t>>(lowHalf(p))),
     28                     vqmovun_s16(bit_cast<V8<int16_t>>(highHalf(p))));
     29 #else
     30  return genericPackWide(p);
     31 #endif
     32 }
     33 
     34 using PackedR8 = V4<uint8_t>;
     35 using WideR8 = V4<uint16_t>;
     36 
     37 SI WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); }
     38 
     39 SI PackedR8 pack(WideR8 p) {
     40 #if USE_SSE2
     41  auto m = expand(p);
     42  auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m));
     43  return SHUFFLE(r, r, 0, 1, 2, 3);
     44 #elif USE_NEON
     45  return lowHalf(
     46      bit_cast<V8<uint8_t>>(vqmovun_s16(bit_cast<V8<int16_t>>(expand(p)))));
     47 #else
     48  return genericPackWide(p);
     49 #endif
     50 }
     51 
     52 using PackedRG8 = V8<uint8_t>;
     53 using WideRG8 = V8<uint16_t>;
     54 
     55 SI PackedRG8 pack(WideRG8 p) {
     56 #if USE_SSE2
     57  return lowHalf(bit_cast<V16<uint8_t>>(_mm_packus_epi16(p, p)));
     58 #elif USE_NEON
     59  return bit_cast<V8<uint8_t>>(vqmovun_s16(bit_cast<V8<int16_t>>(p)));
     60 #else
     61  return genericPackWide(p);
     62 #endif
     63 }
     64 
     65 SI I32 clampCoord(I32 coord, int limit, int base = 0) {
     66 #if USE_SSE2
     67  return _mm_min_epi16(_mm_max_epi16(coord, _mm_set1_epi32(base)),
     68                       _mm_set1_epi32(limit - 1));
     69 #else
     70  return clamp(coord, base, limit - 1);
     71 #endif
     72 }
     73 
     74 SI int clampCoord(int coord, int limit, int base = 0) {
     75  return min(max(coord, base), limit - 1);
     76 }
     77 
     78 template <typename T, typename S>
     79 SI T clamp2D(T P, S sampler) {
     80  return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)};
     81 }
     82 
     83 SI float to_float(uint32_t x) { return x * (1.f / 255.f); }
     84 
     85 SI vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
     86  U32 pixels = {a, b, c, d};
     87  return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF),
     88              cast(pixels & 0xFF), cast(pixels >> 24)) *
     89         (1.0f / 255.0f);
     90 }
     91 
     92 SI vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) {
     93  return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y},
     94              Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w});
     95 }
     96 
     97 SI ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) {
     98  return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y},
     99               I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w});
    100 }
    101 
    102 SI vec4_scalar pixel_to_vec4(uint32_t p) {
    103  U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24};
    104  Float f = cast(i) * (1.0f / 255.0f);
    105  return vec4_scalar(f.x, f.y, f.z, f.w);
    106 }
    107 
    108 template <typename S>
    109 SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) {
    110  return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y],
    111                       sampler->buf[offset.z], sampler->buf[offset.w]);
    112 }
    113 
    114 template <typename S>
    115 vec4 texelFetchRGBA8(S sampler, ivec2 P) {
    116  I32 offset = P.x + P.y * sampler->stride;
    117  return fetchOffsetsRGBA8(sampler, offset);
    118 }
    119 
    120 template <typename S>
    121 SI Float fetchOffsetsR8(S sampler, I32 offset) {
    122  U32 i = {
    123      ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y],
    124      ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]};
    125  return cast(i) * (1.0f / 255.0f);
    126 }
    127 
    128 template <typename S>
    129 vec4 texelFetchR8(S sampler, ivec2 P) {
    130  I32 offset = P.x + P.y * sampler->stride;
    131  return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
    132 }
    133 
    134 template <typename S>
    135 SI vec4 fetchOffsetsRG8(S sampler, I32 offset) {
    136  uint16_t* buf = (uint16_t*)sampler->buf;
    137  U16 pixels = {buf[offset.x], buf[offset.y], buf[offset.z], buf[offset.w]};
    138  Float r = CONVERT(pixels & 0xFF, Float) * (1.0f / 255.0f);
    139  Float g = CONVERT(pixels >> 8, Float) * (1.0f / 255.0f);
    140  return vec4(r, g, 0.0f, 1.0f);
    141 }
    142 
    143 template <typename S>
    144 vec4 texelFetchRG8(S sampler, ivec2 P) {
    145  I32 offset = P.x + P.y * sampler->stride;
    146  return fetchOffsetsRG8(sampler, offset);
    147 }
    148 
    149 template <typename S>
    150 SI Float fetchOffsetsR16(S sampler, I32 offset) {
    151  U32 i = {
    152      ((uint16_t*)sampler->buf)[offset.x], ((uint16_t*)sampler->buf)[offset.y],
    153      ((uint16_t*)sampler->buf)[offset.z], ((uint16_t*)sampler->buf)[offset.w]};
    154  return cast(i) * (1.0f / 65535.0f);
    155 }
    156 
    157 template <typename S>
    158 vec4 texelFetchR16(S sampler, ivec2 P) {
    159  I32 offset = P.x + P.y * sampler->stride;
    160  return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f);
    161 }
    162 
    163 template <typename S>
    164 SI vec4 fetchOffsetsRG16(S sampler, I32 offset) {
    165  U32 pixels = {sampler->buf[offset.x], sampler->buf[offset.y],
    166                sampler->buf[offset.z], sampler->buf[offset.w]};
    167  Float r = cast(pixels & 0xFFFF) * (1.0f / 65535.0f);
    168  Float g = cast(pixels >> 16) * (1.0f / 65535.0f);
    169  return vec4(r, g, 0.0f, 1.0f);
    170 }
    171 
    172 template <typename S>
    173 vec4 texelFetchRG16(S sampler, ivec2 P) {
    174  I32 offset = P.x + P.y * sampler->stride;
    175  return fetchOffsetsRG16(sampler, offset);
    176 }
    177 
    178 SI vec4 fetchOffsetsFloat(const uint32_t* buf, I32 offset) {
    179  return pixel_float_to_vec4(*(Float*)&buf[offset.x], *(Float*)&buf[offset.y],
    180                             *(Float*)&buf[offset.z], *(Float*)&buf[offset.w]);
    181 }
    182 
    183 SI vec4 fetchOffsetsFloat(samplerCommon* sampler, I32 offset) {
    184  return fetchOffsetsFloat(sampler->buf, offset);
    185 }
    186 
    187 vec4 texelFetchFloat(sampler2D sampler, ivec2 P) {
    188  I32 offset = P.x * 4 + P.y * sampler->stride;
    189  return fetchOffsetsFloat(sampler, offset);
    190 }
    191 
    192 template <typename S>
    193 SI vec4 fetchOffsetsYUY2(S sampler, I32 offset) {
    194  // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.
    195  // Offset is aligned to a chunk rather than a pixel, and selector specifies
    196  // pixel within the chunk.
    197  I32 selector = offset & 1;
    198  offset &= ~1;
    199  uint16_t* buf = (uint16_t*)sampler->buf;
    200  U32 pixels = {*(uint32_t*)&buf[offset.x], *(uint32_t*)&buf[offset.y],
    201                *(uint32_t*)&buf[offset.z], *(uint32_t*)&buf[offset.w]};
    202  Float b = CONVERT((pixels >> 8) & 0xFF, Float) * (1.0f / 255.0f);
    203  Float r = CONVERT((pixels >> 24), Float) * (1.0f / 255.0f);
    204  Float g =
    205      CONVERT(if_then_else(-selector, pixels >> 16, pixels) & 0xFF, Float) *
    206      (1.0f / 255.0f);
    207  return vec4(r, g, b, 1.0f);
    208 }
    209 
    210 template <typename S>
    211 vec4 texelFetchYUY2(S sampler, ivec2 P) {
    212  I32 offset = P.x + P.y * sampler->stride;
    213  return fetchOffsetsYUY2(sampler, offset);
    214 }
    215 
    216 vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) {
    217  assert(lod == 0);
    218  P = clamp2D(P, sampler);
    219  switch (sampler->format) {
    220    case TextureFormat::RGBA32F:
    221      return texelFetchFloat(sampler, P);
    222    case TextureFormat::RGBA8:
    223      return texelFetchRGBA8(sampler, P);
    224    case TextureFormat::R8:
    225      return texelFetchR8(sampler, P);
    226    case TextureFormat::RG8:
    227      return texelFetchRG8(sampler, P);
    228    case TextureFormat::R16:
    229      return texelFetchR16(sampler, P);
    230    case TextureFormat::RG16:
    231      return texelFetchRG16(sampler, P);
    232    case TextureFormat::YUY2:
    233      return texelFetchYUY2(sampler, P);
    234    default:
    235      assert(false);
    236      return vec4();
    237  }
    238 }
    239 
    240 vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) {
    241  assert(lod == 0);
    242  P = clamp2D(P, sampler);
    243  assert(sampler->format == TextureFormat::RGBA32F);
    244  return texelFetchFloat(sampler, P);
    245 }
    246 
    247 vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) {
    248  assert(lod == 0);
    249  P = clamp2D(P, sampler);
    250  assert(sampler->format == TextureFormat::RGBA8);
    251  return texelFetchRGBA8(sampler, P);
    252 }
    253 
    254 vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) {
    255  assert(lod == 0);
    256  P = clamp2D(P, sampler);
    257  assert(sampler->format == TextureFormat::R8);
    258  return texelFetchR8(sampler, P);
    259 }
    260 
    261 vec4 texelFetch(sampler2DRG8 sampler, ivec2 P, int lod) {
    262  assert(lod == 0);
    263  P = clamp2D(P, sampler);
    264  assert(sampler->format == TextureFormat::RG8);
    265  return texelFetchRG8(sampler, P);
    266 }
    267 
    268 vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) {
    269  assert(lod == 0);
    270  P = clamp2D(P, sampler);
    271  if (sampler->format == TextureFormat::RGBA32F) {
    272    return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
    273  } else {
    274    assert(sampler->format == TextureFormat::RGBA8);
    275    return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
    276  }
    277 }
    278 
    279 vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) {
    280  assert(lod == 0);
    281  P = clamp2D(P, sampler);
    282  assert(sampler->format == TextureFormat::RGBA32F);
    283  return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
    284 }
    285 
    286 vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) {
    287  assert(lod == 0);
    288  P = clamp2D(P, sampler);
    289  assert(sampler->format == TextureFormat::RGBA8);
    290  return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
    291 }
    292 
    293 vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) {
    294  assert(lod == 0);
    295  P = clamp2D(P, sampler);
    296  assert(sampler->format == TextureFormat::R8);
    297  return vec4_scalar{
    298      to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f,
    299      0.0f, 1.0f};
    300 }
    301 
    302 vec4_scalar texelFetch(sampler2DRG8 sampler, ivec2_scalar P, int lod) {
    303  assert(lod == 0);
    304  P = clamp2D(P, sampler);
    305  assert(sampler->format == TextureFormat::RG8);
    306  uint16_t pixel = ((uint16_t*)sampler->buf)[P.x + P.y * sampler->stride];
    307  return vec4_scalar{to_float(pixel & 0xFF), to_float(pixel >> 8), 0.0f, 1.0f};
    308 }
    309 
    310 vec4 texelFetch(sampler2DRect sampler, ivec2 P) {
    311  P = clamp2D(P, sampler);
    312  switch (sampler->format) {
    313    case TextureFormat::RGBA8:
    314      return texelFetchRGBA8(sampler, P);
    315    case TextureFormat::R8:
    316      return texelFetchR8(sampler, P);
    317    case TextureFormat::RG8:
    318      return texelFetchRG8(sampler, P);
    319    case TextureFormat::R16:
    320      return texelFetchR16(sampler, P);
    321    case TextureFormat::RG16:
    322      return texelFetchRG16(sampler, P);
    323    case TextureFormat::YUY2:
    324      return texelFetchYUY2(sampler, P);
    325    default:
    326      assert(false);
    327      return vec4();
    328  }
    329 }
    330 
    331 SI ivec4 fetchOffsetsInt(const uint32_t* buf, I32 offset) {
    332  return pixel_int_to_ivec4(*(I32*)&buf[offset.x], *(I32*)&buf[offset.y],
    333                            *(I32*)&buf[offset.z], *(I32*)&buf[offset.w]);
    334 }
    335 
    336 SI ivec4 fetchOffsetsInt(samplerCommon* sampler, I32 offset) {
    337  return fetchOffsetsInt(sampler->buf, offset);
    338 }
    339 
    340 ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) {
    341  assert(lod == 0);
    342  P = clamp2D(P, sampler);
    343  assert(sampler->format == TextureFormat::RGBA32I);
    344  I32 offset = P.x * 4 + P.y * sampler->stride;
    345  return fetchOffsetsInt(sampler, offset);
    346 }
    347 
    348 ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) {
    349  assert(lod == 0);
    350  P = clamp2D(P, sampler);
    351  assert(sampler->format == TextureFormat::RGBA32I);
    352  return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
    353 }
    354 
    355 constexpr int MAX_TEXEL_OFFSET = 8;
    356 
    357 // Fill texelFetchOffset outside the valid texture bounds with zeroes. The
    358 // stride will be set to 0 so that only one row of zeroes is needed.
    359 ALIGNED_DECL(
    360    16, static const uint32_t zeroFetchBuf[MAX_TEXEL_OFFSET * sizeof(Float) /
    361                                           sizeof(uint32_t)]) = {0};
    362 
    363 struct FetchScalar {
    364  const uint32_t* buf;
    365  uint32_t stride;
    366 };
    367 
    368 template <typename S>
    369 SI FetchScalar texelFetchPtr(S sampler, ivec2_scalar P, int min_x, int max_x,
    370                             int min_y, int max_y) {
    371  assert(max_x < MAX_TEXEL_OFFSET);
    372  if (P.x < -min_x || P.x >= int(sampler->width) - max_x || P.y < -min_y ||
    373      P.y >= int(sampler->height) - max_y) {
    374    return FetchScalar{zeroFetchBuf, 0};
    375  }
    376  return FetchScalar{&sampler->buf[P.x * 4 + P.y * sampler->stride],
    377                     sampler->stride};
    378 }
    379 
    380 SI vec4_scalar texelFetchUnchecked(sampler2D sampler, FetchScalar ptr, int x,
    381                                   int y = 0) {
    382  assert(sampler->format == TextureFormat::RGBA32F);
    383  return *(vec4_scalar*)&ptr.buf[x * 4 + y * ptr.stride];
    384 }
    385 
    386 SI ivec4_scalar texelFetchUnchecked(isampler2D sampler, FetchScalar ptr, int x,
    387                                    int y = 0) {
    388  assert(sampler->format == TextureFormat::RGBA32I);
    389  return *(ivec4_scalar*)&ptr.buf[x * 4 + y * ptr.stride];
    390 }
    391 
    392 struct FetchVector {
    393  const uint32_t* buf;
    394  I32 offset;
    395  uint32_t stride;
    396 };
    397 
    398 template <typename S>
    399 SI FetchVector texelFetchPtr(S sampler, ivec2 P, int min_x, int max_x,
    400                             int min_y, int max_y) {
    401  assert(max_x < MAX_TEXEL_OFFSET);
    402  if (test_any(P.x < -min_x || P.x >= int(sampler->width) - max_x ||
    403               P.y < -min_y || P.y >= int(sampler->height) - max_y)) {
    404    return FetchVector{zeroFetchBuf, I32(0), 0};
    405  }
    406  return FetchVector{sampler->buf, P.x * 4 + P.y * sampler->stride,
    407                     sampler->stride};
    408 }
    409 
    410 SI vec4 texelFetchUnchecked(sampler2D sampler, FetchVector ptr, int x,
    411                            int y = 0) {
    412  assert(sampler->format == TextureFormat::RGBA32F);
    413  return fetchOffsetsFloat(&ptr.buf[x * 4 + y * ptr.stride], ptr.offset);
    414 }
    415 
    416 SI ivec4 texelFetchUnchecked(isampler2D sampler, FetchVector ptr, int x,
    417                             int y = 0) {
    418  assert(sampler->format == TextureFormat::RGBA32I);
    419  return fetchOffsetsInt(&ptr.buf[x * 4 + y * ptr.stride], ptr.offset);
    420 }
    421 
    422 #define texelFetchOffset(sampler, P, lod, offset) \
    423  texelFetch(sampler, (P) + (offset), lod)
    424 
    425 // Scale texture coords for quantization, subtract offset for filtering
    426 // (assuming coords already offset to texel centers), and round to nearest
    427 // 1/scale increment
    428 template <typename T>
    429 SI T linearQuantize(T P, float scale) {
    430  return P * scale + (0.5f - 0.5f * scale);
    431 }
    432 
    433 // Helper version that also scales normalized texture coords for sampler
    434 template <typename T, typename S>
    435 SI T samplerScale(S sampler, T P) {
    436  P.x *= sampler->width;
    437  P.y *= sampler->height;
    438  return P;
    439 }
    440 
    441 template <typename T>
    442 SI T samplerScale(UNUSED sampler2DRect sampler, T P) {
    443  return P;
    444 }
    445 
    446 template <typename T, typename S>
    447 SI T linearQuantize(T P, float scale, S sampler) {
    448  return linearQuantize(samplerScale(sampler, P), scale);
    449 }
    450 
    451 // Compute clamped offset of first row for linear interpolation
    452 template <typename S, typename I>
    453 SI auto computeRow(S sampler, I i, size_t margin = 1) -> decltype(i.x) {
    454  return clampCoord(i.x, sampler->width - margin) +
    455         clampCoord(i.y, sampler->height) * sampler->stride;
    456 }
    457 
    458 // Compute clamped offset of second row for linear interpolation from first row
    459 template <typename S, typename I>
    460 SI auto computeNextRowOffset(S sampler, I i) -> decltype(i.x) {
    461  return if_then_else(i.y >= 0 && i.y < int32_t(sampler->height) - 1,
    462                      sampler->stride, 0);
    463 }
    464 
    465 // Convert X coordinate to a 2^7 scale fraction for interpolation
    466 template <typename S>
    467 SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) {
    468  auto overread = i.x > int32_t(sampler->width) - 2;
    469  return CONVERT((((frac.x & (i.x >= 0)) | overread) & 0x7F) - overread, I16);
    470 }
    471 
    472 // Convert Y coordinate to a 2^7 scale fraction for interpolation
    473 SI I16 computeFracNoClamp(I32 frac) { return CONVERT(frac & 0x7F, I16); }
    474 SI I16 computeFracY(ivec2 frac) { return computeFracNoClamp(frac.y); }
    475 
    476 struct WidePlanarRGBA8 {
    477  V8<uint16_t> rg;
    478  V8<uint16_t> ba;
    479 };
    480 
    481 template <typename S>
    482 SI WidePlanarRGBA8 textureLinearPlanarRGBA8(S sampler, ivec2 i) {
    483  assert(sampler->format == TextureFormat::RGBA8);
    484 
    485  ivec2 frac = i;
    486  i >>= 7;
    487 
    488  I32 row0 = computeRow(sampler, i);
    489  I32 row1 = row0 + computeNextRowOffset(sampler, i);
    490  I16 fracx = computeFracX(sampler, i, frac);
    491  I16 fracy = computeFracY(frac);
    492 
    493  auto a0 =
    494      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);
    495  auto a1 =
    496      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>);
    497  a0 += ((a1 - a0) * fracy.x) >> 7;
    498 
    499  auto b0 =
    500      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>);
    501  auto b1 =
    502      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>);
    503  b0 += ((b1 - b0) * fracy.y) >> 7;
    504 
    505  auto abl = zipLow(a0, b0);
    506  auto abh = zipHigh(a0, b0);
    507  abl += ((abh - abl) * fracx.xyxyxyxy) >> 7;
    508 
    509  auto c0 =
    510      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>);
    511  auto c1 =
    512      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>);
    513  c0 += ((c1 - c0) * fracy.z) >> 7;
    514 
    515  auto d0 =
    516      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>);
    517  auto d1 =
    518      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>);
    519  d0 += ((d1 - d0) * fracy.w) >> 7;
    520 
    521  auto cdl = zipLow(c0, d0);
    522  auto cdh = zipHigh(c0, d0);
    523  cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7;
    524 
    525  auto rg = V8<uint16_t>(zip2Low(abl, cdl));
    526  auto ba = V8<uint16_t>(zip2High(abl, cdl));
    527  return WidePlanarRGBA8{rg, ba};
    528 }
    529 
    530 template <typename S>
    531 vec4 textureLinearRGBA8(S sampler, vec2 P) {
    532  ivec2 i(linearQuantize(P, 128, sampler));
    533  auto planar = textureLinearPlanarRGBA8(sampler, i);
    534  auto rg = CONVERT(planar.rg, V8<float>);
    535  auto ba = CONVERT(planar.ba, V8<float>);
    536  auto r = lowHalf(rg);
    537  auto g = highHalf(rg);
    538  auto b = lowHalf(ba);
    539  auto a = highHalf(ba);
    540  return vec4(b, g, r, a) * (1.0f / 255.0f);
    541 }
    542 
    543 template <typename S>
    544 static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i) {
    545  assert(sampler->format == TextureFormat::R8);
    546  ivec2 frac = i;
    547  i >>= 7;
    548 
    549  I32 row0 = computeRow(sampler, i);
    550  I32 row1 = row0 + computeNextRowOffset(sampler, i);
    551  I16 fracx = computeFracX(sampler, i, frac);
    552  I16 fracy = computeFracY(frac);
    553 
    554  uint8_t* buf = (uint8_t*)sampler->buf;
    555  auto a0 = unaligned_load<V2<uint8_t>>(&buf[row0.x]);
    556  auto b0 = unaligned_load<V2<uint8_t>>(&buf[row0.y]);
    557  auto c0 = unaligned_load<V2<uint8_t>>(&buf[row0.z]);
    558  auto d0 = unaligned_load<V2<uint8_t>>(&buf[row0.w]);
    559  auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);
    560 
    561  auto a1 = unaligned_load<V2<uint8_t>>(&buf[row1.x]);
    562  auto b1 = unaligned_load<V2<uint8_t>>(&buf[row1.y]);
    563  auto c1 = unaligned_load<V2<uint8_t>>(&buf[row1.z]);
    564  auto d1 = unaligned_load<V2<uint8_t>>(&buf[row1.w]);
    565  auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);
    566 
    567  abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7;
    568 
    569  abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
    570  auto abcdl = lowHalf(abcd0);
    571  auto abcdh = highHalf(abcd0);
    572  abcdl += ((abcdh - abcdl) * fracx) >> 7;
    573 
    574  return U16(abcdl);
    575 }
    576 
    577 template <typename S>
    578 vec4 textureLinearR8(S sampler, vec2 P) {
    579  assert(sampler->format == TextureFormat::R8);
    580 
    581  ivec2 i(linearQuantize(P, 128, sampler));
    582  Float r = CONVERT(textureLinearUnpackedR8(sampler, i), Float);
    583  return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f);
    584 }
    585 
    586 struct WidePlanarRG8 {
    587  V8<uint16_t> rg;
    588 };
    589 
    590 template <typename S>
    591 SI WidePlanarRG8 textureLinearPlanarRG8(S sampler, ivec2 i) {
    592  assert(sampler->format == TextureFormat::RG8);
    593 
    594  ivec2 frac = i;
    595  i >>= 7;
    596 
    597  I32 row0 = computeRow(sampler, i);
    598  I32 row1 = row0 + computeNextRowOffset(sampler, i);
    599  I16 fracx = computeFracX(sampler, i, frac);
    600  I16 fracy = computeFracY(frac);
    601 
    602  uint16_t* buf = (uint16_t*)sampler->buf;
    603 
    604  // Load RG bytes for two adjacent pixels - rgRG
    605  auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);
    606  auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);
    607  auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);
    608  // Load two pixels for next row
    609  auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);
    610  auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);
    611  auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);
    612  // Blend rows
    613  ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;
    614 
    615  auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);
    616  auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);
    617  auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);
    618  auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);
    619  auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);
    620  auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);
    621  // Blend rows
    622  cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;
    623 
    624  // ab = a.rgRG,b.rgRG
    625  // cd = c.rgRG,d.rgRG
    626  // ... ac = ar,cr,ag,cg,aR,cR,aG,cG
    627  // ... bd = br,dr,bg,dg,bR,dR,bG,dG
    628  auto ac = zipLow(ab0, cd0);
    629  auto bd = zipHigh(ab0, cd0);
    630  // ar,br,cr,dr,ag,bg,cg,dg
    631  // aR,bR,cR,dR,aG,bG,cG,dG
    632  auto abcdl = zipLow(ac, bd);
    633  auto abcdh = zipHigh(ac, bd);
    634  // Blend columns
    635  abcdl += ((abcdh - abcdl) * fracx.xyzwxyzw) >> 7;
    636 
    637  auto rg = V8<uint16_t>(abcdl);
    638  return WidePlanarRG8{rg};
    639 }
    640 
    641 template <typename S>
    642 vec4 textureLinearRG8(S sampler, vec2 P) {
    643  ivec2 i(linearQuantize(P, 128, sampler));
    644  auto planar = textureLinearPlanarRG8(sampler, i);
    645  auto rg = CONVERT(planar.rg, V8<float>) * (1.0f / 255.0f);
    646  auto r = lowHalf(rg);
    647  auto g = highHalf(rg);
    648  return vec4(r, g, 0.0f, 1.0f);
    649 }
    650 
    651 // Samples R16 texture with linear filtering and returns results packed as
    652 // signed I16. One bit of precision is shifted away from the bottom end to
    653 // accommodate the sign bit, so only 15 bits of precision is left.
    654 template <typename S>
    655 static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i) {
    656  assert(sampler->format == TextureFormat::R16);
    657 
    658  ivec2 frac = i;
    659  i >>= 7;
    660 
    661  I32 row0 = computeRow(sampler, i);
    662  I32 row1 = row0 + computeNextRowOffset(sampler, i);
    663 
    664  I16 fracx =
    665      CONVERT(
    666          ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F,
    667          I16)
    668      << 8;
    669  I16 fracy = computeFracY(frac) << 8;
    670 
    671  // Sample the 16 bit data for both rows
    672  uint16_t* buf = (uint16_t*)sampler->buf;
    673  auto a0 = unaligned_load<V2<uint16_t>>(&buf[row0.x]);
    674  auto b0 = unaligned_load<V2<uint16_t>>(&buf[row0.y]);
    675  auto c0 = unaligned_load<V2<uint16_t>>(&buf[row0.z]);
    676  auto d0 = unaligned_load<V2<uint16_t>>(&buf[row0.w]);
    677  auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8<int16_t>);
    678 
    679  auto a1 = unaligned_load<V2<uint16_t>>(&buf[row1.x]);
    680  auto b1 = unaligned_load<V2<uint16_t>>(&buf[row1.y]);
    681  auto c1 = unaligned_load<V2<uint16_t>>(&buf[row1.z]);
    682  auto d1 = unaligned_load<V2<uint16_t>>(&buf[row1.w]);
    683  auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8<int16_t>);
    684 
    685  // The samples occupy 15 bits and the fraction occupies 15 bits, so that when
    686  // they are multiplied together, the new scaled sample will fit in the high
    687  // 14 bits of the result. It is left shifted once to make it 15 bits again
    688  // for the final multiply.
    689 #if USE_SSE2
    690  abcd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcd1 - abcd0, fracy.xxyyzzww))
    691           << 1;
    692 #elif USE_NEON
    693  // NEON has a convenient instruction that does both the multiply and the
    694  // doubling, so doesn't need an extra shift.
    695  abcd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcd1 - abcd0, fracy.xxyyzzww));
    696 #else
    697  abcd0 += CONVERT((CONVERT(abcd1 - abcd0, V8<int32_t>) *
    698                    CONVERT(fracy.xxyyzzww, V8<int32_t>)) >>
    699                       16,
    700                   V8<int16_t>)
    701           << 1;
    702 #endif
    703 
    704  abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
    705  auto abcdl = lowHalf(abcd0);
    706  auto abcdh = highHalf(abcd0);
    707 #if USE_SSE2
    708  abcdl += lowHalf(bit_cast<V8<int16_t>>(
    709               _mm_mulhi_epi16(expand(abcdh - abcdl), expand(fracx))))
    710           << 1;
    711 #elif USE_NEON
    712  abcdl += bit_cast<V4<int16_t>>(vqrdmulh_s16(abcdh - abcdl, fracx));
    713 #else
    714  abcdl += CONVERT((CONVERT(abcdh - abcdl, V4<int32_t>) *
    715                    CONVERT(fracx, V4<int32_t>)) >>
    716                       16,
    717                   V4<int16_t>)
    718           << 1;
    719 #endif
    720 
    721  return abcdl;
    722 }
    723 
    724 template <typename S>
    725 vec4 textureLinearR16(S sampler, vec2 P) {
    726  assert(sampler->format == TextureFormat::R16);
    727 
    728  ivec2 i(linearQuantize(P, 128, sampler));
    729  Float r = CONVERT(textureLinearUnpackedR16(sampler, i), Float);
    730  return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f);
    731 }
    732 
    733 // Samples RG16 texture with linear filtering and returns results packed as
    734 // signed I16. One bit of precision is shifted away from the bottom end to
    735 // accommodate the sign bit, so only 15 bits of precision is left.
    736 template <typename S>
    737 static inline V8<int16_t> textureLinearUnpackedRG16(S sampler, ivec2 i) {
    738  assert(sampler->format == TextureFormat::RG16);
    739 
    740  ivec2 frac = i;
    741  i >>= 7;
    742 
    743  I32 row0 = computeRow(sampler, i);
    744  I32 row1 = row0 + computeNextRowOffset(sampler, i);
    745 
    746  I16 fracx =
    747      CONVERT(
    748          ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F,
    749          I16)
    750      << 8;
    751  I16 fracy = computeFracY(frac) << 8;
    752 
    753  // Sample the 2x16 bit data for both rows
    754  auto a0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.x]);
    755  auto b0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.y]);
    756  auto ab0 = CONVERT(combine(a0, b0) >> 1, V8<int16_t>);
    757  auto c0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.z]);
    758  auto d0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.w]);
    759  auto cd0 = CONVERT(combine(c0, d0) >> 1, V8<int16_t>);
    760 
    761  auto a1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.x]);
    762  auto b1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.y]);
    763  auto ab1 = CONVERT(combine(a1, b1) >> 1, V8<int16_t>);
    764  auto c1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.z]);
    765  auto d1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.w]);
    766  auto cd1 = CONVERT(combine(c1, d1) >> 1, V8<int16_t>);
    767 
    768  // The samples occupy 15 bits and the fraction occupies 15 bits, so that when
    769  // they are multiplied together, the new scaled sample will fit in the high
    770  // 14 bits of the result. It is left shifted once to make it 15 bits again
    771  // for the final multiply.
    772 #if USE_SSE2
    773  ab0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(ab1 - ab0, fracy.xxxxyyyy)) << 1;
    774  cd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(cd1 - cd0, fracy.zzzzwwww)) << 1;
    775 #elif USE_NEON
    776  // NEON has a convenient instruction that does both the multiply and the
    777  // doubling, so doesn't need an extra shift.
    778  ab0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(ab1 - ab0, fracy.xxxxyyyy));
    779  cd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(cd1 - cd0, fracy.zzzzwwww));
    780 #else
    781  ab0 += CONVERT((CONVERT(ab1 - ab0, V8<int32_t>) *
    782                  CONVERT(fracy.xxxxyyyy, V8<int32_t>)) >>
    783                     16,
    784                 V8<int16_t>)
    785         << 1;
    786  cd0 += CONVERT((CONVERT(cd1 - cd0, V8<int32_t>) *
    787                  CONVERT(fracy.zzzzwwww, V8<int32_t>)) >>
    788                     16,
    789                 V8<int16_t>)
    790         << 1;
    791 #endif
    792 
    793  // ab = a.rgRG,b.rgRG
    794  // cd = c.rgRG,d.rgRG
    795  // ... ac = a.rg,c.rg,a.RG,c.RG
    796  // ... bd = b.rg,d.rg,b.RG,d.RG
    797  auto ac = zip2Low(ab0, cd0);
    798  auto bd = zip2High(ab0, cd0);
    799  // a.rg,b.rg,c.rg,d.rg
    800  // a.RG,b.RG,c.RG,d.RG
    801  auto abcdl = zip2Low(ac, bd);
    802  auto abcdh = zip2High(ac, bd);
    803  // Blend columns
    804 #if USE_SSE2
    805  abcdl += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcdh - abcdl, fracx.xxyyzzww))
    806           << 1;
    807 #elif USE_NEON
    808  abcdl += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcdh - abcdl, fracx.xxyyzzww));
    809 #else
    810  abcdl += CONVERT((CONVERT(abcdh - abcdl, V8<int32_t>) *
    811                    CONVERT(fracx.xxyyzzww, V8<int32_t>)) >>
    812                       16,
    813                   V8<int16_t>)
    814           << 1;
    815 #endif
    816 
    817  return abcdl;
    818 }
    819 
    820 template <typename S>
    821 vec4 textureLinearRG16(S sampler, vec2 P) {
    822  assert(sampler->format == TextureFormat::RG16);
    823 
    824  ivec2 i(linearQuantize(P, 128, sampler));
    825  auto rg = bit_cast<V4<int32_t>>(textureLinearUnpackedRG16(sampler, i));
    826  auto r = cast(rg & 0xFFFF) * (1.0f / 32767.0f);
    827  auto g = cast(rg >> 16) * (1.0f / 32767.0f);
    828  return vec4(r, g, 0.0f, 1.0f);
    829 }
    830 
    831 using PackedRGBA32F = V16<float>;
    832 using WideRGBA32F = V16<float>;
    833 
    834 template <typename S>
    835 vec4 textureLinearRGBA32F(S sampler, vec2 P) {
    836  assert(sampler->format == TextureFormat::RGBA32F);
    837  P = samplerScale(sampler, P);
    838  P -= 0.5f;
    839  vec2 f = floor(P);
    840  vec2 r = P - f;
    841  ivec2 i(f);
    842  ivec2 c(clampCoord(i.x, sampler->width - 1),
    843          clampCoord(i.y, sampler->height));
    844  r.x = if_then_else(i.x >= 0, if_then_else(i.x < sampler->width - 1, r.x, 1.0),
    845                     0.0f);
    846  I32 offset0 = c.x * 4 + c.y * sampler->stride;
    847  I32 offset1 = offset0 + computeNextRowOffset(sampler, i);
    848 
    849  Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x],
    850                     *(Float*)&sampler->buf[offset0.x + 4], r.x),
    851                 mix(*(Float*)&sampler->buf[offset1.x],
    852                     *(Float*)&sampler->buf[offset1.x + 4], r.x),
    853                 r.y);
    854  Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y],
    855                     *(Float*)&sampler->buf[offset0.y + 4], r.x),
    856                 mix(*(Float*)&sampler->buf[offset1.y],
    857                     *(Float*)&sampler->buf[offset1.y + 4], r.x),
    858                 r.y);
    859  Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z],
    860                     *(Float*)&sampler->buf[offset0.z + 4], r.x),
    861                 mix(*(Float*)&sampler->buf[offset1.z],
    862                     *(Float*)&sampler->buf[offset1.z + 4], r.x),
    863                 r.y);
    864  Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w],
    865                     *(Float*)&sampler->buf[offset0.w + 4], r.x),
    866                 mix(*(Float*)&sampler->buf[offset1.w],
    867                     *(Float*)&sampler->buf[offset1.w + 4], r.x),
    868                 r.y);
    869  return pixel_float_to_vec4(c0, c1, c2, c3);
    870 }
    871 
    872 struct WidePlanarYUV8 {
    873  U16 y;
    874  U16 u;
    875  U16 v;
    876 };
    877 
    878 template <typename S>
    879 SI WidePlanarYUV8 textureLinearPlanarYUY2(S sampler, ivec2 i) {
    880  assert(sampler->format == TextureFormat::YUY2);
    881 
    882  ivec2 frac = i;
    883  i >>= 7;
    884 
    885  I32 row0 = computeRow(sampler, i, 2);
    886  // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.
    887  // Get the selector for the pixel within the chunk.
    888  I32 selector = row0 & 1;
    889  // Align the row index to the chunk.
    890  row0 &= ~1;
    891  I32 row1 = row0 + computeNextRowOffset(sampler, i);
    892  // G only needs to be clamped to a pixel boundary for safe interpolation,
    893  // whereas the BR fraction needs to be clamped 1 extra pixel inside to a chunk
    894  // boundary.
    895  frac.x &= (i.x >= 0);
    896  auto fracx =
    897      CONVERT(combine(frac.x | (i.x > int32_t(sampler->width) - 3),
    898                      (frac.x >> 1) | (i.x > int32_t(sampler->width) - 3)) &
    899                  0x7F,
    900              V8<int16_t>);
    901  I16 fracy = computeFracY(frac);
    902 
    903  uint16_t* buf = (uint16_t*)sampler->buf;
    904 
    905  // Load bytes for two adjacent chunks - g0,b,g1,r,G0,B,G1,R
    906  // We always need to interpolate between (b,r) and (B,R).
    907  // Depending on selector we need to either interpolate between g0 and g1
    908  // or between g1 and G0. So for now we just interpolate both cases for g
    909  // and will select the appropriate one on output.
    910  auto a0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.x]), V8<int16_t>);
    911  auto a1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.x]), V8<int16_t>);
    912  // Combine with next row.
    913  a0 += ((a1 - a0) * fracy.x) >> 7;
    914 
    915  auto b0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.y]), V8<int16_t>);
    916  auto b1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.y]), V8<int16_t>);
    917  b0 += ((b1 - b0) * fracy.y) >> 7;
    918 
    919  auto c0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.z]), V8<int16_t>);
    920  auto c1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.z]), V8<int16_t>);
    921  c0 += ((c1 - c0) * fracy.z) >> 7;
    922 
    923  auto d0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.w]), V8<int16_t>);
    924  auto d1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.w]), V8<int16_t>);
    925  d0 += ((d1 - d0) * fracy.w) >> 7;
    926 
    927  // Shuffle things around so we end up with g0,g0,g0,g0,b,b,b,b and
    928  // g1,g1,g1,g1,r,r,r,r.
    929  auto abl = zipLow(a0, b0);
    930  auto cdl = zipLow(c0, d0);
    931  auto g0b = zip2Low(abl, cdl);
    932  auto g1r = zip2High(abl, cdl);
    933 
    934  // Need to zip g1,B,G0,R. Instead of using a bunch of complicated masks and
    935  // and shifts, just shuffle here instead... We finally end up with
    936  // g1,g1,g1,g1,B,B,B,B and G0,G0,G0,G0,R,R,R,R.
    937  auto abh = SHUFFLE(a0, b0, 2, 10, 5, 13, 4, 12, 7, 15);
    938  auto cdh = SHUFFLE(c0, d0, 2, 10, 5, 13, 4, 12, 7, 15);
    939  auto g1B = zip2Low(abh, cdh);
    940  auto G0R = zip2High(abh, cdh);
    941 
    942  // Finally interpolate between adjacent columns.
    943  g0b += ((g1B - g0b) * fracx) >> 7;
    944  g1r += ((G0R - g1r) * fracx) >> 7;
    945 
    946  // Choose either g0 or g1 based on selector.
    947  return WidePlanarYUV8{
    948      U16(if_then_else(CONVERT(-selector, I16), lowHalf(g1r), lowHalf(g0b))),
    949      U16(highHalf(g0b)), U16(highHalf(g1r))};
    950 }
    951 
    952 template <typename S>
    953 vec4 textureLinearYUY2(S sampler, vec2 P) {
    954  ivec2 i(linearQuantize(P, 128, sampler));
    955  auto planar = textureLinearPlanarYUY2(sampler, i);
    956  auto y = CONVERT(planar.y, Float) * (1.0f / 255.0f);
    957  auto u = CONVERT(planar.u, Float) * (1.0f / 255.0f);
    958  auto v = CONVERT(planar.v, Float) * (1.0f / 255.0f);
    959  return vec4(v, y, u, 1.0f);
    960 }
    961 
    962 SI vec4 texture(sampler2D sampler, vec2 P) {
    963  if (sampler->filter == TextureFilter::LINEAR) {
    964    switch (sampler->format) {
    965      case TextureFormat::RGBA32F:
    966        return textureLinearRGBA32F(sampler, P);
    967      case TextureFormat::RGBA8:
    968        return textureLinearRGBA8(sampler, P);
    969      case TextureFormat::R8:
    970        return textureLinearR8(sampler, P);
    971      case TextureFormat::RG8:
    972        return textureLinearRG8(sampler, P);
    973      case TextureFormat::R16:
    974        return textureLinearR16(sampler, P);
    975      case TextureFormat::RG16:
    976        return textureLinearRG16(sampler, P);
    977      case TextureFormat::YUY2:
    978        return textureLinearYUY2(sampler, P);
    979      default:
    980        assert(false);
    981        return vec4();
    982    }
    983  } else {
    984    ivec2 coord(roundzero(P.x, sampler->width),
    985                roundzero(P.y, sampler->height));
    986    return texelFetch(sampler, coord, 0);
    987  }
    988 }
    989 
    990 vec4 texture(sampler2DRect sampler, vec2 P) {
    991  if (sampler->filter == TextureFilter::LINEAR) {
    992    switch (sampler->format) {
    993      case TextureFormat::RGBA8:
    994        return textureLinearRGBA8(sampler, P);
    995      case TextureFormat::R8:
    996        return textureLinearR8(sampler, P);
    997      case TextureFormat::RG8:
    998        return textureLinearRG8(sampler, P);
    999      case TextureFormat::R16:
   1000        return textureLinearR16(sampler, P);
   1001      case TextureFormat::RG16:
   1002        return textureLinearRG16(sampler, P);
   1003      case TextureFormat::YUY2:
   1004        return textureLinearYUY2(sampler, P);
   1005      default:
   1006        assert(false);
   1007        return vec4();
   1008    }
   1009  } else {
   1010    ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f));
   1011    return texelFetch(sampler, coord);
   1012  }
   1013 }
   1014 
   1015 template <typename S>
   1016 vec4_scalar texture(S sampler, vec2_scalar P) {
   1017  return force_scalar(texture(sampler, vec2(P)));
   1018 }
   1019 
   1020 ivec2_scalar textureSize(sampler2D sampler, int) {
   1021  return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
   1022 }
   1023 
   1024 ivec2_scalar textureSize(sampler2DRect sampler) {
   1025  return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
   1026 }
   1027 
   1028 template <typename S>
   1029 static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) {
   1030  assert(sampler->format == TextureFormat::RGBA8);
   1031  ivec2 frac = i;
   1032  i >>= 7;
   1033 
   1034  I32 row0 = computeRow(sampler, i);
   1035  I32 row1 = row0 + computeNextRowOffset(sampler, i);
   1036  I16 fracx = computeFracX(sampler, i, frac);
   1037  I16 fracy = computeFracY(frac);
   1038 
   1039  auto a0 =
   1040      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);
   1041  auto a1 =
   1042      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>);
   1043  a0 += ((a1 - a0) * fracy.x) >> 7;
   1044 
   1045  auto b0 =
   1046      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>);
   1047  auto b1 =
   1048      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>);
   1049  b0 += ((b1 - b0) * fracy.y) >> 7;
   1050 
   1051  auto abl = combine(lowHalf(a0), lowHalf(b0));
   1052  auto abh = combine(highHalf(a0), highHalf(b0));
   1053  abl += ((abh - abl) * fracx.xxxxyyyy) >> 7;
   1054 
   1055  auto c0 =
   1056      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>);
   1057  auto c1 =
   1058      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>);
   1059  c0 += ((c1 - c0) * fracy.z) >> 7;
   1060 
   1061  auto d0 =
   1062      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>);
   1063  auto d1 =
   1064      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>);
   1065  d0 += ((d1 - d0) * fracy.w) >> 7;
   1066 
   1067  auto cdl = combine(lowHalf(c0), lowHalf(d0));
   1068  auto cdh = combine(highHalf(c0), highHalf(d0));
   1069  cdl += ((cdh - cdl) * fracx.zzzzwwww) >> 7;
   1070 
   1071  return combine(HalfRGBA8(abl), HalfRGBA8(cdl));
   1072 }
   1073 
   1074 template <typename S>
   1075 static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i) {
   1076  return pack(textureLinearUnpackedRGBA8(sampler, i));
   1077 }
   1078 
   1079 template <typename S>
   1080 static PackedRGBA8 textureNearestPackedRGBA8(S sampler, ivec2 i) {
   1081  assert(sampler->format == TextureFormat::RGBA8);
   1082  I32 row = computeRow(sampler, i, 0);
   1083  return combine(unaligned_load<V4<uint8_t>>(&sampler->buf[row.x]),
   1084                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.y]),
   1085                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.z]),
   1086                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.w]));
   1087 }
   1088 
   1089 template <typename S>
   1090 static PackedR8 textureLinearPackedR8(S sampler, ivec2 i) {
   1091  return pack(textureLinearUnpackedR8(sampler, i));
   1092 }
   1093 
   1094 template <typename S>
   1095 static WideRG8 textureLinearUnpackedRG8(S sampler, ivec2 i) {
   1096  assert(sampler->format == TextureFormat::RG8);
   1097  ivec2 frac = i & 0x7F;
   1098  i >>= 7;
   1099 
   1100  I32 row0 = computeRow(sampler, i);
   1101  I32 row1 = row0 + computeNextRowOffset(sampler, i);
   1102  I16 fracx = computeFracX(sampler, i, frac);
   1103  I16 fracy = computeFracY(frac);
   1104 
   1105  uint16_t* buf = (uint16_t*)sampler->buf;
   1106 
   1107  // Load RG bytes for two adjacent pixels - rgRG
   1108  auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);
   1109  auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);
   1110  auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);
   1111  // Load two pixels for next row
   1112  auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);
   1113  auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);
   1114  auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);
   1115  // Blend rows
   1116  ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;
   1117 
   1118  auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);
   1119  auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);
   1120  auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);
   1121  auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);
   1122  auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);
   1123  auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);
   1124  // Blend rows
   1125  cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;
   1126 
   1127  // ab = a.rgRG,b.rgRG
   1128  // cd = c.rgRG,d.rgRG
   1129  // ... ac = a.rg,c.rg,a.RG,c.RG
   1130  // ... bd = b.rg,d.rg,b.RG,d.RG
   1131  auto ac = zip2Low(ab0, cd0);
   1132  auto bd = zip2High(ab0, cd0);
   1133  // a.rg,b.rg,c.rg,d.rg
   1134  // a.RG,b.RG,c.RG,d.RG
   1135  auto abcdl = zip2Low(ac, bd);
   1136  auto abcdh = zip2High(ac, bd);
   1137  // Blend columns
   1138  abcdl += ((abcdh - abcdl) * fracx.xxyyzzww) >> 7;
   1139 
   1140  return WideRG8(abcdl);
   1141 }
   1142 
   1143 template <typename S>
   1144 static PackedRG8 textureLinearPackedRG8(S sampler, ivec2 i) {
   1145  return pack(textureLinearUnpackedRG8(sampler, i));
   1146 }
   1147 
   1148 template <int N>
   1149 static ALWAYS_INLINE VectorType<uint16_t, N> addsat(VectorType<uint16_t, N> x,
   1150                                                    VectorType<uint16_t, N> y) {
   1151  auto r = x + y;
   1152  return r | (r < x);
   1153 }
   1154 
   1155 static inline V8<uint16_t> addsat(V8<uint16_t> x, V8<uint16_t> y) {
   1156 #if USE_SSE2
   1157  return _mm_adds_epu16(x, y);
   1158 #elif USE_NEON
   1159  return vqaddq_u16(x, y);
   1160 #else
   1161  auto r = x + y;
   1162  return r | (r < x);
   1163 #endif
   1164 }
   1165 
   1166 template <typename P, typename S>
   1167 static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurHorizontal(
   1168    S sampler, const ivec2_scalar& i, int minX, int maxX, int radius,
   1169    float coeff, float coeffStep) {
   1170  // Packed and unpacked vectors for a chunk of the given pixel type.
   1171  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
   1172  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
   1173 
   1174  // Pre-scale the coefficient by 8 bits of fractional precision, so that when
   1175  // the sample is multiplied by it, it will yield a 16 bit unsigned integer
   1176  // that will use all 16 bits of precision to accumulate the sum.
   1177  coeff *= 1 << 8;
   1178  float coeffStep2 = coeffStep * coeffStep;
   1179 
   1180  int row = computeRow(sampler, i);
   1181  P* buf = (P*)sampler->buf;
   1182  auto pixelsRight = unaligned_load<V4<P>>(&buf[row]);
   1183  auto pixelsLeft = pixelsRight;
   1184  auto sum = CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) *
   1185             uint16_t(coeff + 0.5f);
   1186 
   1187  // Here we use some trickery to reuse the pixels within a chunk, shifted over
   1188  // by one pixel, to get the next sample for the entire chunk. This allows us
   1189  // to sample only one pixel for each offset across the entire chunk in both
   1190  // the left and right directions. To avoid clamping within the loop to the
   1191  // texture bounds, we compute the valid radius that doesn't require clamping
   1192  // and fall back to a slower clamping loop outside of that valid radius.
   1193  int offset = 1;
   1194  // The left bound is how much we can offset the sample before the start of
   1195  // the row bounds.
   1196  int leftBound = i.x - max(minX, 0);
   1197  // The right bound is how much we can offset the sample before the end of the
   1198  // row bounds.
   1199  int rightBound = min(maxX, sampler->width - 1) - i.x;
   1200  int validRadius = min(radius, min(leftBound, rightBound - (4 - 1)));
   1201  for (; offset <= validRadius; offset++) {
   1202    // Overwrite the pixel that needs to be shifted out with the new pixel, and
   1203    // shift it into the correct location.
   1204    pixelsRight.x = unaligned_load<P>(&buf[row + offset + 4 - 1]);
   1205    pixelsRight = pixelsRight.yzwx;
   1206    pixelsLeft = pixelsLeft.wxyz;
   1207    pixelsLeft.x = unaligned_load<P>(&buf[row - offset]);
   1208 
   1209    // Accumulate the Gaussian coefficients step-wise.
   1210    coeff *= coeffStep;
   1211    coeffStep *= coeffStep2;
   1212 
   1213    // Both left and right samples at this offset use the same coefficient.
   1214    sum = addsat(sum,
   1215                 (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +
   1216                  CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *
   1217                     uint16_t(coeff + 0.5f));
   1218  }
   1219 
   1220  for (; offset <= radius; offset++) {
   1221    pixelsRight.x =
   1222        unaligned_load<P>(&buf[row + min(offset + 4 - 1, rightBound)]);
   1223    pixelsRight = pixelsRight.yzwx;
   1224    pixelsLeft = pixelsLeft.wxyz;
   1225    pixelsLeft.x = unaligned_load<P>(&buf[row - min(offset, leftBound)]);
   1226 
   1227    coeff *= coeffStep;
   1228    coeffStep *= coeffStep2;
   1229 
   1230    sum = addsat(sum,
   1231                 (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +
   1232                  CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *
   1233                     uint16_t(coeff + 0.5f));
   1234  }
   1235 
   1236  // Shift away the intermediate precision.
   1237  return sum >> 8;
   1238 }
   1239 
   1240 template <typename P, typename S>
   1241 static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurVertical(
   1242    S sampler, const ivec2_scalar& i, int minY, int maxY, int radius,
   1243    float coeff, float coeffStep) {
   1244  // Packed and unpacked vectors for a chunk of the given pixel type.
   1245  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
   1246  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
   1247 
   1248  // Pre-scale the coefficient by 8 bits of fractional precision, so that when
   1249  // the sample is multiplied by it, it will yield a 16 bit unsigned integer
   1250  // that will use all 16 bits of precision to accumulate the sum.
   1251  coeff *= 1 << 8;
   1252  float coeffStep2 = coeffStep * coeffStep;
   1253 
   1254  int rowAbove = computeRow(sampler, i);
   1255  int rowBelow = rowAbove;
   1256  P* buf = (P*)sampler->buf;
   1257  auto pixels = unaligned_load<V4<P>>(&buf[rowAbove]);
   1258  auto sum = CONVERT(bit_cast<packed_type>(pixels), unpacked_type) *
   1259             uint16_t(coeff + 0.5f);
   1260 
   1261  // For the vertical loop we can't be quite as creative with reusing old values
   1262  // as we were in the horizontal loop. We just do the obvious implementation of
   1263  // loading a chunk from each row in turn and accumulating it into the sum. We
   1264  // compute a valid radius within which we don't need to clamp the sampled row
   1265  // and use that to avoid any clamping in the main inner loop. We fall back to
   1266  // a slower clamping loop outside of that valid radius.
   1267  int offset = 1;
   1268  int belowBound = i.y - max(minY, 0);
   1269  int aboveBound = min(maxY, sampler->height - 1) - i.y;
   1270  int validRadius = min(radius, min(belowBound, aboveBound));
   1271  for (; offset <= validRadius; offset++) {
   1272    rowAbove += sampler->stride;
   1273    rowBelow -= sampler->stride;
   1274    auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);
   1275    auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);
   1276 
   1277    // Accumulate the Gaussian coefficients step-wise.
   1278    coeff *= coeffStep;
   1279    coeffStep *= coeffStep2;
   1280 
   1281    // Both above and below samples at this offset use the same coefficient.
   1282    sum = addsat(sum,
   1283                 (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +
   1284                  CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *
   1285                     uint16_t(coeff + 0.5f));
   1286  }
   1287 
   1288  for (; offset <= radius; offset++) {
   1289    if (offset <= aboveBound) {
   1290      rowAbove += sampler->stride;
   1291    }
   1292    if (offset <= belowBound) {
   1293      rowBelow -= sampler->stride;
   1294    }
   1295    auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);
   1296    auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);
   1297 
   1298    coeff *= coeffStep;
   1299    coeffStep *= coeffStep2;
   1300 
   1301    sum = addsat(sum,
   1302                 (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +
   1303                  CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *
   1304                     uint16_t(coeff + 0.5f));
   1305  }
   1306 
   1307  // Shift away the intermediate precision.
   1308  return sum >> 8;
   1309 }
   1310 
   1311 }  // namespace glsl