texture.h (45631B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 namespace glsl { 6 7 using PackedRGBA8 = V16<uint8_t>; 8 using WideRGBA8 = V16<uint16_t>; 9 using HalfRGBA8 = V8<uint16_t>; 10 11 SI WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); } 12 13 template <int N> 14 UNUSED SI VectorType<uint8_t, N> genericPackWide(VectorType<uint16_t, N> p) { 15 typedef VectorType<uint8_t, N> packed_type; 16 // Generic conversions only mask off the low byte without actually clamping 17 // like a real pack. First force the word to all 1s if it overflows, and then 18 // add on the sign bit to cause it to roll over to 0 if it was negative. 19 p = (p | (p > 255)) + (p >> 15); 20 return CONVERT(p, packed_type); 21 } 22 23 SI PackedRGBA8 pack(WideRGBA8 p) { 24 #if USE_SSE2 25 return _mm_packus_epi16(lowHalf(p), highHalf(p)); 26 #elif USE_NEON 27 return vcombine_u8(vqmovun_s16(bit_cast<V8<int16_t>>(lowHalf(p))), 28 vqmovun_s16(bit_cast<V8<int16_t>>(highHalf(p)))); 29 #else 30 return genericPackWide(p); 31 #endif 32 } 33 34 using PackedR8 = V4<uint8_t>; 35 using WideR8 = V4<uint16_t>; 36 37 SI WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); } 38 39 SI PackedR8 pack(WideR8 p) { 40 #if USE_SSE2 41 auto m = expand(p); 42 auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m)); 43 return SHUFFLE(r, r, 0, 1, 2, 3); 44 #elif USE_NEON 45 return lowHalf( 46 bit_cast<V8<uint8_t>>(vqmovun_s16(bit_cast<V8<int16_t>>(expand(p))))); 47 #else 48 return genericPackWide(p); 49 #endif 50 } 51 52 using PackedRG8 = V8<uint8_t>; 53 using WideRG8 = V8<uint16_t>; 54 55 SI PackedRG8 pack(WideRG8 p) { 56 #if USE_SSE2 57 return lowHalf(bit_cast<V16<uint8_t>>(_mm_packus_epi16(p, p))); 58 #elif USE_NEON 59 return bit_cast<V8<uint8_t>>(vqmovun_s16(bit_cast<V8<int16_t>>(p))); 60 #else 61 return genericPackWide(p); 62 #endif 63 } 64 65 SI I32 clampCoord(I32 coord, int limit, int base = 0) { 66 #if USE_SSE2 67 return _mm_min_epi16(_mm_max_epi16(coord, _mm_set1_epi32(base)), 68 _mm_set1_epi32(limit - 1)); 69 #else 70 return clamp(coord, base, limit - 1); 71 #endif 72 } 73 74 SI int clampCoord(int coord, int limit, int base = 0) { 75 return min(max(coord, base), limit - 1); 76 } 77 78 template <typename T, typename S> 79 SI T clamp2D(T P, S sampler) { 80 return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)}; 81 } 82 83 SI float to_float(uint32_t x) { return x * (1.f / 255.f); } 84 85 SI vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { 86 U32 pixels = {a, b, c, d}; 87 return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF), 88 cast(pixels & 0xFF), cast(pixels >> 24)) * 89 (1.0f / 255.0f); 90 } 91 92 SI vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) { 93 return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y}, 94 Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w}); 95 } 96 97 SI ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) { 98 return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y}, 99 I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w}); 100 } 101 102 SI vec4_scalar pixel_to_vec4(uint32_t p) { 103 U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24}; 104 Float f = cast(i) * (1.0f / 255.0f); 105 return vec4_scalar(f.x, f.y, f.z, f.w); 106 } 107 108 template <typename S> 109 SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) { 110 return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y], 111 sampler->buf[offset.z], sampler->buf[offset.w]); 112 } 113 114 template <typename S> 115 vec4 texelFetchRGBA8(S sampler, ivec2 P) { 116 I32 offset = P.x + P.y * sampler->stride; 117 return fetchOffsetsRGBA8(sampler, offset); 118 } 119 120 template <typename S> 121 SI Float fetchOffsetsR8(S sampler, I32 offset) { 122 U32 i = { 123 ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y], 124 ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]}; 125 return cast(i) * (1.0f / 255.0f); 126 } 127 128 template <typename S> 129 vec4 texelFetchR8(S sampler, ivec2 P) { 130 I32 offset = P.x + P.y * sampler->stride; 131 return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f); 132 } 133 134 template <typename S> 135 SI vec4 fetchOffsetsRG8(S sampler, I32 offset) { 136 uint16_t* buf = (uint16_t*)sampler->buf; 137 U16 pixels = {buf[offset.x], buf[offset.y], buf[offset.z], buf[offset.w]}; 138 Float r = CONVERT(pixels & 0xFF, Float) * (1.0f / 255.0f); 139 Float g = CONVERT(pixels >> 8, Float) * (1.0f / 255.0f); 140 return vec4(r, g, 0.0f, 1.0f); 141 } 142 143 template <typename S> 144 vec4 texelFetchRG8(S sampler, ivec2 P) { 145 I32 offset = P.x + P.y * sampler->stride; 146 return fetchOffsetsRG8(sampler, offset); 147 } 148 149 template <typename S> 150 SI Float fetchOffsetsR16(S sampler, I32 offset) { 151 U32 i = { 152 ((uint16_t*)sampler->buf)[offset.x], ((uint16_t*)sampler->buf)[offset.y], 153 ((uint16_t*)sampler->buf)[offset.z], ((uint16_t*)sampler->buf)[offset.w]}; 154 return cast(i) * (1.0f / 65535.0f); 155 } 156 157 template <typename S> 158 vec4 texelFetchR16(S sampler, ivec2 P) { 159 I32 offset = P.x + P.y * sampler->stride; 160 return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f); 161 } 162 163 template <typename S> 164 SI vec4 fetchOffsetsRG16(S sampler, I32 offset) { 165 U32 pixels = {sampler->buf[offset.x], sampler->buf[offset.y], 166 sampler->buf[offset.z], sampler->buf[offset.w]}; 167 Float r = cast(pixels & 0xFFFF) * (1.0f / 65535.0f); 168 Float g = cast(pixels >> 16) * (1.0f / 65535.0f); 169 return vec4(r, g, 0.0f, 1.0f); 170 } 171 172 template <typename S> 173 vec4 texelFetchRG16(S sampler, ivec2 P) { 174 I32 offset = P.x + P.y * sampler->stride; 175 return fetchOffsetsRG16(sampler, offset); 176 } 177 178 SI vec4 fetchOffsetsFloat(const uint32_t* buf, I32 offset) { 179 return pixel_float_to_vec4(*(Float*)&buf[offset.x], *(Float*)&buf[offset.y], 180 *(Float*)&buf[offset.z], *(Float*)&buf[offset.w]); 181 } 182 183 SI vec4 fetchOffsetsFloat(samplerCommon* sampler, I32 offset) { 184 return fetchOffsetsFloat(sampler->buf, offset); 185 } 186 187 vec4 texelFetchFloat(sampler2D sampler, ivec2 P) { 188 I32 offset = P.x * 4 + P.y * sampler->stride; 189 return fetchOffsetsFloat(sampler, offset); 190 } 191 192 template <typename S> 193 SI vec4 fetchOffsetsYUY2(S sampler, I32 offset) { 194 // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R. 195 // Offset is aligned to a chunk rather than a pixel, and selector specifies 196 // pixel within the chunk. 197 I32 selector = offset & 1; 198 offset &= ~1; 199 uint16_t* buf = (uint16_t*)sampler->buf; 200 U32 pixels = {*(uint32_t*)&buf[offset.x], *(uint32_t*)&buf[offset.y], 201 *(uint32_t*)&buf[offset.z], *(uint32_t*)&buf[offset.w]}; 202 Float b = CONVERT((pixels >> 8) & 0xFF, Float) * (1.0f / 255.0f); 203 Float r = CONVERT((pixels >> 24), Float) * (1.0f / 255.0f); 204 Float g = 205 CONVERT(if_then_else(-selector, pixels >> 16, pixels) & 0xFF, Float) * 206 (1.0f / 255.0f); 207 return vec4(r, g, b, 1.0f); 208 } 209 210 template <typename S> 211 vec4 texelFetchYUY2(S sampler, ivec2 P) { 212 I32 offset = P.x + P.y * sampler->stride; 213 return fetchOffsetsYUY2(sampler, offset); 214 } 215 216 vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) { 217 assert(lod == 0); 218 P = clamp2D(P, sampler); 219 switch (sampler->format) { 220 case TextureFormat::RGBA32F: 221 return texelFetchFloat(sampler, P); 222 case TextureFormat::RGBA8: 223 return texelFetchRGBA8(sampler, P); 224 case TextureFormat::R8: 225 return texelFetchR8(sampler, P); 226 case TextureFormat::RG8: 227 return texelFetchRG8(sampler, P); 228 case TextureFormat::R16: 229 return texelFetchR16(sampler, P); 230 case TextureFormat::RG16: 231 return texelFetchRG16(sampler, P); 232 case TextureFormat::YUY2: 233 return texelFetchYUY2(sampler, P); 234 default: 235 assert(false); 236 return vec4(); 237 } 238 } 239 240 vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) { 241 assert(lod == 0); 242 P = clamp2D(P, sampler); 243 assert(sampler->format == TextureFormat::RGBA32F); 244 return texelFetchFloat(sampler, P); 245 } 246 247 vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) { 248 assert(lod == 0); 249 P = clamp2D(P, sampler); 250 assert(sampler->format == TextureFormat::RGBA8); 251 return texelFetchRGBA8(sampler, P); 252 } 253 254 vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) { 255 assert(lod == 0); 256 P = clamp2D(P, sampler); 257 assert(sampler->format == TextureFormat::R8); 258 return texelFetchR8(sampler, P); 259 } 260 261 vec4 texelFetch(sampler2DRG8 sampler, ivec2 P, int lod) { 262 assert(lod == 0); 263 P = clamp2D(P, sampler); 264 assert(sampler->format == TextureFormat::RG8); 265 return texelFetchRG8(sampler, P); 266 } 267 268 vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) { 269 assert(lod == 0); 270 P = clamp2D(P, sampler); 271 if (sampler->format == TextureFormat::RGBA32F) { 272 return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; 273 } else { 274 assert(sampler->format == TextureFormat::RGBA8); 275 return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]); 276 } 277 } 278 279 vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) { 280 assert(lod == 0); 281 P = clamp2D(P, sampler); 282 assert(sampler->format == TextureFormat::RGBA32F); 283 return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; 284 } 285 286 vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) { 287 assert(lod == 0); 288 P = clamp2D(P, sampler); 289 assert(sampler->format == TextureFormat::RGBA8); 290 return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]); 291 } 292 293 vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) { 294 assert(lod == 0); 295 P = clamp2D(P, sampler); 296 assert(sampler->format == TextureFormat::R8); 297 return vec4_scalar{ 298 to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f, 299 0.0f, 1.0f}; 300 } 301 302 vec4_scalar texelFetch(sampler2DRG8 sampler, ivec2_scalar P, int lod) { 303 assert(lod == 0); 304 P = clamp2D(P, sampler); 305 assert(sampler->format == TextureFormat::RG8); 306 uint16_t pixel = ((uint16_t*)sampler->buf)[P.x + P.y * sampler->stride]; 307 return vec4_scalar{to_float(pixel & 0xFF), to_float(pixel >> 8), 0.0f, 1.0f}; 308 } 309 310 vec4 texelFetch(sampler2DRect sampler, ivec2 P) { 311 P = clamp2D(P, sampler); 312 switch (sampler->format) { 313 case TextureFormat::RGBA8: 314 return texelFetchRGBA8(sampler, P); 315 case TextureFormat::R8: 316 return texelFetchR8(sampler, P); 317 case TextureFormat::RG8: 318 return texelFetchRG8(sampler, P); 319 case TextureFormat::R16: 320 return texelFetchR16(sampler, P); 321 case TextureFormat::RG16: 322 return texelFetchRG16(sampler, P); 323 case TextureFormat::YUY2: 324 return texelFetchYUY2(sampler, P); 325 default: 326 assert(false); 327 return vec4(); 328 } 329 } 330 331 SI ivec4 fetchOffsetsInt(const uint32_t* buf, I32 offset) { 332 return pixel_int_to_ivec4(*(I32*)&buf[offset.x], *(I32*)&buf[offset.y], 333 *(I32*)&buf[offset.z], *(I32*)&buf[offset.w]); 334 } 335 336 SI ivec4 fetchOffsetsInt(samplerCommon* sampler, I32 offset) { 337 return fetchOffsetsInt(sampler->buf, offset); 338 } 339 340 ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) { 341 assert(lod == 0); 342 P = clamp2D(P, sampler); 343 assert(sampler->format == TextureFormat::RGBA32I); 344 I32 offset = P.x * 4 + P.y * sampler->stride; 345 return fetchOffsetsInt(sampler, offset); 346 } 347 348 ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) { 349 assert(lod == 0); 350 P = clamp2D(P, sampler); 351 assert(sampler->format == TextureFormat::RGBA32I); 352 return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; 353 } 354 355 constexpr int MAX_TEXEL_OFFSET = 8; 356 357 // Fill texelFetchOffset outside the valid texture bounds with zeroes. The 358 // stride will be set to 0 so that only one row of zeroes is needed. 359 ALIGNED_DECL( 360 16, static const uint32_t zeroFetchBuf[MAX_TEXEL_OFFSET * sizeof(Float) / 361 sizeof(uint32_t)]) = {0}; 362 363 struct FetchScalar { 364 const uint32_t* buf; 365 uint32_t stride; 366 }; 367 368 template <typename S> 369 SI FetchScalar texelFetchPtr(S sampler, ivec2_scalar P, int min_x, int max_x, 370 int min_y, int max_y) { 371 assert(max_x < MAX_TEXEL_OFFSET); 372 if (P.x < -min_x || P.x >= int(sampler->width) - max_x || P.y < -min_y || 373 P.y >= int(sampler->height) - max_y) { 374 return FetchScalar{zeroFetchBuf, 0}; 375 } 376 return FetchScalar{&sampler->buf[P.x * 4 + P.y * sampler->stride], 377 sampler->stride}; 378 } 379 380 SI vec4_scalar texelFetchUnchecked(sampler2D sampler, FetchScalar ptr, int x, 381 int y = 0) { 382 assert(sampler->format == TextureFormat::RGBA32F); 383 return *(vec4_scalar*)&ptr.buf[x * 4 + y * ptr.stride]; 384 } 385 386 SI ivec4_scalar texelFetchUnchecked(isampler2D sampler, FetchScalar ptr, int x, 387 int y = 0) { 388 assert(sampler->format == TextureFormat::RGBA32I); 389 return *(ivec4_scalar*)&ptr.buf[x * 4 + y * ptr.stride]; 390 } 391 392 struct FetchVector { 393 const uint32_t* buf; 394 I32 offset; 395 uint32_t stride; 396 }; 397 398 template <typename S> 399 SI FetchVector texelFetchPtr(S sampler, ivec2 P, int min_x, int max_x, 400 int min_y, int max_y) { 401 assert(max_x < MAX_TEXEL_OFFSET); 402 if (test_any(P.x < -min_x || P.x >= int(sampler->width) - max_x || 403 P.y < -min_y || P.y >= int(sampler->height) - max_y)) { 404 return FetchVector{zeroFetchBuf, I32(0), 0}; 405 } 406 return FetchVector{sampler->buf, P.x * 4 + P.y * sampler->stride, 407 sampler->stride}; 408 } 409 410 SI vec4 texelFetchUnchecked(sampler2D sampler, FetchVector ptr, int x, 411 int y = 0) { 412 assert(sampler->format == TextureFormat::RGBA32F); 413 return fetchOffsetsFloat(&ptr.buf[x * 4 + y * ptr.stride], ptr.offset); 414 } 415 416 SI ivec4 texelFetchUnchecked(isampler2D sampler, FetchVector ptr, int x, 417 int y = 0) { 418 assert(sampler->format == TextureFormat::RGBA32I); 419 return fetchOffsetsInt(&ptr.buf[x * 4 + y * ptr.stride], ptr.offset); 420 } 421 422 #define texelFetchOffset(sampler, P, lod, offset) \ 423 texelFetch(sampler, (P) + (offset), lod) 424 425 // Scale texture coords for quantization, subtract offset for filtering 426 // (assuming coords already offset to texel centers), and round to nearest 427 // 1/scale increment 428 template <typename T> 429 SI T linearQuantize(T P, float scale) { 430 return P * scale + (0.5f - 0.5f * scale); 431 } 432 433 // Helper version that also scales normalized texture coords for sampler 434 template <typename T, typename S> 435 SI T samplerScale(S sampler, T P) { 436 P.x *= sampler->width; 437 P.y *= sampler->height; 438 return P; 439 } 440 441 template <typename T> 442 SI T samplerScale(UNUSED sampler2DRect sampler, T P) { 443 return P; 444 } 445 446 template <typename T, typename S> 447 SI T linearQuantize(T P, float scale, S sampler) { 448 return linearQuantize(samplerScale(sampler, P), scale); 449 } 450 451 // Compute clamped offset of first row for linear interpolation 452 template <typename S, typename I> 453 SI auto computeRow(S sampler, I i, size_t margin = 1) -> decltype(i.x) { 454 return clampCoord(i.x, sampler->width - margin) + 455 clampCoord(i.y, sampler->height) * sampler->stride; 456 } 457 458 // Compute clamped offset of second row for linear interpolation from first row 459 template <typename S, typename I> 460 SI auto computeNextRowOffset(S sampler, I i) -> decltype(i.x) { 461 return if_then_else(i.y >= 0 && i.y < int32_t(sampler->height) - 1, 462 sampler->stride, 0); 463 } 464 465 // Convert X coordinate to a 2^7 scale fraction for interpolation 466 template <typename S> 467 SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) { 468 auto overread = i.x > int32_t(sampler->width) - 2; 469 return CONVERT((((frac.x & (i.x >= 0)) | overread) & 0x7F) - overread, I16); 470 } 471 472 // Convert Y coordinate to a 2^7 scale fraction for interpolation 473 SI I16 computeFracNoClamp(I32 frac) { return CONVERT(frac & 0x7F, I16); } 474 SI I16 computeFracY(ivec2 frac) { return computeFracNoClamp(frac.y); } 475 476 struct WidePlanarRGBA8 { 477 V8<uint16_t> rg; 478 V8<uint16_t> ba; 479 }; 480 481 template <typename S> 482 SI WidePlanarRGBA8 textureLinearPlanarRGBA8(S sampler, ivec2 i) { 483 assert(sampler->format == TextureFormat::RGBA8); 484 485 ivec2 frac = i; 486 i >>= 7; 487 488 I32 row0 = computeRow(sampler, i); 489 I32 row1 = row0 + computeNextRowOffset(sampler, i); 490 I16 fracx = computeFracX(sampler, i, frac); 491 I16 fracy = computeFracY(frac); 492 493 auto a0 = 494 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>); 495 auto a1 = 496 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>); 497 a0 += ((a1 - a0) * fracy.x) >> 7; 498 499 auto b0 = 500 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>); 501 auto b1 = 502 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>); 503 b0 += ((b1 - b0) * fracy.y) >> 7; 504 505 auto abl = zipLow(a0, b0); 506 auto abh = zipHigh(a0, b0); 507 abl += ((abh - abl) * fracx.xyxyxyxy) >> 7; 508 509 auto c0 = 510 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>); 511 auto c1 = 512 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>); 513 c0 += ((c1 - c0) * fracy.z) >> 7; 514 515 auto d0 = 516 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>); 517 auto d1 = 518 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>); 519 d0 += ((d1 - d0) * fracy.w) >> 7; 520 521 auto cdl = zipLow(c0, d0); 522 auto cdh = zipHigh(c0, d0); 523 cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7; 524 525 auto rg = V8<uint16_t>(zip2Low(abl, cdl)); 526 auto ba = V8<uint16_t>(zip2High(abl, cdl)); 527 return WidePlanarRGBA8{rg, ba}; 528 } 529 530 template <typename S> 531 vec4 textureLinearRGBA8(S sampler, vec2 P) { 532 ivec2 i(linearQuantize(P, 128, sampler)); 533 auto planar = textureLinearPlanarRGBA8(sampler, i); 534 auto rg = CONVERT(planar.rg, V8<float>); 535 auto ba = CONVERT(planar.ba, V8<float>); 536 auto r = lowHalf(rg); 537 auto g = highHalf(rg); 538 auto b = lowHalf(ba); 539 auto a = highHalf(ba); 540 return vec4(b, g, r, a) * (1.0f / 255.0f); 541 } 542 543 template <typename S> 544 static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i) { 545 assert(sampler->format == TextureFormat::R8); 546 ivec2 frac = i; 547 i >>= 7; 548 549 I32 row0 = computeRow(sampler, i); 550 I32 row1 = row0 + computeNextRowOffset(sampler, i); 551 I16 fracx = computeFracX(sampler, i, frac); 552 I16 fracy = computeFracY(frac); 553 554 uint8_t* buf = (uint8_t*)sampler->buf; 555 auto a0 = unaligned_load<V2<uint8_t>>(&buf[row0.x]); 556 auto b0 = unaligned_load<V2<uint8_t>>(&buf[row0.y]); 557 auto c0 = unaligned_load<V2<uint8_t>>(&buf[row0.z]); 558 auto d0 = unaligned_load<V2<uint8_t>>(&buf[row0.w]); 559 auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>); 560 561 auto a1 = unaligned_load<V2<uint8_t>>(&buf[row1.x]); 562 auto b1 = unaligned_load<V2<uint8_t>>(&buf[row1.y]); 563 auto c1 = unaligned_load<V2<uint8_t>>(&buf[row1.z]); 564 auto d1 = unaligned_load<V2<uint8_t>>(&buf[row1.w]); 565 auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>); 566 567 abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7; 568 569 abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7); 570 auto abcdl = lowHalf(abcd0); 571 auto abcdh = highHalf(abcd0); 572 abcdl += ((abcdh - abcdl) * fracx) >> 7; 573 574 return U16(abcdl); 575 } 576 577 template <typename S> 578 vec4 textureLinearR8(S sampler, vec2 P) { 579 assert(sampler->format == TextureFormat::R8); 580 581 ivec2 i(linearQuantize(P, 128, sampler)); 582 Float r = CONVERT(textureLinearUnpackedR8(sampler, i), Float); 583 return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f); 584 } 585 586 struct WidePlanarRG8 { 587 V8<uint16_t> rg; 588 }; 589 590 template <typename S> 591 SI WidePlanarRG8 textureLinearPlanarRG8(S sampler, ivec2 i) { 592 assert(sampler->format == TextureFormat::RG8); 593 594 ivec2 frac = i; 595 i >>= 7; 596 597 I32 row0 = computeRow(sampler, i); 598 I32 row1 = row0 + computeNextRowOffset(sampler, i); 599 I16 fracx = computeFracX(sampler, i, frac); 600 I16 fracy = computeFracY(frac); 601 602 uint16_t* buf = (uint16_t*)sampler->buf; 603 604 // Load RG bytes for two adjacent pixels - rgRG 605 auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]); 606 auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]); 607 auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>); 608 // Load two pixels for next row 609 auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]); 610 auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]); 611 auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>); 612 // Blend rows 613 ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7; 614 615 auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]); 616 auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]); 617 auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>); 618 auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]); 619 auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]); 620 auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>); 621 // Blend rows 622 cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7; 623 624 // ab = a.rgRG,b.rgRG 625 // cd = c.rgRG,d.rgRG 626 // ... ac = ar,cr,ag,cg,aR,cR,aG,cG 627 // ... bd = br,dr,bg,dg,bR,dR,bG,dG 628 auto ac = zipLow(ab0, cd0); 629 auto bd = zipHigh(ab0, cd0); 630 // ar,br,cr,dr,ag,bg,cg,dg 631 // aR,bR,cR,dR,aG,bG,cG,dG 632 auto abcdl = zipLow(ac, bd); 633 auto abcdh = zipHigh(ac, bd); 634 // Blend columns 635 abcdl += ((abcdh - abcdl) * fracx.xyzwxyzw) >> 7; 636 637 auto rg = V8<uint16_t>(abcdl); 638 return WidePlanarRG8{rg}; 639 } 640 641 template <typename S> 642 vec4 textureLinearRG8(S sampler, vec2 P) { 643 ivec2 i(linearQuantize(P, 128, sampler)); 644 auto planar = textureLinearPlanarRG8(sampler, i); 645 auto rg = CONVERT(planar.rg, V8<float>) * (1.0f / 255.0f); 646 auto r = lowHalf(rg); 647 auto g = highHalf(rg); 648 return vec4(r, g, 0.0f, 1.0f); 649 } 650 651 // Samples R16 texture with linear filtering and returns results packed as 652 // signed I16. One bit of precision is shifted away from the bottom end to 653 // accommodate the sign bit, so only 15 bits of precision is left. 654 template <typename S> 655 static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i) { 656 assert(sampler->format == TextureFormat::R16); 657 658 ivec2 frac = i; 659 i >>= 7; 660 661 I32 row0 = computeRow(sampler, i); 662 I32 row1 = row0 + computeNextRowOffset(sampler, i); 663 664 I16 fracx = 665 CONVERT( 666 ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F, 667 I16) 668 << 8; 669 I16 fracy = computeFracY(frac) << 8; 670 671 // Sample the 16 bit data for both rows 672 uint16_t* buf = (uint16_t*)sampler->buf; 673 auto a0 = unaligned_load<V2<uint16_t>>(&buf[row0.x]); 674 auto b0 = unaligned_load<V2<uint16_t>>(&buf[row0.y]); 675 auto c0 = unaligned_load<V2<uint16_t>>(&buf[row0.z]); 676 auto d0 = unaligned_load<V2<uint16_t>>(&buf[row0.w]); 677 auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8<int16_t>); 678 679 auto a1 = unaligned_load<V2<uint16_t>>(&buf[row1.x]); 680 auto b1 = unaligned_load<V2<uint16_t>>(&buf[row1.y]); 681 auto c1 = unaligned_load<V2<uint16_t>>(&buf[row1.z]); 682 auto d1 = unaligned_load<V2<uint16_t>>(&buf[row1.w]); 683 auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8<int16_t>); 684 685 // The samples occupy 15 bits and the fraction occupies 15 bits, so that when 686 // they are multiplied together, the new scaled sample will fit in the high 687 // 14 bits of the result. It is left shifted once to make it 15 bits again 688 // for the final multiply. 689 #if USE_SSE2 690 abcd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcd1 - abcd0, fracy.xxyyzzww)) 691 << 1; 692 #elif USE_NEON 693 // NEON has a convenient instruction that does both the multiply and the 694 // doubling, so doesn't need an extra shift. 695 abcd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcd1 - abcd0, fracy.xxyyzzww)); 696 #else 697 abcd0 += CONVERT((CONVERT(abcd1 - abcd0, V8<int32_t>) * 698 CONVERT(fracy.xxyyzzww, V8<int32_t>)) >> 699 16, 700 V8<int16_t>) 701 << 1; 702 #endif 703 704 abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7); 705 auto abcdl = lowHalf(abcd0); 706 auto abcdh = highHalf(abcd0); 707 #if USE_SSE2 708 abcdl += lowHalf(bit_cast<V8<int16_t>>( 709 _mm_mulhi_epi16(expand(abcdh - abcdl), expand(fracx)))) 710 << 1; 711 #elif USE_NEON 712 abcdl += bit_cast<V4<int16_t>>(vqrdmulh_s16(abcdh - abcdl, fracx)); 713 #else 714 abcdl += CONVERT((CONVERT(abcdh - abcdl, V4<int32_t>) * 715 CONVERT(fracx, V4<int32_t>)) >> 716 16, 717 V4<int16_t>) 718 << 1; 719 #endif 720 721 return abcdl; 722 } 723 724 template <typename S> 725 vec4 textureLinearR16(S sampler, vec2 P) { 726 assert(sampler->format == TextureFormat::R16); 727 728 ivec2 i(linearQuantize(P, 128, sampler)); 729 Float r = CONVERT(textureLinearUnpackedR16(sampler, i), Float); 730 return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f); 731 } 732 733 // Samples RG16 texture with linear filtering and returns results packed as 734 // signed I16. One bit of precision is shifted away from the bottom end to 735 // accommodate the sign bit, so only 15 bits of precision is left. 736 template <typename S> 737 static inline V8<int16_t> textureLinearUnpackedRG16(S sampler, ivec2 i) { 738 assert(sampler->format == TextureFormat::RG16); 739 740 ivec2 frac = i; 741 i >>= 7; 742 743 I32 row0 = computeRow(sampler, i); 744 I32 row1 = row0 + computeNextRowOffset(sampler, i); 745 746 I16 fracx = 747 CONVERT( 748 ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F, 749 I16) 750 << 8; 751 I16 fracy = computeFracY(frac) << 8; 752 753 // Sample the 2x16 bit data for both rows 754 auto a0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.x]); 755 auto b0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.y]); 756 auto ab0 = CONVERT(combine(a0, b0) >> 1, V8<int16_t>); 757 auto c0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.z]); 758 auto d0 = unaligned_load<V4<uint16_t>>(&sampler->buf[row0.w]); 759 auto cd0 = CONVERT(combine(c0, d0) >> 1, V8<int16_t>); 760 761 auto a1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.x]); 762 auto b1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.y]); 763 auto ab1 = CONVERT(combine(a1, b1) >> 1, V8<int16_t>); 764 auto c1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.z]); 765 auto d1 = unaligned_load<V4<uint16_t>>(&sampler->buf[row1.w]); 766 auto cd1 = CONVERT(combine(c1, d1) >> 1, V8<int16_t>); 767 768 // The samples occupy 15 bits and the fraction occupies 15 bits, so that when 769 // they are multiplied together, the new scaled sample will fit in the high 770 // 14 bits of the result. It is left shifted once to make it 15 bits again 771 // for the final multiply. 772 #if USE_SSE2 773 ab0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(ab1 - ab0, fracy.xxxxyyyy)) << 1; 774 cd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(cd1 - cd0, fracy.zzzzwwww)) << 1; 775 #elif USE_NEON 776 // NEON has a convenient instruction that does both the multiply and the 777 // doubling, so doesn't need an extra shift. 778 ab0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(ab1 - ab0, fracy.xxxxyyyy)); 779 cd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(cd1 - cd0, fracy.zzzzwwww)); 780 #else 781 ab0 += CONVERT((CONVERT(ab1 - ab0, V8<int32_t>) * 782 CONVERT(fracy.xxxxyyyy, V8<int32_t>)) >> 783 16, 784 V8<int16_t>) 785 << 1; 786 cd0 += CONVERT((CONVERT(cd1 - cd0, V8<int32_t>) * 787 CONVERT(fracy.zzzzwwww, V8<int32_t>)) >> 788 16, 789 V8<int16_t>) 790 << 1; 791 #endif 792 793 // ab = a.rgRG,b.rgRG 794 // cd = c.rgRG,d.rgRG 795 // ... ac = a.rg,c.rg,a.RG,c.RG 796 // ... bd = b.rg,d.rg,b.RG,d.RG 797 auto ac = zip2Low(ab0, cd0); 798 auto bd = zip2High(ab0, cd0); 799 // a.rg,b.rg,c.rg,d.rg 800 // a.RG,b.RG,c.RG,d.RG 801 auto abcdl = zip2Low(ac, bd); 802 auto abcdh = zip2High(ac, bd); 803 // Blend columns 804 #if USE_SSE2 805 abcdl += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcdh - abcdl, fracx.xxyyzzww)) 806 << 1; 807 #elif USE_NEON 808 abcdl += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcdh - abcdl, fracx.xxyyzzww)); 809 #else 810 abcdl += CONVERT((CONVERT(abcdh - abcdl, V8<int32_t>) * 811 CONVERT(fracx.xxyyzzww, V8<int32_t>)) >> 812 16, 813 V8<int16_t>) 814 << 1; 815 #endif 816 817 return abcdl; 818 } 819 820 template <typename S> 821 vec4 textureLinearRG16(S sampler, vec2 P) { 822 assert(sampler->format == TextureFormat::RG16); 823 824 ivec2 i(linearQuantize(P, 128, sampler)); 825 auto rg = bit_cast<V4<int32_t>>(textureLinearUnpackedRG16(sampler, i)); 826 auto r = cast(rg & 0xFFFF) * (1.0f / 32767.0f); 827 auto g = cast(rg >> 16) * (1.0f / 32767.0f); 828 return vec4(r, g, 0.0f, 1.0f); 829 } 830 831 using PackedRGBA32F = V16<float>; 832 using WideRGBA32F = V16<float>; 833 834 template <typename S> 835 vec4 textureLinearRGBA32F(S sampler, vec2 P) { 836 assert(sampler->format == TextureFormat::RGBA32F); 837 P = samplerScale(sampler, P); 838 P -= 0.5f; 839 vec2 f = floor(P); 840 vec2 r = P - f; 841 ivec2 i(f); 842 ivec2 c(clampCoord(i.x, sampler->width - 1), 843 clampCoord(i.y, sampler->height)); 844 r.x = if_then_else(i.x >= 0, if_then_else(i.x < sampler->width - 1, r.x, 1.0), 845 0.0f); 846 I32 offset0 = c.x * 4 + c.y * sampler->stride; 847 I32 offset1 = offset0 + computeNextRowOffset(sampler, i); 848 849 Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x], 850 *(Float*)&sampler->buf[offset0.x + 4], r.x), 851 mix(*(Float*)&sampler->buf[offset1.x], 852 *(Float*)&sampler->buf[offset1.x + 4], r.x), 853 r.y); 854 Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y], 855 *(Float*)&sampler->buf[offset0.y + 4], r.x), 856 mix(*(Float*)&sampler->buf[offset1.y], 857 *(Float*)&sampler->buf[offset1.y + 4], r.x), 858 r.y); 859 Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z], 860 *(Float*)&sampler->buf[offset0.z + 4], r.x), 861 mix(*(Float*)&sampler->buf[offset1.z], 862 *(Float*)&sampler->buf[offset1.z + 4], r.x), 863 r.y); 864 Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w], 865 *(Float*)&sampler->buf[offset0.w + 4], r.x), 866 mix(*(Float*)&sampler->buf[offset1.w], 867 *(Float*)&sampler->buf[offset1.w + 4], r.x), 868 r.y); 869 return pixel_float_to_vec4(c0, c1, c2, c3); 870 } 871 872 struct WidePlanarYUV8 { 873 U16 y; 874 U16 u; 875 U16 v; 876 }; 877 878 template <typename S> 879 SI WidePlanarYUV8 textureLinearPlanarYUY2(S sampler, ivec2 i) { 880 assert(sampler->format == TextureFormat::YUY2); 881 882 ivec2 frac = i; 883 i >>= 7; 884 885 I32 row0 = computeRow(sampler, i, 2); 886 // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R. 887 // Get the selector for the pixel within the chunk. 888 I32 selector = row0 & 1; 889 // Align the row index to the chunk. 890 row0 &= ~1; 891 I32 row1 = row0 + computeNextRowOffset(sampler, i); 892 // G only needs to be clamped to a pixel boundary for safe interpolation, 893 // whereas the BR fraction needs to be clamped 1 extra pixel inside to a chunk 894 // boundary. 895 frac.x &= (i.x >= 0); 896 auto fracx = 897 CONVERT(combine(frac.x | (i.x > int32_t(sampler->width) - 3), 898 (frac.x >> 1) | (i.x > int32_t(sampler->width) - 3)) & 899 0x7F, 900 V8<int16_t>); 901 I16 fracy = computeFracY(frac); 902 903 uint16_t* buf = (uint16_t*)sampler->buf; 904 905 // Load bytes for two adjacent chunks - g0,b,g1,r,G0,B,G1,R 906 // We always need to interpolate between (b,r) and (B,R). 907 // Depending on selector we need to either interpolate between g0 and g1 908 // or between g1 and G0. So for now we just interpolate both cases for g 909 // and will select the appropriate one on output. 910 auto a0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.x]), V8<int16_t>); 911 auto a1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.x]), V8<int16_t>); 912 // Combine with next row. 913 a0 += ((a1 - a0) * fracy.x) >> 7; 914 915 auto b0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.y]), V8<int16_t>); 916 auto b1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.y]), V8<int16_t>); 917 b0 += ((b1 - b0) * fracy.y) >> 7; 918 919 auto c0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.z]), V8<int16_t>); 920 auto c1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.z]), V8<int16_t>); 921 c0 += ((c1 - c0) * fracy.z) >> 7; 922 923 auto d0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.w]), V8<int16_t>); 924 auto d1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.w]), V8<int16_t>); 925 d0 += ((d1 - d0) * fracy.w) >> 7; 926 927 // Shuffle things around so we end up with g0,g0,g0,g0,b,b,b,b and 928 // g1,g1,g1,g1,r,r,r,r. 929 auto abl = zipLow(a0, b0); 930 auto cdl = zipLow(c0, d0); 931 auto g0b = zip2Low(abl, cdl); 932 auto g1r = zip2High(abl, cdl); 933 934 // Need to zip g1,B,G0,R. Instead of using a bunch of complicated masks and 935 // and shifts, just shuffle here instead... We finally end up with 936 // g1,g1,g1,g1,B,B,B,B and G0,G0,G0,G0,R,R,R,R. 937 auto abh = SHUFFLE(a0, b0, 2, 10, 5, 13, 4, 12, 7, 15); 938 auto cdh = SHUFFLE(c0, d0, 2, 10, 5, 13, 4, 12, 7, 15); 939 auto g1B = zip2Low(abh, cdh); 940 auto G0R = zip2High(abh, cdh); 941 942 // Finally interpolate between adjacent columns. 943 g0b += ((g1B - g0b) * fracx) >> 7; 944 g1r += ((G0R - g1r) * fracx) >> 7; 945 946 // Choose either g0 or g1 based on selector. 947 return WidePlanarYUV8{ 948 U16(if_then_else(CONVERT(-selector, I16), lowHalf(g1r), lowHalf(g0b))), 949 U16(highHalf(g0b)), U16(highHalf(g1r))}; 950 } 951 952 template <typename S> 953 vec4 textureLinearYUY2(S sampler, vec2 P) { 954 ivec2 i(linearQuantize(P, 128, sampler)); 955 auto planar = textureLinearPlanarYUY2(sampler, i); 956 auto y = CONVERT(planar.y, Float) * (1.0f / 255.0f); 957 auto u = CONVERT(planar.u, Float) * (1.0f / 255.0f); 958 auto v = CONVERT(planar.v, Float) * (1.0f / 255.0f); 959 return vec4(v, y, u, 1.0f); 960 } 961 962 SI vec4 texture(sampler2D sampler, vec2 P) { 963 if (sampler->filter == TextureFilter::LINEAR) { 964 switch (sampler->format) { 965 case TextureFormat::RGBA32F: 966 return textureLinearRGBA32F(sampler, P); 967 case TextureFormat::RGBA8: 968 return textureLinearRGBA8(sampler, P); 969 case TextureFormat::R8: 970 return textureLinearR8(sampler, P); 971 case TextureFormat::RG8: 972 return textureLinearRG8(sampler, P); 973 case TextureFormat::R16: 974 return textureLinearR16(sampler, P); 975 case TextureFormat::RG16: 976 return textureLinearRG16(sampler, P); 977 case TextureFormat::YUY2: 978 return textureLinearYUY2(sampler, P); 979 default: 980 assert(false); 981 return vec4(); 982 } 983 } else { 984 ivec2 coord(roundzero(P.x, sampler->width), 985 roundzero(P.y, sampler->height)); 986 return texelFetch(sampler, coord, 0); 987 } 988 } 989 990 vec4 texture(sampler2DRect sampler, vec2 P) { 991 if (sampler->filter == TextureFilter::LINEAR) { 992 switch (sampler->format) { 993 case TextureFormat::RGBA8: 994 return textureLinearRGBA8(sampler, P); 995 case TextureFormat::R8: 996 return textureLinearR8(sampler, P); 997 case TextureFormat::RG8: 998 return textureLinearRG8(sampler, P); 999 case TextureFormat::R16: 1000 return textureLinearR16(sampler, P); 1001 case TextureFormat::RG16: 1002 return textureLinearRG16(sampler, P); 1003 case TextureFormat::YUY2: 1004 return textureLinearYUY2(sampler, P); 1005 default: 1006 assert(false); 1007 return vec4(); 1008 } 1009 } else { 1010 ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f)); 1011 return texelFetch(sampler, coord); 1012 } 1013 } 1014 1015 template <typename S> 1016 vec4_scalar texture(S sampler, vec2_scalar P) { 1017 return force_scalar(texture(sampler, vec2(P))); 1018 } 1019 1020 ivec2_scalar textureSize(sampler2D sampler, int) { 1021 return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)}; 1022 } 1023 1024 ivec2_scalar textureSize(sampler2DRect sampler) { 1025 return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)}; 1026 } 1027 1028 template <typename S> 1029 static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) { 1030 assert(sampler->format == TextureFormat::RGBA8); 1031 ivec2 frac = i; 1032 i >>= 7; 1033 1034 I32 row0 = computeRow(sampler, i); 1035 I32 row1 = row0 + computeNextRowOffset(sampler, i); 1036 I16 fracx = computeFracX(sampler, i, frac); 1037 I16 fracy = computeFracY(frac); 1038 1039 auto a0 = 1040 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>); 1041 auto a1 = 1042 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>); 1043 a0 += ((a1 - a0) * fracy.x) >> 7; 1044 1045 auto b0 = 1046 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>); 1047 auto b1 = 1048 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>); 1049 b0 += ((b1 - b0) * fracy.y) >> 7; 1050 1051 auto abl = combine(lowHalf(a0), lowHalf(b0)); 1052 auto abh = combine(highHalf(a0), highHalf(b0)); 1053 abl += ((abh - abl) * fracx.xxxxyyyy) >> 7; 1054 1055 auto c0 = 1056 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>); 1057 auto c1 = 1058 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>); 1059 c0 += ((c1 - c0) * fracy.z) >> 7; 1060 1061 auto d0 = 1062 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>); 1063 auto d1 = 1064 CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>); 1065 d0 += ((d1 - d0) * fracy.w) >> 7; 1066 1067 auto cdl = combine(lowHalf(c0), lowHalf(d0)); 1068 auto cdh = combine(highHalf(c0), highHalf(d0)); 1069 cdl += ((cdh - cdl) * fracx.zzzzwwww) >> 7; 1070 1071 return combine(HalfRGBA8(abl), HalfRGBA8(cdl)); 1072 } 1073 1074 template <typename S> 1075 static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i) { 1076 return pack(textureLinearUnpackedRGBA8(sampler, i)); 1077 } 1078 1079 template <typename S> 1080 static PackedRGBA8 textureNearestPackedRGBA8(S sampler, ivec2 i) { 1081 assert(sampler->format == TextureFormat::RGBA8); 1082 I32 row = computeRow(sampler, i, 0); 1083 return combine(unaligned_load<V4<uint8_t>>(&sampler->buf[row.x]), 1084 unaligned_load<V4<uint8_t>>(&sampler->buf[row.y]), 1085 unaligned_load<V4<uint8_t>>(&sampler->buf[row.z]), 1086 unaligned_load<V4<uint8_t>>(&sampler->buf[row.w])); 1087 } 1088 1089 template <typename S> 1090 static PackedR8 textureLinearPackedR8(S sampler, ivec2 i) { 1091 return pack(textureLinearUnpackedR8(sampler, i)); 1092 } 1093 1094 template <typename S> 1095 static WideRG8 textureLinearUnpackedRG8(S sampler, ivec2 i) { 1096 assert(sampler->format == TextureFormat::RG8); 1097 ivec2 frac = i & 0x7F; 1098 i >>= 7; 1099 1100 I32 row0 = computeRow(sampler, i); 1101 I32 row1 = row0 + computeNextRowOffset(sampler, i); 1102 I16 fracx = computeFracX(sampler, i, frac); 1103 I16 fracy = computeFracY(frac); 1104 1105 uint16_t* buf = (uint16_t*)sampler->buf; 1106 1107 // Load RG bytes for two adjacent pixels - rgRG 1108 auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]); 1109 auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]); 1110 auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>); 1111 // Load two pixels for next row 1112 auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]); 1113 auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]); 1114 auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>); 1115 // Blend rows 1116 ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7; 1117 1118 auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]); 1119 auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]); 1120 auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>); 1121 auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]); 1122 auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]); 1123 auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>); 1124 // Blend rows 1125 cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7; 1126 1127 // ab = a.rgRG,b.rgRG 1128 // cd = c.rgRG,d.rgRG 1129 // ... ac = a.rg,c.rg,a.RG,c.RG 1130 // ... bd = b.rg,d.rg,b.RG,d.RG 1131 auto ac = zip2Low(ab0, cd0); 1132 auto bd = zip2High(ab0, cd0); 1133 // a.rg,b.rg,c.rg,d.rg 1134 // a.RG,b.RG,c.RG,d.RG 1135 auto abcdl = zip2Low(ac, bd); 1136 auto abcdh = zip2High(ac, bd); 1137 // Blend columns 1138 abcdl += ((abcdh - abcdl) * fracx.xxyyzzww) >> 7; 1139 1140 return WideRG8(abcdl); 1141 } 1142 1143 template <typename S> 1144 static PackedRG8 textureLinearPackedRG8(S sampler, ivec2 i) { 1145 return pack(textureLinearUnpackedRG8(sampler, i)); 1146 } 1147 1148 template <int N> 1149 static ALWAYS_INLINE VectorType<uint16_t, N> addsat(VectorType<uint16_t, N> x, 1150 VectorType<uint16_t, N> y) { 1151 auto r = x + y; 1152 return r | (r < x); 1153 } 1154 1155 static inline V8<uint16_t> addsat(V8<uint16_t> x, V8<uint16_t> y) { 1156 #if USE_SSE2 1157 return _mm_adds_epu16(x, y); 1158 #elif USE_NEON 1159 return vqaddq_u16(x, y); 1160 #else 1161 auto r = x + y; 1162 return r | (r < x); 1163 #endif 1164 } 1165 1166 template <typename P, typename S> 1167 static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurHorizontal( 1168 S sampler, const ivec2_scalar& i, int minX, int maxX, int radius, 1169 float coeff, float coeffStep) { 1170 // Packed and unpacked vectors for a chunk of the given pixel type. 1171 typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; 1172 typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; 1173 1174 // Pre-scale the coefficient by 8 bits of fractional precision, so that when 1175 // the sample is multiplied by it, it will yield a 16 bit unsigned integer 1176 // that will use all 16 bits of precision to accumulate the sum. 1177 coeff *= 1 << 8; 1178 float coeffStep2 = coeffStep * coeffStep; 1179 1180 int row = computeRow(sampler, i); 1181 P* buf = (P*)sampler->buf; 1182 auto pixelsRight = unaligned_load<V4<P>>(&buf[row]); 1183 auto pixelsLeft = pixelsRight; 1184 auto sum = CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) * 1185 uint16_t(coeff + 0.5f); 1186 1187 // Here we use some trickery to reuse the pixels within a chunk, shifted over 1188 // by one pixel, to get the next sample for the entire chunk. This allows us 1189 // to sample only one pixel for each offset across the entire chunk in both 1190 // the left and right directions. To avoid clamping within the loop to the 1191 // texture bounds, we compute the valid radius that doesn't require clamping 1192 // and fall back to a slower clamping loop outside of that valid radius. 1193 int offset = 1; 1194 // The left bound is how much we can offset the sample before the start of 1195 // the row bounds. 1196 int leftBound = i.x - max(minX, 0); 1197 // The right bound is how much we can offset the sample before the end of the 1198 // row bounds. 1199 int rightBound = min(maxX, sampler->width - 1) - i.x; 1200 int validRadius = min(radius, min(leftBound, rightBound - (4 - 1))); 1201 for (; offset <= validRadius; offset++) { 1202 // Overwrite the pixel that needs to be shifted out with the new pixel, and 1203 // shift it into the correct location. 1204 pixelsRight.x = unaligned_load<P>(&buf[row + offset + 4 - 1]); 1205 pixelsRight = pixelsRight.yzwx; 1206 pixelsLeft = pixelsLeft.wxyz; 1207 pixelsLeft.x = unaligned_load<P>(&buf[row - offset]); 1208 1209 // Accumulate the Gaussian coefficients step-wise. 1210 coeff *= coeffStep; 1211 coeffStep *= coeffStep2; 1212 1213 // Both left and right samples at this offset use the same coefficient. 1214 sum = addsat(sum, 1215 (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) + 1216 CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) * 1217 uint16_t(coeff + 0.5f)); 1218 } 1219 1220 for (; offset <= radius; offset++) { 1221 pixelsRight.x = 1222 unaligned_load<P>(&buf[row + min(offset + 4 - 1, rightBound)]); 1223 pixelsRight = pixelsRight.yzwx; 1224 pixelsLeft = pixelsLeft.wxyz; 1225 pixelsLeft.x = unaligned_load<P>(&buf[row - min(offset, leftBound)]); 1226 1227 coeff *= coeffStep; 1228 coeffStep *= coeffStep2; 1229 1230 sum = addsat(sum, 1231 (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) + 1232 CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) * 1233 uint16_t(coeff + 0.5f)); 1234 } 1235 1236 // Shift away the intermediate precision. 1237 return sum >> 8; 1238 } 1239 1240 template <typename P, typename S> 1241 static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurVertical( 1242 S sampler, const ivec2_scalar& i, int minY, int maxY, int radius, 1243 float coeff, float coeffStep) { 1244 // Packed and unpacked vectors for a chunk of the given pixel type. 1245 typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; 1246 typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; 1247 1248 // Pre-scale the coefficient by 8 bits of fractional precision, so that when 1249 // the sample is multiplied by it, it will yield a 16 bit unsigned integer 1250 // that will use all 16 bits of precision to accumulate the sum. 1251 coeff *= 1 << 8; 1252 float coeffStep2 = coeffStep * coeffStep; 1253 1254 int rowAbove = computeRow(sampler, i); 1255 int rowBelow = rowAbove; 1256 P* buf = (P*)sampler->buf; 1257 auto pixels = unaligned_load<V4<P>>(&buf[rowAbove]); 1258 auto sum = CONVERT(bit_cast<packed_type>(pixels), unpacked_type) * 1259 uint16_t(coeff + 0.5f); 1260 1261 // For the vertical loop we can't be quite as creative with reusing old values 1262 // as we were in the horizontal loop. We just do the obvious implementation of 1263 // loading a chunk from each row in turn and accumulating it into the sum. We 1264 // compute a valid radius within which we don't need to clamp the sampled row 1265 // and use that to avoid any clamping in the main inner loop. We fall back to 1266 // a slower clamping loop outside of that valid radius. 1267 int offset = 1; 1268 int belowBound = i.y - max(minY, 0); 1269 int aboveBound = min(maxY, sampler->height - 1) - i.y; 1270 int validRadius = min(radius, min(belowBound, aboveBound)); 1271 for (; offset <= validRadius; offset++) { 1272 rowAbove += sampler->stride; 1273 rowBelow -= sampler->stride; 1274 auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]); 1275 auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]); 1276 1277 // Accumulate the Gaussian coefficients step-wise. 1278 coeff *= coeffStep; 1279 coeffStep *= coeffStep2; 1280 1281 // Both above and below samples at this offset use the same coefficient. 1282 sum = addsat(sum, 1283 (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) + 1284 CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) * 1285 uint16_t(coeff + 0.5f)); 1286 } 1287 1288 for (; offset <= radius; offset++) { 1289 if (offset <= aboveBound) { 1290 rowAbove += sampler->stride; 1291 } 1292 if (offset <= belowBound) { 1293 rowBelow -= sampler->stride; 1294 } 1295 auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]); 1296 auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]); 1297 1298 coeff *= coeffStep; 1299 coeffStep *= coeffStep2; 1300 1301 sum = addsat(sum, 1302 (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) + 1303 CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) * 1304 uint16_t(coeff + 0.5f)); 1305 } 1306 1307 // Shift away the intermediate precision. 1308 return sum >> 8; 1309 } 1310 1311 } // namespace glsl