enc_xyb.cc (15864B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jxl/enc_xyb.h" 7 8 #include <jxl/memory_manager.h> 9 10 #include <algorithm> 11 #include <cstdlib> 12 13 #undef HWY_TARGET_INCLUDE 14 #define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc" 15 #include <hwy/foreach_target.h> 16 #include <hwy/highway.h> 17 18 #include "lib/jxl/base/compiler_specific.h" 19 #include "lib/jxl/base/data_parallel.h" 20 #include "lib/jxl/base/fast_math-inl.h" 21 #include "lib/jxl/base/rect.h" 22 #include "lib/jxl/base/status.h" 23 #include "lib/jxl/cms/opsin_params.h" 24 #include "lib/jxl/cms/transfer_functions-inl.h" 25 #include "lib/jxl/color_encoding_internal.h" 26 #include "lib/jxl/enc_image_bundle.h" 27 #include "lib/jxl/image_bundle.h" 28 #include "lib/jxl/image_ops.h" 29 30 HWY_BEFORE_NAMESPACE(); 31 namespace jxl { 32 namespace HWY_NAMESPACE { 33 34 // These templates are not found via ADL. 35 using hwy::HWY_NAMESPACE::Add; 36 using hwy::HWY_NAMESPACE::Mul; 37 using hwy::HWY_NAMESPACE::MulAdd; 38 using hwy::HWY_NAMESPACE::Sub; 39 using hwy::HWY_NAMESPACE::ZeroIfNegative; 40 41 // 4x3 matrix * 3x1 SIMD vectors 42 template <class V> 43 JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b, 44 const float* JXL_RESTRICT premul_absorb, 45 V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1, 46 V* JXL_RESTRICT mixed2) { 47 const float* bias = jxl::cms::kOpsinAbsorbanceBias.data(); 48 const HWY_FULL(float) d; 49 const size_t N = Lanes(d); 50 const auto m0 = Load(d, premul_absorb + 0 * N); 51 const auto m1 = Load(d, premul_absorb + 1 * N); 52 const auto m2 = Load(d, premul_absorb + 2 * N); 53 const auto m3 = Load(d, premul_absorb + 3 * N); 54 const auto m4 = Load(d, premul_absorb + 4 * N); 55 const auto m5 = Load(d, premul_absorb + 5 * N); 56 const auto m6 = Load(d, premul_absorb + 6 * N); 57 const auto m7 = Load(d, premul_absorb + 7 * N); 58 const auto m8 = Load(d, premul_absorb + 8 * N); 59 *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0])))); 60 *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1])))); 61 *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2])))); 62 } 63 64 template <class V> 65 void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx, 66 float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) { 67 const HWY_FULL(float) d; 68 const V half = Set(d, 0.5f); 69 Store(Mul(half, Sub(r, g)), d, valx); 70 Store(Mul(half, Add(r, g)), d, valy); 71 Store(b, d, valz); 72 } 73 74 // Converts one RGB vector to XYB. 75 template <class V> 76 void LinearRGBToXYB(const V r, const V g, const V b, 77 const float* JXL_RESTRICT premul_absorb, 78 float* JXL_RESTRICT valx, float* JXL_RESTRICT valy, 79 float* JXL_RESTRICT valz) { 80 V mixed0; 81 V mixed1; 82 V mixed2; 83 OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2); 84 85 // mixed* should be non-negative even for wide-gamut, so clamp to zero. 86 mixed0 = ZeroIfNegative(mixed0); 87 mixed1 = ZeroIfNegative(mixed1); 88 mixed2 = ZeroIfNegative(mixed2); 89 90 const HWY_FULL(float) d; 91 const size_t N = Lanes(d); 92 mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N)); 93 mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N)); 94 mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N)); 95 StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz); 96 97 // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative. 98 } 99 100 void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, 101 float* JXL_RESTRICT row2, 102 const float* JXL_RESTRICT premul_absorb, size_t xsize) { 103 const HWY_FULL(float) d; 104 for (size_t x = 0; x < xsize; x += Lanes(d)) { 105 const auto r = Load(d, row0 + x); 106 const auto g = Load(d, row1 + x); 107 const auto b = Load(d, row2 + x); 108 LinearRGBToXYB(r, g, b, premul_absorb, row0 + x, row1 + x, row2 + x); 109 } 110 } 111 112 // Input/output uses the codec.h scaling: nominally 0-1 if in-gamut. 113 template <class V> 114 V LinearFromSRGB(V encoded) { 115 return TF_SRGB().DisplayFromEncoded(encoded); 116 } 117 118 Status LinearSRGBToXYB(const float* JXL_RESTRICT premul_absorb, 119 ThreadPool* pool, Image3F* JXL_RESTRICT image) { 120 const size_t xsize = image->xsize(); 121 122 const HWY_FULL(float) d; 123 const auto process_row = [&](const uint32_t task, 124 size_t /*thread*/) -> Status { 125 const size_t y = static_cast<size_t>(task); 126 float* JXL_RESTRICT row0 = image->PlaneRow(0, y); 127 float* JXL_RESTRICT row1 = image->PlaneRow(1, y); 128 float* JXL_RESTRICT row2 = image->PlaneRow(2, y); 129 130 for (size_t x = 0; x < xsize; x += Lanes(d)) { 131 const auto in_r = Load(d, row0 + x); 132 const auto in_g = Load(d, row1 + x); 133 const auto in_b = Load(d, row2 + x); 134 LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x, 135 row2 + x); 136 } 137 return true; 138 }; 139 JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(image->ysize()), 140 ThreadPool::NoInit, process_row, 141 "LinearToXYB")); 142 return true; 143 } 144 145 Status SRGBToXYB(const float* JXL_RESTRICT premul_absorb, ThreadPool* pool, 146 Image3F* JXL_RESTRICT image) { 147 const size_t xsize = image->xsize(); 148 149 const HWY_FULL(float) d; 150 const auto process_row = [&](const uint32_t task, 151 size_t /*thread*/) -> Status { 152 const size_t y = static_cast<size_t>(task); 153 float* JXL_RESTRICT row0 = image->PlaneRow(0, y); 154 float* JXL_RESTRICT row1 = image->PlaneRow(1, y); 155 float* JXL_RESTRICT row2 = image->PlaneRow(2, y); 156 157 for (size_t x = 0; x < xsize; x += Lanes(d)) { 158 const auto in_r = LinearFromSRGB(Load(d, row0 + x)); 159 const auto in_g = LinearFromSRGB(Load(d, row1 + x)); 160 const auto in_b = LinearFromSRGB(Load(d, row2 + x)); 161 LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x, 162 row2 + x); 163 } 164 return true; 165 }; 166 JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(image->ysize()), 167 ThreadPool::NoInit, process_row, "SRGBToXYB")); 168 return true; 169 } 170 171 Status SRGBToXYBAndLinear(const float* JXL_RESTRICT premul_absorb, 172 ThreadPool* pool, Image3F* JXL_RESTRICT image, 173 Image3F* JXL_RESTRICT linear) { 174 const size_t xsize = image->xsize(); 175 176 const HWY_FULL(float) d; 177 const auto process_row = [&](const uint32_t task, 178 size_t /*thread*/) -> Status { 179 const size_t y = static_cast<size_t>(task); 180 float* JXL_RESTRICT row_image0 = image->PlaneRow(0, y); 181 float* JXL_RESTRICT row_image1 = image->PlaneRow(1, y); 182 float* JXL_RESTRICT row_image2 = image->PlaneRow(2, y); 183 float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y); 184 float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y); 185 float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y); 186 187 for (size_t x = 0; x < xsize; x += Lanes(d)) { 188 const auto in_r = LinearFromSRGB(Load(d, row_image0 + x)); 189 const auto in_g = LinearFromSRGB(Load(d, row_image1 + x)); 190 const auto in_b = LinearFromSRGB(Load(d, row_image2 + x)); 191 192 Store(in_r, d, row_linear0 + x); 193 Store(in_g, d, row_linear1 + x); 194 Store(in_b, d, row_linear2 + x); 195 196 LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_image0 + x, 197 row_image1 + x, row_image2 + x); 198 } 199 return true; 200 }; 201 JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(image->ysize()), 202 ThreadPool::NoInit, process_row, 203 "SRGBToXYBAndLinear")); 204 return true; 205 } 206 207 void ComputePremulAbsorb(float intensity_target, float* premul_absorb) { 208 const HWY_FULL(float) d; 209 const size_t N = Lanes(d); 210 const float mul = intensity_target / 255.0f; 211 for (size_t j = 0; j < 3; ++j) { 212 for (size_t i = 0; i < 3; ++i) { 213 const auto absorb = Set(d, jxl::cms::kOpsinAbsorbanceMatrix[j][i] * mul); 214 Store(absorb, d, premul_absorb + (j * 3 + i) * N); 215 } 216 } 217 for (size_t i = 0; i < 3; ++i) { 218 const auto neg_bias_cbrt = 219 Set(d, -cbrtf(jxl::cms::kOpsinAbsorbanceBias[i])); 220 Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N); 221 } 222 } 223 224 // This is different from Butteraugli's OpsinDynamicsImage() in the sense that 225 // it does not contain a sensitivity multiplier based on the blurred image. 226 Status ToXYB(const ColorEncoding& c_current, float intensity_target, 227 const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image, 228 const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) { 229 if (black) JXL_ENSURE(SameSize(*image, *black)); 230 if (linear) JXL_ENSURE(SameSize(*image, *linear)); 231 232 const HWY_FULL(float) d; 233 // Pre-broadcasted constants 234 HWY_ALIGN float premul_absorb[MaxLanes(d) * 12]; 235 ComputePremulAbsorb(intensity_target, premul_absorb); 236 237 const bool want_linear = linear != nullptr; 238 239 const ColorEncoding& c_linear_srgb = 240 ColorEncoding::LinearSRGB(c_current.IsGray()); 241 // Linear sRGB inputs are rare but can be useful for the fastest encoders, for 242 // which undoing the sRGB transfer function would be a large part of the cost. 243 if (c_linear_srgb.SameColorEncoding(c_current)) { 244 // This only happens if kitten or slower, moving ImageBundle might be 245 // possible but the encoder is much slower than this copy. 246 if (want_linear) { 247 JXL_RETURN_IF_ERROR(CopyImageTo(*image, linear)); 248 } 249 JXL_RETURN_IF_ERROR(LinearSRGBToXYB(premul_absorb, pool, image)); 250 return true; 251 } 252 253 // Common case: already sRGB, can avoid the color transform 254 if (c_current.IsSRGB()) { 255 // Common case: can avoid allocating/copying 256 if (want_linear) { 257 // Slow encoder also wants linear sRGB. 258 JXL_RETURN_IF_ERROR( 259 SRGBToXYBAndLinear(premul_absorb, pool, image, linear)); 260 } else { 261 JXL_RETURN_IF_ERROR(SRGBToXYB(premul_absorb, pool, image)); 262 } 263 return true; 264 } 265 266 JXL_RETURN_IF_ERROR(ApplyColorTransform( 267 c_current, intensity_target, *image, black, Rect(*image), c_linear_srgb, 268 cms, pool, want_linear ? linear : image)); 269 if (want_linear) { 270 JXL_RETURN_IF_ERROR(CopyImageTo(*linear, image)); 271 } 272 JXL_RETURN_IF_ERROR(LinearSRGBToXYB(premul_absorb, pool, image)); 273 return true; 274 } 275 276 // Transform RGB to YCbCr. 277 // Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B). 278 Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, 279 const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, 280 ImageF* cr_plane, ThreadPool* pool) { 281 const HWY_FULL(float) df; 282 const size_t S = Lanes(df); // Step. 283 284 const size_t xsize = r_plane.xsize(); 285 const size_t ysize = r_plane.ysize(); 286 if ((xsize == 0) || (ysize == 0)) return true; 287 288 // Full-range BT.601 as defined by JFIF Clause 7: 289 // https://www.itu.int/rec/T-REC-T.871-201105-I/en 290 const auto k128 = Set(df, 128.0f / 255); 291 const auto kR = Set(df, 0.299f); // NTSC luma 292 const auto kG = Set(df, 0.587f); 293 const auto kB = Set(df, 0.114f); 294 const auto kAmpR = Set(df, 0.701f); 295 const auto kAmpB = Set(df, 0.886f); 296 const auto kDiffR = Add(kAmpR, kR); 297 const auto kDiffB = Add(kAmpB, kB); 298 const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB)))); 299 const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB)))); 300 301 constexpr size_t kGroupArea = kGroupDim * kGroupDim; 302 const size_t lines_per_group = DivCeil(kGroupArea, xsize); 303 const size_t num_stripes = DivCeil(ysize, lines_per_group); 304 const auto transform = [&](int idx, int /* thread*/) -> Status { 305 const size_t y0 = idx * lines_per_group; 306 const size_t y1 = std::min<size_t>(y0 + lines_per_group, ysize); 307 for (size_t y = y0; y < y1; ++y) { 308 const float* r_row = r_plane.ConstRow(y); 309 const float* g_row = g_plane.ConstRow(y); 310 const float* b_row = b_plane.ConstRow(y); 311 float* y_row = y_plane->Row(y); 312 float* cb_row = cb_plane->Row(y); 313 float* cr_row = cr_plane->Row(y); 314 for (size_t x = 0; x < xsize; x += S) { 315 const auto r = Load(df, r_row + x); 316 const auto g = Load(df, g_row + x); 317 const auto b = Load(df, b_row + x); 318 const auto r_base = Mul(r, kR); 319 const auto r_diff = Mul(r, kDiffR); 320 const auto g_base = Mul(g, kG); 321 const auto b_base = Mul(b, kB); 322 const auto b_diff = Mul(b, kDiffB); 323 const auto y_base = Add(r_base, Add(g_base, b_base)); 324 const auto y_vec = Sub(y_base, k128); 325 const auto cb_vec = Mul(Sub(b_diff, y_base), kNormB); 326 const auto cr_vec = Mul(Sub(r_diff, y_base), kNormR); 327 Store(y_vec, df, y_row + x); 328 Store(cb_vec, df, cb_row + x); 329 Store(cr_vec, df, cr_row + x); 330 } 331 } 332 return true; 333 }; 334 JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<int>(num_stripes), 335 ThreadPool::NoInit, transform, "RgbToYcbCr")); 336 return true; 337 } 338 339 // NOLINTNEXTLINE(google-readability-namespace-comments) 340 } // namespace HWY_NAMESPACE 341 } // namespace jxl 342 HWY_AFTER_NAMESPACE(); 343 344 #if HWY_ONCE 345 namespace jxl { 346 HWY_EXPORT(ToXYB); 347 Status ToXYB(const ColorEncoding& c_current, float intensity_target, 348 const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image, 349 const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) { 350 return HWY_DYNAMIC_DISPATCH(ToXYB)(c_current, intensity_target, black, pool, 351 image, cms, linear); 352 } 353 354 Status ToXYB(const ImageBundle& in, ThreadPool* pool, Image3F* JXL_RESTRICT xyb, 355 const JxlCmsInterface& cms, Image3F* JXL_RESTRICT linear) { 356 JxlMemoryManager* memory_manager = in.memory_manager(); 357 JXL_ASSIGN_OR_RETURN(*xyb, 358 Image3F::Create(memory_manager, in.xsize(), in.ysize())); 359 JXL_RETURN_IF_ERROR(CopyImageTo(in.color(), xyb)); 360 JXL_RETURN_IF_ERROR(ToXYB(in.c_current(), in.metadata()->IntensityTarget(), 361 in.black(), pool, xyb, cms, linear)); 362 return true; 363 } 364 365 HWY_EXPORT(LinearRGBRowToXYB); 366 void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, 367 float* JXL_RESTRICT row2, 368 const float* JXL_RESTRICT premul_absorb, size_t xsize) { 369 HWY_DYNAMIC_DISPATCH(LinearRGBRowToXYB) 370 (row0, row1, row2, premul_absorb, xsize); 371 } 372 373 HWY_EXPORT(ComputePremulAbsorb); 374 void ComputePremulAbsorb(float intensity_target, float* premul_absorb) { 375 HWY_DYNAMIC_DISPATCH(ComputePremulAbsorb)(intensity_target, premul_absorb); 376 } 377 378 void ScaleXYBRow(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, 379 float* JXL_RESTRICT row2, size_t xsize) { 380 for (size_t x = 0; x < xsize; x++) { 381 row2[x] = (row2[x] - row1[x] + jxl::cms::kScaledXYBOffset[2]) * 382 jxl::cms::kScaledXYBScale[2]; 383 row0[x] = (row0[x] + jxl::cms::kScaledXYBOffset[0]) * 384 jxl::cms::kScaledXYBScale[0]; 385 row1[x] = (row1[x] + jxl::cms::kScaledXYBOffset[1]) * 386 jxl::cms::kScaledXYBScale[1]; 387 } 388 } 389 390 void ScaleXYB(Image3F* opsin) { 391 for (size_t y = 0; y < opsin->ysize(); y++) { 392 float* row0 = opsin->PlaneRow(0, y); 393 float* row1 = opsin->PlaneRow(1, y); 394 float* row2 = opsin->PlaneRow(2, y); 395 ScaleXYBRow(row0, row1, row2, opsin->xsize()); 396 } 397 } 398 399 HWY_EXPORT(RgbToYcbcr); 400 Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, 401 const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, 402 ImageF* cr_plane, ThreadPool* pool) { 403 return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane, 404 cb_plane, cr_plane, pool); 405 } 406 407 } // namespace jxl 408 #endif // HWY_ONCE