tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

butteraugli.cc (81907B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 //
      6 // Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com)
      7 //
      8 // The physical architecture of butteraugli is based on the following naming
      9 // convention:
     10 //   * Opsin - dynamics of the photosensitive chemicals in the retina
     11 //             with their immediate electrical processing
     12 //   * Xyb - hybrid opponent/trichromatic color space
     13 //     x is roughly red-subtract-green.
     14 //     y is yellow.
     15 //     b is blue.
     16 //     Xyb values are computed from Opsin mixing, not directly from rgb.
     17 //   * Mask - for visual masking
     18 //   * Hf - color modeling for spatially high-frequency features
     19 //   * Lf - color modeling for spatially low-frequency features
     20 //   * Diffmap - to cluster and build an image of error between the images
     21 //   * Blur - to hold the smoothing code
     22 
     23 #include "lib/jxl/butteraugli/butteraugli.h"
     24 
     25 #include <jxl/memory_manager.h>
     26 
     27 #include <algorithm>
     28 #include <cmath>
     29 #include <cstdint>
     30 #include <cstdio>
     31 #include <cstdlib>
     32 #include <cstring>
     33 #include <memory>
     34 #include <vector>
     35 
     36 #include "lib/jxl/image.h"
     37 
     38 #undef HWY_TARGET_INCLUDE
     39 #define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc"
     40 #include <hwy/foreach_target.h>
     41 
     42 #include "lib/jxl/base/fast_math-inl.h"
     43 #include "lib/jxl/base/rect.h"
     44 #include "lib/jxl/base/status.h"
     45 #include "lib/jxl/convolve.h"
     46 #include "lib/jxl/image_ops.h"
     47 
     48 #if BUTTERAUGLI_ENABLE_CHECKS
     49 #include "lib/jxl/base/printf_macros.h"
     50 #endif
     51 
     52 #ifndef JXL_BUTTERAUGLI_ONCE
     53 #define JXL_BUTTERAUGLI_ONCE
     54 
     55 namespace jxl {
     56 
     57 static const double wMfMalta = 37.0819870399;
     58 static const double norm1Mf = 130262059.556;
     59 static const double wMfMaltaX = 8246.75321353;
     60 static const double norm1MfX = 1009002.70582;
     61 static const double wHfMalta = 18.7237414387;
     62 static const double norm1Hf = 4498534.45232;
     63 static const double wHfMaltaX = 6923.99476109;
     64 static const double norm1HfX = 8051.15833247;
     65 static const double wUhfMalta = 1.10039032555;
     66 static const double norm1Uhf = 71.7800275169;
     67 static const double wUhfMaltaX = 173.5;
     68 static const double norm1UhfX = 5.0;
     69 static const double wmul[9] = {
     70    400.0,         1.50815703118,  0,
     71    2150.0,        10.6195433239,  16.2176043152,
     72    29.2353797994, 0.844626970982, 0.703646627719,
     73 };
     74 
     75 std::vector<float> ComputeKernel(float sigma) {
     76  const float m = 2.25;  // Accuracy increases when m is increased.
     77  const double scaler = -1.0 / (2.0 * sigma * sigma);
     78  const int diff = std::max<int>(1, m * std::fabs(sigma));
     79  std::vector<float> kernel(2 * diff + 1);
     80  for (int i = -diff; i <= diff; ++i) {
     81    kernel[i + diff] = std::exp(scaler * i * i);
     82  }
     83  return kernel;
     84 }
     85 
     86 void ConvolveBorderColumn(const ImageF& in, const std::vector<float>& kernel,
     87                          const size_t x, float* BUTTERAUGLI_RESTRICT row_out) {
     88  const size_t offset = kernel.size() / 2;
     89  int minx = x < offset ? 0 : x - offset;
     90  int maxx = std::min<int>(in.xsize() - 1, x + offset);
     91  float weight = 0.0f;
     92  for (int j = minx; j <= maxx; ++j) {
     93    weight += kernel[j - x + offset];
     94  }
     95  float scale = 1.0f / weight;
     96  for (size_t y = 0; y < in.ysize(); ++y) {
     97    const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
     98    float sum = 0.0f;
     99    for (int j = minx; j <= maxx; ++j) {
    100      sum += row_in[j] * kernel[j - x + offset];
    101    }
    102    row_out[y] = sum * scale;
    103  }
    104 }
    105 
    106 // Computes a horizontal convolution and transposes the result.
    107 Status ConvolutionWithTranspose(const ImageF& in,
    108                                const std::vector<float>& kernel,
    109                                ImageF* BUTTERAUGLI_RESTRICT out) {
    110  JXL_ENSURE(out->xsize() == in.ysize());
    111  JXL_ENSURE(out->ysize() == in.xsize());
    112  const size_t len = kernel.size();
    113  const size_t offset = len / 2;
    114  float weight_no_border = 0.0f;
    115  for (size_t j = 0; j < len; ++j) {
    116    weight_no_border += kernel[j];
    117  }
    118  const float scale_no_border = 1.0f / weight_no_border;
    119  const size_t border1 = std::min(in.xsize(), offset);
    120  const size_t border2 = in.xsize() > offset ? in.xsize() - offset : 0;
    121  std::vector<float> scaled_kernel(len / 2 + 1);
    122  for (size_t i = 0; i <= len / 2; ++i) {
    123    scaled_kernel[i] = kernel[i] * scale_no_border;
    124  }
    125 
    126  // middle
    127  switch (len) {
    128    case 7: {
    129      const float sk0 = scaled_kernel[0];
    130      const float sk1 = scaled_kernel[1];
    131      const float sk2 = scaled_kernel[2];
    132      const float sk3 = scaled_kernel[3];
    133      for (size_t y = 0; y < in.ysize(); ++y) {
    134        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
    135        for (size_t x = border1; x < border2; ++x, ++row_in) {
    136          const float sum0 = (row_in[0] + row_in[6]) * sk0;
    137          const float sum1 = (row_in[1] + row_in[5]) * sk1;
    138          const float sum2 = (row_in[2] + row_in[4]) * sk2;
    139          const float sum = (row_in[3]) * sk3 + sum0 + sum1 + sum2;
    140          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
    141          row_out[y] = sum;
    142        }
    143      }
    144    } break;
    145    case 13: {
    146      for (size_t y = 0; y < in.ysize(); ++y) {
    147        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
    148        for (size_t x = border1; x < border2; ++x, ++row_in) {
    149          float sum0 = (row_in[0] + row_in[12]) * scaled_kernel[0];
    150          float sum1 = (row_in[1] + row_in[11]) * scaled_kernel[1];
    151          float sum2 = (row_in[2] + row_in[10]) * scaled_kernel[2];
    152          float sum3 = (row_in[3] + row_in[9]) * scaled_kernel[3];
    153          sum0 += (row_in[4] + row_in[8]) * scaled_kernel[4];
    154          sum1 += (row_in[5] + row_in[7]) * scaled_kernel[5];
    155          const float sum = (row_in[6]) * scaled_kernel[6];
    156          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
    157          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
    158        }
    159      }
    160      break;
    161    }
    162    case 15: {
    163      for (size_t y = 0; y < in.ysize(); ++y) {
    164        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
    165        for (size_t x = border1; x < border2; ++x, ++row_in) {
    166          float sum0 = (row_in[0] + row_in[14]) * scaled_kernel[0];
    167          float sum1 = (row_in[1] + row_in[13]) * scaled_kernel[1];
    168          float sum2 = (row_in[2] + row_in[12]) * scaled_kernel[2];
    169          float sum3 = (row_in[3] + row_in[11]) * scaled_kernel[3];
    170          sum0 += (row_in[4] + row_in[10]) * scaled_kernel[4];
    171          sum1 += (row_in[5] + row_in[9]) * scaled_kernel[5];
    172          sum2 += (row_in[6] + row_in[8]) * scaled_kernel[6];
    173          const float sum = (row_in[7]) * scaled_kernel[7];
    174          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
    175          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
    176        }
    177      }
    178      break;
    179    }
    180    case 33: {
    181      for (size_t y = 0; y < in.ysize(); ++y) {
    182        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
    183        for (size_t x = border1; x < border2; ++x, ++row_in) {
    184          float sum0 = (row_in[0] + row_in[32]) * scaled_kernel[0];
    185          float sum1 = (row_in[1] + row_in[31]) * scaled_kernel[1];
    186          float sum2 = (row_in[2] + row_in[30]) * scaled_kernel[2];
    187          float sum3 = (row_in[3] + row_in[29]) * scaled_kernel[3];
    188          sum0 += (row_in[4] + row_in[28]) * scaled_kernel[4];
    189          sum1 += (row_in[5] + row_in[27]) * scaled_kernel[5];
    190          sum2 += (row_in[6] + row_in[26]) * scaled_kernel[6];
    191          sum3 += (row_in[7] + row_in[25]) * scaled_kernel[7];
    192          sum0 += (row_in[8] + row_in[24]) * scaled_kernel[8];
    193          sum1 += (row_in[9] + row_in[23]) * scaled_kernel[9];
    194          sum2 += (row_in[10] + row_in[22]) * scaled_kernel[10];
    195          sum3 += (row_in[11] + row_in[21]) * scaled_kernel[11];
    196          sum0 += (row_in[12] + row_in[20]) * scaled_kernel[12];
    197          sum1 += (row_in[13] + row_in[19]) * scaled_kernel[13];
    198          sum2 += (row_in[14] + row_in[18]) * scaled_kernel[14];
    199          sum3 += (row_in[15] + row_in[17]) * scaled_kernel[15];
    200          const float sum = (row_in[16]) * scaled_kernel[16];
    201          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
    202          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
    203        }
    204      }
    205      break;
    206    }
    207    default:
    208      return JXL_UNREACHABLE("kernel size %d not implemented",
    209                             static_cast<int>(len));
    210  }
    211  // left border
    212  for (size_t x = 0; x < border1; ++x) {
    213    ConvolveBorderColumn(in, kernel, x, out->Row(x));
    214  }
    215 
    216  // right border
    217  for (size_t x = border2; x < in.xsize(); ++x) {
    218    ConvolveBorderColumn(in, kernel, x, out->Row(x));
    219  }
    220  return true;
    221 }
    222 
    223 // A blur somewhat similar to a 2D Gaussian blur.
    224 // See: https://en.wikipedia.org/wiki/Gaussian_blur
    225 //
    226 // This is a bottleneck because the sigma can be quite large (>7). We can use
    227 // gauss_blur.cc (runtime independent of sigma, closer to a 4*sigma truncated
    228 // Gaussian and our 2.25 in ComputeKernel), but its boundary conditions are
    229 // zero-valued. This leads to noticeable differences at the edges of diffmaps.
    230 // We retain a special case for 5x5 kernels (even faster than gauss_blur),
    231 // optionally use gauss_blur followed by fixup of the borders for large images,
    232 // or fall back to the previous truncated FIR followed by a transpose.
    233 Status Blur(const ImageF& in, float sigma, const ButteraugliParams& params,
    234            BlurTemp* temp, ImageF* out) {
    235  std::vector<float> kernel = ComputeKernel(sigma);
    236  // Separable5 does an in-place convolution, so this fast path is not safe if
    237  // in aliases out.
    238  if (kernel.size() == 5 && &in != out) {
    239    float sum_weights = 0.0f;
    240    for (const float w : kernel) {
    241      sum_weights += w;
    242    }
    243    const float scale = 1.0f / sum_weights;
    244    const float w0 = kernel[2] * scale;
    245    const float w1 = kernel[1] * scale;
    246    const float w2 = kernel[0] * scale;
    247    const WeightsSeparable5 weights = {
    248        {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
    249        {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
    250    };
    251    JXL_RETURN_IF_ERROR(
    252        Separable5(in, Rect(in), weights, /*pool=*/nullptr, out));
    253    return true;
    254  }
    255 
    256  ImageF* temp_t;
    257  JXL_RETURN_IF_ERROR(temp->GetTransposed(in, &temp_t));
    258  JXL_RETURN_IF_ERROR(ConvolutionWithTranspose(in, kernel, temp_t));
    259  JXL_RETURN_IF_ERROR(ConvolutionWithTranspose(*temp_t, kernel, out));
    260  return true;
    261 }
    262 
    263 // Allows PaddedMaltaUnit to call either function via overloading.
    264 struct MaltaTagLF {};
    265 struct MaltaTag {};
    266 
    267 }  // namespace jxl
    268 
    269 #endif  // JXL_BUTTERAUGLI_ONCE
    270 
    271 #include <hwy/highway.h>
    272 HWY_BEFORE_NAMESPACE();
    273 namespace jxl {
    274 namespace HWY_NAMESPACE {
    275 
    276 // These templates are not found via ADL.
    277 using hwy::HWY_NAMESPACE::Abs;
    278 using hwy::HWY_NAMESPACE::Div;
    279 using hwy::HWY_NAMESPACE::Gt;
    280 using hwy::HWY_NAMESPACE::IfThenElse;
    281 using hwy::HWY_NAMESPACE::IfThenElseZero;
    282 using hwy::HWY_NAMESPACE::Lt;
    283 using hwy::HWY_NAMESPACE::Max;
    284 using hwy::HWY_NAMESPACE::Mul;
    285 using hwy::HWY_NAMESPACE::MulAdd;
    286 using hwy::HWY_NAMESPACE::MulSub;
    287 using hwy::HWY_NAMESPACE::Neg;
    288 using hwy::HWY_NAMESPACE::Sub;
    289 using hwy::HWY_NAMESPACE::Vec;
    290 using hwy::HWY_NAMESPACE::ZeroIfNegative;
    291 
    292 template <class D, class V>
    293 HWY_INLINE V MaximumClamp(D d, V v, double kMaxVal) {
    294  static const double kMul = 0.724216145665;
    295  const V mul = Set(d, kMul);
    296  const V maxval = Set(d, kMaxVal);
    297  // If greater than maxval or less than -maxval, replace with if_*.
    298  const V if_pos = MulAdd(Sub(v, maxval), mul, maxval);
    299  const V if_neg = MulSub(Add(v, maxval), mul, maxval);
    300  const V pos_or_v = IfThenElse(Ge(v, maxval), if_pos, v);
    301  return IfThenElse(Lt(v, Neg(maxval)), if_neg, pos_or_v);
    302 }
    303 
    304 // Make area around zero less important (remove it).
    305 template <class D, class V>
    306 HWY_INLINE V RemoveRangeAroundZero(const D d, const double kw, const V x) {
    307  const auto w = Set(d, kw);
    308  return IfThenElse(Gt(x, w), Sub(x, w),
    309                    IfThenElseZero(Lt(x, Neg(w)), Add(x, w)));
    310 }
    311 
    312 // Make area around zero more important (2x it until the limit).
    313 template <class D, class V>
    314 HWY_INLINE V AmplifyRangeAroundZero(const D d, const double kw, const V x) {
    315  const auto w = Set(d, kw);
    316  return IfThenElse(Gt(x, w), Add(x, w),
    317                    IfThenElse(Lt(x, Neg(w)), Sub(x, w), Add(x, x)));
    318 }
    319 
    320 // XybLowFreqToVals converts from low-frequency XYB space to the 'vals' space.
    321 // Vals space can be converted to L2-norm space (Euclidean and normalized)
    322 // through visual masking.
    323 template <class D, class V>
    324 HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y,
    325                                 const V& b_arg, V* HWY_RESTRICT valx,
    326                                 V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) {
    327  static const double xmul_scalar = 33.832837186260;
    328  static const double ymul_scalar = 14.458268100570;
    329  static const double bmul_scalar = 49.87984651440;
    330  static const double y_to_b_mul_scalar = -0.362267051518;
    331  const V xmul = Set(d, xmul_scalar);
    332  const V ymul = Set(d, ymul_scalar);
    333  const V bmul = Set(d, bmul_scalar);
    334  const V y_to_b_mul = Set(d, y_to_b_mul_scalar);
    335  const V b = MulAdd(y_to_b_mul, y, b_arg);
    336  *valb = Mul(b, bmul);
    337  *valx = Mul(x, xmul);
    338  *valy = Mul(y, ymul);
    339 }
    340 
    341 void XybLowFreqToVals(Image3F* xyb_lf) {
    342  // Modify range around zero code only concerns the high frequency
    343  // planes and only the X and Y channels.
    344  // Convert low freq xyb to vals space so that we can do a simple squared sum
    345  // diff on the low frequencies later.
    346  const HWY_FULL(float) d;
    347  for (size_t y = 0; y < xyb_lf->ysize(); ++y) {
    348    float* BUTTERAUGLI_RESTRICT row_x = xyb_lf->PlaneRow(0, y);
    349    float* BUTTERAUGLI_RESTRICT row_y = xyb_lf->PlaneRow(1, y);
    350    float* BUTTERAUGLI_RESTRICT row_b = xyb_lf->PlaneRow(2, y);
    351    for (size_t x = 0; x < xyb_lf->xsize(); x += Lanes(d)) {
    352      auto valx = Undefined(d);
    353      auto valy = Undefined(d);
    354      auto valb = Undefined(d);
    355      XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x),
    356                       Load(d, row_b + x), &valx, &valy, &valb);
    357      Store(valx, d, row_x + x);
    358      Store(valy, d, row_y + x);
    359      Store(valb, d, row_b + x);
    360    }
    361  }
    362 }
    363 
    364 Status SuppressXByY(const ImageF& in_y, ImageF* HWY_RESTRICT inout_x) {
    365  JXL_ENSURE(SameSize(*inout_x, in_y));
    366  const size_t xsize = in_y.xsize();
    367  const size_t ysize = in_y.ysize();
    368  const HWY_FULL(float) d;
    369  static const double suppress = 46.0;
    370  static const double s = 0.653020556257;
    371  const auto sv = Set(d, s);
    372  const auto one_minus_s = Set(d, 1.0 - s);
    373  const auto ywv = Set(d, suppress);
    374 
    375  for (size_t y = 0; y < ysize; ++y) {
    376    const float* HWY_RESTRICT row_y = in_y.ConstRow(y);
    377    float* HWY_RESTRICT row_x = inout_x->Row(y);
    378    for (size_t x = 0; x < xsize; x += Lanes(d)) {
    379      const auto vx = Load(d, row_x + x);
    380      const auto vy = Load(d, row_y + x);
    381      const auto scaler =
    382          MulAdd(Div(ywv, MulAdd(vy, vy, ywv)), one_minus_s, sv);
    383      Store(Mul(scaler, vx), d, row_x + x);
    384    }
    385  }
    386  return true;
    387 }
    388 
    389 void Subtract(const ImageF& a, const ImageF& b, ImageF* c) {
    390  const HWY_FULL(float) d;
    391  for (size_t y = 0; y < a.ysize(); ++y) {
    392    const float* row_a = a.ConstRow(y);
    393    const float* row_b = b.ConstRow(y);
    394    float* row_c = c->Row(y);
    395    for (size_t x = 0; x < a.xsize(); x += Lanes(d)) {
    396      Store(Sub(Load(d, row_a + x), Load(d, row_b + x)), d, row_c + x);
    397    }
    398  }
    399 }
    400 
    401 Status SeparateLFAndMF(const ButteraugliParams& params, const Image3F& xyb,
    402                       Image3F* lf, Image3F* mf, BlurTemp* blur_temp) {
    403  static const double kSigmaLf = 7.15593339443;
    404  for (int i = 0; i < 3; ++i) {
    405    // Extract lf ...
    406    JXL_RETURN_IF_ERROR(
    407        Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &lf->Plane(i)));
    408    // ... and keep everything else in mf.
    409    Subtract(xyb.Plane(i), lf->Plane(i), &mf->Plane(i));
    410  }
    411  XybLowFreqToVals(lf);
    412  return true;
    413 }
    414 
    415 Status SeparateMFAndHF(const ButteraugliParams& params, Image3F* mf, ImageF* hf,
    416                       BlurTemp* blur_temp) {
    417  const HWY_FULL(float) d;
    418  static const double kSigmaHf = 3.22489901262;
    419  const size_t xsize = mf->xsize();
    420  const size_t ysize = mf->ysize();
    421  JxlMemoryManager* memory_manager = mf[0].memory_manager();
    422  JXL_ASSIGN_OR_RETURN(hf[0], ImageF::Create(memory_manager, xsize, ysize));
    423  JXL_ASSIGN_OR_RETURN(hf[1], ImageF::Create(memory_manager, xsize, ysize));
    424  for (int i = 0; i < 3; ++i) {
    425    if (i == 2) {
    426      JXL_RETURN_IF_ERROR(
    427          Blur(mf->Plane(i), kSigmaHf, params, blur_temp, &mf->Plane(i)));
    428      break;
    429    }
    430    for (size_t y = 0; y < ysize; ++y) {
    431      float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(i, y);
    432      float* BUTTERAUGLI_RESTRICT row_hf = hf[i].Row(y);
    433      for (size_t x = 0; x < xsize; x += Lanes(d)) {
    434        Store(Load(d, row_mf + x), d, row_hf + x);
    435      }
    436    }
    437    JXL_RETURN_IF_ERROR(
    438        Blur(mf->Plane(i), kSigmaHf, params, blur_temp, &mf->Plane(i)));
    439    static const double kRemoveMfRange = 0.29;
    440    static const double kAddMfRange = 0.1;
    441    if (i == 0) {
    442      for (size_t y = 0; y < ysize; ++y) {
    443        float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(0, y);
    444        float* BUTTERAUGLI_RESTRICT row_hf = hf[0].Row(y);
    445        for (size_t x = 0; x < xsize; x += Lanes(d)) {
    446          auto mf = Load(d, row_mf + x);
    447          auto hf = Sub(Load(d, row_hf + x), mf);
    448          mf = RemoveRangeAroundZero(d, kRemoveMfRange, mf);
    449          Store(mf, d, row_mf + x);
    450          Store(hf, d, row_hf + x);
    451        }
    452      }
    453    } else {
    454      for (size_t y = 0; y < ysize; ++y) {
    455        float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(1, y);
    456        float* BUTTERAUGLI_RESTRICT row_hf = hf[1].Row(y);
    457        for (size_t x = 0; x < xsize; x += Lanes(d)) {
    458          auto mf = Load(d, row_mf + x);
    459          auto hf = Sub(Load(d, row_hf + x), mf);
    460 
    461          mf = AmplifyRangeAroundZero(d, kAddMfRange, mf);
    462          Store(mf, d, row_mf + x);
    463          Store(hf, d, row_hf + x);
    464        }
    465      }
    466    }
    467  }
    468  // Suppress red-green by intensity change in the high freq channels.
    469  JXL_RETURN_IF_ERROR(SuppressXByY(hf[1], &hf[0]));
    470  return true;
    471 }
    472 
    473 Status SeparateHFAndUHF(const ButteraugliParams& params, ImageF* hf,
    474                        ImageF* uhf, BlurTemp* blur_temp) {
    475  const HWY_FULL(float) d;
    476  const size_t xsize = hf[0].xsize();
    477  const size_t ysize = hf[0].ysize();
    478  JxlMemoryManager* memory_manager = hf[0].memory_manager();
    479  static const double kSigmaUhf = 1.56416327805;
    480  JXL_ASSIGN_OR_RETURN(uhf[0], ImageF::Create(memory_manager, xsize, ysize));
    481  JXL_ASSIGN_OR_RETURN(uhf[1], ImageF::Create(memory_manager, xsize, ysize));
    482  for (int i = 0; i < 2; ++i) {
    483    // Divide hf into hf and uhf.
    484    for (size_t y = 0; y < ysize; ++y) {
    485      float* BUTTERAUGLI_RESTRICT row_uhf = uhf[i].Row(y);
    486      float* BUTTERAUGLI_RESTRICT row_hf = hf[i].Row(y);
    487      for (size_t x = 0; x < xsize; ++x) {
    488        row_uhf[x] = row_hf[x];
    489      }
    490    }
    491    JXL_RETURN_IF_ERROR(Blur(hf[i], kSigmaUhf, params, blur_temp, &hf[i]));
    492    static const double kRemoveHfRange = 1.5;
    493    static const double kAddHfRange = 0.132;
    494    static const double kRemoveUhfRange = 0.04;
    495    static const double kMaxclampHf = 28.4691806922;
    496    static const double kMaxclampUhf = 5.19175294647;
    497    static double kMulYHf = 2.155;
    498    static double kMulYUhf = 2.69313763794;
    499    if (i == 0) {
    500      for (size_t y = 0; y < ysize; ++y) {
    501        float* BUTTERAUGLI_RESTRICT row_uhf = uhf[0].Row(y);
    502        float* BUTTERAUGLI_RESTRICT row_hf = hf[0].Row(y);
    503        for (size_t x = 0; x < xsize; x += Lanes(d)) {
    504          auto hf = Load(d, row_hf + x);
    505          auto uhf = Sub(Load(d, row_uhf + x), hf);
    506          hf = RemoveRangeAroundZero(d, kRemoveHfRange, hf);
    507          uhf = RemoveRangeAroundZero(d, kRemoveUhfRange, uhf);
    508          Store(hf, d, row_hf + x);
    509          Store(uhf, d, row_uhf + x);
    510        }
    511      }
    512    } else {
    513      for (size_t y = 0; y < ysize; ++y) {
    514        float* BUTTERAUGLI_RESTRICT row_uhf = uhf[1].Row(y);
    515        float* BUTTERAUGLI_RESTRICT row_hf = hf[1].Row(y);
    516        for (size_t x = 0; x < xsize; x += Lanes(d)) {
    517          auto hf = Load(d, row_hf + x);
    518          hf = MaximumClamp(d, hf, kMaxclampHf);
    519 
    520          auto uhf = Sub(Load(d, row_uhf + x), hf);
    521          uhf = MaximumClamp(d, uhf, kMaxclampUhf);
    522          uhf = Mul(uhf, Set(d, kMulYUhf));
    523          Store(uhf, d, row_uhf + x);
    524 
    525          hf = Mul(hf, Set(d, kMulYHf));
    526          hf = AmplifyRangeAroundZero(d, kAddHfRange, hf);
    527          Store(hf, d, row_hf + x);
    528        }
    529      }
    530    }
    531  }
    532  return true;
    533 }
    534 
    535 void DeallocateHFAndUHF(ImageF* hf, ImageF* uhf) {
    536  for (int i = 0; i < 2; ++i) {
    537    hf[i] = ImageF();
    538    uhf[i] = ImageF();
    539  }
    540 }
    541 
    542 Status SeparateFrequencies(size_t xsize, size_t ysize,
    543                           const ButteraugliParams& params, BlurTemp* blur_temp,
    544                           const Image3F& xyb, PsychoImage& ps) {
    545  JxlMemoryManager* memory_manager = xyb.memory_manager();
    546  JXL_ASSIGN_OR_RETURN(
    547      ps.lf, Image3F::Create(memory_manager, xyb.xsize(), xyb.ysize()));
    548  JXL_ASSIGN_OR_RETURN(
    549      ps.mf, Image3F::Create(memory_manager, xyb.xsize(), xyb.ysize()));
    550  JXL_RETURN_IF_ERROR(SeparateLFAndMF(params, xyb, &ps.lf, &ps.mf, blur_temp));
    551  JXL_RETURN_IF_ERROR(SeparateMFAndHF(params, &ps.mf, &ps.hf[0], blur_temp));
    552  JXL_RETURN_IF_ERROR(
    553      SeparateHFAndUHF(params, &ps.hf[0], &ps.uhf[0], blur_temp));
    554  return true;
    555 }
    556 
    557 namespace {
    558 template <typename V>
    559 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d) {
    560  return Add(Add(a, b), Add(c, d));
    561 }
    562 template <typename V>
    563 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e) {
    564  return Sum(a, b, c, Add(d, e));
    565 }
    566 template <typename V>
    567 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g) {
    568  return Sum(a, b, c, Sum(d, e, f, g));
    569 }
    570 template <typename V>
    571 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g, V h, V i) {
    572  return Add(Add(Sum(a, b, c, d), Sum(e, f, g, h)), i);
    573 }
    574 }  // namespace
    575 
    576 template <class D>
    577 Vec<D> MaltaUnit(MaltaTagLF /*tag*/, const D df,
    578                 const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
    579  const intptr_t xs3 = 3 * xs;
    580 
    581  const auto center = LoadU(df, d);
    582 
    583  // x grows, y constant
    584  const auto sum_yconst = Sum(LoadU(df, d - 4), LoadU(df, d - 2), center,
    585                              LoadU(df, d + 2), LoadU(df, d + 4));
    586  // Will return this, sum of all line kernels
    587  auto retval = Mul(sum_yconst, sum_yconst);
    588  {
    589    // y grows, x constant
    590    auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs - xs), center,
    591                   LoadU(df, d + xs + xs), LoadU(df, d + xs3 + xs));
    592    retval = MulAdd(sum, sum, retval);
    593  }
    594  {
    595    // both grow
    596    auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2), center,
    597                   LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
    598    retval = MulAdd(sum, sum, retval);
    599  }
    600  {
    601    // y grows, x shrinks
    602    auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2), center,
    603                   LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
    604    retval = MulAdd(sum, sum, retval);
    605  }
    606  {
    607    // y grows -4 to 4, x shrinks 1 -> -1
    608    auto sum =
    609        Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs - xs + 1), center,
    610            LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 1));
    611    retval = MulAdd(sum, sum, retval);
    612  }
    613  {
    614    //  y grows -4 to 4, x grows -1 -> 1
    615    auto sum =
    616        Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs - xs - 1), center,
    617            LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 1));
    618    retval = MulAdd(sum, sum, retval);
    619  }
    620  {
    621    // x grows -4 to 4, y grows -1 to 1
    622    auto sum = Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 2 - xs), center,
    623                   LoadU(df, d + 2 + xs), LoadU(df, d + 4 + xs));
    624    retval = MulAdd(sum, sum, retval);
    625  }
    626  {
    627    // x grows -4 to 4, y shrinks 1 to -1
    628    auto sum = Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 2 + xs), center,
    629                   LoadU(df, d + 2 - xs), LoadU(df, d + 4 - xs));
    630    retval = MulAdd(sum, sum, retval);
    631  }
    632  {
    633    /* 0_________
    634       1__*______
    635       2___*_____
    636       3_________
    637       4____0____
    638       5_________
    639       6_____*___
    640       7______*__
    641       8_________ */
    642    auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1), center,
    643                   LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
    644    retval = MulAdd(sum, sum, retval);
    645  }
    646  {
    647    /* 0_________
    648       1______*__
    649       2_____*___
    650       3_________
    651       4____0____
    652       5_________
    653       6___*_____
    654       7__*______
    655       8_________ */
    656    auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1), center,
    657                   LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
    658    retval = MulAdd(sum, sum, retval);
    659  }
    660  {
    661    /* 0_________
    662       1_________
    663       2_*_______
    664       3__*______
    665       4____0____
    666       5______*__
    667       6_______*_
    668       7_________
    669       8_________ */
    670    auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2), center,
    671                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
    672    retval = MulAdd(sum, sum, retval);
    673  }
    674  {
    675    /* 0_________
    676       1_________
    677       2_______*_
    678       3______*__
    679       4____0____
    680       5__*______
    681       6_*_______
    682       7_________
    683       8_________ */
    684    auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2), center,
    685                   LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
    686    retval = MulAdd(sum, sum, retval);
    687  }
    688  {
    689    /* 0_________
    690       1_________
    691       2________*
    692       3______*__
    693       4____0____
    694       5__*______
    695       6*________
    696       7_________
    697       8_________ */
    698 
    699    auto sum = Sum(LoadU(df, d + xs + xs - 4), LoadU(df, d + xs - 2), center,
    700                   LoadU(df, d - xs + 2), LoadU(df, d - xs - xs + 4));
    701    retval = MulAdd(sum, sum, retval);
    702  }
    703  {
    704    /* 0_________
    705       1_________
    706       2*________
    707       3__*______
    708       4____0____
    709       5______*__
    710       6________*
    711       7_________
    712       8_________ */
    713    auto sum = Sum(LoadU(df, d - xs - xs - 4), LoadU(df, d - xs - 2), center,
    714                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 4));
    715    retval = MulAdd(sum, sum, retval);
    716  }
    717  {
    718    /* 0__*______
    719       1_________
    720       2___*_____
    721       3_________
    722       4____0____
    723       5_________
    724       6_____*___
    725       7_________
    726       8______*__ */
    727    auto sum =
    728        Sum(LoadU(df, d - xs3 - xs - 2), LoadU(df, d - xs - xs - 1), center,
    729            LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 2));
    730    retval = MulAdd(sum, sum, retval);
    731  }
    732  {
    733    /* 0______*__
    734       1_________
    735       2_____*___
    736       3_________
    737       4____0____
    738       5_________
    739       6___*_____
    740       7_________
    741       8__*______ */
    742    auto sum =
    743        Sum(LoadU(df, d - xs3 - xs + 2), LoadU(df, d - xs - xs + 1), center,
    744            LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 2));
    745    retval = MulAdd(sum, sum, retval);
    746  }
    747  return retval;
    748 }
    749 
    750 template <class D>
    751 Vec<D> MaltaUnit(MaltaTag /*tag*/, const D df,
    752                 const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
    753  const intptr_t xs3 = 3 * xs;
    754 
    755  const auto center = LoadU(df, d);
    756 
    757  // x grows, y constant
    758  const auto sum_yconst =
    759      Sum(LoadU(df, d - 4), LoadU(df, d - 3), LoadU(df, d - 2),
    760          LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2),
    761          LoadU(df, d + 3), LoadU(df, d + 4));
    762  // Will return this, sum of all line kernels
    763  auto retval = Mul(sum_yconst, sum_yconst);
    764 
    765  {
    766    // y grows, x constant
    767    auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs3),
    768                   LoadU(df, d - xs - xs), LoadU(df, d - xs), center,
    769                   LoadU(df, d + xs), LoadU(df, d + xs + xs),
    770                   LoadU(df, d + xs3), LoadU(df, d + xs3 + xs));
    771    retval = MulAdd(sum, sum, retval);
    772  }
    773  {
    774    // both grow
    775    auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2),
    776                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
    777                   LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
    778    retval = MulAdd(sum, sum, retval);
    779  }
    780  {
    781    // y grows, x shrinks
    782    auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2),
    783                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
    784                   LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
    785    retval = MulAdd(sum, sum, retval);
    786  }
    787  {
    788    // y grows -4 to 4, x shrinks 1 -> -1
    789    auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
    790                   LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
    791                   LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
    792                   LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
    793    retval = MulAdd(sum, sum, retval);
    794  }
    795  {
    796    //  y grows -4 to 4, x grows -1 -> 1
    797    auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
    798                   LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
    799                   LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
    800                   LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
    801    retval = MulAdd(sum, sum, retval);
    802  }
    803  {
    804    // x grows -4 to 4, y grows -1 to 1
    805    auto sum =
    806        Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 3 - xs), LoadU(df, d - 2 - xs),
    807            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 + xs),
    808            LoadU(df, d + 3 + xs), LoadU(df, d + 4 + xs));
    809    retval = MulAdd(sum, sum, retval);
    810  }
    811  {
    812    // x grows -4 to 4, y shrinks 1 to -1
    813    auto sum =
    814        Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 3 + xs), LoadU(df, d - 2 + xs),
    815            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 - xs),
    816            LoadU(df, d + 3 - xs), LoadU(df, d + 4 - xs));
    817    retval = MulAdd(sum, sum, retval);
    818  }
    819  {
    820    /* 0_________
    821       1__*______
    822       2___*_____
    823       3___*_____
    824       4____0____
    825       5_____*___
    826       6_____*___
    827       7______*__
    828       8_________ */
    829    auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1),
    830                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
    831                   LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
    832    retval = MulAdd(sum, sum, retval);
    833  }
    834  {
    835    /* 0_________
    836       1______*__
    837       2_____*___
    838       3_____*___
    839       4____0____
    840       5___*_____
    841       6___*_____
    842       7__*______
    843       8_________ */
    844    auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1),
    845                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
    846                   LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
    847    retval = MulAdd(sum, sum, retval);
    848  }
    849  {
    850    /* 0_________
    851       1_________
    852       2_*_______
    853       3__**_____
    854       4____0____
    855       5_____**__
    856       6_______*_
    857       7_________
    858       8_________ */
    859    auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2),
    860                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
    861                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
    862    retval = MulAdd(sum, sum, retval);
    863  }
    864  {
    865    /* 0_________
    866       1_________
    867       2_______*_
    868       3_____**__
    869       4____0____
    870       5__**_____
    871       6_*_______
    872       7_________
    873       8_________ */
    874    auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2),
    875                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
    876                   LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
    877    retval = MulAdd(sum, sum, retval);
    878  }
    879  {
    880    /* 0_________
    881       1_________
    882       2_________
    883       3______***
    884       4___*0*___
    885       5***______
    886       6_________
    887       7_________
    888       8_________ */
    889 
    890    auto sum =
    891        Sum(LoadU(df, d + xs - 4), LoadU(df, d + xs - 3), LoadU(df, d + xs - 2),
    892            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d - xs + 2),
    893            LoadU(df, d - xs + 3), LoadU(df, d - xs + 4));
    894    retval = MulAdd(sum, sum, retval);
    895  }
    896  {
    897    /* 0_________
    898       1_________
    899       2_________
    900       3***______
    901       4___*0*___
    902       5______***
    903       6_________
    904       7_________
    905       8_________ */
    906    auto sum =
    907        Sum(LoadU(df, d - xs - 4), LoadU(df, d - xs - 3), LoadU(df, d - xs - 2),
    908            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + xs + 2),
    909            LoadU(df, d + xs + 3), LoadU(df, d + xs + 4));
    910    retval = MulAdd(sum, sum, retval);
    911  }
    912  {
    913    /* 0___*_____
    914       1___*_____
    915       2___*_____
    916       3____*____
    917       4____0____
    918       5____*____
    919       6_____*___
    920       7_____*___
    921       8_____*___ */
    922    auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
    923                   LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
    924                   LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
    925                   LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
    926    retval = MulAdd(sum, sum, retval);
    927  }
    928  {
    929    /* 0_____*___
    930       1_____*___
    931       2____ *___
    932       3____*____
    933       4____0____
    934       5____*____
    935       6___*_____
    936       7___*_____
    937       8___*_____ */
    938    auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
    939                   LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
    940                   LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
    941                   LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
    942    retval = MulAdd(sum, sum, retval);
    943  }
    944  return retval;
    945 }
    946 
    947 // Returns MaltaUnit. Avoids bounds-checks when x0 and y0 are known
    948 // to be far enough from the image borders. "diffs" is a packed image.
    949 template <class Tag>
    950 static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs,
    951                                                const size_t x0,
    952                                                const size_t y0) {
    953  const float* BUTTERAUGLI_RESTRICT d = diffs.ConstRow(y0) + x0;
    954  const HWY_CAPPED(float, 1) df;
    955  if ((x0 >= 4 && y0 >= 4 && x0 < (diffs.xsize() - 4) &&
    956       y0 < (diffs.ysize() - 4))) {
    957    return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow()));
    958  }
    959 
    960  float borderimage[12 * 9];  // round up to 4
    961  for (int dy = 0; dy < 9; ++dy) {
    962    int y = y0 + dy - 4;
    963    if (y < 0 || static_cast<size_t>(y) >= diffs.ysize()) {
    964      for (int dx = 0; dx < 12; ++dx) {
    965        borderimage[dy * 12 + dx] = 0.0f;
    966      }
    967      continue;
    968    }
    969 
    970    const float* row_diffs = diffs.ConstRow(y);
    971    for (int dx = 0; dx < 9; ++dx) {
    972      int x = x0 + dx - 4;
    973      if (x < 0 || static_cast<size_t>(x) >= diffs.xsize()) {
    974        borderimage[dy * 12 + dx] = 0.0f;
    975      } else {
    976        borderimage[dy * 12 + dx] = row_diffs[x];
    977      }
    978    }
    979    std::fill(borderimage + dy * 12 + 9, borderimage + dy * 12 + 12, 0.0f);
    980  }
    981  return GetLane(MaltaUnit(Tag(), df, &borderimage[4 * 12 + 4], 12));
    982 }
    983 
    984 template <class Tag>
    985 static Status MaltaDiffMapT(const Tag tag, const ImageF& lum0,
    986                            const ImageF& lum1, const double w_0gt1,
    987                            const double w_0lt1, const double norm1,
    988                            const double len, const double mulli,
    989                            ImageF* HWY_RESTRICT diffs,
    990                            ImageF* HWY_RESTRICT block_diff_ac) {
    991  JXL_ENSURE(SameSize(lum0, lum1) && SameSize(lum0, *diffs));
    992  const size_t xsize_ = lum0.xsize();
    993  const size_t ysize_ = lum0.ysize();
    994 
    995  const float kWeight0 = 0.5;
    996  const float kWeight1 = 0.33;
    997 
    998  const double w_pre0gt1 = mulli * std::sqrt(kWeight0 * w_0gt1) / (len * 2 + 1);
    999  const double w_pre0lt1 = mulli * std::sqrt(kWeight1 * w_0lt1) / (len * 2 + 1);
   1000  const float norm2_0gt1 = w_pre0gt1 * norm1;
   1001  const float norm2_0lt1 = w_pre0lt1 * norm1;
   1002 
   1003  for (size_t y = 0; y < ysize_; ++y) {
   1004    const float* HWY_RESTRICT row0 = lum0.ConstRow(y);
   1005    const float* HWY_RESTRICT row1 = lum1.ConstRow(y);
   1006    float* HWY_RESTRICT row_diffs = diffs->Row(y);
   1007    for (size_t x = 0; x < xsize_; ++x) {
   1008      const float absval = 0.5f * (std::abs(row0[x]) + std::abs(row1[x]));
   1009      const float diff = row0[x] - row1[x];
   1010      const float scaler = norm2_0gt1 / (static_cast<float>(norm1) + absval);
   1011 
   1012      // Primary symmetric quadratic objective.
   1013      row_diffs[x] = scaler * diff;
   1014 
   1015      const float scaler2 = norm2_0lt1 / (static_cast<float>(norm1) + absval);
   1016      const double fabs0 = std::fabs(row0[x]);
   1017 
   1018      // Secondary half-open quadratic objectives.
   1019      const double too_small = 0.55 * fabs0;
   1020      const double too_big = 1.05 * fabs0;
   1021 
   1022      if (row0[x] < 0) {
   1023        if (row1[x] > -too_small) {
   1024          double impact = scaler2 * (row1[x] + too_small);
   1025          row_diffs[x] -= impact;
   1026        } else if (row1[x] < -too_big) {
   1027          double impact = scaler2 * (-row1[x] - too_big);
   1028          row_diffs[x] += impact;
   1029        }
   1030      } else {
   1031        if (row1[x] < too_small) {
   1032          double impact = scaler2 * (too_small - row1[x]);
   1033          row_diffs[x] += impact;
   1034        } else if (row1[x] > too_big) {
   1035          double impact = scaler2 * (row1[x] - too_big);
   1036          row_diffs[x] -= impact;
   1037        }
   1038      }
   1039    }
   1040  }
   1041 
   1042  size_t y0 = 0;
   1043  // Top
   1044  for (; y0 < 4; ++y0) {
   1045    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
   1046    for (size_t x0 = 0; x0 < xsize_; ++x0) {
   1047      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
   1048    }
   1049  }
   1050 
   1051  const HWY_FULL(float) df;
   1052  const size_t aligned_x = std::max(static_cast<size_t>(4), Lanes(df));
   1053  const intptr_t stride = diffs->PixelsPerRow();
   1054 
   1055  // Middle
   1056  for (; y0 < ysize_ - 4; ++y0) {
   1057    const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0);
   1058    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
   1059    size_t x0 = 0;
   1060    for (; x0 < aligned_x; ++x0) {
   1061      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
   1062    }
   1063    for (; x0 + Lanes(df) + 4 <= xsize_; x0 += Lanes(df)) {
   1064      auto diff = Load(df, row_diff + x0);
   1065      diff = Add(diff, MaltaUnit(Tag(), df, row_in + x0, stride));
   1066      Store(diff, df, row_diff + x0);
   1067    }
   1068 
   1069    for (; x0 < xsize_; ++x0) {
   1070      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
   1071    }
   1072  }
   1073 
   1074  // Bottom
   1075  for (; y0 < ysize_; ++y0) {
   1076    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
   1077    for (size_t x0 = 0; x0 < xsize_; ++x0) {
   1078      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
   1079    }
   1080  }
   1081  return true;
   1082 }
   1083 
   1084 // Need non-template wrapper functions for HWY_EXPORT.
   1085 Status MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
   1086                    const double w_0lt1, const double norm1,
   1087                    ImageF* HWY_RESTRICT diffs,
   1088                    ImageF* HWY_RESTRICT block_diff_ac) {
   1089  const double len = 3.75;
   1090  static const double mulli = 0.39905817637;
   1091  JXL_RETURN_IF_ERROR(MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1,
   1092                                    norm1, len, mulli, diffs, block_diff_ac));
   1093  return true;
   1094 }
   1095 
   1096 Status MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1,
   1097                      const double w_0gt1, const double w_0lt1,
   1098                      const double norm1, ImageF* HWY_RESTRICT diffs,
   1099                      ImageF* HWY_RESTRICT block_diff_ac) {
   1100  const double len = 3.75;
   1101  static const double mulli = 0.611612573796;
   1102  JXL_RETURN_IF_ERROR(MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1,
   1103                                    norm1, len, mulli, diffs, block_diff_ac));
   1104  return true;
   1105 }
   1106 
   1107 void CombineChannelsForMasking(const ImageF* hf, const ImageF* uhf,
   1108                               ImageF* out) {
   1109  // Only X and Y components are involved in masking. B's influence
   1110  // is considered less important in the high frequency area, and we
   1111  // don't model masking from lower frequency signals.
   1112  static const float muls[3] = {
   1113      2.5f,
   1114      0.4f,
   1115      0.4f,
   1116  };
   1117  // Silly and unoptimized approach here. TODO(jyrki): rework this.
   1118  for (size_t y = 0; y < hf[0].ysize(); ++y) {
   1119    const float* BUTTERAUGLI_RESTRICT row_y_hf = hf[1].Row(y);
   1120    const float* BUTTERAUGLI_RESTRICT row_y_uhf = uhf[1].Row(y);
   1121    const float* BUTTERAUGLI_RESTRICT row_x_hf = hf[0].Row(y);
   1122    const float* BUTTERAUGLI_RESTRICT row_x_uhf = uhf[0].Row(y);
   1123    float* BUTTERAUGLI_RESTRICT row = out->Row(y);
   1124    for (size_t x = 0; x < hf[0].xsize(); ++x) {
   1125      float xdiff = (row_x_uhf[x] + row_x_hf[x]) * muls[0];
   1126      float ydiff = row_y_uhf[x] * muls[1] + row_y_hf[x] * muls[2];
   1127      row[x] = xdiff * xdiff + ydiff * ydiff;
   1128      row[x] = std::sqrt(row[x]);
   1129    }
   1130  }
   1131 }
   1132 
   1133 void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) {
   1134  const size_t xsize = xyb.xsize();
   1135  const size_t ysize = xyb.ysize();
   1136  const float bias = mul * bias_arg;
   1137  const float sqrt_bias = std::sqrt(bias);
   1138  for (size_t y = 0; y < ysize; ++y) {
   1139    const float* BUTTERAUGLI_RESTRICT row_in = xyb.Row(y);
   1140    float* BUTTERAUGLI_RESTRICT row_out = out->Row(y);
   1141    for (size_t x = 0; x < xsize; ++x) {
   1142      // kBias makes sqrt behave more linearly.
   1143      row_out[x] = std::sqrt(mul * std::abs(row_in[x]) + bias) - sqrt_bias;
   1144    }
   1145  }
   1146 }
   1147 
   1148 // std::log(80.0) / std::log(255.0);
   1149 constexpr float kIntensityTargetNormalizationHack = 0.79079917404f;
   1150 static const float kInternalGoodQualityThreshold =
   1151    17.83f * kIntensityTargetNormalizationHack;
   1152 static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
   1153 
   1154 void StoreMin3(const float v, float& min0, float& min1, float& min2) {
   1155  if (v < min2) {
   1156    if (v < min0) {
   1157      min2 = min1;
   1158      min1 = min0;
   1159      min0 = v;
   1160    } else if (v < min1) {
   1161      min2 = min1;
   1162      min1 = v;
   1163    } else {
   1164      min2 = v;
   1165    }
   1166  }
   1167 }
   1168 
   1169 // Look for smooth areas near the area of degradation.
   1170 // If the areas area generally smooth, don't do masking.
   1171 void FuzzyErosion(const ImageF& from, ImageF* to) {
   1172  const size_t xsize = from.xsize();
   1173  const size_t ysize = from.ysize();
   1174  static const int kStep = 3;
   1175  for (size_t y = 0; y < ysize; ++y) {
   1176    for (size_t x = 0; x < xsize; ++x) {
   1177      float min0 = from.Row(y)[x];
   1178      float min1 = 2 * min0;
   1179      float min2 = min1;
   1180      if (x >= kStep) {
   1181        float v = from.Row(y)[x - kStep];
   1182        StoreMin3(v, min0, min1, min2);
   1183        if (y >= kStep) {
   1184          float v = from.Row(y - kStep)[x - kStep];
   1185          StoreMin3(v, min0, min1, min2);
   1186        }
   1187        if (y < ysize - kStep) {
   1188          float v = from.Row(y + kStep)[x - kStep];
   1189          StoreMin3(v, min0, min1, min2);
   1190        }
   1191      }
   1192      if (x < xsize - kStep) {
   1193        float v = from.Row(y)[x + kStep];
   1194        StoreMin3(v, min0, min1, min2);
   1195        if (y >= kStep) {
   1196          float v = from.Row(y - kStep)[x + kStep];
   1197          StoreMin3(v, min0, min1, min2);
   1198        }
   1199        if (y < ysize - kStep) {
   1200          float v = from.Row(y + kStep)[x + kStep];
   1201          StoreMin3(v, min0, min1, min2);
   1202        }
   1203      }
   1204      if (y >= kStep) {
   1205        float v = from.Row(y - kStep)[x];
   1206        StoreMin3(v, min0, min1, min2);
   1207      }
   1208      if (y < ysize - kStep) {
   1209        float v = from.Row(y + kStep)[x];
   1210        StoreMin3(v, min0, min1, min2);
   1211      }
   1212      to->Row(y)[x] = (0.45f * min0 + 0.3f * min1 + 0.25f * min2);
   1213    }
   1214  }
   1215 }
   1216 
   1217 // Compute values of local frequency and dc masking based on the activity
   1218 // in the two images. img_diff_ac may be null.
   1219 Status Mask(const ImageF& mask0, const ImageF& mask1,
   1220            const ButteraugliParams& params, BlurTemp* blur_temp,
   1221            ImageF* BUTTERAUGLI_RESTRICT mask,
   1222            ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
   1223  const size_t xsize = mask0.xsize();
   1224  const size_t ysize = mask0.ysize();
   1225  JxlMemoryManager* memory_manager = mask0.memory_manager();
   1226  JXL_ASSIGN_OR_RETURN(*mask, ImageF::Create(memory_manager, xsize, ysize));
   1227  static const float kMul = 6.19424080439;
   1228  static const float kBias = 12.61050594197;
   1229  static const float kRadius = 2.7;
   1230  JXL_ASSIGN_OR_RETURN(ImageF diff0,
   1231                       ImageF::Create(memory_manager, xsize, ysize));
   1232  JXL_ASSIGN_OR_RETURN(ImageF diff1,
   1233                       ImageF::Create(memory_manager, xsize, ysize));
   1234  JXL_ASSIGN_OR_RETURN(ImageF blurred0,
   1235                       ImageF::Create(memory_manager, xsize, ysize));
   1236  JXL_ASSIGN_OR_RETURN(ImageF blurred1,
   1237                       ImageF::Create(memory_manager, xsize, ysize));
   1238  DiffPrecompute(mask0, kMul, kBias, &diff0);
   1239  DiffPrecompute(mask1, kMul, kBias, &diff1);
   1240  JXL_RETURN_IF_ERROR(Blur(diff0, kRadius, params, blur_temp, &blurred0));
   1241  FuzzyErosion(blurred0, &diff0);
   1242  JXL_RETURN_IF_ERROR(Blur(diff1, kRadius, params, blur_temp, &blurred1));
   1243  for (size_t y = 0; y < ysize; ++y) {
   1244    for (size_t x = 0; x < xsize; ++x) {
   1245      mask->Row(y)[x] = diff0.Row(y)[x];
   1246      if (diff_ac != nullptr) {
   1247        static const float kMaskToErrorMul = 10.0;
   1248        float diff = blurred0.Row(y)[x] - blurred1.Row(y)[x];
   1249        diff_ac->Row(y)[x] += kMaskToErrorMul * diff * diff;
   1250      }
   1251    }
   1252  }
   1253  return true;
   1254 }
   1255 
   1256 // `diff_ac` may be null.
   1257 Status MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1,
   1258                       const size_t xsize, const size_t ysize,
   1259                       const ButteraugliParams& params, BlurTemp* blur_temp,
   1260                       ImageF* BUTTERAUGLI_RESTRICT mask,
   1261                       ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
   1262  JxlMemoryManager* memory_manager = pi0.hf[0].memory_manager();
   1263  JXL_ASSIGN_OR_RETURN(ImageF mask0,
   1264                       ImageF::Create(memory_manager, xsize, ysize));
   1265  JXL_ASSIGN_OR_RETURN(ImageF mask1,
   1266                       ImageF::Create(memory_manager, xsize, ysize));
   1267  CombineChannelsForMasking(&pi0.hf[0], &pi0.uhf[0], &mask0);
   1268  CombineChannelsForMasking(&pi1.hf[0], &pi1.uhf[0], &mask1);
   1269  JXL_RETURN_IF_ERROR(Mask(mask0, mask1, params, blur_temp, mask, diff_ac));
   1270  return true;
   1271 }
   1272 
   1273 double MaskY(double delta) {
   1274  static const double offset = 0.829591754942;
   1275  static const double scaler = 0.451936922203;
   1276  static const double mul = 2.5485944793;
   1277  const double c = mul / ((scaler * delta) + offset);
   1278  const double retval = kGlobalScale * (1.0 + c);
   1279  return retval * retval;
   1280 }
   1281 
   1282 double MaskDcY(double delta) {
   1283  static const double offset = 0.20025578522;
   1284  static const double scaler = 3.87449418804;
   1285  static const double mul = 0.505054525019;
   1286  const double c = mul / ((scaler * delta) + offset);
   1287  const double retval = kGlobalScale * (1.0 + c);
   1288  return retval * retval;
   1289 }
   1290 
   1291 inline float MaskColor(const float color[3], const float mask) {
   1292  return color[0] * mask + color[1] * mask + color[2] * mask;
   1293 }
   1294 
   1295 // Diffmap := sqrt of sum{diff images by multiplied by X and Y/B masks}
   1296 Status CombineChannelsToDiffmap(const ImageF& mask,
   1297                                const Image3F& block_diff_dc,
   1298                                const Image3F& block_diff_ac, float xmul,
   1299                                ImageF* result) {
   1300  JXL_ENSURE(SameSize(mask, *result));
   1301  size_t xsize = mask.xsize();
   1302  size_t ysize = mask.ysize();
   1303  for (size_t y = 0; y < ysize; ++y) {
   1304    float* BUTTERAUGLI_RESTRICT row_out = result->Row(y);
   1305    for (size_t x = 0; x < xsize; ++x) {
   1306      float val = mask.Row(y)[x];
   1307      float maskval = MaskY(val);
   1308      float dc_maskval = MaskDcY(val);
   1309      float diff_dc[3];
   1310      float diff_ac[3];
   1311      for (int i = 0; i < 3; ++i) {
   1312        diff_dc[i] = block_diff_dc.PlaneRow(i, y)[x];
   1313        diff_ac[i] = block_diff_ac.PlaneRow(i, y)[x];
   1314      }
   1315      diff_ac[0] *= xmul;
   1316      diff_dc[0] *= xmul;
   1317      row_out[x] = std::sqrt(MaskColor(diff_dc, dc_maskval) +
   1318                             MaskColor(diff_ac, maskval));
   1319    }
   1320  }
   1321  return true;
   1322 }
   1323 
   1324 // Adds weighted L2 difference between i0 and i1 to diffmap.
   1325 static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
   1326                   ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   1327  if (w == 0) return;
   1328 
   1329  const HWY_FULL(float) d;
   1330  const auto weight = Set(d, w);
   1331 
   1332  for (size_t y = 0; y < i0.ysize(); ++y) {
   1333    const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
   1334    const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
   1335    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
   1336 
   1337    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
   1338      const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
   1339      const auto diff2 = Mul(diff, diff);
   1340      const auto prev = Load(d, row_diff + x);
   1341      Store(MulAdd(diff2, weight, prev), d, row_diff + x);
   1342    }
   1343  }
   1344 }
   1345 
   1346 // Initializes diffmap to the weighted L2 difference between i0 and i1.
   1347 static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
   1348                      ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   1349  if (w == 0) return;
   1350 
   1351  const HWY_FULL(float) d;
   1352  const auto weight = Set(d, w);
   1353 
   1354  for (size_t y = 0; y < i0.ysize(); ++y) {
   1355    const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
   1356    const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
   1357    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
   1358 
   1359    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
   1360      const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
   1361      const auto diff2 = Mul(diff, diff);
   1362      Store(Mul(diff2, weight), d, row_diff + x);
   1363    }
   1364  }
   1365 }
   1366 
   1367 // i0 is the original image.
   1368 // i1 is the deformed copy.
   1369 static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1,
   1370                             float w_0lt1,
   1371                             ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   1372  if (w_0gt1 == 0 && w_0lt1 == 0) {
   1373    return;
   1374  }
   1375 
   1376  const HWY_FULL(float) d;
   1377  const auto vw_0gt1 = Set(d, w_0gt1 * 0.8);
   1378  const auto vw_0lt1 = Set(d, w_0lt1 * 0.8);
   1379 
   1380  for (size_t y = 0; y < i0.ysize(); ++y) {
   1381    const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y);
   1382    const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y);
   1383    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
   1384 
   1385    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
   1386      const auto val0 = Load(d, row0 + x);
   1387      const auto val1 = Load(d, row1 + x);
   1388 
   1389      // Primary symmetric quadratic objective.
   1390      const auto diff = Sub(val0, val1);
   1391      auto total = MulAdd(Mul(diff, diff), vw_0gt1, Load(d, row_diff + x));
   1392 
   1393      // Secondary half-open quadratic objectives.
   1394      const auto fabs0 = Abs(val0);
   1395      const auto too_small = Mul(Set(d, 0.4), fabs0);
   1396      const auto too_big = fabs0;
   1397 
   1398      const auto if_neg = IfThenElse(
   1399          Gt(val1, Neg(too_small)), Add(val1, too_small),
   1400          IfThenElseZero(Lt(val1, Neg(too_big)), Sub(Neg(val1), too_big)));
   1401      const auto if_pos =
   1402          IfThenElse(Lt(val1, too_small), Sub(too_small, val1),
   1403                     IfThenElseZero(Gt(val1, too_big), Sub(val1, too_big)));
   1404      const auto v = IfThenElse(Lt(val0, Zero(d)), if_neg, if_pos);
   1405      total = MulAdd(vw_0lt1, Mul(v, v), total);
   1406      Store(total, d, row_diff + x);
   1407    }
   1408  }
   1409 }
   1410 
   1411 // A simple HDR compatible gamma function.
   1412 template <class DF, class V>
   1413 V Gamma(const DF df, V v) {
   1414  // ln(2) constant folded in because we want std::log but have FastLog2f.
   1415  const auto kRetMul = Set(df, 19.245013259874995f * 0.693147180559945f);
   1416  const auto kRetAdd = Set(df, -23.16046239805755);
   1417  // This should happen rarely, but may lead to a NaN in log, which is
   1418  // undesirable. Since negative photons don't exist we solve the NaNs by
   1419  // clamping here.
   1420  v = ZeroIfNegative(v);
   1421 
   1422  const auto biased = Add(v, Set(df, 9.9710635769299145));
   1423  const auto log = FastLog2f(df, biased);
   1424  // We could fold this into a custom Log2 polynomial, but there would be
   1425  // relatively little gain.
   1426  return MulAdd(kRetMul, log, kRetAdd);
   1427 }
   1428 
   1429 template <bool Clamp, class DF, class V>
   1430 BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1,
   1431                                        const V& in2, V* JXL_RESTRICT out0,
   1432                                        V* JXL_RESTRICT out1,
   1433                                        V* JXL_RESTRICT out2) {
   1434  // https://en.wikipedia.org/wiki/Photopsin absorbance modeling.
   1435  static const double mixi0 = 0.29956550340058319;
   1436  static const double mixi1 = 0.63373087833825936;
   1437  static const double mixi2 = 0.077705617820981968;
   1438  static const double mixi3 = 1.7557483643287353;
   1439  static const double mixi4 = 0.22158691104574774;
   1440  static const double mixi5 = 0.69391388044116142;
   1441  static const double mixi6 = 0.0987313588422;
   1442  static const double mixi7 = 1.7557483643287353;
   1443  static const double mixi8 = 0.02;
   1444  static const double mixi9 = 0.02;
   1445  static const double mixi10 = 0.20480129041026129;
   1446  static const double mixi11 = 12.226454707163354;
   1447 
   1448  const V mix0 = Set(df, mixi0);
   1449  const V mix1 = Set(df, mixi1);
   1450  const V mix2 = Set(df, mixi2);
   1451  const V mix3 = Set(df, mixi3);
   1452  const V mix4 = Set(df, mixi4);
   1453  const V mix5 = Set(df, mixi5);
   1454  const V mix6 = Set(df, mixi6);
   1455  const V mix7 = Set(df, mixi7);
   1456  const V mix8 = Set(df, mixi8);
   1457  const V mix9 = Set(df, mixi9);
   1458  const V mix10 = Set(df, mixi10);
   1459  const V mix11 = Set(df, mixi11);
   1460 
   1461  *out0 = MulAdd(mix0, in0, MulAdd(mix1, in1, MulAdd(mix2, in2, mix3)));
   1462  *out1 = MulAdd(mix4, in0, MulAdd(mix5, in1, MulAdd(mix6, in2, mix7)));
   1463  *out2 = MulAdd(mix8, in0, MulAdd(mix9, in1, MulAdd(mix10, in2, mix11)));
   1464 
   1465  if (Clamp) {
   1466    *out0 = Max(*out0, mix3);
   1467    *out1 = Max(*out1, mix7);
   1468    *out2 = Max(*out2, mix11);
   1469  }
   1470 }
   1471 
   1472 // `blurred` is a temporary image used inside this function and not returned.
   1473 Status OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
   1474                          Image3F* blurred, BlurTemp* blur_temp, Image3F* xyb) {
   1475  JXL_ENSURE(blurred != nullptr);
   1476  const double kSigma = 1.2;
   1477  JXL_RETURN_IF_ERROR(
   1478      Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0)));
   1479  JXL_RETURN_IF_ERROR(
   1480      Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1)));
   1481  JXL_RETURN_IF_ERROR(
   1482      Blur(rgb.Plane(2), kSigma, params, blur_temp, &blurred->Plane(2)));
   1483  const HWY_FULL(float) df;
   1484  const auto intensity_target_multiplier = Set(df, params.intensity_target);
   1485  for (size_t y = 0; y < rgb.ysize(); ++y) {
   1486    const float* row_r = rgb.ConstPlaneRow(0, y);
   1487    const float* row_g = rgb.ConstPlaneRow(1, y);
   1488    const float* row_b = rgb.ConstPlaneRow(2, y);
   1489    const float* row_blurred_r = blurred->ConstPlaneRow(0, y);
   1490    const float* row_blurred_g = blurred->ConstPlaneRow(1, y);
   1491    const float* row_blurred_b = blurred->ConstPlaneRow(2, y);
   1492    float* row_out_x = xyb->PlaneRow(0, y);
   1493    float* row_out_y = xyb->PlaneRow(1, y);
   1494    float* row_out_b = xyb->PlaneRow(2, y);
   1495    const auto min = Set(df, 1e-4f);
   1496    for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) {
   1497      auto sensitivity0 = Undefined(df);
   1498      auto sensitivity1 = Undefined(df);
   1499      auto sensitivity2 = Undefined(df);
   1500      {
   1501        // Calculate sensitivity based on the smoothed image gamma derivative.
   1502        auto pre_mixed0 = Undefined(df);
   1503        auto pre_mixed1 = Undefined(df);
   1504        auto pre_mixed2 = Undefined(df);
   1505        OpsinAbsorbance<true>(
   1506            df, Mul(Load(df, row_blurred_r + x), intensity_target_multiplier),
   1507            Mul(Load(df, row_blurred_g + x), intensity_target_multiplier),
   1508            Mul(Load(df, row_blurred_b + x), intensity_target_multiplier),
   1509            &pre_mixed0, &pre_mixed1, &pre_mixed2);
   1510        pre_mixed0 = Max(pre_mixed0, min);
   1511        pre_mixed1 = Max(pre_mixed1, min);
   1512        pre_mixed2 = Max(pre_mixed2, min);
   1513        sensitivity0 = Div(Gamma(df, pre_mixed0), pre_mixed0);
   1514        sensitivity1 = Div(Gamma(df, pre_mixed1), pre_mixed1);
   1515        sensitivity2 = Div(Gamma(df, pre_mixed2), pre_mixed2);
   1516        sensitivity0 = Max(sensitivity0, min);
   1517        sensitivity1 = Max(sensitivity1, min);
   1518        sensitivity2 = Max(sensitivity2, min);
   1519      }
   1520      auto cur_mixed0 = Undefined(df);
   1521      auto cur_mixed1 = Undefined(df);
   1522      auto cur_mixed2 = Undefined(df);
   1523      OpsinAbsorbance<false>(
   1524          df, Mul(Load(df, row_r + x), intensity_target_multiplier),
   1525          Mul(Load(df, row_g + x), intensity_target_multiplier),
   1526          Mul(Load(df, row_b + x), intensity_target_multiplier), &cur_mixed0,
   1527          &cur_mixed1, &cur_mixed2);
   1528      cur_mixed0 = Mul(cur_mixed0, sensitivity0);
   1529      cur_mixed1 = Mul(cur_mixed1, sensitivity1);
   1530      cur_mixed2 = Mul(cur_mixed2, sensitivity2);
   1531      // This is a kludge. The negative values should be zeroed away before
   1532      // blurring. Ideally there would be no negative values in the first place.
   1533      const auto min01 = Set(df, 1.7557483643287353f);
   1534      const auto min2 = Set(df, 12.226454707163354f);
   1535      cur_mixed0 = Max(cur_mixed0, min01);
   1536      cur_mixed1 = Max(cur_mixed1, min01);
   1537      cur_mixed2 = Max(cur_mixed2, min2);
   1538 
   1539      Store(Sub(cur_mixed0, cur_mixed1), df, row_out_x + x);
   1540      Store(Add(cur_mixed0, cur_mixed1), df, row_out_y + x);
   1541      Store(cur_mixed2, df, row_out_b + x);
   1542    }
   1543  }
   1544  return true;
   1545 }
   1546 
   1547 Status ButteraugliDiffmapInPlace(Image3F& image0, Image3F& image1,
   1548                                 const ButteraugliParams& params,
   1549                                 ImageF& diffmap) {
   1550  // image0 and image1 are in linear sRGB color space
   1551  const size_t xsize = image0.xsize();
   1552  const size_t ysize = image0.ysize();
   1553  JxlMemoryManager* memory_manager = image0.memory_manager();
   1554  BlurTemp blur_temp;
   1555  {
   1556    // Convert image0 and image1 to XYB in-place
   1557    JXL_ASSIGN_OR_RETURN(Image3F temp,
   1558                         Image3F::Create(memory_manager, xsize, ysize));
   1559    JXL_RETURN_IF_ERROR(
   1560        OpsinDynamicsImage(image0, params, &temp, &blur_temp, &image0));
   1561    JXL_RETURN_IF_ERROR(
   1562        OpsinDynamicsImage(image1, params, &temp, &blur_temp, &image1));
   1563  }
   1564  // image0 and image1 are in XYB color space
   1565  JXL_ASSIGN_OR_RETURN(ImageF block_diff_dc,
   1566                       ImageF::Create(memory_manager, xsize, ysize));
   1567  ZeroFillImage(&block_diff_dc);
   1568  {
   1569    // separate out LF components from image0 and image1 and compute the dc
   1570    // diff image from them
   1571    JXL_ASSIGN_OR_RETURN(Image3F lf0,
   1572                         Image3F::Create(memory_manager, xsize, ysize));
   1573    JXL_ASSIGN_OR_RETURN(Image3F lf1,
   1574                         Image3F::Create(memory_manager, xsize, ysize));
   1575    JXL_RETURN_IF_ERROR(
   1576        SeparateLFAndMF(params, image0, &lf0, &image0, &blur_temp));
   1577    JXL_RETURN_IF_ERROR(
   1578        SeparateLFAndMF(params, image1, &lf1, &image1, &blur_temp));
   1579    for (size_t c = 0; c < 3; ++c) {
   1580      L2Diff(lf0.Plane(c), lf1.Plane(c), wmul[6 + c], &block_diff_dc);
   1581    }
   1582  }
   1583  // image0 and image1 are MF residuals (before blurring) in XYB color space
   1584  ImageF hf0[2];
   1585  ImageF hf1[2];
   1586  JXL_RETURN_IF_ERROR(SeparateMFAndHF(params, &image0, &hf0[0], &blur_temp));
   1587  JXL_RETURN_IF_ERROR(SeparateMFAndHF(params, &image1, &hf1[0], &blur_temp));
   1588  // image0 and image1 are MF-images in XYB color space
   1589 
   1590  JXL_ASSIGN_OR_RETURN(ImageF block_diff_ac,
   1591                       ImageF::Create(memory_manager, xsize, ysize));
   1592  ZeroFillImage(&block_diff_ac);
   1593  // start accumulating ac diff image from MF images
   1594  {
   1595    JXL_ASSIGN_OR_RETURN(ImageF diffs,
   1596                         ImageF::Create(memory_manager, xsize, ysize));
   1597    JXL_RETURN_IF_ERROR(MaltaDiffMapLF(image0.Plane(1), image1.Plane(1),
   1598                                       wMfMalta, wMfMalta, norm1Mf, &diffs,
   1599                                       &block_diff_ac));
   1600    JXL_RETURN_IF_ERROR(MaltaDiffMapLF(image0.Plane(0), image1.Plane(0),
   1601                                       wMfMaltaX, wMfMaltaX, norm1MfX, &diffs,
   1602                                       &block_diff_ac));
   1603  }
   1604  for (size_t c = 0; c < 3; ++c) {
   1605    L2Diff(image0.Plane(c), image1.Plane(c), wmul[3 + c], &block_diff_ac);
   1606  }
   1607  // we will not need the MF-images and more, so we deallocate them to reduce
   1608  // peak memory usage
   1609  image0 = Image3F();
   1610  image1 = Image3F();
   1611 
   1612  ImageF uhf0[2];
   1613  ImageF uhf1[2];
   1614  JXL_RETURN_IF_ERROR(SeparateHFAndUHF(params, &hf0[0], &uhf0[0], &blur_temp));
   1615  JXL_RETURN_IF_ERROR(SeparateHFAndUHF(params, &hf1[0], &uhf1[0], &blur_temp));
   1616 
   1617  // continue accumulating ac diff image from HF and UHF images
   1618  const float hf_asymmetry = params.hf_asymmetry;
   1619  {
   1620    JXL_ASSIGN_OR_RETURN(ImageF diffs,
   1621                         ImageF::Create(memory_manager, xsize, ysize));
   1622    JXL_RETURN_IF_ERROR(MaltaDiffMap(uhf0[1], uhf1[1], wUhfMalta * hf_asymmetry,
   1623                                     wUhfMalta / hf_asymmetry, norm1Uhf, &diffs,
   1624                                     &block_diff_ac));
   1625    JXL_RETURN_IF_ERROR(MaltaDiffMap(
   1626        uhf0[0], uhf1[0], wUhfMaltaX * hf_asymmetry, wUhfMaltaX / hf_asymmetry,
   1627        norm1UhfX, &diffs, &block_diff_ac));
   1628    JXL_RETURN_IF_ERROR(MaltaDiffMapLF(
   1629        hf0[1], hf1[1], wHfMalta * std::sqrt(hf_asymmetry),
   1630        wHfMalta / std::sqrt(hf_asymmetry), norm1Hf, &diffs, &block_diff_ac));
   1631    JXL_RETURN_IF_ERROR(MaltaDiffMapLF(
   1632        hf0[0], hf1[0], wHfMaltaX * std::sqrt(hf_asymmetry),
   1633        wHfMaltaX / std::sqrt(hf_asymmetry), norm1HfX, &diffs, &block_diff_ac));
   1634  }
   1635  for (size_t c = 0; c < 2; ++c) {
   1636    L2DiffAsymmetric(hf0[c], hf1[c], wmul[c] * hf_asymmetry,
   1637                     wmul[c] / hf_asymmetry, &block_diff_ac);
   1638  }
   1639 
   1640  // compute mask image from HF and UHF X and Y images
   1641  JXL_ASSIGN_OR_RETURN(ImageF mask,
   1642                       ImageF::Create(memory_manager, xsize, ysize));
   1643  {
   1644    JXL_ASSIGN_OR_RETURN(ImageF mask0,
   1645                         ImageF::Create(memory_manager, xsize, ysize));
   1646    JXL_ASSIGN_OR_RETURN(ImageF mask1,
   1647                         ImageF::Create(memory_manager, xsize, ysize));
   1648    CombineChannelsForMasking(&hf0[0], &uhf0[0], &mask0);
   1649    CombineChannelsForMasking(&hf1[0], &uhf1[0], &mask1);
   1650    DeallocateHFAndUHF(&hf1[0], &uhf1[0]);
   1651    DeallocateHFAndUHF(&hf0[0], &uhf0[0]);
   1652    JXL_RETURN_IF_ERROR(
   1653        Mask(mask0, mask1, params, &blur_temp, &mask, &block_diff_ac));
   1654  }
   1655 
   1656  // compute final diffmap from mask image and ac and dc diff images
   1657  JXL_ASSIGN_OR_RETURN(diffmap, ImageF::Create(memory_manager, xsize, ysize));
   1658  for (size_t y = 0; y < ysize; ++y) {
   1659    const float* row_dc = block_diff_dc.Row(y);
   1660    const float* row_ac = block_diff_ac.Row(y);
   1661    float* row_out = diffmap.Row(y);
   1662    for (size_t x = 0; x < xsize; ++x) {
   1663      const float val = mask.Row(y)[x];
   1664      row_out[x] = sqrt(row_dc[x] * MaskDcY(val) + row_ac[x] * MaskY(val));
   1665    }
   1666  }
   1667  return true;
   1668 }
   1669 
   1670 // NOLINTNEXTLINE(google-readability-namespace-comments)
   1671 }  // namespace HWY_NAMESPACE
   1672 }  // namespace jxl
   1673 HWY_AFTER_NAMESPACE();
   1674 
   1675 #if HWY_ONCE
   1676 namespace jxl {
   1677 
   1678 HWY_EXPORT(SeparateFrequencies);       // Local function.
   1679 HWY_EXPORT(MaskPsychoImage);           // Local function.
   1680 HWY_EXPORT(L2DiffAsymmetric);          // Local function.
   1681 HWY_EXPORT(L2Diff);                    // Local function.
   1682 HWY_EXPORT(SetL2Diff);                 // Local function.
   1683 HWY_EXPORT(CombineChannelsToDiffmap);  // Local function.
   1684 HWY_EXPORT(MaltaDiffMap);              // Local function.
   1685 HWY_EXPORT(MaltaDiffMapLF);            // Local function.
   1686 HWY_EXPORT(OpsinDynamicsImage);        // Local function.
   1687 HWY_EXPORT(ButteraugliDiffmapInPlace);  // Local function.
   1688 
   1689 #if BUTTERAUGLI_ENABLE_CHECKS
   1690 
   1691 static inline bool IsNan(const float x) {
   1692  uint32_t bits;
   1693  memcpy(&bits, &x, sizeof(bits));
   1694  const uint32_t bitmask_exp = 0x7F800000;
   1695  return (bits & bitmask_exp) == bitmask_exp && (bits & 0x7FFFFF);
   1696 }
   1697 
   1698 static inline bool IsNan(const double x) {
   1699  uint64_t bits;
   1700  memcpy(&bits, &x, sizeof(bits));
   1701  return (0x7ff0000000000001ULL <= bits && bits <= 0x7fffffffffffffffULL) ||
   1702         (0xfff0000000000001ULL <= bits && bits <= 0xffffffffffffffffULL);
   1703 }
   1704 
   1705 static inline void CheckImage(const ImageF& image, const char* name) {
   1706  for (size_t y = 0; y < image.ysize(); ++y) {
   1707    const float* BUTTERAUGLI_RESTRICT row = image.Row(y);
   1708    for (size_t x = 0; x < image.xsize(); ++x) {
   1709      if (IsNan(row[x])) {
   1710        printf("NAN: Image %s @ %" PRIuS ",%" PRIuS " (of %" PRIuS ",%" PRIuS
   1711               ")\n",
   1712               name, x, y, image.xsize(), image.ysize());
   1713        exit(1);
   1714      }
   1715    }
   1716  }
   1717 }
   1718 
   1719 #define CHECK_NAN(x, str)                \
   1720  do {                                   \
   1721    if (IsNan(x)) {                      \
   1722      printf("%d: %s\n", __LINE__, str); \
   1723      abort();                           \
   1724    }                                    \
   1725  } while (0)
   1726 
   1727 #define CHECK_IMAGE(image, name) CheckImage(image, name)
   1728 
   1729 #else  // BUTTERAUGLI_ENABLE_CHECKS
   1730 
   1731 #define CHECK_NAN(x, str)
   1732 #define CHECK_IMAGE(image, name)
   1733 
   1734 #endif  // BUTTERAUGLI_ENABLE_CHECKS
   1735 
   1736 // Calculate a 2x2 subsampled image for purposes of recursive butteraugli at
   1737 // multiresolution.
   1738 static StatusOr<Image3F> SubSample2x(const Image3F& in) {
   1739  size_t xs = (in.xsize() + 1) / 2;
   1740  size_t ys = (in.ysize() + 1) / 2;
   1741  JxlMemoryManager* memory_manager = in.memory_manager();
   1742  JXL_ASSIGN_OR_RETURN(Image3F retval, Image3F::Create(memory_manager, xs, ys));
   1743  for (size_t c = 0; c < 3; ++c) {
   1744    for (size_t y = 0; y < ys; ++y) {
   1745      for (size_t x = 0; x < xs; ++x) {
   1746        retval.PlaneRow(c, y)[x] = 0;
   1747      }
   1748    }
   1749  }
   1750  for (size_t c = 0; c < 3; ++c) {
   1751    for (size_t y = 0; y < in.ysize(); ++y) {
   1752      for (size_t x = 0; x < in.xsize(); ++x) {
   1753        retval.PlaneRow(c, y / 2)[x / 2] += 0.25f * in.PlaneRow(c, y)[x];
   1754      }
   1755    }
   1756    if ((in.xsize() & 1) != 0) {
   1757      for (size_t y = 0; y < retval.ysize(); ++y) {
   1758        size_t last_column = retval.xsize() - 1;
   1759        retval.PlaneRow(c, y)[last_column] *= 2.0f;
   1760      }
   1761    }
   1762    if ((in.ysize() & 1) != 0) {
   1763      for (size_t x = 0; x < retval.xsize(); ++x) {
   1764        size_t last_row = retval.ysize() - 1;
   1765        retval.PlaneRow(c, last_row)[x] *= 2.0f;
   1766      }
   1767    }
   1768  }
   1769  return retval;
   1770 }
   1771 
   1772 // Supersample src by 2x and add it to dest.
   1773 static void AddSupersampled2x(const ImageF& src, float w, ImageF& dest) {
   1774  for (size_t y = 0; y < dest.ysize(); ++y) {
   1775    for (size_t x = 0; x < dest.xsize(); ++x) {
   1776      // There will be less errors from the more averaged images.
   1777      // We take it into account to some extent using a scaler.
   1778      static const double kHeuristicMixingValue = 0.3;
   1779      dest.Row(y)[x] *= 1.0 - kHeuristicMixingValue * w;
   1780      dest.Row(y)[x] += w * src.Row(y / 2)[x / 2];
   1781    }
   1782  }
   1783 }
   1784 
   1785 Image3F* ButteraugliComparator::Temp() const {
   1786  bool was_in_use = temp_in_use_.test_and_set(std::memory_order_acq_rel);
   1787  if (was_in_use) return nullptr;
   1788  return &temp_;
   1789 }
   1790 
   1791 void ButteraugliComparator::ReleaseTemp() const { temp_in_use_.clear(); }
   1792 
   1793 ButteraugliComparator::ButteraugliComparator(size_t xsize, size_t ysize,
   1794                                             const ButteraugliParams& params)
   1795    : xsize_(xsize), ysize_(ysize), params_(params) {}
   1796 
   1797 StatusOr<std::unique_ptr<ButteraugliComparator>> ButteraugliComparator::Make(
   1798    const Image3F& rgb0, const ButteraugliParams& params) {
   1799  size_t xsize = rgb0.xsize();
   1800  size_t ysize = rgb0.ysize();
   1801  JxlMemoryManager* memory_manager = rgb0.memory_manager();
   1802  std::unique_ptr<ButteraugliComparator> result =
   1803      std::unique_ptr<ButteraugliComparator>(
   1804          new ButteraugliComparator(xsize, ysize, params));
   1805  JXL_ASSIGN_OR_RETURN(result->temp_,
   1806                       Image3F::Create(memory_manager, xsize, ysize));
   1807 
   1808  if (xsize < 8 || ysize < 8) {
   1809    return result;
   1810  }
   1811 
   1812  JXL_ASSIGN_OR_RETURN(Image3F xyb0,
   1813                       Image3F::Create(memory_manager, xsize, ysize));
   1814  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
   1815      rgb0, params, result->Temp(), &result->blur_temp_, &xyb0));
   1816  result->ReleaseTemp();
   1817  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(SeparateFrequencies)(
   1818      xsize, ysize, params, &result->blur_temp_, xyb0, result->pi0_));
   1819 
   1820  // Awful recursive construction of samples of different resolution.
   1821  // This is an after-thought and possibly somewhat parallel in
   1822  // functionality with the PsychoImage multi-resolution approach.
   1823  JXL_ASSIGN_OR_RETURN(Image3F subsampledRgb0, SubSample2x(rgb0));
   1824  JXL_ASSIGN_OR_RETURN(result->sub_,
   1825                       ButteraugliComparator::Make(subsampledRgb0, params));
   1826  return result;
   1827 }
   1828 
   1829 Status ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const {
   1830  return HWY_DYNAMIC_DISPATCH(MaskPsychoImage)(
   1831      pi0_, pi0_, xsize_, ysize_, params_, &blur_temp_, mask, nullptr);
   1832 }
   1833 
   1834 Status ButteraugliComparator::Diffmap(const Image3F& rgb1,
   1835                                      ImageF& result) const {
   1836  JxlMemoryManager* memory_manager = rgb1.memory_manager();
   1837  if (xsize_ < 8 || ysize_ < 8) {
   1838    ZeroFillImage(&result);
   1839    return true;
   1840  }
   1841  JXL_ASSIGN_OR_RETURN(Image3F xyb1,
   1842                       Image3F::Create(memory_manager, xsize_, ysize_));
   1843  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
   1844      rgb1, params_, Temp(), &blur_temp_, &xyb1));
   1845  ReleaseTemp();
   1846  JXL_RETURN_IF_ERROR(DiffmapOpsinDynamicsImage(xyb1, result));
   1847  if (sub_) {
   1848    if (sub_->xsize_ < 8 || sub_->ysize_ < 8) {
   1849      return true;
   1850    }
   1851    JXL_ASSIGN_OR_RETURN(
   1852        Image3F sub_xyb,
   1853        Image3F::Create(memory_manager, sub_->xsize_, sub_->ysize_));
   1854    JXL_ASSIGN_OR_RETURN(Image3F subsampledRgb1, SubSample2x(rgb1));
   1855    JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
   1856        subsampledRgb1, params_, sub_->Temp(), &sub_->blur_temp_, &sub_xyb));
   1857    sub_->ReleaseTemp();
   1858    ImageF subresult;
   1859    JXL_RETURN_IF_ERROR(sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult));
   1860    AddSupersampled2x(subresult, 0.5, result);
   1861  }
   1862  return true;
   1863 }
   1864 
   1865 Status ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1,
   1866                                                        ImageF& result) const {
   1867  JxlMemoryManager* memory_manager = xyb1.memory_manager();
   1868  if (xsize_ < 8 || ysize_ < 8) {
   1869    ZeroFillImage(&result);
   1870    return true;
   1871  }
   1872  PsychoImage pi1;
   1873  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(SeparateFrequencies)(
   1874      xsize_, ysize_, params_, &blur_temp_, xyb1, pi1));
   1875  JXL_ASSIGN_OR_RETURN(result, ImageF::Create(memory_manager, xsize_, ysize_));
   1876  return DiffmapPsychoImage(pi1, result);
   1877 }
   1878 
   1879 namespace {
   1880 
   1881 Status MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
   1882                    const double w_0lt1, const double norm1,
   1883                    ImageF* HWY_RESTRICT diffs,
   1884                    Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
   1885  return HWY_DYNAMIC_DISPATCH(MaltaDiffMap)(lum0, lum1, w_0gt1, w_0lt1, norm1,
   1886                                            diffs, &block_diff_ac->Plane(c));
   1887 }
   1888 
   1889 Status MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1,
   1890                      const double w_0gt1, const double w_0lt1,
   1891                      const double norm1, ImageF* HWY_RESTRICT diffs,
   1892                      Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
   1893  return HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF)(lum0, lum1, w_0gt1, w_0lt1, norm1,
   1894                                              diffs, &block_diff_ac->Plane(c));
   1895 }
   1896 
   1897 }  // namespace
   1898 
   1899 Status ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
   1900                                                 ImageF& diffmap) const {
   1901  JxlMemoryManager* memory_manager = diffmap.memory_manager();
   1902  if (xsize_ < 8 || ysize_ < 8) {
   1903    ZeroFillImage(&diffmap);
   1904    return true;
   1905  }
   1906 
   1907  const float hf_asymmetry_ = params_.hf_asymmetry;
   1908  const float xmul_ = params_.xmul;
   1909 
   1910  JXL_ASSIGN_OR_RETURN(ImageF diffs,
   1911                       ImageF::Create(memory_manager, xsize_, ysize_));
   1912  JXL_ASSIGN_OR_RETURN(Image3F block_diff_ac,
   1913                       Image3F::Create(memory_manager, xsize_, ysize_));
   1914  ZeroFillImage(&block_diff_ac);
   1915  JXL_RETURN_IF_ERROR(MaltaDiffMap(
   1916      pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_,
   1917      wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1));
   1918  JXL_RETURN_IF_ERROR(MaltaDiffMap(
   1919      pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_,
   1920      wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac, 0));
   1921  JXL_RETURN_IF_ERROR(MaltaDiffMapLF(
   1922      pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_),
   1923      wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs, &block_diff_ac, 1));
   1924  JXL_RETURN_IF_ERROR(MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0],
   1925                                     wHfMaltaX * std::sqrt(hf_asymmetry_),
   1926                                     wHfMaltaX / std::sqrt(hf_asymmetry_),
   1927                                     norm1HfX, &diffs, &block_diff_ac, 0));
   1928  JXL_RETURN_IF_ERROR(MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1),
   1929                                     wMfMalta, wMfMalta, norm1Mf, &diffs,
   1930                                     &block_diff_ac, 1));
   1931  JXL_RETURN_IF_ERROR(MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0),
   1932                                     wMfMaltaX, wMfMaltaX, norm1MfX, &diffs,
   1933                                     &block_diff_ac, 0));
   1934 
   1935  JXL_ASSIGN_OR_RETURN(Image3F block_diff_dc,
   1936                       Image3F::Create(memory_manager, xsize_, ysize_));
   1937  for (size_t c = 0; c < 3; ++c) {
   1938    if (c < 2) {  // No blue channel error accumulated at HF.
   1939      HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric)
   1940      (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_,
   1941       &block_diff_ac.Plane(c));
   1942    }
   1943    HWY_DYNAMIC_DISPATCH(L2Diff)
   1944    (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac.Plane(c));
   1945    HWY_DYNAMIC_DISPATCH(SetL2Diff)
   1946    (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc.Plane(c));
   1947  }
   1948 
   1949  ImageF mask;
   1950  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(MaskPsychoImage)(
   1951      pi0_, pi1, xsize_, ysize_, params_, &blur_temp_, &mask,
   1952      &block_diff_ac.Plane(1)));
   1953 
   1954  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap)(
   1955      mask, block_diff_dc, block_diff_ac, xmul_, &diffmap));
   1956  return true;
   1957 }
   1958 
   1959 double ButteraugliScoreFromDiffmap(const ImageF& diffmap,
   1960                                   const ButteraugliParams* params) {
   1961  float retval = 0.0f;
   1962  for (size_t y = 0; y < diffmap.ysize(); ++y) {
   1963    const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y);
   1964    for (size_t x = 0; x < diffmap.xsize(); ++x) {
   1965      retval = std::max(retval, row[x]);
   1966    }
   1967  }
   1968  return retval;
   1969 }
   1970 
   1971 Status ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
   1972                          double hf_asymmetry, double xmul, ImageF& diffmap) {
   1973  ButteraugliParams params;
   1974  params.hf_asymmetry = hf_asymmetry;
   1975  params.xmul = xmul;
   1976  return ButteraugliDiffmap(rgb0, rgb1, params, diffmap);
   1977 }
   1978 
   1979 template <size_t kMax>
   1980 bool ButteraugliDiffmapSmall(const Image3F& rgb0, const Image3F& rgb1,
   1981                             const ButteraugliParams& params, ImageF& diffmap) {
   1982  const size_t xsize = rgb0.xsize();
   1983  const size_t ysize = rgb0.ysize();
   1984  JxlMemoryManager* memory_manager = rgb0.memory_manager();
   1985  // Butteraugli values for small (where xsize or ysize is smaller
   1986  // than 8 pixels) images are non-sensical, but most likely it is
   1987  // less disruptive to try to compute something than just give up.
   1988  // Temporarily extend the borders of the image to fit 8 x 8 size.
   1989  size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0;
   1990  size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0;
   1991  size_t xscaled = std::max<size_t>(kMax, xsize);
   1992  size_t yscaled = std::max<size_t>(kMax, ysize);
   1993  JXL_ASSIGN_OR_RETURN(Image3F scaled0,
   1994                       Image3F::Create(memory_manager, xscaled, yscaled));
   1995  JXL_ASSIGN_OR_RETURN(Image3F scaled1,
   1996                       Image3F::Create(memory_manager, xscaled, yscaled));
   1997  for (int i = 0; i < 3; ++i) {
   1998    for (size_t y = 0; y < yscaled; ++y) {
   1999      for (size_t x = 0; x < xscaled; ++x) {
   2000        size_t x2 = std::min<size_t>(xsize - 1, x > xborder ? x - xborder : 0);
   2001        size_t y2 = std::min<size_t>(ysize - 1, y > yborder ? y - yborder : 0);
   2002        scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2];
   2003        scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2];
   2004      }
   2005    }
   2006  }
   2007  ImageF diffmap_scaled;
   2008  const bool ok = ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled);
   2009  JXL_ASSIGN_OR_RETURN(diffmap, ImageF::Create(memory_manager, xsize, ysize));
   2010  for (size_t y = 0; y < ysize; ++y) {
   2011    for (size_t x = 0; x < xsize; ++x) {
   2012      diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder];
   2013    }
   2014  }
   2015  return ok;
   2016 }
   2017 
   2018 Status ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
   2019                          const ButteraugliParams& params, ImageF& diffmap) {
   2020  const size_t xsize = rgb0.xsize();
   2021  const size_t ysize = rgb0.ysize();
   2022  if (xsize < 1 || ysize < 1) {
   2023    return JXL_FAILURE("Zero-sized image");
   2024  }
   2025  if (!SameSize(rgb0, rgb1)) {
   2026    return JXL_FAILURE("Size mismatch");
   2027  }
   2028  static const int kMax = 8;
   2029  if (xsize < kMax || ysize < kMax) {
   2030    return ButteraugliDiffmapSmall<kMax>(rgb0, rgb1, params, diffmap);
   2031  }
   2032  JXL_ASSIGN_OR_RETURN(std::unique_ptr<ButteraugliComparator> butteraugli,
   2033                       ButteraugliComparator::Make(rgb0, params));
   2034  JXL_RETURN_IF_ERROR(butteraugli->Diffmap(rgb1, diffmap));
   2035  return true;
   2036 }
   2037 
   2038 bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
   2039                          float hf_asymmetry, float xmul, ImageF& diffmap,
   2040                          double& diffvalue) {
   2041  ButteraugliParams params;
   2042  params.hf_asymmetry = hf_asymmetry;
   2043  params.xmul = xmul;
   2044  return ButteraugliInterface(rgb0, rgb1, params, diffmap, diffvalue);
   2045 }
   2046 
   2047 bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
   2048                          const ButteraugliParams& params, ImageF& diffmap,
   2049                          double& diffvalue) {
   2050  if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) {
   2051    return false;
   2052  }
   2053  diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
   2054  return true;
   2055 }
   2056 
   2057 Status ButteraugliInterfaceInPlace(Image3F&& rgb0, Image3F&& rgb1,
   2058                                   const ButteraugliParams& params,
   2059                                   ImageF& diffmap, double& diffvalue) {
   2060  const size_t xsize = rgb0.xsize();
   2061  const size_t ysize = rgb0.ysize();
   2062  if (xsize < 1 || ysize < 1) {
   2063    return JXL_FAILURE("Zero-sized image");
   2064  }
   2065  if (!SameSize(rgb0, rgb1)) {
   2066    return JXL_FAILURE("Size mismatch");
   2067  }
   2068  static const int kMax = 8;
   2069  if (xsize < kMax || ysize < kMax) {
   2070    bool ok = ButteraugliDiffmapSmall<kMax>(rgb0, rgb1, params, diffmap);
   2071    diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
   2072    return ok;
   2073  }
   2074  ImageF subdiffmap;
   2075  if (xsize >= 15 && ysize >= 15) {
   2076    JXL_ASSIGN_OR_RETURN(Image3F rgb0_sub, SubSample2x(rgb0));
   2077    JXL_ASSIGN_OR_RETURN(Image3F rgb1_sub, SubSample2x(rgb1));
   2078    JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(ButteraugliDiffmapInPlace)(
   2079        rgb0_sub, rgb1_sub, params, subdiffmap));
   2080  }
   2081  JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(ButteraugliDiffmapInPlace)(
   2082      rgb0, rgb1, params, diffmap));
   2083  if (xsize >= 15 && ysize >= 15) {
   2084    AddSupersampled2x(subdiffmap, 0.5, diffmap);
   2085  }
   2086  diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
   2087  return true;
   2088 }
   2089 
   2090 double ButteraugliFuzzyClass(double score) {
   2091  static const double fuzzy_width_up = 4.8;
   2092  static const double fuzzy_width_down = 4.8;
   2093  static const double m0 = 2.0;
   2094  static const double scaler = 0.7777;
   2095  double val;
   2096  if (score < 1.0) {
   2097    // val in [scaler .. 2.0]
   2098    val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_down));
   2099    val -= 1.0;           // from [1 .. 2] to [0 .. 1]
   2100    val *= 2.0 - scaler;  // from [0 .. 1] to [0 .. 2.0 - scaler]
   2101    val += scaler;        // from [0 .. 2.0 - scaler] to [scaler .. 2.0]
   2102  } else {
   2103    // val in [0 .. scaler]
   2104    val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_up));
   2105    val *= scaler;
   2106  }
   2107  return val;
   2108 }
   2109 
   2110 // #define PRINT_OUT_NORMALIZATION
   2111 
   2112 double ButteraugliFuzzyInverse(double seek) {
   2113  double pos = 0;
   2114  // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
   2115  for (double range = 1.0; range >= 1e-10; range *= 0.5) {
   2116    double cur = ButteraugliFuzzyClass(pos);
   2117    if (cur < seek) {
   2118      pos -= range;
   2119    } else {
   2120      pos += range;
   2121    }
   2122  }
   2123 #ifdef PRINT_OUT_NORMALIZATION
   2124  if (seek == 1.0) {
   2125    fprintf(stderr, "Fuzzy inverse %g\n", pos);
   2126  }
   2127 #endif
   2128  return pos;
   2129 }
   2130 
   2131 #ifdef PRINT_OUT_NORMALIZATION
   2132 static double print_out_normalization = ButteraugliFuzzyInverse(1.0);
   2133 #endif
   2134 
   2135 namespace {
   2136 
   2137 void ScoreToRgb(double score, double good_threshold, double bad_threshold,
   2138                float rgb[3]) {
   2139  double heatmap[12][3] = {
   2140      {0, 0, 0},       {0, 0, 1},
   2141      {0, 1, 1},       {0, 1, 0},  // Good level
   2142      {1, 1, 0},       {1, 0, 0},  // Bad level
   2143      {1, 0, 1},       {0.5, 0.5, 1.0},
   2144      {1.0, 0.5, 0.5},  // Pastel colors for the very bad quality range.
   2145      {1.0, 1.0, 0.5}, {1, 1, 1},
   2146      {1, 1, 1},  // Last color repeated to have a solid range of white.
   2147  };
   2148  if (score < good_threshold) {
   2149    score = (score / good_threshold) * 0.3;
   2150  } else if (score < bad_threshold) {
   2151    score = 0.3 +
   2152            (score - good_threshold) / (bad_threshold - good_threshold) * 0.15;
   2153  } else {
   2154    score = 0.45 + (score - bad_threshold) / (bad_threshold * 12) * 0.5;
   2155  }
   2156  static const int kTableSize = sizeof(heatmap) / sizeof(heatmap[0]);
   2157  score = std::min<double>(std::max<double>(score * (kTableSize - 1), 0.0),
   2158                           kTableSize - 2);
   2159  int ix = static_cast<int>(score);
   2160  ix = std::min(std::max(0, ix), kTableSize - 2);  // Handle NaN
   2161  double mix = score - ix;
   2162  for (int i = 0; i < 3; ++i) {
   2163    double v = mix * heatmap[ix + 1][i] + (1 - mix) * heatmap[ix][i];
   2164    rgb[i] = pow(v, 0.5);
   2165  }
   2166 }
   2167 
   2168 }  // namespace
   2169 
   2170 StatusOr<Image3F> CreateHeatMapImage(const ImageF& distmap,
   2171                                     double good_threshold,
   2172                                     double bad_threshold) {
   2173  JxlMemoryManager* memory_manager = distmap.memory_manager();
   2174  JXL_ASSIGN_OR_RETURN(
   2175      Image3F heatmap,
   2176      Image3F::Create(memory_manager, distmap.xsize(), distmap.ysize()));
   2177  for (size_t y = 0; y < distmap.ysize(); ++y) {
   2178    const float* BUTTERAUGLI_RESTRICT row_distmap = distmap.ConstRow(y);
   2179    float* BUTTERAUGLI_RESTRICT row_h0 = heatmap.PlaneRow(0, y);
   2180    float* BUTTERAUGLI_RESTRICT row_h1 = heatmap.PlaneRow(1, y);
   2181    float* BUTTERAUGLI_RESTRICT row_h2 = heatmap.PlaneRow(2, y);
   2182    for (size_t x = 0; x < distmap.xsize(); ++x) {
   2183      const float d = row_distmap[x];
   2184      float rgb[3];
   2185      ScoreToRgb(d, good_threshold, bad_threshold, rgb);
   2186      row_h0[x] = rgb[0];
   2187      row_h1[x] = rgb[1];
   2188      row_h2[x] = rgb[2];
   2189    }
   2190  }
   2191  return heatmap;
   2192 }
   2193 
   2194 }  // namespace jxl
   2195 #endif  // HWY_ONCE