tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

render.cc (29715B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jpegli/render.h"
      7 
      8 #include <array>
      9 #include <cmath>
     10 #include <cstddef>
     11 #include <cstdint>
     12 #include <cstring>
     13 #include <vector>
     14 
     15 #include "lib/jpegli/color_quantize.h"
     16 #include "lib/jpegli/color_transform.h"
     17 #include "lib/jpegli/decode_internal.h"
     18 #include "lib/jpegli/error.h"
     19 #include "lib/jpegli/idct.h"
     20 #include "lib/jpegli/upsample.h"
     21 #include "lib/jxl/base/byte_order.h"
     22 #include "lib/jxl/base/compiler_specific.h"
     23 
     24 #ifdef MEMORY_SANITIZER
     25 #define JXL_MEMORY_SANITIZER 1
     26 #elif defined(__has_feature)
     27 #if __has_feature(memory_sanitizer)
     28 #define JXL_MEMORY_SANITIZER 1
     29 #else
     30 #define JXL_MEMORY_SANITIZER 0
     31 #endif
     32 #else
     33 #define JXL_MEMORY_SANITIZER 0
     34 #endif
     35 
     36 #if JXL_MEMORY_SANITIZER
     37 #include "sanitizer/msan_interface.h"
     38 #endif
     39 
     40 #undef HWY_TARGET_INCLUDE
     41 #define HWY_TARGET_INCLUDE "lib/jpegli/render.cc"
     42 #include <hwy/foreach_target.h>
     43 #include <hwy/highway.h>
     44 
     45 HWY_BEFORE_NAMESPACE();
     46 namespace jpegli {
     47 namespace HWY_NAMESPACE {
     48 
     49 // These templates are not found via ADL.
     50 using hwy::HWY_NAMESPACE::Abs;
     51 using hwy::HWY_NAMESPACE::Add;
     52 using hwy::HWY_NAMESPACE::Clamp;
     53 using hwy::HWY_NAMESPACE::Gt;
     54 using hwy::HWY_NAMESPACE::IfThenElseZero;
     55 using hwy::HWY_NAMESPACE::Mul;
     56 using hwy::HWY_NAMESPACE::NearestInt;
     57 using hwy::HWY_NAMESPACE::Or;
     58 using hwy::HWY_NAMESPACE::Rebind;
     59 using hwy::HWY_NAMESPACE::ShiftLeftSame;
     60 using hwy::HWY_NAMESPACE::ShiftRightSame;
     61 using hwy::HWY_NAMESPACE::Vec;
     62 using D = HWY_FULL(float);
     63 using DI = HWY_FULL(int32_t);
     64 constexpr D d;
     65 constexpr DI di;
     66 
     67 void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
     68                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
     69                      int32_t* JXL_RESTRICT sumabs) {
     70  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
     71    size_t k = i % DCTSIZE2;
     72    const Rebind<int16_t, DI> di16;
     73    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
     74    const auto abs_coeff = Abs(coeff);
     75    const auto not_0 = Gt(abs_coeff, Zero(di));
     76    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
     77    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
     78    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
     79  }
     80 }
     81 
     82 void DecenterRow(float* row, size_t xsize) {
     83  const HWY_CAPPED(float, 8) df;
     84  const auto c128 = Set(df, 128.0f / 255);
     85  for (size_t x = 0; x < xsize; x += Lanes(df)) {
     86    Store(Add(Load(df, row + x), c128), df, row + x);
     87  }
     88 }
     89 
     90 void DitherRow(j_decompress_ptr cinfo, float* row, int c, size_t y,
     91               size_t xsize) {
     92  jpeg_decomp_master* m = cinfo->master;
     93  if (!m->dither_[c]) return;
     94  const float* dither_row =
     95      &m->dither_[c][(y & m->dither_mask_) * m->dither_size_];
     96  for (size_t x = 0; x < xsize; ++x) {
     97    row[x] += dither_row[x & m->dither_mask_];
     98  }
     99 }
    100 
    101 template <typename T>
    102 void StoreUnsignedRow(float* JXL_RESTRICT input[], size_t x0, size_t len,
    103                      size_t num_channels, float multiplier, T* output) {
    104  const HWY_CAPPED(float, 8) d;
    105  auto zero = Zero(d);
    106  auto mul = Set(d, multiplier);
    107  const Rebind<T, decltype(d)> du;
    108 #if JXL_MEMORY_SANITIZER
    109  const size_t padding = hwy::RoundUpTo(len, Lanes(d)) - len;
    110  for (size_t c = 0; c < num_channels; ++c) {
    111    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
    112  }
    113 #endif
    114  if (num_channels == 1) {
    115    for (size_t i = 0; i < len; i += Lanes(d)) {
    116      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
    117      StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
    118    }
    119  } else if (num_channels == 2) {
    120    for (size_t i = 0; i < len; i += Lanes(d)) {
    121      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
    122      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
    123      StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
    124                        DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
    125    }
    126  } else if (num_channels == 3) {
    127    for (size_t i = 0; i < len; i += Lanes(d)) {
    128      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
    129      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
    130      auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
    131      StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
    132                        DemoteTo(du, NearestInt(v1)),
    133                        DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
    134    }
    135  } else if (num_channels == 4) {
    136    for (size_t i = 0; i < len; i += Lanes(d)) {
    137      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
    138      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
    139      auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
    140      auto v3 = Clamp(zero, Mul(LoadU(d, &input[3][x0 + i]), mul), mul);
    141      StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
    142                        DemoteTo(du, NearestInt(v1)),
    143                        DemoteTo(du, NearestInt(v2)),
    144                        DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
    145    }
    146  }
    147 #if JXL_MEMORY_SANITIZER
    148  __msan_poison(output + num_channels * len,
    149                sizeof(output[0]) * num_channels * padding);
    150 #endif
    151 }
    152 
    153 void StoreFloatRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
    154                   size_t num_channels, float* output) {
    155  const HWY_CAPPED(float, 8) d;
    156  if (num_channels == 1) {
    157    memcpy(output, input[0] + x0, len * sizeof(output[0]));
    158  } else if (num_channels == 2) {
    159    for (size_t i = 0; i < len; i += Lanes(d)) {
    160      StoreInterleaved2(LoadU(d, &input[0][x0 + i]),
    161                        LoadU(d, &input[1][x0 + i]), d, &output[2 * i]);
    162    }
    163  } else if (num_channels == 3) {
    164    for (size_t i = 0; i < len; i += Lanes(d)) {
    165      StoreInterleaved3(LoadU(d, &input[0][x0 + i]),
    166                        LoadU(d, &input[1][x0 + i]),
    167                        LoadU(d, &input[2][x0 + i]), d, &output[3 * i]);
    168    }
    169  } else if (num_channels == 4) {
    170    for (size_t i = 0; i < len; i += Lanes(d)) {
    171      StoreInterleaved4(LoadU(d, &input[0][x0 + i]),
    172                        LoadU(d, &input[1][x0 + i]),
    173                        LoadU(d, &input[2][x0 + i]),
    174                        LoadU(d, &input[3][x0 + i]), d, &output[4 * i]);
    175    }
    176  }
    177 }
    178 
    179 static constexpr float kFSWeightMR = 7.0f / 16.0f;
    180 static constexpr float kFSWeightBL = 3.0f / 16.0f;
    181 static constexpr float kFSWeightBM = 5.0f / 16.0f;
    182 static constexpr float kFSWeightBR = 1.0f / 16.0f;
    183 
    184 float LimitError(float error) {
    185  float abserror = std::abs(error);
    186  if (abserror > 48.0f) {
    187    abserror = 32.0f;
    188  } else if (abserror > 16.0f) {
    189    abserror = 0.5f * abserror + 8.0f;
    190  }
    191  return error > 0.0f ? abserror : -abserror;
    192 }
    193 
    194 void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
    195                   size_t xoffset, size_t len, size_t num_channels,
    196                   uint8_t* JXL_RESTRICT output) {
    197  jpeg_decomp_master* m = cinfo->master;
    198  uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
    199  if (cinfo->quantize_colors && m->quant_pass_ == 1) {
    200    float* error_row[kMaxComponents];
    201    float* next_error_row[kMaxComponents];
    202    J_DITHER_MODE dither_mode = cinfo->dither_mode;
    203    if (dither_mode == JDITHER_ORDERED) {
    204      for (size_t c = 0; c < num_channels; ++c) {
    205        DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
    206                  cinfo->output_width);
    207      }
    208    } else if (dither_mode == JDITHER_FS) {
    209      for (size_t c = 0; c < num_channels; ++c) {
    210        if (cinfo->output_scanline % 2 == 0) {
    211          error_row[c] = m->error_row_[c];
    212          next_error_row[c] = m->error_row_[c + kMaxComponents];
    213        } else {
    214          error_row[c] = m->error_row_[c + kMaxComponents];
    215          next_error_row[c] = m->error_row_[c];
    216        }
    217        memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
    218      }
    219    }
    220    const float mul = 255.0f;
    221    if (dither_mode != JDITHER_FS) {
    222      StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
    223    }
    224    for (size_t i = 0; i < len; ++i) {
    225      uint8_t* pixel = &scratch_space[num_channels * i];
    226      if (dither_mode == JDITHER_FS) {
    227        for (size_t c = 0; c < num_channels; ++c) {
    228          float val = rows[c][i] * mul + LimitError(error_row[c][i]);
    229          pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
    230        }
    231      }
    232      int index = LookupColorIndex(cinfo, pixel);
    233      output[i] = index;
    234      if (dither_mode == JDITHER_FS) {
    235        size_t prev_i = i > 0 ? i - 1 : 0;
    236        size_t next_i = i + 1 < len ? i + 1 : len - 1;
    237        for (size_t c = 0; c < num_channels; ++c) {
    238          float error = pixel[c] - cinfo->colormap[c][index];
    239          error_row[c][next_i] += kFSWeightMR * error;
    240          next_error_row[c][prev_i] += kFSWeightBL * error;
    241          next_error_row[c][i] += kFSWeightBM * error;
    242          next_error_row[c][next_i] += kFSWeightBR * error;
    243        }
    244      }
    245    }
    246  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
    247    const float mul = 255.0;
    248    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
    249    memcpy(output, scratch_space, len * num_channels);
    250  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
    251    const float mul = 65535.0;
    252    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
    253    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
    254    if (m->swap_endianness_) {
    255      const HWY_CAPPED(uint16_t, 8) du;
    256      size_t output_len = len * num_channels;
    257      for (size_t j = 0; j < output_len; j += Lanes(du)) {
    258        auto v = LoadU(du, tmp + j);
    259        auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
    260        StoreU(vswap, du, tmp + j);
    261      }
    262    }
    263    memcpy(output, tmp, len * num_channels * 2);
    264  } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
    265    float* tmp = reinterpret_cast<float*>(scratch_space);
    266    StoreFloatRow(rows, xoffset, len, num_channels, tmp);
    267    if (m->swap_endianness_) {
    268      size_t output_len = len * num_channels;
    269      for (size_t j = 0; j < output_len; ++j) {
    270        tmp[j] = BSwapFloat(tmp[j]);
    271      }
    272    }
    273    memcpy(output, tmp, len * num_channels * 4);
    274  }
    275 }
    276 
    277 // NOLINTNEXTLINE(google-readability-namespace-comments)
    278 }  // namespace HWY_NAMESPACE
    279 }  // namespace jpegli
    280 HWY_AFTER_NAMESPACE();
    281 
    282 #if HWY_ONCE
    283 
    284 namespace jpegli {
    285 
    286 HWY_EXPORT(GatherBlockStats);
    287 HWY_EXPORT(WriteToOutput);
    288 HWY_EXPORT(DecenterRow);
    289 
    290 void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
    291                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
    292                      int32_t* JXL_RESTRICT sumabs) {
    293  HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros, sumabs);
    294 }
    295 
    296 void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
    297                   size_t xoffset, size_t len, size_t num_channels,
    298                   uint8_t* JXL_RESTRICT output) {
    299  HWY_DYNAMIC_DISPATCH(WriteToOutput)
    300  (cinfo, rows, xoffset, len, num_channels, output);
    301 }
    302 
    303 void DecenterRow(float* row, size_t xsize) {
    304  HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
    305 }
    306 
    307 bool ShouldApplyDequantBiases(j_decompress_ptr cinfo, int ci) {
    308  const auto& compinfo = cinfo->comp_info[ci];
    309  return (compinfo.h_samp_factor == cinfo->max_h_samp_factor &&
    310          compinfo.v_samp_factor == cinfo->max_v_samp_factor);
    311 }
    312 
    313 // See the following article for the details:
    314 // J. R. Price and M. Rabbani, "Dequantization bias for JPEG decompression"
    315 // Proceedings International Conference on Information Technology: Coding and
    316 // Computing (Cat. No.PR00540), 2000, pp. 30-35, doi: 10.1109/ITCC.2000.844179.
    317 void ComputeOptimalLaplacianBiases(const int num_blocks, const int* nonzeros,
    318                                   const int* sumabs, float* biases) {
    319  for (size_t k = 1; k < DCTSIZE2; ++k) {
    320    if (nonzeros[k] == 0) {
    321      biases[k] = 0.5f;
    322      continue;
    323    }
    324    // Notation adapted from the article
    325    float N = num_blocks;
    326    float N1 = nonzeros[k];
    327    float N0 = num_blocks - N1;
    328    float S = sumabs[k];
    329    // Compute gamma from N0, N1, N, S (eq. 11), with A and B being just
    330    // temporary grouping of terms.
    331    float A = 4.0 * S + 2.0 * N;
    332    float B = 4.0 * S - 2.0 * N1;
    333    float gamma = (-1.0 * N0 + std::sqrt(N0 * N0 * 1.0 + A * B)) / A;
    334    float gamma2 = gamma * gamma;
    335    // The bias is computed from gamma with (eq. 5), where the quantization
    336    // multiplier Q can be factored out and thus the bias can be applied
    337    // directly on the quantized coefficient.
    338    biases[k] =
    339        0.5 * (((1.0 + gamma2) / (1.0 - gamma2)) + 1.0 / std::log(gamma));
    340  }
    341 }
    342 
    343 constexpr std::array<int, SAVED_COEFS> Q_POS = {0, 1, 8,  16, 9,
    344                                                2, 3, 10, 17, 24};
    345 
    346 bool is_nonzero_quantizers(const JQUANT_TBL* qtable) {
    347  return std::all_of(Q_POS.begin(), Q_POS.end(),
    348                     [&](int pos) { return qtable->quantval[pos] != 0; });
    349 }
    350 
    351 // Determine whether smoothing should be applied during decompression
    352 bool do_smoothing(j_decompress_ptr cinfo) {
    353  jpeg_decomp_master* m = cinfo->master;
    354  bool smoothing_useful = false;
    355 
    356  if (!cinfo->progressive_mode || cinfo->coef_bits == nullptr) {
    357    return false;
    358  }
    359  auto* coef_bits_latch = m->coef_bits_latch;
    360  auto* prev_coef_bits_latch = m->prev_coef_bits_latch;
    361 
    362  for (int ci = 0; ci < cinfo->num_components; ci++) {
    363    jpeg_component_info* compptr = &cinfo->comp_info[ci];
    364    JQUANT_TBL* qtable = compptr->quant_table;
    365    int* coef_bits = cinfo->coef_bits[ci];
    366    int* prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
    367 
    368    // Return early if conditions for smoothing are not met
    369    if (qtable == nullptr || !is_nonzero_quantizers(qtable) ||
    370        coef_bits[0] < 0) {
    371      return false;
    372    }
    373 
    374    coef_bits_latch[ci][0] = coef_bits[0];
    375 
    376    for (int coefi = 1; coefi < SAVED_COEFS; coefi++) {
    377      prev_coef_bits_latch[ci][coefi] =
    378          cinfo->input_scan_number > 1 ? prev_coef_bits[coefi] : -1;
    379      if (coef_bits[coefi] != 0) {
    380        smoothing_useful = true;
    381      }
    382      coef_bits_latch[ci][coefi] = coef_bits[coefi];
    383    }
    384  }
    385 
    386  return smoothing_useful;
    387 }
    388 
    389 void PredictSmooth(j_decompress_ptr cinfo, JBLOCKARRAY blocks, int component,
    390                   size_t bx, int iy) {
    391  const size_t imcu_row = cinfo->output_iMCU_row;
    392  int16_t* scratch = cinfo->master->smoothing_scratch_;
    393  std::vector<int> Q_VAL(SAVED_COEFS);
    394  int* coef_bits;
    395 
    396  std::array<std::array<int, 5>, 5> dc_values;
    397  auto& compinfo = cinfo->comp_info[component];
    398  const size_t by0 = imcu_row * compinfo.v_samp_factor;
    399  const size_t by = by0 + iy;
    400 
    401  int prev_iy = by > 0 ? iy - 1 : 0;
    402  int prev_prev_iy = by > 1 ? iy - 2 : prev_iy;
    403  int next_iy = by + 1 < compinfo.height_in_blocks ? iy + 1 : iy;
    404  int next_next_iy = by + 2 < compinfo.height_in_blocks ? iy + 2 : next_iy;
    405 
    406  const int16_t* cur_row = blocks[iy][bx];
    407  const int16_t* prev_row = blocks[prev_iy][bx];
    408  const int16_t* prev_prev_row = blocks[prev_prev_iy][bx];
    409  const int16_t* next_row = blocks[next_iy][bx];
    410  const int16_t* next_next_row = blocks[next_next_iy][bx];
    411 
    412  int prev_block_ind = bx ? -DCTSIZE2 : 0;
    413  int prev_prev_block_ind = bx > 1 ? -2 * DCTSIZE2 : prev_block_ind;
    414  int next_block_ind = bx + 1 < compinfo.width_in_blocks ? DCTSIZE2 : 0;
    415  int next_next_block_ind =
    416      bx + 2 < compinfo.width_in_blocks ? DCTSIZE2 * 2 : next_block_ind;
    417 
    418  std::array<const int16_t*, 5> row_ptrs = {prev_prev_row, prev_row, cur_row,
    419                                            next_row, next_next_row};
    420  std::array<int, 5> block_inds = {prev_prev_block_ind, prev_block_ind, 0,
    421                                   next_block_ind, next_next_block_ind};
    422 
    423  memcpy(scratch, cur_row, DCTSIZE2 * sizeof(cur_row[0]));
    424 
    425  for (int r = 0; r < 5; ++r) {
    426    for (int c = 0; c < 5; ++c) {
    427      dc_values[r][c] = row_ptrs[r][block_inds[c]];
    428    }
    429  }
    430  // Get the correct coef_bits: In case of an incomplete scan, we use the
    431  // prev coefficients.
    432  if (cinfo->output_iMCU_row + 1 > cinfo->input_iMCU_row) {
    433    coef_bits = cinfo->master->prev_coef_bits_latch[component];
    434  } else {
    435    coef_bits = cinfo->master->coef_bits_latch[component];
    436  }
    437 
    438  bool change_dc = true;
    439  for (int i = 1; i < SAVED_COEFS; i++) {
    440    if (coef_bits[i] != -1) {
    441      change_dc = false;
    442      break;
    443    }
    444  }
    445 
    446  JQUANT_TBL* quanttbl = cinfo->quant_tbl_ptrs[compinfo.quant_tbl_no];
    447  for (size_t i = 0; i < 6; ++i) {
    448    Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
    449  }
    450  if (change_dc) {
    451    for (size_t i = 6; i < SAVED_COEFS; ++i) {
    452      Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
    453    }
    454  }
    455  auto calculate_dct_value = [&](int coef_index) {
    456    int64_t num = 0;
    457    int pred;
    458    int Al;
    459    // we use the symmetry of the smoothing matrices by transposing the 5x5 dc
    460    // matrix in that case.
    461    bool swap_indices = coef_index == 2 || coef_index == 5 || coef_index == 8 ||
    462                        coef_index == 9;
    463    auto dc = [&](int i, int j) {
    464      return swap_indices ? dc_values[j][i] : dc_values[i][j];
    465    };
    466    JPEGLI_CHECK(coef_index >= 0 && coef_index < 10);
    467    Al = coef_bits[coef_index];
    468    switch (coef_index) {
    469      case 0:
    470        // set the DC
    471        num = (-2 * dc(0, 0) - 6 * dc(0, 1) - 8 * dc(0, 2) - 6 * dc(0, 3) -
    472               2 * dc(0, 4) - 6 * dc(1, 0) + 6 * dc(1, 1) + 42 * dc(1, 2) +
    473               6 * dc(1, 3) - 6 * dc(1, 4) - 8 * dc(2, 0) + 42 * dc(2, 1) +
    474               152 * dc(2, 2) + 42 * dc(2, 3) - 8 * dc(2, 4) - 6 * dc(3, 0) +
    475               6 * dc(3, 1) + 42 * dc(3, 2) + 6 * dc(3, 3) - 6 * dc(3, 4) -
    476               2 * dc(4, 0) - 6 * dc(4, 1) - 8 * dc(4, 2) - 6 * dc(4, 3) -
    477               2 * dc(4, 4));
    478        // special case: for the DC the dequantization is different
    479        Al = 0;
    480        break;
    481      case 1:
    482      case 2:
    483        // set Q01 or Q10
    484        num = (change_dc ? (-dc(0, 0) - dc(0, 1) + dc(0, 3) + dc(0, 4) -
    485                            3 * dc(1, 0) + 13 * dc(1, 1) - 13 * dc(1, 3) +
    486                            3 * dc(1, 4) - 3 * dc(2, 0) + 38 * dc(2, 1) -
    487                            38 * dc(2, 3) + 3 * dc(2, 4) - 3 * dc(3, 0) +
    488                            13 * dc(3, 1) - 13 * dc(3, 3) + 3 * dc(3, 4) -
    489                            dc(4, 0) - dc(4, 1) + dc(4, 3) + dc(4, 4))
    490                         : (-7 * dc(2, 0) + 50 * dc(2, 1) - 50 * dc(2, 3) +
    491                            7 * dc(2, 4)));
    492        break;
    493      case 3:
    494      case 5:
    495        // set Q02 or Q20
    496        num = (change_dc
    497                   ? dc(0, 2) + 2 * dc(1, 1) + 7 * dc(1, 2) + 2 * dc(1, 3) -
    498                         5 * dc(2, 1) - 14 * dc(2, 2) - 5 * dc(2, 3) +
    499                         2 * dc(3, 1) + 7 * dc(3, 2) + 2 * dc(3, 3) + dc(4, 2)
    500                   : (-dc(0, 2) + 13 * dc(1, 2) - 24 * dc(2, 2) +
    501                      13 * dc(3, 2) - dc(4, 2)));
    502        break;
    503      case 4:
    504        // set Q11
    505        num =
    506            (change_dc ? -dc(0, 0) + dc(0, 4) + 9 * dc(1, 1) - 9 * dc(1, 3) -
    507                             9 * dc(3, 1) + 9 * dc(3, 3) + dc(4, 0) - dc(4, 4)
    508                       : (dc(1, 4) + dc(3, 0) - 10 * dc(3, 1) + 10 * dc(3, 3) -
    509                          dc(0, 1) - dc(3, 4) + dc(4, 1) - dc(4, 3) + dc(0, 3) -
    510                          dc(1, 0) + 10 * dc(1, 1) - 10 * dc(1, 3)));
    511        break;
    512      case 6:
    513      case 9:
    514        // set Q03 or Q30
    515        num = (dc(1, 1) - dc(1, 3) + 2 * dc(2, 1) - 2 * dc(2, 3) + dc(3, 1) -
    516               dc(3, 3));
    517        break;
    518      case 7:
    519      case 8:
    520      default:
    521        // set Q12 and Q21
    522        num = (dc(1, 1) - 3 * dc(1, 2) + dc(1, 3) - dc(3, 1) + 3 * dc(3, 2) -
    523               dc(3, 3));
    524        break;
    525    }
    526    num = Q_VAL[0] * num;
    527    if (num >= 0) {
    528      pred = ((Q_VAL[coef_index] << 7) + num) / (Q_VAL[coef_index] << 8);
    529      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
    530    } else {
    531      pred = ((Q_VAL[coef_index] << 7) - num) / (Q_VAL[coef_index] << 8);
    532      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
    533      pred = -pred;
    534    }
    535    return static_cast<int16_t>(pred);
    536  };
    537 
    538  int loop_end = change_dc ? SAVED_COEFS : 6;
    539  for (int i = 1; i < loop_end; ++i) {
    540    if (coef_bits[i] != 0 && scratch[Q_POS[i]] == 0) {
    541      scratch[Q_POS[i]] = calculate_dct_value(i);
    542    }
    543  }
    544  if (change_dc) {
    545    scratch[0] = calculate_dct_value(0);
    546  }
    547 }
    548 
    549 void PrepareForOutput(j_decompress_ptr cinfo) {
    550  jpeg_decomp_master* m = cinfo->master;
    551  bool smoothing = do_smoothing(cinfo);
    552  m->apply_smoothing = smoothing && FROM_JXL_BOOL(cinfo->do_block_smoothing);
    553  size_t coeffs_per_block = cinfo->num_components * DCTSIZE2;
    554  memset(m->nonzeros_, 0, coeffs_per_block * sizeof(m->nonzeros_[0]));
    555  memset(m->sumabs_, 0, coeffs_per_block * sizeof(m->sumabs_[0]));
    556  memset(m->num_processed_blocks_, 0, sizeof(m->num_processed_blocks_));
    557  memset(m->biases_, 0, coeffs_per_block * sizeof(m->biases_[0]));
    558  cinfo->output_iMCU_row = 0;
    559  cinfo->output_scanline = 0;
    560  const float kDequantScale = 1.0f / (8 * 255);
    561  for (int c = 0; c < cinfo->num_components; c++) {
    562    const auto& comp = cinfo->comp_info[c];
    563    JQUANT_TBL* table = comp.quant_table;
    564    if (table == nullptr) continue;
    565    for (size_t k = 0; k < DCTSIZE2; ++k) {
    566      m->dequant_[c * DCTSIZE2 + k] = table->quantval[k] * kDequantScale;
    567    }
    568  }
    569  JPEGLI_CHECK(ChooseInverseTransform(cinfo));
    570  ChooseColorTransform(cinfo);
    571 }
    572 
    573 void DecodeCurrentiMCURow(j_decompress_ptr cinfo) {
    574  jpeg_decomp_master* m = cinfo->master;
    575  const size_t imcu_row = cinfo->output_iMCU_row;
    576  JBLOCKARRAY blocks[kMaxComponents];
    577  for (int c = 0; c < cinfo->num_components; ++c) {
    578    const jpeg_component_info* comp = &cinfo->comp_info[c];
    579    int by0 = imcu_row * comp->v_samp_factor;
    580    int block_rows_left = comp->height_in_blocks - by0;
    581    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
    582    int offset = m->streaming_mode_ ? 0 : by0;
    583    blocks[c] = (*cinfo->mem->access_virt_barray)(
    584        reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
    585        max_block_rows, FALSE);
    586  }
    587  for (int c = 0; c < cinfo->num_components; ++c) {
    588    size_t k0 = c * DCTSIZE2;
    589    auto& compinfo = cinfo->comp_info[c];
    590    size_t block_row = imcu_row * compinfo.v_samp_factor;
    591    if (ShouldApplyDequantBiases(cinfo, c)) {
    592      // Update statistics for this iMCU row.
    593      for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
    594        size_t by = block_row + iy;
    595        if (by >= compinfo.height_in_blocks) {
    596          continue;
    597        }
    598        int16_t* JXL_RESTRICT coeffs = &blocks[c][iy][0][0];
    599        size_t num = compinfo.width_in_blocks * DCTSIZE2;
    600        GatherBlockStats(coeffs, num, &m->nonzeros_[k0], &m->sumabs_[k0]);
    601        m->num_processed_blocks_[c] += compinfo.width_in_blocks;
    602      }
    603      if (imcu_row % 4 == 3) {
    604        // Re-compute optimal biases every few iMCU-rows.
    605        ComputeOptimalLaplacianBiases(m->num_processed_blocks_[c],
    606                                      &m->nonzeros_[k0], &m->sumabs_[k0],
    607                                      &m->biases_[k0]);
    608      }
    609    }
    610    RowBuffer<float>* raw_out = &m->raw_output_[c];
    611    for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
    612      size_t by = block_row + iy;
    613      if (by >= compinfo.height_in_blocks) {
    614        continue;
    615      }
    616      size_t dctsize = m->scaled_dct_size[c];
    617      int16_t* JXL_RESTRICT row_in = &blocks[c][iy][0][0];
    618      float* JXL_RESTRICT row_out = raw_out->Row(by * dctsize);
    619      for (size_t bx = 0; bx < compinfo.width_in_blocks; ++bx) {
    620        if (m->apply_smoothing) {
    621          PredictSmooth(cinfo, blocks[c], c, bx, iy);
    622          (*m->inverse_transform[c])(m->smoothing_scratch_, &m->dequant_[k0],
    623                                     &m->biases_[k0], m->idct_scratch_,
    624                                     &row_out[bx * dctsize], raw_out->stride(),
    625                                     dctsize);
    626        } else {
    627          (*m->inverse_transform[c])(&row_in[bx * DCTSIZE2], &m->dequant_[k0],
    628                                     &m->biases_[k0], m->idct_scratch_,
    629                                     &row_out[bx * dctsize], raw_out->stride(),
    630                                     dctsize);
    631        }
    632      }
    633      if (m->streaming_mode_) {
    634        memset(row_in, 0, compinfo.width_in_blocks * sizeof(JBLOCK));
    635      }
    636    }
    637  }
    638 }
    639 
    640 void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data) {
    641  jpegli::DecodeCurrentiMCURow(cinfo);
    642  jpeg_decomp_master* m = cinfo->master;
    643  for (int c = 0; c < cinfo->num_components; ++c) {
    644    const auto& compinfo = cinfo->comp_info[c];
    645    size_t comp_width = compinfo.width_in_blocks * DCTSIZE;
    646    size_t comp_height = compinfo.height_in_blocks * DCTSIZE;
    647    size_t comp_nrows = compinfo.v_samp_factor * DCTSIZE;
    648    size_t y0 = cinfo->output_iMCU_row * compinfo.v_samp_factor * DCTSIZE;
    649    size_t y1 = std::min(y0 + comp_nrows, comp_height);
    650    for (size_t y = y0; y < y1; ++y) {
    651      float* rows[1] = {m->raw_output_[c].Row(y)};
    652      uint8_t* output = data[c][y - y0];
    653      DecenterRow(rows[0], comp_width);
    654      WriteToOutput(cinfo, rows, 0, comp_width, 1, output);
    655    }
    656  }
    657  ++cinfo->output_iMCU_row;
    658  cinfo->output_scanline += cinfo->max_v_samp_factor * DCTSIZE;
    659  if (cinfo->output_scanline >= cinfo->output_height) {
    660    ++m->output_passes_done_;
    661  }
    662 }
    663 
    664 void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
    665                   JSAMPARRAY scanlines, size_t max_output_rows) {
    666  jpeg_decomp_master* m = cinfo->master;
    667  const int vfactor = cinfo->max_v_samp_factor;
    668  const int hfactor = cinfo->max_h_samp_factor;
    669  const size_t context = m->need_context_rows_ ? 1 : 0;
    670  const size_t imcu_row = cinfo->output_iMCU_row;
    671  const size_t imcu_height = vfactor * m->min_scaled_dct_size;
    672  const size_t imcu_width = hfactor * m->min_scaled_dct_size;
    673  const size_t output_width = m->iMCU_cols_ * imcu_width;
    674  if (imcu_row == cinfo->total_iMCU_rows ||
    675      (imcu_row > context &&
    676       cinfo->output_scanline < (imcu_row - context) * imcu_height)) {
    677    // We are ready to output some scanlines.
    678    size_t ybegin = cinfo->output_scanline;
    679    size_t yend = (imcu_row == cinfo->total_iMCU_rows
    680                       ? cinfo->output_height
    681                       : (imcu_row - context) * imcu_height);
    682    yend = std::min<size_t>(yend, ybegin + max_output_rows - *num_output_rows);
    683    size_t yb = (ybegin / vfactor) * vfactor;
    684    size_t ye = DivCeil(yend, vfactor) * vfactor;
    685    for (size_t y = yb; y < ye; y += vfactor) {
    686      for (int c = 0; c < cinfo->num_components; ++c) {
    687        RowBuffer<float>* raw_out = &m->raw_output_[c];
    688        RowBuffer<float>* render_out = &m->render_output_[c];
    689        int line_groups = vfactor / m->v_factor[c];
    690        int downsampled_width = output_width / m->h_factor[c];
    691        size_t yc = y / m->v_factor[c];
    692        for (int dy = 0; dy < line_groups; ++dy) {
    693          size_t ymid = yc + dy;
    694          const float* JXL_RESTRICT row_mid = raw_out->Row(ymid);
    695          if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) {
    696            const float* JXL_RESTRICT row_top =
    697                ymid == 0 ? row_mid : raw_out->Row(ymid - 1);
    698            const float* JXL_RESTRICT row_bot = ymid + 1 == m->raw_height_[c]
    699                                                    ? row_mid
    700                                                    : raw_out->Row(ymid + 1);
    701            Upsample2Vertical(row_top, row_mid, row_bot,
    702                              render_out->Row(2 * dy),
    703                              render_out->Row(2 * dy + 1), downsampled_width);
    704          } else {
    705            for (int yix = 0; yix < m->v_factor[c]; ++yix) {
    706              memcpy(render_out->Row(m->v_factor[c] * dy + yix), row_mid,
    707                     downsampled_width * sizeof(float));
    708            }
    709          }
    710          if (m->h_factor[c] > 1) {
    711            for (int yix = 0; yix < m->v_factor[c]; ++yix) {
    712              int row_ix = m->v_factor[c] * dy + yix;
    713              float* JXL_RESTRICT row = render_out->Row(row_ix);
    714              float* JXL_RESTRICT tmp = m->upsample_scratch_;
    715              if (cinfo->do_fancy_upsampling && m->h_factor[c] == 2) {
    716                Upsample2Horizontal(row, tmp, output_width);
    717              } else {
    718                // TODO(szabadka) SIMDify this.
    719                for (size_t x = 0; x < output_width; ++x) {
    720                  tmp[x] = row[x / m->h_factor[c]];
    721                }
    722                memcpy(row, tmp, output_width * sizeof(tmp[0]));
    723              }
    724            }
    725          }
    726        }
    727      }
    728      for (int yix = 0; yix < vfactor; ++yix) {
    729        if (y + yix < ybegin || y + yix >= yend) continue;
    730        float* rows[kMaxComponents];
    731        int num_all_components =
    732            std::max(cinfo->out_color_components, cinfo->num_components);
    733        for (int c = 0; c < num_all_components; ++c) {
    734          rows[c] = m->render_output_[c].Row(yix);
    735        }
    736        (*m->color_transform)(rows, output_width);
    737        for (int c = 0; c < cinfo->out_color_components; ++c) {
    738          // Undo the centering of the sample values around zero.
    739          DecenterRow(rows[c], output_width);
    740        }
    741        if (scanlines) {
    742          uint8_t* output = scanlines[*num_output_rows];
    743          WriteToOutput(cinfo, rows, m->xoffset_, cinfo->output_width,
    744                        cinfo->out_color_components, output);
    745        }
    746        JPEGLI_CHECK(cinfo->output_scanline == y + yix);
    747        ++cinfo->output_scanline;
    748        ++(*num_output_rows);
    749        if (cinfo->output_scanline == cinfo->output_height) {
    750          ++m->output_passes_done_;
    751        }
    752      }
    753    }
    754  } else {
    755    DecodeCurrentiMCURow(cinfo);
    756    ++cinfo->output_iMCU_row;
    757  }
    758 }
    759 
    760 }  // namespace jpegli
    761 #endif  // HWY_ONCE