tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

enc_group.cc (21347B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/enc_group.h"
      7 
      8 #include <jxl/memory_manager.h>
      9 
     10 #include "lib/jxl/base/status.h"
     11 #include "lib/jxl/memory_manager_internal.h"
     12 
     13 #undef HWY_TARGET_INCLUDE
     14 #define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc"
     15 #include <hwy/foreach_target.h>
     16 #include <hwy/highway.h>
     17 
     18 #include "lib/jxl/ac_strategy.h"
     19 #include "lib/jxl/base/bits.h"
     20 #include "lib/jxl/base/compiler_specific.h"
     21 #include "lib/jxl/base/rect.h"
     22 #include "lib/jxl/common.h"  // kMaxNumPasses
     23 #include "lib/jxl/dct_util.h"
     24 #include "lib/jxl/dec_transforms-inl.h"
     25 #include "lib/jxl/enc_aux_out.h"
     26 #include "lib/jxl/enc_cache.h"
     27 #include "lib/jxl/enc_params.h"
     28 #include "lib/jxl/enc_transforms-inl.h"
     29 #include "lib/jxl/image.h"
     30 #include "lib/jxl/quantizer-inl.h"
     31 #include "lib/jxl/quantizer.h"
     32 #include "lib/jxl/simd_util.h"
     33 HWY_BEFORE_NAMESPACE();
     34 namespace jxl {
     35 namespace HWY_NAMESPACE {
     36 
     37 // These templates are not found via ADL.
     38 using hwy::HWY_NAMESPACE::Abs;
     39 using hwy::HWY_NAMESPACE::Ge;
     40 using hwy::HWY_NAMESPACE::IfThenElse;
     41 using hwy::HWY_NAMESPACE::IfThenElseZero;
     42 using hwy::HWY_NAMESPACE::MaskFromVec;
     43 using hwy::HWY_NAMESPACE::Round;
     44 
     45 // NOTE: caller takes care of extracting quant from rect of RawQuantField.
     46 void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion,
     47                     size_t c, float qm_multiplier, AcStrategyType quant_kind,
     48                     size_t xsize, size_t ysize, float* thresholds,
     49                     const float* JXL_RESTRICT block_in, const int32_t* quant,
     50                     int32_t* JXL_RESTRICT block_out) {
     51  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
     52  float qac = quantizer.Scale() * (*quant);
     53  // Not SIMD-ified for now.
     54  if (c != 1 && xsize * ysize >= 4) {
     55    for (int i = 0; i < 4; ++i) {
     56      thresholds[i] -= 0.00744f * xsize * ysize;
     57      if (thresholds[i] < 0.5) {
     58        thresholds[i] = 0.5;
     59      }
     60    }
     61  }
     62  HWY_CAPPED(float, kBlockDim) df;
     63  HWY_CAPPED(int32_t, kBlockDim) di;
     64  HWY_CAPPED(uint32_t, kBlockDim) du;
     65  const auto quantv = Set(df, qac * qm_multiplier);
     66  for (size_t y = 0; y < ysize * kBlockDim; y++) {
     67    size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2;
     68    const size_t off = y * kBlockDim * xsize;
     69    for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) {
     70      auto threshold = Zero(df);
     71      if (xsize == 1) {
     72        HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u};
     73        const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x)));
     74        threshold = IfThenElse(mask, Set(df, thresholds[yfix + 1]),
     75                               Set(df, thresholds[yfix]));
     76      } else {
     77        // Same for all lanes in the vector.
     78        threshold = Set(
     79            df,
     80            thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]);
     81      }
     82      const auto q = Mul(Load(df, qm + off + x), quantv);
     83      const auto in = Load(df, block_in + off + x);
     84      const auto val = Mul(q, in);
     85      const auto nzero_mask = Ge(Abs(val), threshold);
     86      const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val)));
     87      Store(v, di, block_out + off + x);
     88    }
     89  }
     90 }
     91 
     92 void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c,
     93                        float qm_multiplier, AcStrategyType quant_kind,
     94                        size_t xsize, size_t ysize, float* thresholds,
     95                        const float* JXL_RESTRICT block_in, int32_t* quant) {
     96  // No quantization adjusting for these small blocks.
     97  // Quantization adjusting attempts to fix some known issues
     98  // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness
     99  // when there are not many non-zeros.
    100  constexpr size_t kPartialBlockKinds =
    101      (1 << static_cast<size_t>(AcStrategyType::IDENTITY)) |
    102      (1 << static_cast<size_t>(AcStrategyType::DCT2X2)) |
    103      (1 << static_cast<size_t>(AcStrategyType::DCT4X4)) |
    104      (1 << static_cast<size_t>(AcStrategyType::DCT4X8)) |
    105      (1 << static_cast<size_t>(AcStrategyType::DCT8X4)) |
    106      (1 << static_cast<size_t>(AcStrategyType::AFV0)) |
    107      (1 << static_cast<size_t>(AcStrategyType::AFV1)) |
    108      (1 << static_cast<size_t>(AcStrategyType::AFV2)) |
    109      (1 << static_cast<size_t>(AcStrategyType::AFV3));
    110  if ((1 << static_cast<size_t>(quant_kind)) & kPartialBlockKinds) {
    111    return;
    112  }
    113 
    114  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
    115  float qac = quantizer.Scale() * (*quant);
    116  if (xsize > 1 || ysize > 1) {
    117    for (int i = 0; i < 4; ++i) {
    118      thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
    119      if (thresholds[i] < 0.54) {
    120        thresholds[i] = 0.54;
    121      }
    122    }
    123  }
    124  float sum_of_highest_freq_row_and_column = 0;
    125  float sum_of_error = 0;
    126  float sum_of_vals = 0;
    127  float hfNonZeros[4] = {};
    128  float hfMaxError[4] = {};
    129 
    130  for (size_t y = 0; y < ysize * kBlockDim; y++) {
    131    for (size_t x = 0; x < xsize * kBlockDim; x++) {
    132      const size_t pos = y * kBlockDim * xsize + x;
    133      if (x < xsize && y < ysize) {
    134        continue;
    135      }
    136      const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 +
    137                           static_cast<size_t>(x >= xsize * kBlockDim / 2));
    138      const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier);
    139      const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val);
    140      const float error = std::abs(val - v);
    141      sum_of_error += error;
    142      sum_of_vals += std::abs(v);
    143      if (c == 1 && v == 0) {
    144        if (hfMaxError[hfix] < error) {
    145          hfMaxError[hfix] = error;
    146        }
    147      }
    148      if (v != 0.0f) {
    149        hfNonZeros[hfix] += std::abs(v);
    150        bool in_corner = y >= 7 * ysize && x >= 7 * xsize;
    151        bool on_border =
    152            y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1;
    153        bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize;
    154        if (in_corner || (on_border && in_larger_corner)) {
    155          sum_of_highest_freq_row_and_column += std::abs(val);
    156        }
    157      }
    158    }
    159  }
    160  if (c == 1 && sum_of_vals * 8 < xsize * ysize) {
    161    static const double kLimit[4] = {
    162        0.46,
    163        0.46,
    164        0.46,
    165        0.46,
    166    };
    167    static const double kMul[4] = {
    168        0.9999,
    169        0.9999,
    170        0.9999,
    171        0.9999,
    172    };
    173    const int32_t orig_quant = *quant;
    174    int32_t new_quant = *quant;
    175    for (int i = 1; i < 4; ++i) {
    176      if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) {
    177        new_quant = orig_quant + 1;
    178        break;
    179      }
    180    }
    181    *quant = new_quant;
    182    if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) {
    183      thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant;
    184    } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) ||
    185               (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) {
    186      thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) *
    187                      new_quant / orig_quant;
    188      thresholds[2] = thresholds[1];
    189    } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) {
    190      thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant;
    191    }
    192  }
    193  // Heuristic for improving accuracy of high-frequency patterns
    194  // occurring in an environment with no medium-frequency masking
    195  // patterns.
    196  {
    197    float all =
    198        hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1;
    199    float mul[3] = {70, 30, 60};
    200    if (mul[c] * sum_of_highest_freq_row_and_column >= all) {
    201      *quant += mul[c] * sum_of_highest_freq_row_and_column / all;
    202      if (*quant >= Quantizer::kQuantMax) {
    203        *quant = Quantizer::kQuantMax - 1;
    204      }
    205    }
    206  }
    207  if (quant_kind == AcStrategyType::DCT) {
    208    // If this 8x8 block is too flat, increase the adaptive quantization level
    209    // a bit to reduce visible block boundaries and requantize the block.
    210    if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) {
    211      *quant += 1;
    212      if (*quant >= Quantizer::kQuantMax) {
    213        *quant = Quantizer::kQuantMax - 1;
    214      }
    215    }
    216  }
    217  {
    218    static const double kMul1[4][3] = {
    219        {
    220            0.22080615753848404,
    221            0.45797479824262011,
    222            0.29859235095977965,
    223        },
    224        {
    225            0.70109486510286834,
    226            0.16185281305512639,
    227            0.14387691730035473,
    228        },
    229        {
    230            0.114985964456218638,
    231            0.44656840441027695,
    232            0.10587658215149048,
    233        },
    234        {
    235            0.46849665264409396,
    236            0.41239077937781954,
    237            0.088667407767185444,
    238        },
    239    };
    240    static const double kMul2[4][3] = {
    241        {
    242            0.27450281941822197,
    243            1.1255766549984996,
    244            0.98950459134128388,
    245        },
    246        {
    247            0.4652168675598285,
    248            0.40945807983455818,
    249            0.36581899811751367,
    250        },
    251        {
    252            0.28034972424715715,
    253            0.9182653201929738,
    254            1.5581531543057416,
    255        },
    256        {
    257            0.26873118114033728,
    258            0.68863712390392484,
    259            1.2082185408666786,
    260        },
    261    };
    262    static const double kQuantNormalizer = 2.2942708343284721;
    263    sum_of_error *= kQuantNormalizer;
    264    sum_of_vals *= kQuantNormalizer;
    265    if (quant_kind >= AcStrategyType::DCT16X16) {
    266      int ix = 3;
    267      if (quant_kind == AcStrategyType::DCT32X16 ||
    268          quant_kind == AcStrategyType::DCT16X32) {
    269        ix = 1;
    270      } else if (quant_kind == AcStrategyType::DCT16X16) {
    271        ix = 0;
    272      } else if (quant_kind == AcStrategyType::DCT32X32) {
    273        ix = 2;
    274      }
    275      int step =
    276          sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
    277                          kMul2[ix][c] * sum_of_vals);
    278      if (step >= 2) {
    279        step = 2;
    280      }
    281      if (step < 0) {
    282        step = 0;
    283      }
    284      if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
    285                             kMul2[ix][c] * sum_of_vals) {
    286        *quant += step;
    287        if (*quant >= Quantizer::kQuantMax) {
    288          *quant = Quantizer::kQuantMax - 1;
    289        }
    290      }
    291    }
    292  }
    293  {
    294    // Reduce quant in highly active areas.
    295    int32_t div = (xsize * ysize);
    296    int32_t activity = (static_cast<int32_t>(hfNonZeros[0]) + div / 2) / div;
    297    int32_t orig_qp_limit = std::max(4, *quant / 2);
    298    for (int i = 1; i < 4; ++i) {
    299      activity = std::min(
    300          activity, (static_cast<int32_t>(hfNonZeros[i]) + div / 2) / div);
    301    }
    302    if (activity >= 15) {
    303      activity = 15;
    304    }
    305    int32_t qp = *quant - activity;
    306    if (c == 1) {
    307      for (int i = 1; i < 4; ++i) {
    308        thresholds[i] += 0.01 * activity;
    309      }
    310    }
    311    if (qp < orig_qp_limit) {
    312      qp = orig_qp_limit;
    313    }
    314    *quant = qp;
    315  }
    316 }
    317 
    318 // NOTE: caller takes care of extracting quant from rect of RawQuantField.
    319 void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size,
    320                               const Quantizer& quantizer,
    321                               const bool error_diffusion,
    322                               AcStrategyType quant_kind, size_t xsize,
    323                               size_t ysize, const float* JXL_RESTRICT biases,
    324                               int32_t* quant, float* JXL_RESTRICT inout,
    325                               int32_t* JXL_RESTRICT quantized) {
    326  float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f};
    327  if (enc_state->cparams.speed_tier <= SpeedTier::kHare) {
    328    int32_t max_quant = 0;
    329    int quant_orig = *quant;
    330    float val[3] = {enc_state->x_qm_multiplier, 1.0f,
    331                    enc_state->b_qm_multiplier};
    332    for (int c : {1, 0, 2}) {
    333      float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f};
    334      *quant = quant_orig;
    335      AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize,
    336                         &thres[0], inout + c * size, quant);
    337      // Dead zone adjustment
    338      if (c == 1) {
    339        for (int k = 0; k < 4; ++k) {
    340          thres_y[k] = thres[k];
    341        }
    342      }
    343      max_quant = std::max(*quant, max_quant);
    344    }
    345    *quant = max_quant;
    346  } else {
    347    thres_y[0] = 0.56;
    348    thres_y[1] = 0.62;
    349    thres_y[2] = 0.62;
    350    thres_y[3] = 0.62;
    351  }
    352 
    353  QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize,
    354                  &thres_y[0], inout + size, quant, quantized + size);
    355 
    356  const float* JXL_RESTRICT dequant_matrix =
    357      quantizer.DequantMatrix(quant_kind, 1);
    358 
    359  HWY_CAPPED(float, kDCTBlockSize) df;
    360  HWY_CAPPED(int32_t, kDCTBlockSize) di;
    361  const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant));
    362  for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) {
    363    const auto quant = Load(di, quantized + size + k);
    364    const auto adj_quant = AdjustQuantBias(di, 1, quant, biases);
    365    const auto dequantm = Load(df, dequant_matrix + k);
    366    Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k);
    367  }
    368 }
    369 
    370 Status ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
    371                           const Image3F& opsin, const Rect& rect,
    372                           Image3F* dc) {
    373  JxlMemoryManager* memory_manager = opsin.memory_manager();
    374  const Rect block_group_rect =
    375      enc_state->shared.frame_dim.BlockGroupRect(group_idx);
    376  const Rect cmap_rect(
    377      block_group_rect.x0() / kColorTileDimInBlocks,
    378      block_group_rect.y0() / kColorTileDimInBlocks,
    379      DivCeil(block_group_rect.xsize(), kColorTileDimInBlocks),
    380      DivCeil(block_group_rect.ysize(), kColorTileDimInBlocks));
    381  const Rect group_rect =
    382      enc_state->shared.frame_dim.GroupRect(group_idx).Translate(rect.x0(),
    383                                                                 rect.y0());
    384 
    385  const size_t xsize_blocks = block_group_rect.xsize();
    386  const size_t ysize_blocks = block_group_rect.ysize();
    387 
    388  const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow());
    389  const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow());
    390 
    391  ImageI& full_quant_field = enc_state->shared.raw_quant_field;
    392  const CompressParams& cparams = enc_state->cparams;
    393 
    394  const size_t dct_scratch_size =
    395      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
    396 
    397  // TODO(veluca): consider strategies to reduce this memory.
    398  size_t mem_bytes = 3 * AcStrategy::kMaxCoeffArea * sizeof(int32_t);
    399  JXL_ASSIGN_OR_RETURN(auto mem,
    400                       AlignedMemory::Create(memory_manager, mem_bytes));
    401  size_t fmem_bytes =
    402      (5 * AcStrategy::kMaxCoeffArea + dct_scratch_size) * sizeof(float);
    403  JXL_ASSIGN_OR_RETURN(auto fmem,
    404                       AlignedMemory::Create(memory_manager, fmem_bytes));
    405  float* JXL_RESTRICT scratch_space =
    406      fmem.address<float>() + 3 * AcStrategy::kMaxCoeffArea;
    407  {
    408    // Only use error diffusion in Squirrel mode or slower.
    409    const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel;
    410    constexpr HWY_CAPPED(float, kDCTBlockSize) d;
    411 
    412    int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {};
    413    size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
    414    JXL_ENSURE(num_passes > 0);
    415    for (size_t i = 0; i < num_passes; i++) {
    416      // TODO(veluca): 16-bit quantized coeffs are not implemented yet.
    417      JXL_ENSURE(enc_state->coeffs[i]->Type() == ACType::k32);
    418      for (size_t c = 0; c < 3; c++) {
    419        coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32;
    420      }
    421    }
    422 
    423    HWY_ALIGN float* coeffs_in = fmem.address<float>();
    424    HWY_ALIGN int32_t* quantized = mem.address<int32_t>();
    425 
    426    for (size_t by = 0; by < ysize_blocks; ++by) {
    427      int32_t* JXL_RESTRICT row_quant_ac =
    428          block_group_rect.Row(&full_quant_field, by);
    429      size_t ty = by / kColorTileDimInBlocks;
    430      const int8_t* JXL_RESTRICT row_cmap[3] = {
    431          cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty),
    432          nullptr,
    433          cmap_rect.ConstRow(enc_state->shared.cmap.ytob_map, ty),
    434      };
    435      const float* JXL_RESTRICT opsin_rows[3] = {
    436          group_rect.ConstPlaneRow(opsin, 0, by * kBlockDim),
    437          group_rect.ConstPlaneRow(opsin, 1, by * kBlockDim),
    438          group_rect.ConstPlaneRow(opsin, 2, by * kBlockDim),
    439      };
    440      float* JXL_RESTRICT dc_rows[3] = {
    441          block_group_rect.PlaneRow(dc, 0, by),
    442          block_group_rect.PlaneRow(dc, 1, by),
    443          block_group_rect.PlaneRow(dc, 2, by),
    444      };
    445      AcStrategyRow ac_strategy_row =
    446          enc_state->shared.ac_strategy.ConstRow(block_group_rect, by);
    447      for (size_t tx = 0; tx < DivCeil(xsize_blocks, kColorTileDimInBlocks);
    448           tx++) {
    449        const auto x_factor =
    450            Set(d, enc_state->shared.cmap.base().YtoXRatio(row_cmap[0][tx]));
    451        const auto b_factor =
    452            Set(d, enc_state->shared.cmap.base().YtoBRatio(row_cmap[2][tx]));
    453        for (size_t bx = tx * kColorTileDimInBlocks;
    454             bx < xsize_blocks && bx < (tx + 1) * kColorTileDimInBlocks; ++bx) {
    455          const AcStrategy acs = ac_strategy_row[bx];
    456          if (!acs.IsFirstBlock()) continue;
    457 
    458          size_t xblocks = acs.covered_blocks_x();
    459          size_t yblocks = acs.covered_blocks_y();
    460 
    461          CoefficientLayout(&yblocks, &xblocks);
    462 
    463          size_t size = kDCTBlockSize * xblocks * yblocks;
    464 
    465          // DCT Y channel, roundtrip-quantize it and set DC.
    466          int32_t quant_ac = row_quant_ac[bx];
    467          for (size_t c : {0, 1, 2}) {
    468            TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim,
    469                                opsin_stride, coeffs_in + c * size,
    470                                scratch_space);
    471          }
    472          DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size,
    473                                  dc_rows[1] + bx, dc_stride);
    474 
    475          QuantizeRoundtripYBlockAC(
    476              enc_state, size, enc_state->shared.quantizer, error_diffusion,
    477              acs.Strategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac,
    478              coeffs_in, quantized);
    479 
    480          // Unapply color correlation
    481          for (size_t k = 0; k < size; k += Lanes(d)) {
    482            const auto in_x = Load(d, coeffs_in + k);
    483            const auto in_y = Load(d, coeffs_in + size + k);
    484            const auto in_b = Load(d, coeffs_in + 2 * size + k);
    485            const auto out_x = NegMulAdd(x_factor, in_y, in_x);
    486            const auto out_b = NegMulAdd(b_factor, in_y, in_b);
    487            Store(out_x, d, coeffs_in + k);
    488            Store(out_b, d, coeffs_in + 2 * size + k);
    489          }
    490 
    491          // Quantize X and B channels and set DC.
    492          for (size_t c : {0, 2}) {
    493            float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f};
    494            QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c,
    495                            c == 0 ? enc_state->x_qm_multiplier
    496                                   : enc_state->b_qm_multiplier,
    497                            acs.Strategy(), xblocks, yblocks, &thres[0],
    498                            coeffs_in + c * size, &quant_ac,
    499                            quantized + c * size);
    500            DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size,
    501                                    dc_rows[c] + bx, dc_stride);
    502          }
    503          row_quant_ac[bx] = quant_ac;
    504          for (size_t c = 0; c < 3; c++) {
    505            enc_state->progressive_splitter.SplitACCoefficients(
    506                quantized + c * size, acs, bx, by, coeffs[c]);
    507            for (size_t p = 0; p < num_passes; p++) {
    508              coeffs[c][p] += size;
    509            }
    510          }
    511        }
    512      }
    513    }
    514  }
    515  return true;
    516 }
    517 
    518 // NOLINTNEXTLINE(google-readability-namespace-comments)
    519 }  // namespace HWY_NAMESPACE
    520 }  // namespace jxl
    521 HWY_AFTER_NAMESPACE();
    522 
    523 #if HWY_ONCE
    524 namespace jxl {
    525 HWY_EXPORT(ComputeCoefficients);
    526 Status ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
    527                           const Image3F& opsin, const Rect& rect,
    528                           Image3F* dc) {
    529  return HWY_DYNAMIC_DISPATCH(ComputeCoefficients)(group_idx, enc_state, opsin,
    530                                                   rect, dc);
    531 }
    532 
    533 Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
    534                                        size_t histogram_idx,
    535                                        const PassesEncoderState& enc_state,
    536                                        BitWriter* writer, AuxOut* aux_out) {
    537  // Select which histogram to use among those of the current pass.
    538  const size_t num_histograms = enc_state.shared.num_histograms;
    539  // num_histograms is 0 only for lossless.
    540  JXL_ENSURE(num_histograms == 0 || histogram_idx < num_histograms);
    541  size_t histo_selector_bits = CeilLog2Nonzero(num_histograms);
    542 
    543  if (histo_selector_bits != 0) {
    544    JXL_RETURN_IF_ERROR(
    545        writer->WithMaxBits(histo_selector_bits, LayerType::Ac, aux_out, [&] {
    546          writer->Write(histo_selector_bits, histogram_idx);
    547          return true;
    548        }));
    549  }
    550  size_t context_offset =
    551      histogram_idx * enc_state.shared.block_ctx_map.NumACContexts();
    552  JXL_RETURN_IF_ERROR(WriteTokens(
    553      enc_state.passes[pass_idx].ac_tokens[group_idx],
    554      enc_state.passes[pass_idx].codes, enc_state.passes[pass_idx].context_map,
    555      context_offset, writer, LayerType::AcTokens, aux_out));
    556 
    557  return true;
    558 }
    559 
    560 }  // namespace jxl
    561 #endif  // HWY_ONCE