tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

quant_enc.c (50400B)


      1 // Copyright 2011 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 //   Quantization
     11 //
     12 // Author: Skal (pascal.massimino@gmail.com)
     13 
     14 #include <assert.h>
     15 #include <math.h>
     16 #include <stdlib.h>  // for abs()
     17 #include <string.h>
     18 
     19 #include "src/dec/common_dec.h"
     20 #include "src/dsp/dsp.h"
     21 #include "src/dsp/quant.h"
     22 #include "src/enc/cost_enc.h"
     23 #include "src/enc/vp8i_enc.h"
     24 #include "src/webp/types.h"
     25 
     26 #define DO_TRELLIS_I4  1
     27 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
     28 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
     29 #define USE_TDISTO 1
     30 
     31 #define MID_ALPHA 64      // neutral value for susceptibility
     32 #define MIN_ALPHA 30      // lowest usable value for susceptibility
     33 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
     34 
     35 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
     36                          // power-law modulation. Must be strictly less than 1.
     37 
     38 // number of non-zero coeffs below which we consider the block very flat
     39 // (and apply a penalty to complex predictions)
     40 #define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
     41 #define FLATNESS_LIMIT_I4  3       // I4 mode
     42 #define FLATNESS_LIMIT_UV  2       // UV mode
     43 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
     44 
     45 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
     46 
     47 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
     48 
     49 // #define DEBUG_BLOCK
     50 
     51 //------------------------------------------------------------------------------
     52 
     53 #if defined(DEBUG_BLOCK)
     54 
     55 #include <stdio.h>
     56 #include <stdlib.h>
     57 
     58 static void PrintBlockInfo(const VP8EncIterator* const it,
     59                           const VP8ModeScore* const rd) {
     60  int i, j;
     61  const int is_i16 = (it->mb->type == 1);
     62  const uint8_t* const y_in = it->yuv_in + Y_OFF_ENC;
     63  const uint8_t* const y_out = it->yuv_out + Y_OFF_ENC;
     64  const uint8_t* const uv_in = it->yuv_in + U_OFF_ENC;
     65  const uint8_t* const uv_out = it->yuv_out + U_OFF_ENC;
     66  printf("SOURCE / OUTPUT / ABS DELTA\n");
     67  for (j = 0; j < 16; ++j) {
     68    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
     69    printf("     ");
     70    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
     71    printf("     ");
     72    for (i = 0; i < 16; ++i) {
     73      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
     74    }
     75    printf("\n");
     76  }
     77  printf("\n");   // newline before the U/V block
     78  for (j = 0; j < 8; ++j) {
     79    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
     80    printf(" ");
     81    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
     82    printf("    ");
     83    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
     84    printf(" ");
     85    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
     86    printf("   ");
     87    for (i = 0; i < 8; ++i) {
     88      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     89    }
     90    printf(" ");
     91    for (i = 8; i < 16; ++i) {
     92      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     93    }
     94    printf("\n");
     95  }
     96  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
     97    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
     98    (int)rd->score);
     99  if (is_i16) {
    100    printf("Mode: %d\n", rd->mode_i16);
    101    printf("y_dc_levels:");
    102    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
    103    printf("\n");
    104  } else {
    105    printf("Modes[16]: ");
    106    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
    107    printf("\n");
    108  }
    109  printf("y_ac_levels:\n");
    110  for (j = 0; j < 16; ++j) {
    111    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
    112      printf("%4d ", rd->y_ac_levels[j][i]);
    113    }
    114    printf("\n");
    115  }
    116  printf("\n");
    117  printf("uv_levels (mode=%d):\n", rd->mode_uv);
    118  for (j = 0; j < 8; ++j) {
    119    for (i = 0; i < 16; ++i) {
    120      printf("%4d ", rd->uv_levels[j][i]);
    121    }
    122    printf("\n");
    123  }
    124 }
    125 
    126 #endif   // DEBUG_BLOCK
    127 
    128 //------------------------------------------------------------------------------
    129 
    130 static WEBP_INLINE int clip(int v, int m, int M) {
    131  return v < m ? m : v > M ? M : v;
    132 }
    133 
    134 static const uint8_t kZigzag[16] = {
    135  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
    136 };
    137 
    138 static const uint8_t kDcTable[128] = {
    139  4,     5,   6,   7,   8,   9,  10,  10,
    140  11,   12,  13,  14,  15,  16,  17,  17,
    141  18,   19,  20,  20,  21,  21,  22,  22,
    142  23,   23,  24,  25,  25,  26,  27,  28,
    143  29,   30,  31,  32,  33,  34,  35,  36,
    144  37,   37,  38,  39,  40,  41,  42,  43,
    145  44,   45,  46,  46,  47,  48,  49,  50,
    146  51,   52,  53,  54,  55,  56,  57,  58,
    147  59,   60,  61,  62,  63,  64,  65,  66,
    148  67,   68,  69,  70,  71,  72,  73,  74,
    149  75,   76,  76,  77,  78,  79,  80,  81,
    150  82,   83,  84,  85,  86,  87,  88,  89,
    151  91,   93,  95,  96,  98, 100, 101, 102,
    152  104, 106, 108, 110, 112, 114, 116, 118,
    153  122, 124, 126, 128, 130, 132, 134, 136,
    154  138, 140, 143, 145, 148, 151, 154, 157
    155 };
    156 
    157 static const uint16_t kAcTable[128] = {
    158  4,     5,   6,   7,   8,   9,  10,  11,
    159  12,   13,  14,  15,  16,  17,  18,  19,
    160  20,   21,  22,  23,  24,  25,  26,  27,
    161  28,   29,  30,  31,  32,  33,  34,  35,
    162  36,   37,  38,  39,  40,  41,  42,  43,
    163  44,   45,  46,  47,  48,  49,  50,  51,
    164  52,   53,  54,  55,  56,  57,  58,  60,
    165  62,   64,  66,  68,  70,  72,  74,  76,
    166  78,   80,  82,  84,  86,  88,  90,  92,
    167  94,   96,  98, 100, 102, 104, 106, 108,
    168  110, 112, 114, 116, 119, 122, 125, 128,
    169  131, 134, 137, 140, 143, 146, 149, 152,
    170  155, 158, 161, 164, 167, 170, 173, 177,
    171  181, 185, 189, 193, 197, 201, 205, 209,
    172  213, 217, 221, 225, 229, 234, 239, 245,
    173  249, 254, 259, 264, 269, 274, 279, 284
    174 };
    175 
    176 static const uint16_t kAcTable2[128] = {
    177  8,     8,   9,  10,  12,  13,  15,  17,
    178  18,   20,  21,  23,  24,  26,  27,  29,
    179  31,   32,  34,  35,  37,  38,  40,  41,
    180  43,   44,  46,  48,  49,  51,  52,  54,
    181  55,   57,  58,  60,  62,  63,  65,  66,
    182  68,   69,  71,  72,  74,  75,  77,  79,
    183  80,   82,  83,  85,  86,  88,  89,  93,
    184  96,   99, 102, 105, 108, 111, 114, 117,
    185  120, 124, 127, 130, 133, 136, 139, 142,
    186  145, 148, 151, 155, 158, 161, 164, 167,
    187  170, 173, 176, 179, 184, 189, 193, 198,
    188  203, 207, 212, 217, 221, 226, 230, 235,
    189  240, 244, 249, 254, 258, 263, 268, 274,
    190  280, 286, 292, 299, 305, 311, 317, 323,
    191  330, 336, 342, 348, 354, 362, 370, 379,
    192  385, 393, 401, 409, 416, 424, 432, 440
    193 };
    194 
    195 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
    196  { 96, 110 }, { 96, 108 }, { 110, 115 }
    197 };
    198 
    199 // Sharpening by (slightly) raising the hi-frequency coeffs.
    200 // Hack-ish but helpful for mid-bitrate range. Use with care.
    201 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
    202 static const uint8_t kFreqSharpening[16] = {
    203  0,  30, 60, 90,
    204  30, 60, 90, 90,
    205  60, 90, 90, 90,
    206  90, 90, 90, 90
    207 };
    208 
    209 //------------------------------------------------------------------------------
    210 // Initialize quantization parameters in VP8Matrix
    211 
    212 // Returns the average quantizer
    213 static int ExpandMatrix(VP8Matrix* const m, int type) {
    214  int i, sum;
    215  for (i = 0; i < 2; ++i) {
    216    const int is_ac_coeff = (i > 0);
    217    const int bias = kBiasMatrices[type][is_ac_coeff];
    218    m->iq[i] = (1 << QFIX) / m->q[i];
    219    m->bias[i] = BIAS(bias);
    220    // zthresh is the exact value such that QUANTDIV(coeff, iQ, B) is:
    221    //   * zero if coeff <= zthresh
    222    //   * non-zero if coeff > zthresh
    223    m->zthresh[i] = ((1 << QFIX) - 1 - m->bias[i]) / m->iq[i];
    224  }
    225  for (i = 2; i < 16; ++i) {
    226    m->q[i] = m->q[1];
    227    m->iq[i] = m->iq[1];
    228    m->bias[i] = m->bias[1];
    229    m->zthresh[i] = m->zthresh[1];
    230  }
    231  for (sum = 0, i = 0; i < 16; ++i) {
    232    if (type == 0) {  // we only use sharpening for AC luma coeffs
    233      m->sharpen[i] = (kFreqSharpening[i] * m->q[i]) >> SHARPEN_BITS;
    234    } else {
    235      m->sharpen[i] = 0;
    236    }
    237    sum += m->q[i];
    238  }
    239  return (sum + 8) >> 4;
    240 }
    241 
    242 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
    243 
    244 static void SetupMatrices(VP8Encoder* enc) {
    245  int i;
    246  const int tlambda_scale =
    247    (enc->method >= 4) ? enc->config->sns_strength
    248                        : 0;
    249  const int num_segments = enc->segment_hdr.num_segments;
    250  for (i = 0; i < num_segments; ++i) {
    251    VP8SegmentInfo* const m = &enc->dqm[i];
    252    const int q = m->quant;
    253    int q_i4, q_i16, q_uv;
    254    m->y1.q[0] = kDcTable[clip(q + enc->dq_y1_dc, 0, 127)];
    255    m->y1.q[1] = kAcTable[clip(q,                  0, 127)];
    256 
    257    m->y2.q[0] = kDcTable[ clip(q + enc->dq_y2_dc, 0, 127)] * 2;
    258    m->y2.q[1] = kAcTable2[clip(q + enc->dq_y2_ac, 0, 127)];
    259 
    260    m->uv.q[0] = kDcTable[clip(q + enc->dq_uv_dc, 0, 117)];
    261    m->uv.q[1] = kAcTable[clip(q + enc->dq_uv_ac, 0, 127)];
    262 
    263    q_i4  = ExpandMatrix(&m->y1, 0);
    264    q_i16 = ExpandMatrix(&m->y2, 1);
    265    q_uv  = ExpandMatrix(&m->uv, 2);
    266 
    267    m->lambda_i4          = (3 * q_i4 * q_i4) >> 7;
    268    m->lambda_i16         = (3 * q_i16 * q_i16);
    269    m->lambda_uv          = (3 * q_uv * q_uv) >> 6;
    270    m->lambda_mode        = (1 * q_i4 * q_i4) >> 7;
    271    m->lambda_trellis_i4  = (7 * q_i4 * q_i4) >> 3;
    272    m->lambda_trellis_i16 = (q_i16 * q_i16) >> 2;
    273    m->lambda_trellis_uv  = (q_uv * q_uv) << 1;
    274    m->tlambda            = (tlambda_scale * q_i4) >> 5;
    275 
    276    // none of these constants should be < 1
    277    CheckLambdaValue(&m->lambda_i4);
    278    CheckLambdaValue(&m->lambda_i16);
    279    CheckLambdaValue(&m->lambda_uv);
    280    CheckLambdaValue(&m->lambda_mode);
    281    CheckLambdaValue(&m->lambda_trellis_i4);
    282    CheckLambdaValue(&m->lambda_trellis_i16);
    283    CheckLambdaValue(&m->lambda_trellis_uv);
    284    CheckLambdaValue(&m->tlambda);
    285 
    286    m->min_disto = 20 * m->y1.q[0];   // quantization-aware min disto
    287    m->max_edge  = 0;
    288 
    289    m->i4_penalty = 1000 * q_i4 * q_i4;
    290  }
    291 }
    292 
    293 //------------------------------------------------------------------------------
    294 // Initialize filtering parameters
    295 
    296 // Very small filter-strength values have close to no visual effect. So we can
    297 // save a little decoding-CPU by turning filtering off for these.
    298 #define FSTRENGTH_CUTOFF 2
    299 
    300 static void SetupFilterStrength(VP8Encoder* const enc) {
    301  int i;
    302  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
    303  const int level0 = 5 * enc->config->filter_strength;
    304  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
    305    VP8SegmentInfo* const m = &enc->dqm[i];
    306    // We focus on the quantization of AC coeffs.
    307    const int qstep = kAcTable[clip(m->quant, 0, 127)] >> 2;
    308    const int base_strength =
    309        VP8FilterStrengthFromDelta(enc->filter_hdr.sharpness, qstep);
    310    // Segments with lower complexity ('beta') will be less filtered.
    311    const int f = base_strength * level0 / (256 + m->beta);
    312    m->fstrength = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
    313  }
    314  // We record the initial strength (mainly for the case of 1-segment only).
    315  enc->filter_hdr.level = enc->dqm[0].fstrength;
    316  enc->filter_hdr.simple = (enc->config->filter_type == 0);
    317  enc->filter_hdr.sharpness = enc->config->filter_sharpness;
    318 }
    319 
    320 //------------------------------------------------------------------------------
    321 
    322 // Note: if you change the values below, remember that the max range
    323 // allowed by the syntax for DQ_UV is [-16,16].
    324 #define MAX_DQ_UV (6)
    325 #define MIN_DQ_UV (-4)
    326 
    327 // We want to emulate jpeg-like behaviour where the expected "good" quality
    328 // is around q=75. Internally, our "good" middle is around c=50. So we
    329 // map accordingly using linear piece-wise function
    330 static double QualityToCompression(double c) {
    331  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
    332  // The file size roughly scales as pow(quantizer, 3.). Actually, the
    333  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
    334  // in the mid-quant range. So we scale the compressibility inversely to
    335  // this power-law: quant ~= compression ^ 1/3. This law holds well for
    336  // low quant. Finer modeling for high-quant would make use of kAcTable[]
    337  // more explicitly.
    338  const double v = pow(linear_c, 1 / 3.);
    339  return v;
    340 }
    341 
    342 static double QualityToJPEGCompression(double c, double alpha) {
    343  // We map the complexity 'alpha' and quality setting 'c' to a compression
    344  // exponent empirically matched to the compression curve of libjpeg6b.
    345  // On average, the WebP output size will be roughly similar to that of a
    346  // JPEG file compressed with same quality factor.
    347  const double amin = 0.30;
    348  const double amax = 0.85;
    349  const double exp_min = 0.4;
    350  const double exp_max = 0.9;
    351  const double slope = (exp_min - exp_max) / (amax - amin);
    352  // Linearly interpolate 'expn' from exp_min to exp_max
    353  // in the [amin, amax] range.
    354  const double expn = (alpha > amax) ? exp_min
    355                    : (alpha < amin) ? exp_max
    356                    : exp_max + slope * (alpha - amin);
    357  const double v = pow(c, expn);
    358  return v;
    359 }
    360 
    361 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
    362                                 const VP8SegmentInfo* const S2) {
    363  return (S1->quant == S2->quant) && (S1->fstrength == S2->fstrength);
    364 }
    365 
    366 static void SimplifySegments(VP8Encoder* const enc) {
    367  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
    368  // 'num_segments' is previously validated and <= NUM_MB_SEGMENTS, but an
    369  // explicit check is needed to avoid a spurious warning about 'i' exceeding
    370  // array bounds of 'dqm' with some compilers (noticed with gcc-4.9).
    371  const int num_segments = (enc->segment_hdr.num_segments < NUM_MB_SEGMENTS)
    372                               ? enc->segment_hdr.num_segments
    373                               : NUM_MB_SEGMENTS;
    374  int num_final_segments = 1;
    375  int s1, s2;
    376  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
    377    const VP8SegmentInfo* const S1 = &enc->dqm[s1];
    378    int found = 0;
    379    // check if we already have similar segment
    380    for (s2 = 0; s2 < num_final_segments; ++s2) {
    381      const VP8SegmentInfo* const S2 = &enc->dqm[s2];
    382      if (SegmentsAreEquivalent(S1, S2)) {
    383        found = 1;
    384        break;
    385      }
    386    }
    387    map[s1] = s2;
    388    if (!found) {
    389      if (num_final_segments != s1) {
    390        enc->dqm[num_final_segments] = enc->dqm[s1];
    391      }
    392      ++num_final_segments;
    393    }
    394  }
    395  if (num_final_segments < num_segments) {  // Remap
    396    int i = enc->mb_w * enc->mb_h;
    397    while (i-- > 0) enc->mb_info[i].segment = map[enc->mb_info[i].segment];
    398    enc->segment_hdr.num_segments = num_final_segments;
    399    // Replicate the trailing segment infos (it's mostly cosmetics)
    400    for (i = num_final_segments; i < num_segments; ++i) {
    401      enc->dqm[i] = enc->dqm[num_final_segments - 1];
    402    }
    403  }
    404 }
    405 
    406 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
    407  int i;
    408  int dq_uv_ac, dq_uv_dc;
    409  const int num_segments = enc->segment_hdr.num_segments;
    410  const double amp = SNS_TO_DQ * enc->config->sns_strength / 100. / 128.;
    411  const double Q = quality / 100.;
    412  const double c_base = enc->config->emulate_jpeg_size ?
    413      QualityToJPEGCompression(Q, enc->alpha / 255.) :
    414      QualityToCompression(Q);
    415  for (i = 0; i < num_segments; ++i) {
    416    // We modulate the base coefficient to accommodate for the quantization
    417    // susceptibility and allow denser segments to be quantized more.
    418    const double expn = 1. - amp * enc->dqm[i].alpha;
    419    const double c = pow(c_base, expn);
    420    const int q = (int)(127. * (1. - c));
    421    assert(expn > 0.);
    422    enc->dqm[i].quant = clip(q, 0, 127);
    423  }
    424 
    425  // purely indicative in the bitstream (except for the 1-segment case)
    426  enc->base_quant = enc->dqm[0].quant;
    427 
    428  // fill-in values for the unused segments (required by the syntax)
    429  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
    430    enc->dqm[i].quant = enc->base_quant;
    431  }
    432 
    433  // uv_alpha is normally spread around ~60. The useful range is
    434  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
    435  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
    436  dq_uv_ac = (enc->uv_alpha - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
    437                                         / (MAX_ALPHA - MIN_ALPHA);
    438  // we rescale by the user-defined strength of adaptation
    439  dq_uv_ac = dq_uv_ac * enc->config->sns_strength / 100;
    440  // and make it safe.
    441  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
    442  // We also boost the dc-uv-quant a little, based on sns-strength, since
    443  // U/V channels are quite more reactive to high quants (flat DC-blocks
    444  // tend to appear, and are unpleasant).
    445  dq_uv_dc = -4 * enc->config->sns_strength / 100;
    446  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
    447 
    448  enc->dq_y1_dc = 0;       // TODO(skal): dq-lum
    449  enc->dq_y2_dc = 0;
    450  enc->dq_y2_ac = 0;
    451  enc->dq_uv_dc = dq_uv_dc;
    452  enc->dq_uv_ac = dq_uv_ac;
    453 
    454  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
    455 
    456  if (num_segments > 1) SimplifySegments(enc);
    457 
    458  SetupMatrices(enc);         // finalize quantization matrices
    459 }
    460 
    461 //------------------------------------------------------------------------------
    462 // Form the predictions in cache
    463 
    464 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
    465 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
    466 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
    467 
    468 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
    469 static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
    470  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
    471 };
    472 
    473 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
    474  const uint8_t* const left = it->x ? it->y_left : NULL;
    475  const uint8_t* const top = it->y ? it->y_top : NULL;
    476  VP8EncPredLuma16(it->yuv_p, left, top);
    477 }
    478 
    479 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
    480  const uint8_t* const left = it->x ? it->u_left : NULL;
    481  const uint8_t* const top = it->y ? it->uv_top : NULL;
    482  VP8EncPredChroma8(it->yuv_p, left, top);
    483 }
    484 
    485 // Form all the ten Intra4x4 predictions in the 'yuv_p' cache
    486 // for the 4x4 block it->i4
    487 static void MakeIntra4Preds(const VP8EncIterator* const it) {
    488  VP8EncPredLuma4(it->yuv_p, it->i4_top);
    489 }
    490 
    491 //------------------------------------------------------------------------------
    492 // Quantize
    493 
    494 // Layout:
    495 // +----+----+
    496 // |YYYY|UUVV| 0
    497 // |YYYY|UUVV| 4
    498 // |YYYY|....| 8
    499 // |YYYY|....| 12
    500 // +----+----+
    501 
    502 const uint16_t VP8Scan[16] = {  // Luma
    503  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
    504  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
    505  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
    506  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
    507 };
    508 
    509 static const uint16_t VP8ScanUV[4 + 4] = {
    510  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
    511  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
    512 };
    513 
    514 //------------------------------------------------------------------------------
    515 // Distortion measurement
    516 
    517 static const uint16_t kWeightY[16] = {
    518  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
    519 };
    520 
    521 static const uint16_t kWeightTrellis[16] = {
    522 #if USE_TDISTO == 0
    523  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    524 #else
    525  30, 27, 19, 11,
    526  27, 24, 17, 10,
    527  19, 17, 12,  8,
    528  11, 10,  8,  6
    529 #endif
    530 };
    531 
    532 // Init/Copy the common fields in score.
    533 static void InitScore(VP8ModeScore* const rd) {
    534  rd->D  = 0;
    535  rd->SD = 0;
    536  rd->R  = 0;
    537  rd->H  = 0;
    538  rd->nz = 0;
    539  rd->score = MAX_COST;
    540 }
    541 
    542 static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
    543                      const VP8ModeScore* WEBP_RESTRICT const src) {
    544  dst->D  = src->D;
    545  dst->SD = src->SD;
    546  dst->R  = src->R;
    547  dst->H  = src->H;
    548  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
    549  dst->score = src->score;
    550 }
    551 
    552 static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
    553                     const VP8ModeScore* WEBP_RESTRICT const src) {
    554  dst->D  += src->D;
    555  dst->SD += src->SD;
    556  dst->R  += src->R;
    557  dst->H  += src->H;
    558  dst->nz |= src->nz;     // here, new nz bits are accumulated.
    559  dst->score += src->score;
    560 }
    561 
    562 //------------------------------------------------------------------------------
    563 // Performs trellis-optimized quantization.
    564 
    565 // Trellis node
    566 typedef struct {
    567  int8_t prev;            // best previous node
    568  int8_t sign;            // sign of coeff_i
    569  int16_t level;          // level
    570 } Node;
    571 
    572 // Score state
    573 typedef struct {
    574  score_t score;          // partial RD score
    575  const uint16_t* costs;  // shortcut to cost tables
    576 } ScoreState;
    577 
    578 // If a coefficient was quantized to a value Q (using a neutral bias),
    579 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
    580 // We don't test negative values though.
    581 #define MIN_DELTA 0   // how much lower level to try
    582 #define MAX_DELTA 1   // how much higher
    583 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
    584 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
    585 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
    586 
    587 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
    588  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
    589 }
    590 
    591 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
    592                                          score_t distortion) {
    593  return rate * lambda + RD_DISTO_MULT * distortion;
    594 }
    595 
    596 // Coefficient type.
    597 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
    598 
    599 static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
    600                                int16_t in[16], int16_t out[16],
    601                                int ctx0, int coeff_type,
    602                                const VP8Matrix* WEBP_RESTRICT const mtx,
    603                                int lambda) {
    604  const ProbaArray* const probas = enc->proba.coeffs[coeff_type];
    605  CostArrayPtr const costs =
    606      (CostArrayPtr)enc->proba.remapped_costs[coeff_type];
    607  const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
    608  Node nodes[16][NUM_NODES];
    609  ScoreState score_states[2][NUM_NODES];
    610  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
    611  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
    612  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
    613  score_t best_score;
    614  int n, m, p, last;
    615 
    616  {
    617    score_t cost;
    618    const int thresh = mtx->q[1] * mtx->q[1] / 4;
    619    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
    620 
    621    // compute the position of the last interesting coefficient
    622    last = first - 1;
    623    for (n = 15; n >= first; --n) {
    624      const int j = kZigzag[n];
    625      const int err = in[j] * in[j];
    626      if (err > thresh) {
    627        last = n;
    628        break;
    629      }
    630    }
    631    // we don't need to go inspect up to n = 16 coeffs. We can just go up
    632    // to last + 1 (inclusive) without losing much.
    633    if (last < 15) ++last;
    634 
    635    // compute 'skip' score. This is the max score one can do.
    636    cost = VP8BitCost(0, last_proba);
    637    best_score = RDScoreTrellis(lambda, cost, 0);
    638 
    639    // initialize source node.
    640    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    641      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
    642      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
    643      ss_cur[m].costs = costs[first][ctx0];
    644    }
    645  }
    646 
    647  // traverse trellis.
    648  for (n = first; n <= last; ++n) {
    649    const int j = kZigzag[n];
    650    const uint32_t Q  = mtx->q[j];
    651    const uint32_t iQ = mtx->iq[j];
    652    const uint32_t B = BIAS(0x00);     // neutral bias
    653    // note: it's important to take sign of the _original_ coeff,
    654    // so we don't have to consider level < 0 afterward.
    655    const int sign = (in[j] < 0);
    656    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen[j];
    657    int level0 = QUANTDIV(coeff0, iQ, B);
    658    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
    659    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
    660    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
    661 
    662    {   // Swap current and previous score states
    663      ScoreState* const tmp = ss_cur;
    664      ss_cur = ss_prev;
    665      ss_prev = tmp;
    666    }
    667 
    668    // test all alternate level values around level0.
    669    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    670      Node* const cur = &NODE(n, m);
    671      const int level = level0 + m;
    672      const int ctx = (level > 2) ? 2 : level;
    673      const int band = VP8EncBands[n + 1];
    674      score_t base_score;
    675      score_t best_cur_score;
    676      int best_prev;
    677      score_t cost, score;
    678 
    679      ss_cur[m].costs = costs[n + 1][ctx];
    680      if (level < 0 || level > thresh_level) {
    681        ss_cur[m].score = MAX_COST;
    682        // Node is dead.
    683        continue;
    684      }
    685 
    686      {
    687        // Compute delta_error = how much coding this level will
    688        // subtract to max_error as distortion.
    689        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
    690        const int new_error = coeff0 - level * Q;
    691        const int delta_error =
    692            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
    693        base_score = RDScoreTrellis(lambda, 0, delta_error);
    694      }
    695 
    696      // Inspect all possible non-dead predecessors. Retain only the best one.
    697      // The base_score is added to all scores so it is only added for the final
    698      // value after the loop.
    699      cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
    700      best_cur_score =
    701          ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
    702      best_prev = -MIN_DELTA;
    703      for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
    704        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
    705        // eliminated since their score can't be better than the current best.
    706        cost = VP8LevelCost(ss_prev[p].costs, level);
    707        // Examine node assuming it's a non-terminal one.
    708        score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
    709        if (score < best_cur_score) {
    710          best_cur_score = score;
    711          best_prev = p;
    712        }
    713      }
    714      best_cur_score += base_score;
    715      // Store best finding in current node.
    716      cur->sign = sign;
    717      cur->level = level;
    718      cur->prev = best_prev;
    719      ss_cur[m].score = best_cur_score;
    720 
    721      // Now, record best terminal node (and thus best entry in the graph).
    722      if (level != 0 && best_cur_score < best_score) {
    723        const score_t last_pos_cost =
    724            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
    725        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
    726        score = best_cur_score + last_pos_score;
    727        if (score < best_score) {
    728          best_score = score;
    729          best_path[0] = n;                     // best eob position
    730          best_path[1] = m;                     // best node index
    731          best_path[2] = best_prev;             // best predecessor
    732        }
    733      }
    734    }
    735  }
    736 
    737  // Fresh start
    738  // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
    739  if (coeff_type == TYPE_I16_AC) {
    740    memset(in + 1, 0, 15 * sizeof(*in));
    741    memset(out + 1, 0, 15 * sizeof(*out));
    742  } else {
    743    memset(in, 0, 16 * sizeof(*in));
    744    memset(out, 0, 16 * sizeof(*out));
    745  }
    746  if (best_path[0] == -1) {
    747    return 0;  // skip!
    748  }
    749 
    750  {
    751    // Unwind the best path.
    752    // Note: best-prev on terminal node is not necessarily equal to the
    753    // best_prev for non-terminal. So we patch best_path[2] in.
    754    int nz = 0;
    755    int best_node = best_path[1];
    756    n = best_path[0];
    757    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
    758 
    759    for (; n >= first; --n) {
    760      const Node* const node = &NODE(n, best_node);
    761      const int j = kZigzag[n];
    762      out[n] = node->sign ? -node->level : node->level;
    763      nz |= node->level;
    764      in[j] = out[n] * mtx->q[j];
    765      best_node = node->prev;
    766    }
    767    return (nz != 0);
    768  }
    769 }
    770 
    771 #undef NODE
    772 
    773 //------------------------------------------------------------------------------
    774 // Performs: difference, transform, quantize, back-transform, add
    775 // all at once. Output is the reconstructed block in *yuv_out, and the
    776 // quantized levels in *levels.
    777 
    778 static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
    779                              VP8ModeScore* WEBP_RESTRICT const rd,
    780                              uint8_t* WEBP_RESTRICT const yuv_out,
    781                              int mode) {
    782  const VP8Encoder* const enc = it->enc;
    783  const uint8_t* const ref = it->yuv_p + VP8I16ModeOffsets[mode];
    784  const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
    785  const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
    786  int nz = 0;
    787  int n;
    788  int16_t tmp[16][16], dc_tmp[16];
    789 
    790  for (n = 0; n < 16; n += 2) {
    791    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
    792  }
    793  VP8FTransformWHT(tmp[0], dc_tmp);
    794  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2) << 24;
    795 
    796  if (DO_TRELLIS_I16 && it->do_trellis) {
    797    int x, y;
    798    VP8IteratorNzToBytes(it);
    799    for (y = 0, n = 0; y < 4; ++y) {
    800      for (x = 0; x < 4; ++x, ++n) {
    801        const int ctx = it->top_nz[x] + it->left_nz[y];
    802        const int non_zero = TrellisQuantizeBlock(
    803            enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1,
    804            dqm->lambda_trellis_i16);
    805        it->top_nz[x] = it->left_nz[y] = non_zero;
    806        rd->y_ac_levels[n][0] = 0;
    807        nz |= non_zero << n;
    808      }
    809    }
    810  } else {
    811    for (n = 0; n < 16; n += 2) {
    812      // Zero-out the first coeff, so that: a) nz is correct below, and
    813      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
    814      tmp[n][0] = tmp[n + 1][0] = 0;
    815      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1) << n;
    816      assert(rd->y_ac_levels[n + 0][0] == 0);
    817      assert(rd->y_ac_levels[n + 1][0] == 0);
    818    }
    819  }
    820 
    821  // Transform back
    822  VP8TransformWHT(dc_tmp, tmp[0]);
    823  for (n = 0; n < 16; n += 2) {
    824    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
    825  }
    826 
    827  return nz;
    828 }
    829 
    830 static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
    831                             int16_t levels[16],
    832                             const uint8_t* WEBP_RESTRICT const src,
    833                             uint8_t* WEBP_RESTRICT const yuv_out,
    834                             int mode) {
    835  const VP8Encoder* const enc = it->enc;
    836  const uint8_t* const ref = it->yuv_p + VP8I4ModeOffsets[mode];
    837  const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
    838  int nz = 0;
    839  int16_t tmp[16];
    840 
    841  VP8FTransform(src, ref, tmp);
    842  if (DO_TRELLIS_I4 && it->do_trellis) {
    843    const int x = it->i4 & 3, y = it->i4 >> 2;
    844    const int ctx = it->top_nz[x] + it->left_nz[y];
    845    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1,
    846                              dqm->lambda_trellis_i4);
    847  } else {
    848    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1);
    849  }
    850  VP8ITransform(ref, tmp, yuv_out, 0);
    851  return nz;
    852 }
    853 
    854 //------------------------------------------------------------------------------
    855 // DC-error diffusion
    856 
    857 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
    858 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
    859 #define C1 7    // fraction of error sent to the 4x4 block below
    860 #define C2 8    // fraction of error sent to the 4x4 block on the right
    861 #define DSHIFT 4
    862 #define DSCALE 1   // storage descaling, needed to make the error fit int8_t
    863 
    864 // Quantize as usual, but also compute and return the quantization error.
    865 // Error is already divided by DSHIFT.
    866 static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
    867                          const VP8Matrix* WEBP_RESTRICT const mtx) {
    868  int V = *v;
    869  const int sign = (V < 0);
    870  if (sign) V = -V;
    871  if (V > (int)mtx->zthresh[0]) {
    872    const int qV = QUANTDIV(V, mtx->iq[0], mtx->bias[0]) * mtx->q[0];
    873    const int err = (V - qV);
    874    *v = sign ? -qV : qV;
    875    return (sign ? -err : err) >> DSCALE;
    876  }
    877  *v = 0;
    878  return (sign ? -V : V) >> DSCALE;
    879 }
    880 
    881 static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
    882                            const VP8Matrix* WEBP_RESTRICT const mtx,
    883                            int16_t tmp[][16],
    884                            VP8ModeScore* WEBP_RESTRICT const rd) {
    885  //         | top[0] | top[1]
    886  // --------+--------+---------
    887  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
    888  // left[1] | tmp[2]   tmp[3]        err2 err3
    889  //
    890  // Final errors {err1,err2,err3} are preserved and later restored
    891  // as top[]/left[] on the next block.
    892  int ch;
    893  for (ch = 0; ch <= 1; ++ch) {
    894    const int8_t* const top = it->top_derr[it->x][ch];
    895    const int8_t* const left = it->left_derr[ch];
    896    int16_t (* const c)[16] = &tmp[ch * 4];
    897    int err0, err1, err2, err3;
    898    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
    899    err0 = QuantizeSingle(&c[0][0], mtx);
    900    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
    901    err1 = QuantizeSingle(&c[1][0], mtx);
    902    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
    903    err2 = QuantizeSingle(&c[2][0], mtx);
    904    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
    905    err3 = QuantizeSingle(&c[3][0], mtx);
    906    // error 'err' is bounded by mtx->q[0] which is 132 at max. Hence
    907    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
    908    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
    909    rd->derr[ch][0] = (int8_t)err1;
    910    rd->derr[ch][1] = (int8_t)err2;
    911    rd->derr[ch][2] = (int8_t)err3;
    912  }
    913 }
    914 
    915 static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
    916                                 const VP8ModeScore* WEBP_RESTRICT const rd) {
    917  int ch;
    918  for (ch = 0; ch <= 1; ++ch) {
    919    int8_t* const top = it->top_derr[it->x][ch];
    920    int8_t* const left = it->left_derr[ch];
    921    left[0] = rd->derr[ch][0];            // restore err1
    922    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
    923    top[0]  = rd->derr[ch][1];            //     ... err2
    924    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
    925  }
    926 }
    927 
    928 #undef C1
    929 #undef C2
    930 #undef DSHIFT
    931 #undef DSCALE
    932 
    933 //------------------------------------------------------------------------------
    934 
    935 static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
    936                         VP8ModeScore* WEBP_RESTRICT const rd,
    937                         uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
    938  const VP8Encoder* const enc = it->enc;
    939  const uint8_t* const ref = it->yuv_p + VP8UVModeOffsets[mode];
    940  const uint8_t* const src = it->yuv_in + U_OFF_ENC;
    941  const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
    942  int nz = 0;
    943  int n;
    944  int16_t tmp[8][16];
    945 
    946  for (n = 0; n < 8; n += 2) {
    947    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
    948  }
    949  if (it->top_derr != NULL) CorrectDCValues(it, &dqm->uv, tmp, rd);
    950 
    951  if (DO_TRELLIS_UV && it->do_trellis) {
    952    int ch, x, y;
    953    for (ch = 0, n = 0; ch <= 2; ch += 2) {
    954      for (y = 0; y < 2; ++y) {
    955        for (x = 0; x < 2; ++x, ++n) {
    956          const int ctx = it->top_nz[4 + ch + x] + it->left_nz[4 + ch + y];
    957          const int non_zero = TrellisQuantizeBlock(
    958              enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv,
    959              dqm->lambda_trellis_uv);
    960          it->top_nz[4 + ch + x] = it->left_nz[4 + ch + y] = non_zero;
    961          nz |= non_zero << n;
    962        }
    963      }
    964    }
    965  } else {
    966    for (n = 0; n < 8; n += 2) {
    967      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv) << n;
    968    }
    969  }
    970 
    971  for (n = 0; n < 8; n += 2) {
    972    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
    973  }
    974  return (nz << 16);
    975 }
    976 
    977 //------------------------------------------------------------------------------
    978 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
    979 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
    980 
    981 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
    982  // We look at the first three AC coefficients to determine what is the average
    983  // delta between each sub-4x4 block.
    984  const int v0 = abs(DCs[1]);
    985  const int v1 = abs(DCs[2]);
    986  const int v2 = abs(DCs[4]);
    987  int max_v = (v1 > v0) ? v1 : v0;
    988  max_v = (v2 > max_v) ? v2 : max_v;
    989  if (max_v > dqm->max_edge) dqm->max_edge = max_v;
    990 }
    991 
    992 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
    993  VP8ModeScore* const tmp = *a;
    994  *a = *b;
    995  *b = tmp;
    996 }
    997 
    998 static void SwapPtr(uint8_t** a, uint8_t** b) {
    999  uint8_t* const tmp = *a;
   1000  *a = *b;
   1001  *b = tmp;
   1002 }
   1003 
   1004 static void SwapOut(VP8EncIterator* const it) {
   1005  SwapPtr(&it->yuv_out, &it->yuv_out2);
   1006 }
   1007 
   1008 static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
   1009                            VP8ModeScore* WEBP_RESTRICT rd) {
   1010  const int kNumBlocks = 16;
   1011  VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
   1012  const int lambda = dqm->lambda_i16;
   1013  const int tlambda = dqm->tlambda;
   1014  const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
   1015  VP8ModeScore rd_tmp;
   1016  VP8ModeScore* rd_cur = &rd_tmp;
   1017  VP8ModeScore* rd_best = rd;
   1018  int mode;
   1019  int is_flat = IsFlatSource16(it->yuv_in + Y_OFF_ENC);
   1020 
   1021  rd->mode_i16 = -1;
   1022  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1023    uint8_t* const tmp_dst = it->yuv_out2 + Y_OFF_ENC;  // scratch buffer
   1024    rd_cur->mode_i16 = mode;
   1025 
   1026    // Reconstruct
   1027    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
   1028 
   1029    // Measure RD-score
   1030    rd_cur->D = VP8SSE16x16(src, tmp_dst);
   1031    rd_cur->SD =
   1032        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
   1033    rd_cur->H = VP8FixedCostsI16[mode];
   1034    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
   1035    if (is_flat) {
   1036      // refine the first impression (which was in pixel space)
   1037      is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
   1038      if (is_flat) {
   1039        // Block is very flat. We put emphasis on the distortion being very low!
   1040        rd_cur->D *= 2;
   1041        rd_cur->SD *= 2;
   1042      }
   1043    }
   1044 
   1045    // Since we always examine Intra16 first, we can overwrite *rd directly.
   1046    SetRDScore(lambda, rd_cur);
   1047    if (mode == 0 || rd_cur->score < rd_best->score) {
   1048      SwapModeScore(&rd_cur, &rd_best);
   1049      SwapOut(it);
   1050    }
   1051  }
   1052  if (rd_best != rd) {
   1053    memcpy(rd, rd_best, sizeof(*rd));
   1054  }
   1055  SetRDScore(dqm->lambda_mode, rd);   // finalize score for mode decision.
   1056  VP8SetIntra16Mode(it, rd->mode_i16);
   1057 
   1058  // we have a blocky macroblock (only DCs are non-zero) with fairly high
   1059  // distortion, record max delta so we can later adjust the minimal filtering
   1060  // strength needed to smooth these blocks out.
   1061  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto) {
   1062    StoreMaxDelta(dqm, rd->y_dc_levels);
   1063  }
   1064 }
   1065 
   1066 //------------------------------------------------------------------------------
   1067 
   1068 // return the cost array corresponding to the surrounding prediction modes.
   1069 static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
   1070                                     const uint8_t modes[16]) {
   1071  const int preds_w = it->enc->preds_w;
   1072  const int x = (it->i4 & 3), y = it->i4 >> 2;
   1073  const int left = (x == 0) ? it->preds[y * preds_w - 1] : modes[it->i4 - 1];
   1074  const int top = (y == 0) ? it->preds[-preds_w + x] : modes[it->i4 - 4];
   1075  return VP8FixedCostsI4[top][left];
   1076 }
   1077 
   1078 static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
   1079                          VP8ModeScore* WEBP_RESTRICT const rd) {
   1080  const VP8Encoder* const enc = it->enc;
   1081  const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment];
   1082  const int lambda = dqm->lambda_i4;
   1083  const int tlambda = dqm->tlambda;
   1084  const uint8_t* const src0 = it->yuv_in + Y_OFF_ENC;
   1085  uint8_t* const best_blocks = it->yuv_out2 + Y_OFF_ENC;
   1086  int total_header_bits = 0;
   1087  VP8ModeScore rd_best;
   1088 
   1089  if (enc->max_i4_header_bits == 0) {
   1090    return 0;
   1091  }
   1092 
   1093  InitScore(&rd_best);
   1094  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
   1095  SetRDScore(dqm->lambda_mode, &rd_best);
   1096  VP8IteratorStartI4(it);
   1097  do {
   1098    const int kNumBlocks = 1;
   1099    VP8ModeScore rd_i4;
   1100    int mode;
   1101    int best_mode = -1;
   1102    const uint8_t* const src = src0 + VP8Scan[it->i4];
   1103    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
   1104    uint8_t* best_block = best_blocks + VP8Scan[it->i4];
   1105    uint8_t* tmp_dst = it->yuv_p + I4TMP;    // scratch buffer.
   1106 
   1107    InitScore(&rd_i4);
   1108    MakeIntra4Preds(it);
   1109    for (mode = 0; mode < NUM_BMODES; ++mode) {
   1110      VP8ModeScore rd_tmp;
   1111      int16_t tmp_levels[16];
   1112 
   1113      // Reconstruct
   1114      rd_tmp.nz =
   1115          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4;
   1116 
   1117      // Compute RD-score
   1118      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
   1119      rd_tmp.SD =
   1120          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
   1121                  : 0;
   1122      rd_tmp.H = mode_costs[mode];
   1123 
   1124      // Add flatness penalty, to avoid flat area to be mispredicted
   1125      // by a complex mode.
   1126      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
   1127        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
   1128      } else {
   1129        rd_tmp.R = 0;
   1130      }
   1131 
   1132      // early-out check
   1133      SetRDScore(lambda, &rd_tmp);
   1134      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
   1135 
   1136      // finish computing score
   1137      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
   1138      SetRDScore(lambda, &rd_tmp);
   1139 
   1140      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
   1141        CopyScore(&rd_i4, &rd_tmp);
   1142        best_mode = mode;
   1143        SwapPtr(&tmp_dst, &best_block);
   1144        memcpy(rd_best.y_ac_levels[it->i4], tmp_levels,
   1145               sizeof(rd_best.y_ac_levels[it->i4]));
   1146      }
   1147    }
   1148    SetRDScore(dqm->lambda_mode, &rd_i4);
   1149    AddScore(&rd_best, &rd_i4);
   1150    if (rd_best.score >= rd->score) {
   1151      return 0;
   1152    }
   1153    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
   1154    if (total_header_bits > enc->max_i4_header_bits) {
   1155      return 0;
   1156    }
   1157    // Copy selected samples if not in the right place already.
   1158    if (best_block != best_blocks + VP8Scan[it->i4]) {
   1159      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4]);
   1160    }
   1161    rd->modes_i4[it->i4] = best_mode;
   1162    it->top_nz[it->i4 & 3] = it->left_nz[it->i4 >> 2] = (rd_i4.nz ? 1 : 0);
   1163  } while (VP8IteratorRotateI4(it, best_blocks));
   1164 
   1165  // finalize state
   1166  CopyScore(rd, &rd_best);
   1167  VP8SetIntra4Mode(it, rd->modes_i4);
   1168  SwapOut(it);
   1169  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
   1170  return 1;   // select intra4x4 over intra16x16
   1171 }
   1172 
   1173 //------------------------------------------------------------------------------
   1174 
   1175 static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
   1176                       VP8ModeScore* WEBP_RESTRICT const rd) {
   1177  const int kNumBlocks = 8;
   1178  const VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
   1179  const int lambda = dqm->lambda_uv;
   1180  const uint8_t* const src = it->yuv_in + U_OFF_ENC;
   1181  uint8_t* tmp_dst = it->yuv_out2 + U_OFF_ENC;  // scratch buffer
   1182  uint8_t* dst0 = it->yuv_out + U_OFF_ENC;
   1183  uint8_t* dst = dst0;
   1184  VP8ModeScore rd_best;
   1185  int mode;
   1186 
   1187  rd->mode_uv = -1;
   1188  InitScore(&rd_best);
   1189  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1190    VP8ModeScore rd_uv;
   1191 
   1192    // Reconstruct
   1193    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
   1194 
   1195    // Compute RD-score
   1196    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
   1197    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
   1198    rd_uv.H  = VP8FixedCostsUV[mode];
   1199    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
   1200    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
   1201      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
   1202    }
   1203 
   1204    SetRDScore(lambda, &rd_uv);
   1205    if (mode == 0 || rd_uv.score < rd_best.score) {
   1206      CopyScore(&rd_best, &rd_uv);
   1207      rd->mode_uv = mode;
   1208      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
   1209      if (it->top_derr != NULL) {
   1210        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
   1211      }
   1212      SwapPtr(&dst, &tmp_dst);
   1213    }
   1214  }
   1215  VP8SetIntraUVMode(it, rd->mode_uv);
   1216  AddScore(rd, &rd_best);
   1217  if (dst != dst0) {   // copy 16x8 block if needed
   1218    VP8Copy16x8(dst, dst0);
   1219  }
   1220  if (it->top_derr != NULL) {  // store diffusion errors for next block
   1221    StoreDiffusionErrors(it, rd);
   1222  }
   1223 }
   1224 
   1225 //------------------------------------------------------------------------------
   1226 // Final reconstruction and quantization.
   1227 
   1228 static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
   1229                           VP8ModeScore* WEBP_RESTRICT const rd) {
   1230  const VP8Encoder* const enc = it->enc;
   1231  const int is_i16 = (it->mb->type == 1);
   1232  int nz = 0;
   1233 
   1234  if (is_i16) {
   1235    nz = ReconstructIntra16(it, rd, it->yuv_out + Y_OFF_ENC, it->preds[0]);
   1236  } else {
   1237    VP8IteratorStartI4(it);
   1238    do {
   1239      const int mode =
   1240          it->preds[(it->i4 & 3) + (it->i4 >> 2) * enc->preds_w];
   1241      const uint8_t* const src = it->yuv_in + Y_OFF_ENC + VP8Scan[it->i4];
   1242      uint8_t* const dst = it->yuv_out + Y_OFF_ENC + VP8Scan[it->i4];
   1243      MakeIntra4Preds(it);
   1244      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4],
   1245                              src, dst, mode) << it->i4;
   1246    } while (VP8IteratorRotateI4(it, it->yuv_out + Y_OFF_ENC));
   1247  }
   1248 
   1249  nz |= ReconstructUV(it, rd, it->yuv_out + U_OFF_ENC, it->mb->uv_mode);
   1250  rd->nz = nz;
   1251 }
   1252 
   1253 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
   1254 static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
   1255                                  int try_both_modes, int refine_uv_mode,
   1256                                  VP8ModeScore* WEBP_RESTRICT const rd) {
   1257  score_t best_score = MAX_COST;
   1258  int nz = 0;
   1259  int mode;
   1260  int is_i16 = try_both_modes || (it->mb->type == 1);
   1261 
   1262  const VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment];
   1263  // Some empiric constants, of approximate order of magnitude.
   1264  const int lambda_d_i16 = 106;
   1265  const int lambda_d_i4 = 11;
   1266  const int lambda_d_uv = 120;
   1267  score_t score_i4 = dqm->i4_penalty;
   1268  score_t i4_bit_sum = 0;
   1269  const score_t bit_limit = try_both_modes ? it->enc->mb_header_limit
   1270                                           : MAX_COST;  // no early-out allowed
   1271 
   1272  if (is_i16) {   // First, evaluate Intra16 distortion
   1273    int best_mode = -1;
   1274    const uint8_t* const src = it->yuv_in + Y_OFF_ENC;
   1275    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1276      const uint8_t* const ref = it->yuv_p + VP8I16ModeOffsets[mode];
   1277      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
   1278                          + VP8FixedCostsI16[mode] * lambda_d_i16;
   1279      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
   1280        continue;
   1281      }
   1282 
   1283      if (score < best_score) {
   1284        best_mode = mode;
   1285        best_score = score;
   1286      }
   1287    }
   1288    if (it->x == 0 || it->y == 0) {
   1289      // avoid starting a checkerboard resonance from the border. See bug #432.
   1290      if (IsFlatSource16(src)) {
   1291        best_mode = (it->x == 0) ? 0 : 2;
   1292        try_both_modes = 0;  // stick to i16
   1293      }
   1294    }
   1295    VP8SetIntra16Mode(it, best_mode);
   1296    // we'll reconstruct later, if i16 mode actually gets selected
   1297  }
   1298 
   1299  // Next, evaluate Intra4
   1300  if (try_both_modes || !is_i16) {
   1301    // We don't evaluate the rate here, but just account for it through a
   1302    // constant penalty (i4 mode usually needs more bits compared to i16).
   1303    is_i16 = 0;
   1304    VP8IteratorStartI4(it);
   1305    do {
   1306      int best_i4_mode = -1;
   1307      score_t best_i4_score = MAX_COST;
   1308      const uint8_t* const src = it->yuv_in + Y_OFF_ENC + VP8Scan[it->i4];
   1309      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
   1310 
   1311      MakeIntra4Preds(it);
   1312      for (mode = 0; mode < NUM_BMODES; ++mode) {
   1313        const uint8_t* const ref = it->yuv_p + VP8I4ModeOffsets[mode];
   1314        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
   1315                            + mode_costs[mode] * lambda_d_i4;
   1316        if (score < best_i4_score) {
   1317          best_i4_mode = mode;
   1318          best_i4_score = score;
   1319        }
   1320      }
   1321      i4_bit_sum += mode_costs[best_i4_mode];
   1322      rd->modes_i4[it->i4] = best_i4_mode;
   1323      score_i4 += best_i4_score;
   1324      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
   1325        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
   1326        is_i16 = 1;
   1327        break;
   1328      } else {  // reconstruct partial block inside yuv_out2 buffer
   1329        uint8_t* const tmp_dst = it->yuv_out2 + Y_OFF_ENC + VP8Scan[it->i4];
   1330        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4],
   1331                                src, tmp_dst, best_i4_mode) << it->i4;
   1332      }
   1333    } while (VP8IteratorRotateI4(it, it->yuv_out2 + Y_OFF_ENC));
   1334  }
   1335 
   1336  // Final reconstruction, depending on which mode is selected.
   1337  if (!is_i16) {
   1338    VP8SetIntra4Mode(it, rd->modes_i4);
   1339    SwapOut(it);
   1340    best_score = score_i4;
   1341  } else {
   1342    nz = ReconstructIntra16(it, rd, it->yuv_out + Y_OFF_ENC, it->preds[0]);
   1343  }
   1344 
   1345  // ... and UV!
   1346  if (refine_uv_mode) {
   1347    int best_mode = -1;
   1348    score_t best_uv_score = MAX_COST;
   1349    const uint8_t* const src = it->yuv_in + U_OFF_ENC;
   1350    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1351      const uint8_t* const ref = it->yuv_p + VP8UVModeOffsets[mode];
   1352      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
   1353                          + VP8FixedCostsUV[mode] * lambda_d_uv;
   1354      if (score < best_uv_score) {
   1355        best_mode = mode;
   1356        best_uv_score = score;
   1357      }
   1358    }
   1359    VP8SetIntraUVMode(it, best_mode);
   1360  }
   1361  nz |= ReconstructUV(it, rd, it->yuv_out + U_OFF_ENC, it->mb->uv_mode);
   1362 
   1363  rd->nz = nz;
   1364  rd->score = best_score;
   1365 }
   1366 
   1367 //------------------------------------------------------------------------------
   1368 // Entry point
   1369 
   1370 int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
   1371                VP8ModeScore* WEBP_RESTRICT const rd,
   1372                VP8RDLevel rd_opt) {
   1373  int is_skipped;
   1374  const int method = it->enc->method;
   1375 
   1376  InitScore(rd);
   1377 
   1378  // We can perform predictions for Luma16x16 and Chroma8x8 already.
   1379  // Luma4x4 predictions needs to be done as-we-go.
   1380  VP8MakeLuma16Preds(it);
   1381  VP8MakeChroma8Preds(it);
   1382 
   1383  if (rd_opt > RD_OPT_NONE) {
   1384    it->do_trellis = (rd_opt >= RD_OPT_TRELLIS_ALL);
   1385    PickBestIntra16(it, rd);
   1386    if (method >= 2) {
   1387      PickBestIntra4(it, rd);
   1388    }
   1389    PickBestUV(it, rd);
   1390    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
   1391      it->do_trellis = 1;
   1392      SimpleQuantize(it, rd);
   1393    }
   1394  } else {
   1395    // At this point we have heuristically decided intra16 / intra4.
   1396    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
   1397    // For method <= 1, we don't re-examine the decision but just go ahead with
   1398    // quantization/reconstruction.
   1399    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
   1400  }
   1401  is_skipped = (rd->nz == 0);
   1402  VP8SetSkip(it, is_skipped);
   1403  return is_skipped;
   1404 }