quant_enc.c (50400B)
1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Quantization 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include <assert.h> 15 #include <math.h> 16 #include <stdlib.h> // for abs() 17 #include <string.h> 18 19 #include "src/dec/common_dec.h" 20 #include "src/dsp/dsp.h" 21 #include "src/dsp/quant.h" 22 #include "src/enc/cost_enc.h" 23 #include "src/enc/vp8i_enc.h" 24 #include "src/webp/types.h" 25 26 #define DO_TRELLIS_I4 1 27 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate. 28 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth. 29 #define USE_TDISTO 1 30 31 #define MID_ALPHA 64 // neutral value for susceptibility 32 #define MIN_ALPHA 30 // lowest usable value for susceptibility 33 #define MAX_ALPHA 100 // higher meaningful value for susceptibility 34 35 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP 36 // power-law modulation. Must be strictly less than 1. 37 38 // number of non-zero coeffs below which we consider the block very flat 39 // (and apply a penalty to complex predictions) 40 #define FLATNESS_LIMIT_I16 0 // I16 mode (special case) 41 #define FLATNESS_LIMIT_I4 3 // I4 mode 42 #define FLATNESS_LIMIT_UV 2 // UV mode 43 #define FLATNESS_PENALTY 140 // roughly ~1bit per block 44 45 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8) 46 47 #define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda) 48 49 // #define DEBUG_BLOCK 50 51 //------------------------------------------------------------------------------ 52 53 #if defined(DEBUG_BLOCK) 54 55 #include <stdio.h> 56 #include <stdlib.h> 57 58 static void PrintBlockInfo(const VP8EncIterator* const it, 59 const VP8ModeScore* const rd) { 60 int i, j; 61 const int is_i16 = (it->mb->type == 1); 62 const uint8_t* const y_in = it->yuv_in + Y_OFF_ENC; 63 const uint8_t* const y_out = it->yuv_out + Y_OFF_ENC; 64 const uint8_t* const uv_in = it->yuv_in + U_OFF_ENC; 65 const uint8_t* const uv_out = it->yuv_out + U_OFF_ENC; 66 printf("SOURCE / OUTPUT / ABS DELTA\n"); 67 for (j = 0; j < 16; ++j) { 68 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]); 69 printf(" "); 70 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]); 71 printf(" "); 72 for (i = 0; i < 16; ++i) { 73 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS])); 74 } 75 printf("\n"); 76 } 77 printf("\n"); // newline before the U/V block 78 for (j = 0; j < 8; ++j) { 79 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]); 80 printf(" "); 81 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]); 82 printf(" "); 83 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]); 84 printf(" "); 85 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]); 86 printf(" "); 87 for (i = 0; i < 8; ++i) { 88 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS])); 89 } 90 printf(" "); 91 for (i = 8; i < 16; ++i) { 92 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS])); 93 } 94 printf("\n"); 95 } 96 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n", 97 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz, 98 (int)rd->score); 99 if (is_i16) { 100 printf("Mode: %d\n", rd->mode_i16); 101 printf("y_dc_levels:"); 102 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]); 103 printf("\n"); 104 } else { 105 printf("Modes[16]: "); 106 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]); 107 printf("\n"); 108 } 109 printf("y_ac_levels:\n"); 110 for (j = 0; j < 16; ++j) { 111 for (i = is_i16 ? 1 : 0; i < 16; ++i) { 112 printf("%4d ", rd->y_ac_levels[j][i]); 113 } 114 printf("\n"); 115 } 116 printf("\n"); 117 printf("uv_levels (mode=%d):\n", rd->mode_uv); 118 for (j = 0; j < 8; ++j) { 119 for (i = 0; i < 16; ++i) { 120 printf("%4d ", rd->uv_levels[j][i]); 121 } 122 printf("\n"); 123 } 124 } 125 126 #endif // DEBUG_BLOCK 127 128 //------------------------------------------------------------------------------ 129 130 static WEBP_INLINE int clip(int v, int m, int M) { 131 return v < m ? m : v > M ? M : v; 132 } 133 134 static const uint8_t kZigzag[16] = { 135 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 136 }; 137 138 static const uint8_t kDcTable[128] = { 139 4, 5, 6, 7, 8, 9, 10, 10, 140 11, 12, 13, 14, 15, 16, 17, 17, 141 18, 19, 20, 20, 21, 21, 22, 22, 142 23, 23, 24, 25, 25, 26, 27, 28, 143 29, 30, 31, 32, 33, 34, 35, 36, 144 37, 37, 38, 39, 40, 41, 42, 43, 145 44, 45, 46, 46, 47, 48, 49, 50, 146 51, 52, 53, 54, 55, 56, 57, 58, 147 59, 60, 61, 62, 63, 64, 65, 66, 148 67, 68, 69, 70, 71, 72, 73, 74, 149 75, 76, 76, 77, 78, 79, 80, 81, 150 82, 83, 84, 85, 86, 87, 88, 89, 151 91, 93, 95, 96, 98, 100, 101, 102, 152 104, 106, 108, 110, 112, 114, 116, 118, 153 122, 124, 126, 128, 130, 132, 134, 136, 154 138, 140, 143, 145, 148, 151, 154, 157 155 }; 156 157 static const uint16_t kAcTable[128] = { 158 4, 5, 6, 7, 8, 9, 10, 11, 159 12, 13, 14, 15, 16, 17, 18, 19, 160 20, 21, 22, 23, 24, 25, 26, 27, 161 28, 29, 30, 31, 32, 33, 34, 35, 162 36, 37, 38, 39, 40, 41, 42, 43, 163 44, 45, 46, 47, 48, 49, 50, 51, 164 52, 53, 54, 55, 56, 57, 58, 60, 165 62, 64, 66, 68, 70, 72, 74, 76, 166 78, 80, 82, 84, 86, 88, 90, 92, 167 94, 96, 98, 100, 102, 104, 106, 108, 168 110, 112, 114, 116, 119, 122, 125, 128, 169 131, 134, 137, 140, 143, 146, 149, 152, 170 155, 158, 161, 164, 167, 170, 173, 177, 171 181, 185, 189, 193, 197, 201, 205, 209, 172 213, 217, 221, 225, 229, 234, 239, 245, 173 249, 254, 259, 264, 269, 274, 279, 284 174 }; 175 176 static const uint16_t kAcTable2[128] = { 177 8, 8, 9, 10, 12, 13, 15, 17, 178 18, 20, 21, 23, 24, 26, 27, 29, 179 31, 32, 34, 35, 37, 38, 40, 41, 180 43, 44, 46, 48, 49, 51, 52, 54, 181 55, 57, 58, 60, 62, 63, 65, 66, 182 68, 69, 71, 72, 74, 75, 77, 79, 183 80, 82, 83, 85, 86, 88, 89, 93, 184 96, 99, 102, 105, 108, 111, 114, 117, 185 120, 124, 127, 130, 133, 136, 139, 142, 186 145, 148, 151, 155, 158, 161, 164, 167, 187 170, 173, 176, 179, 184, 189, 193, 198, 188 203, 207, 212, 217, 221, 226, 230, 235, 189 240, 244, 249, 254, 258, 263, 268, 274, 190 280, 286, 292, 299, 305, 311, 317, 323, 191 330, 336, 342, 348, 354, 362, 370, 379, 192 385, 393, 401, 409, 416, 424, 432, 440 193 }; 194 195 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac] 196 { 96, 110 }, { 96, 108 }, { 110, 115 } 197 }; 198 199 // Sharpening by (slightly) raising the hi-frequency coeffs. 200 // Hack-ish but helpful for mid-bitrate range. Use with care. 201 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias 202 static const uint8_t kFreqSharpening[16] = { 203 0, 30, 60, 90, 204 30, 60, 90, 90, 205 60, 90, 90, 90, 206 90, 90, 90, 90 207 }; 208 209 //------------------------------------------------------------------------------ 210 // Initialize quantization parameters in VP8Matrix 211 212 // Returns the average quantizer 213 static int ExpandMatrix(VP8Matrix* const m, int type) { 214 int i, sum; 215 for (i = 0; i < 2; ++i) { 216 const int is_ac_coeff = (i > 0); 217 const int bias = kBiasMatrices[type][is_ac_coeff]; 218 m->iq[i] = (1 << QFIX) / m->q[i]; 219 m->bias[i] = BIAS(bias); 220 // zthresh is the exact value such that QUANTDIV(coeff, iQ, B) is: 221 // * zero if coeff <= zthresh 222 // * non-zero if coeff > zthresh 223 m->zthresh[i] = ((1 << QFIX) - 1 - m->bias[i]) / m->iq[i]; 224 } 225 for (i = 2; i < 16; ++i) { 226 m->q[i] = m->q[1]; 227 m->iq[i] = m->iq[1]; 228 m->bias[i] = m->bias[1]; 229 m->zthresh[i] = m->zthresh[1]; 230 } 231 for (sum = 0, i = 0; i < 16; ++i) { 232 if (type == 0) { // we only use sharpening for AC luma coeffs 233 m->sharpen[i] = (kFreqSharpening[i] * m->q[i]) >> SHARPEN_BITS; 234 } else { 235 m->sharpen[i] = 0; 236 } 237 sum += m->q[i]; 238 } 239 return (sum + 8) >> 4; 240 } 241 242 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; } 243 244 static void SetupMatrices(VP8Encoder* enc) { 245 int i; 246 const int tlambda_scale = 247 (enc->method >= 4) ? enc->config->sns_strength 248 : 0; 249 const int num_segments = enc->segment_hdr.num_segments; 250 for (i = 0; i < num_segments; ++i) { 251 VP8SegmentInfo* const m = &enc->dqm[i]; 252 const int q = m->quant; 253 int q_i4, q_i16, q_uv; 254 m->y1.q[0] = kDcTable[clip(q + enc->dq_y1_dc, 0, 127)]; 255 m->y1.q[1] = kAcTable[clip(q, 0, 127)]; 256 257 m->y2.q[0] = kDcTable[ clip(q + enc->dq_y2_dc, 0, 127)] * 2; 258 m->y2.q[1] = kAcTable2[clip(q + enc->dq_y2_ac, 0, 127)]; 259 260 m->uv.q[0] = kDcTable[clip(q + enc->dq_uv_dc, 0, 117)]; 261 m->uv.q[1] = kAcTable[clip(q + enc->dq_uv_ac, 0, 127)]; 262 263 q_i4 = ExpandMatrix(&m->y1, 0); 264 q_i16 = ExpandMatrix(&m->y2, 1); 265 q_uv = ExpandMatrix(&m->uv, 2); 266 267 m->lambda_i4 = (3 * q_i4 * q_i4) >> 7; 268 m->lambda_i16 = (3 * q_i16 * q_i16); 269 m->lambda_uv = (3 * q_uv * q_uv) >> 6; 270 m->lambda_mode = (1 * q_i4 * q_i4) >> 7; 271 m->lambda_trellis_i4 = (7 * q_i4 * q_i4) >> 3; 272 m->lambda_trellis_i16 = (q_i16 * q_i16) >> 2; 273 m->lambda_trellis_uv = (q_uv * q_uv) << 1; 274 m->tlambda = (tlambda_scale * q_i4) >> 5; 275 276 // none of these constants should be < 1 277 CheckLambdaValue(&m->lambda_i4); 278 CheckLambdaValue(&m->lambda_i16); 279 CheckLambdaValue(&m->lambda_uv); 280 CheckLambdaValue(&m->lambda_mode); 281 CheckLambdaValue(&m->lambda_trellis_i4); 282 CheckLambdaValue(&m->lambda_trellis_i16); 283 CheckLambdaValue(&m->lambda_trellis_uv); 284 CheckLambdaValue(&m->tlambda); 285 286 m->min_disto = 20 * m->y1.q[0]; // quantization-aware min disto 287 m->max_edge = 0; 288 289 m->i4_penalty = 1000 * q_i4 * q_i4; 290 } 291 } 292 293 //------------------------------------------------------------------------------ 294 // Initialize filtering parameters 295 296 // Very small filter-strength values have close to no visual effect. So we can 297 // save a little decoding-CPU by turning filtering off for these. 298 #define FSTRENGTH_CUTOFF 2 299 300 static void SetupFilterStrength(VP8Encoder* const enc) { 301 int i; 302 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering. 303 const int level0 = 5 * enc->config->filter_strength; 304 for (i = 0; i < NUM_MB_SEGMENTS; ++i) { 305 VP8SegmentInfo* const m = &enc->dqm[i]; 306 // We focus on the quantization of AC coeffs. 307 const int qstep = kAcTable[clip(m->quant, 0, 127)] >> 2; 308 const int base_strength = 309 VP8FilterStrengthFromDelta(enc->filter_hdr.sharpness, qstep); 310 // Segments with lower complexity ('beta') will be less filtered. 311 const int f = base_strength * level0 / (256 + m->beta); 312 m->fstrength = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f; 313 } 314 // We record the initial strength (mainly for the case of 1-segment only). 315 enc->filter_hdr.level = enc->dqm[0].fstrength; 316 enc->filter_hdr.simple = (enc->config->filter_type == 0); 317 enc->filter_hdr.sharpness = enc->config->filter_sharpness; 318 } 319 320 //------------------------------------------------------------------------------ 321 322 // Note: if you change the values below, remember that the max range 323 // allowed by the syntax for DQ_UV is [-16,16]. 324 #define MAX_DQ_UV (6) 325 #define MIN_DQ_UV (-4) 326 327 // We want to emulate jpeg-like behaviour where the expected "good" quality 328 // is around q=75. Internally, our "good" middle is around c=50. So we 329 // map accordingly using linear piece-wise function 330 static double QualityToCompression(double c) { 331 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.; 332 // The file size roughly scales as pow(quantizer, 3.). Actually, the 333 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested 334 // in the mid-quant range. So we scale the compressibility inversely to 335 // this power-law: quant ~= compression ^ 1/3. This law holds well for 336 // low quant. Finer modeling for high-quant would make use of kAcTable[] 337 // more explicitly. 338 const double v = pow(linear_c, 1 / 3.); 339 return v; 340 } 341 342 static double QualityToJPEGCompression(double c, double alpha) { 343 // We map the complexity 'alpha' and quality setting 'c' to a compression 344 // exponent empirically matched to the compression curve of libjpeg6b. 345 // On average, the WebP output size will be roughly similar to that of a 346 // JPEG file compressed with same quality factor. 347 const double amin = 0.30; 348 const double amax = 0.85; 349 const double exp_min = 0.4; 350 const double exp_max = 0.9; 351 const double slope = (exp_min - exp_max) / (amax - amin); 352 // Linearly interpolate 'expn' from exp_min to exp_max 353 // in the [amin, amax] range. 354 const double expn = (alpha > amax) ? exp_min 355 : (alpha < amin) ? exp_max 356 : exp_max + slope * (alpha - amin); 357 const double v = pow(c, expn); 358 return v; 359 } 360 361 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1, 362 const VP8SegmentInfo* const S2) { 363 return (S1->quant == S2->quant) && (S1->fstrength == S2->fstrength); 364 } 365 366 static void SimplifySegments(VP8Encoder* const enc) { 367 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 }; 368 // 'num_segments' is previously validated and <= NUM_MB_SEGMENTS, but an 369 // explicit check is needed to avoid a spurious warning about 'i' exceeding 370 // array bounds of 'dqm' with some compilers (noticed with gcc-4.9). 371 const int num_segments = (enc->segment_hdr.num_segments < NUM_MB_SEGMENTS) 372 ? enc->segment_hdr.num_segments 373 : NUM_MB_SEGMENTS; 374 int num_final_segments = 1; 375 int s1, s2; 376 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments 377 const VP8SegmentInfo* const S1 = &enc->dqm[s1]; 378 int found = 0; 379 // check if we already have similar segment 380 for (s2 = 0; s2 < num_final_segments; ++s2) { 381 const VP8SegmentInfo* const S2 = &enc->dqm[s2]; 382 if (SegmentsAreEquivalent(S1, S2)) { 383 found = 1; 384 break; 385 } 386 } 387 map[s1] = s2; 388 if (!found) { 389 if (num_final_segments != s1) { 390 enc->dqm[num_final_segments] = enc->dqm[s1]; 391 } 392 ++num_final_segments; 393 } 394 } 395 if (num_final_segments < num_segments) { // Remap 396 int i = enc->mb_w * enc->mb_h; 397 while (i-- > 0) enc->mb_info[i].segment = map[enc->mb_info[i].segment]; 398 enc->segment_hdr.num_segments = num_final_segments; 399 // Replicate the trailing segment infos (it's mostly cosmetics) 400 for (i = num_final_segments; i < num_segments; ++i) { 401 enc->dqm[i] = enc->dqm[num_final_segments - 1]; 402 } 403 } 404 } 405 406 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) { 407 int i; 408 int dq_uv_ac, dq_uv_dc; 409 const int num_segments = enc->segment_hdr.num_segments; 410 const double amp = SNS_TO_DQ * enc->config->sns_strength / 100. / 128.; 411 const double Q = quality / 100.; 412 const double c_base = enc->config->emulate_jpeg_size ? 413 QualityToJPEGCompression(Q, enc->alpha / 255.) : 414 QualityToCompression(Q); 415 for (i = 0; i < num_segments; ++i) { 416 // We modulate the base coefficient to accommodate for the quantization 417 // susceptibility and allow denser segments to be quantized more. 418 const double expn = 1. - amp * enc->dqm[i].alpha; 419 const double c = pow(c_base, expn); 420 const int q = (int)(127. * (1. - c)); 421 assert(expn > 0.); 422 enc->dqm[i].quant = clip(q, 0, 127); 423 } 424 425 // purely indicative in the bitstream (except for the 1-segment case) 426 enc->base_quant = enc->dqm[0].quant; 427 428 // fill-in values for the unused segments (required by the syntax) 429 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) { 430 enc->dqm[i].quant = enc->base_quant; 431 } 432 433 // uv_alpha is normally spread around ~60. The useful range is 434 // typically ~30 (quite bad) to ~100 (ok to decimate UV more). 435 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv. 436 dq_uv_ac = (enc->uv_alpha - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV) 437 / (MAX_ALPHA - MIN_ALPHA); 438 // we rescale by the user-defined strength of adaptation 439 dq_uv_ac = dq_uv_ac * enc->config->sns_strength / 100; 440 // and make it safe. 441 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV); 442 // We also boost the dc-uv-quant a little, based on sns-strength, since 443 // U/V channels are quite more reactive to high quants (flat DC-blocks 444 // tend to appear, and are unpleasant). 445 dq_uv_dc = -4 * enc->config->sns_strength / 100; 446 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed 447 448 enc->dq_y1_dc = 0; // TODO(skal): dq-lum 449 enc->dq_y2_dc = 0; 450 enc->dq_y2_ac = 0; 451 enc->dq_uv_dc = dq_uv_dc; 452 enc->dq_uv_ac = dq_uv_ac; 453 454 SetupFilterStrength(enc); // initialize segments' filtering, eventually 455 456 if (num_segments > 1) SimplifySegments(enc); 457 458 SetupMatrices(enc); // finalize quantization matrices 459 } 460 461 //------------------------------------------------------------------------------ 462 // Form the predictions in cache 463 464 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index 465 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 }; 466 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 }; 467 468 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index 469 static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = { 470 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 471 }; 472 473 void VP8MakeLuma16Preds(const VP8EncIterator* const it) { 474 const uint8_t* const left = it->x ? it->y_left : NULL; 475 const uint8_t* const top = it->y ? it->y_top : NULL; 476 VP8EncPredLuma16(it->yuv_p, left, top); 477 } 478 479 void VP8MakeChroma8Preds(const VP8EncIterator* const it) { 480 const uint8_t* const left = it->x ? it->u_left : NULL; 481 const uint8_t* const top = it->y ? it->uv_top : NULL; 482 VP8EncPredChroma8(it->yuv_p, left, top); 483 } 484 485 // Form all the ten Intra4x4 predictions in the 'yuv_p' cache 486 // for the 4x4 block it->i4 487 static void MakeIntra4Preds(const VP8EncIterator* const it) { 488 VP8EncPredLuma4(it->yuv_p, it->i4_top); 489 } 490 491 //------------------------------------------------------------------------------ 492 // Quantize 493 494 // Layout: 495 // +----+----+ 496 // |YYYY|UUVV| 0 497 // |YYYY|UUVV| 4 498 // |YYYY|....| 8 499 // |YYYY|....| 12 500 // +----+----+ 501 502 const uint16_t VP8Scan[16] = { // Luma 503 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 504 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 505 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 506 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, 507 }; 508 509 static const uint16_t VP8ScanUV[4 + 4] = { 510 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 511 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V 512 }; 513 514 //------------------------------------------------------------------------------ 515 // Distortion measurement 516 517 static const uint16_t kWeightY[16] = { 518 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 519 }; 520 521 static const uint16_t kWeightTrellis[16] = { 522 #if USE_TDISTO == 0 523 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 524 #else 525 30, 27, 19, 11, 526 27, 24, 17, 10, 527 19, 17, 12, 8, 528 11, 10, 8, 6 529 #endif 530 }; 531 532 // Init/Copy the common fields in score. 533 static void InitScore(VP8ModeScore* const rd) { 534 rd->D = 0; 535 rd->SD = 0; 536 rd->R = 0; 537 rd->H = 0; 538 rd->nz = 0; 539 rd->score = MAX_COST; 540 } 541 542 static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst, 543 const VP8ModeScore* WEBP_RESTRICT const src) { 544 dst->D = src->D; 545 dst->SD = src->SD; 546 dst->R = src->R; 547 dst->H = src->H; 548 dst->nz = src->nz; // note that nz is not accumulated, but just copied. 549 dst->score = src->score; 550 } 551 552 static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst, 553 const VP8ModeScore* WEBP_RESTRICT const src) { 554 dst->D += src->D; 555 dst->SD += src->SD; 556 dst->R += src->R; 557 dst->H += src->H; 558 dst->nz |= src->nz; // here, new nz bits are accumulated. 559 dst->score += src->score; 560 } 561 562 //------------------------------------------------------------------------------ 563 // Performs trellis-optimized quantization. 564 565 // Trellis node 566 typedef struct { 567 int8_t prev; // best previous node 568 int8_t sign; // sign of coeff_i 569 int16_t level; // level 570 } Node; 571 572 // Score state 573 typedef struct { 574 score_t score; // partial RD score 575 const uint16_t* costs; // shortcut to cost tables 576 } ScoreState; 577 578 // If a coefficient was quantized to a value Q (using a neutral bias), 579 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA] 580 // We don't test negative values though. 581 #define MIN_DELTA 0 // how much lower level to try 582 #define MAX_DELTA 1 // how much higher 583 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA) 584 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA]) 585 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA]) 586 587 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) { 588 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD); 589 } 590 591 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate, 592 score_t distortion) { 593 return rate * lambda + RD_DISTO_MULT * distortion; 594 } 595 596 // Coefficient type. 597 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 }; 598 599 static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc, 600 int16_t in[16], int16_t out[16], 601 int ctx0, int coeff_type, 602 const VP8Matrix* WEBP_RESTRICT const mtx, 603 int lambda) { 604 const ProbaArray* const probas = enc->proba.coeffs[coeff_type]; 605 CostArrayPtr const costs = 606 (CostArrayPtr)enc->proba.remapped_costs[coeff_type]; 607 const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0; 608 Node nodes[16][NUM_NODES]; 609 ScoreState score_states[2][NUM_NODES]; 610 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA); 611 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA); 612 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous 613 score_t best_score; 614 int n, m, p, last; 615 616 { 617 score_t cost; 618 const int thresh = mtx->q[1] * mtx->q[1] / 4; 619 const int last_proba = probas[VP8EncBands[first]][ctx0][0]; 620 621 // compute the position of the last interesting coefficient 622 last = first - 1; 623 for (n = 15; n >= first; --n) { 624 const int j = kZigzag[n]; 625 const int err = in[j] * in[j]; 626 if (err > thresh) { 627 last = n; 628 break; 629 } 630 } 631 // we don't need to go inspect up to n = 16 coeffs. We can just go up 632 // to last + 1 (inclusive) without losing much. 633 if (last < 15) ++last; 634 635 // compute 'skip' score. This is the max score one can do. 636 cost = VP8BitCost(0, last_proba); 637 best_score = RDScoreTrellis(lambda, cost, 0); 638 639 // initialize source node. 640 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 641 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0; 642 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0); 643 ss_cur[m].costs = costs[first][ctx0]; 644 } 645 } 646 647 // traverse trellis. 648 for (n = first; n <= last; ++n) { 649 const int j = kZigzag[n]; 650 const uint32_t Q = mtx->q[j]; 651 const uint32_t iQ = mtx->iq[j]; 652 const uint32_t B = BIAS(0x00); // neutral bias 653 // note: it's important to take sign of the _original_ coeff, 654 // so we don't have to consider level < 0 afterward. 655 const int sign = (in[j] < 0); 656 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen[j]; 657 int level0 = QUANTDIV(coeff0, iQ, B); 658 int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80)); 659 if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL; 660 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL; 661 662 { // Swap current and previous score states 663 ScoreState* const tmp = ss_cur; 664 ss_cur = ss_prev; 665 ss_prev = tmp; 666 } 667 668 // test all alternate level values around level0. 669 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 670 Node* const cur = &NODE(n, m); 671 const int level = level0 + m; 672 const int ctx = (level > 2) ? 2 : level; 673 const int band = VP8EncBands[n + 1]; 674 score_t base_score; 675 score_t best_cur_score; 676 int best_prev; 677 score_t cost, score; 678 679 ss_cur[m].costs = costs[n + 1][ctx]; 680 if (level < 0 || level > thresh_level) { 681 ss_cur[m].score = MAX_COST; 682 // Node is dead. 683 continue; 684 } 685 686 { 687 // Compute delta_error = how much coding this level will 688 // subtract to max_error as distortion. 689 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2 690 const int new_error = coeff0 - level * Q; 691 const int delta_error = 692 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0); 693 base_score = RDScoreTrellis(lambda, 0, delta_error); 694 } 695 696 // Inspect all possible non-dead predecessors. Retain only the best one. 697 // The base_score is added to all scores so it is only added for the final 698 // value after the loop. 699 cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level); 700 best_cur_score = 701 ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0); 702 best_prev = -MIN_DELTA; 703 for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) { 704 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically 705 // eliminated since their score can't be better than the current best. 706 cost = VP8LevelCost(ss_prev[p].costs, level); 707 // Examine node assuming it's a non-terminal one. 708 score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0); 709 if (score < best_cur_score) { 710 best_cur_score = score; 711 best_prev = p; 712 } 713 } 714 best_cur_score += base_score; 715 // Store best finding in current node. 716 cur->sign = sign; 717 cur->level = level; 718 cur->prev = best_prev; 719 ss_cur[m].score = best_cur_score; 720 721 // Now, record best terminal node (and thus best entry in the graph). 722 if (level != 0 && best_cur_score < best_score) { 723 const score_t last_pos_cost = 724 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0; 725 const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0); 726 score = best_cur_score + last_pos_score; 727 if (score < best_score) { 728 best_score = score; 729 best_path[0] = n; // best eob position 730 best_path[1] = m; // best node index 731 best_path[2] = best_prev; // best predecessor 732 } 733 } 734 } 735 } 736 737 // Fresh start 738 // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case. 739 if (coeff_type == TYPE_I16_AC) { 740 memset(in + 1, 0, 15 * sizeof(*in)); 741 memset(out + 1, 0, 15 * sizeof(*out)); 742 } else { 743 memset(in, 0, 16 * sizeof(*in)); 744 memset(out, 0, 16 * sizeof(*out)); 745 } 746 if (best_path[0] == -1) { 747 return 0; // skip! 748 } 749 750 { 751 // Unwind the best path. 752 // Note: best-prev on terminal node is not necessarily equal to the 753 // best_prev for non-terminal. So we patch best_path[2] in. 754 int nz = 0; 755 int best_node = best_path[1]; 756 n = best_path[0]; 757 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal 758 759 for (; n >= first; --n) { 760 const Node* const node = &NODE(n, best_node); 761 const int j = kZigzag[n]; 762 out[n] = node->sign ? -node->level : node->level; 763 nz |= node->level; 764 in[j] = out[n] * mtx->q[j]; 765 best_node = node->prev; 766 } 767 return (nz != 0); 768 } 769 } 770 771 #undef NODE 772 773 //------------------------------------------------------------------------------ 774 // Performs: difference, transform, quantize, back-transform, add 775 // all at once. Output is the reconstructed block in *yuv_out, and the 776 // quantized levels in *levels. 777 778 static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it, 779 VP8ModeScore* WEBP_RESTRICT const rd, 780 uint8_t* WEBP_RESTRICT const yuv_out, 781 int mode) { 782 const VP8Encoder* const enc = it->enc; 783 const uint8_t* const ref = it->yuv_p + VP8I16ModeOffsets[mode]; 784 const uint8_t* const src = it->yuv_in + Y_OFF_ENC; 785 const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment]; 786 int nz = 0; 787 int n; 788 int16_t tmp[16][16], dc_tmp[16]; 789 790 for (n = 0; n < 16; n += 2) { 791 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); 792 } 793 VP8FTransformWHT(tmp[0], dc_tmp); 794 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2) << 24; 795 796 if (DO_TRELLIS_I16 && it->do_trellis) { 797 int x, y; 798 VP8IteratorNzToBytes(it); 799 for (y = 0, n = 0; y < 4; ++y) { 800 for (x = 0; x < 4; ++x, ++n) { 801 const int ctx = it->top_nz[x] + it->left_nz[y]; 802 const int non_zero = TrellisQuantizeBlock( 803 enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1, 804 dqm->lambda_trellis_i16); 805 it->top_nz[x] = it->left_nz[y] = non_zero; 806 rd->y_ac_levels[n][0] = 0; 807 nz |= non_zero << n; 808 } 809 } 810 } else { 811 for (n = 0; n < 16; n += 2) { 812 // Zero-out the first coeff, so that: a) nz is correct below, and 813 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified. 814 tmp[n][0] = tmp[n + 1][0] = 0; 815 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1) << n; 816 assert(rd->y_ac_levels[n + 0][0] == 0); 817 assert(rd->y_ac_levels[n + 1][0] == 0); 818 } 819 } 820 821 // Transform back 822 VP8TransformWHT(dc_tmp, tmp[0]); 823 for (n = 0; n < 16; n += 2) { 824 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1); 825 } 826 827 return nz; 828 } 829 830 static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it, 831 int16_t levels[16], 832 const uint8_t* WEBP_RESTRICT const src, 833 uint8_t* WEBP_RESTRICT const yuv_out, 834 int mode) { 835 const VP8Encoder* const enc = it->enc; 836 const uint8_t* const ref = it->yuv_p + VP8I4ModeOffsets[mode]; 837 const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment]; 838 int nz = 0; 839 int16_t tmp[16]; 840 841 VP8FTransform(src, ref, tmp); 842 if (DO_TRELLIS_I4 && it->do_trellis) { 843 const int x = it->i4 & 3, y = it->i4 >> 2; 844 const int ctx = it->top_nz[x] + it->left_nz[y]; 845 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1, 846 dqm->lambda_trellis_i4); 847 } else { 848 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1); 849 } 850 VP8ITransform(ref, tmp, yuv_out, 0); 851 return nz; 852 } 853 854 //------------------------------------------------------------------------------ 855 // DC-error diffusion 856 857 // Diffusion weights. We under-correct a bit (15/16th of the error is actually 858 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0. 859 #define C1 7 // fraction of error sent to the 4x4 block below 860 #define C2 8 // fraction of error sent to the 4x4 block on the right 861 #define DSHIFT 4 862 #define DSCALE 1 // storage descaling, needed to make the error fit int8_t 863 864 // Quantize as usual, but also compute and return the quantization error. 865 // Error is already divided by DSHIFT. 866 static int QuantizeSingle(int16_t* WEBP_RESTRICT const v, 867 const VP8Matrix* WEBP_RESTRICT const mtx) { 868 int V = *v; 869 const int sign = (V < 0); 870 if (sign) V = -V; 871 if (V > (int)mtx->zthresh[0]) { 872 const int qV = QUANTDIV(V, mtx->iq[0], mtx->bias[0]) * mtx->q[0]; 873 const int err = (V - qV); 874 *v = sign ? -qV : qV; 875 return (sign ? -err : err) >> DSCALE; 876 } 877 *v = 0; 878 return (sign ? -V : V) >> DSCALE; 879 } 880 881 static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it, 882 const VP8Matrix* WEBP_RESTRICT const mtx, 883 int16_t tmp[][16], 884 VP8ModeScore* WEBP_RESTRICT const rd) { 885 // | top[0] | top[1] 886 // --------+--------+--------- 887 // left[0] | tmp[0] tmp[1] <-> err0 err1 888 // left[1] | tmp[2] tmp[3] err2 err3 889 // 890 // Final errors {err1,err2,err3} are preserved and later restored 891 // as top[]/left[] on the next block. 892 int ch; 893 for (ch = 0; ch <= 1; ++ch) { 894 const int8_t* const top = it->top_derr[it->x][ch]; 895 const int8_t* const left = it->left_derr[ch]; 896 int16_t (* const c)[16] = &tmp[ch * 4]; 897 int err0, err1, err2, err3; 898 c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE); 899 err0 = QuantizeSingle(&c[0][0], mtx); 900 c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE); 901 err1 = QuantizeSingle(&c[1][0], mtx); 902 c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE); 903 err2 = QuantizeSingle(&c[2][0], mtx); 904 c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE); 905 err3 = QuantizeSingle(&c[3][0], mtx); 906 // error 'err' is bounded by mtx->q[0] which is 132 at max. Hence 907 // err >> DSCALE will fit in an int8_t type if DSCALE>=1. 908 assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127); 909 rd->derr[ch][0] = (int8_t)err1; 910 rd->derr[ch][1] = (int8_t)err2; 911 rd->derr[ch][2] = (int8_t)err3; 912 } 913 } 914 915 static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it, 916 const VP8ModeScore* WEBP_RESTRICT const rd) { 917 int ch; 918 for (ch = 0; ch <= 1; ++ch) { 919 int8_t* const top = it->top_derr[it->x][ch]; 920 int8_t* const left = it->left_derr[ch]; 921 left[0] = rd->derr[ch][0]; // restore err1 922 left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3 923 top[0] = rd->derr[ch][1]; // ... err2 924 top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3. 925 } 926 } 927 928 #undef C1 929 #undef C2 930 #undef DSHIFT 931 #undef DSCALE 932 933 //------------------------------------------------------------------------------ 934 935 static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it, 936 VP8ModeScore* WEBP_RESTRICT const rd, 937 uint8_t* WEBP_RESTRICT const yuv_out, int mode) { 938 const VP8Encoder* const enc = it->enc; 939 const uint8_t* const ref = it->yuv_p + VP8UVModeOffsets[mode]; 940 const uint8_t* const src = it->yuv_in + U_OFF_ENC; 941 const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment]; 942 int nz = 0; 943 int n; 944 int16_t tmp[8][16]; 945 946 for (n = 0; n < 8; n += 2) { 947 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); 948 } 949 if (it->top_derr != NULL) CorrectDCValues(it, &dqm->uv, tmp, rd); 950 951 if (DO_TRELLIS_UV && it->do_trellis) { 952 int ch, x, y; 953 for (ch = 0, n = 0; ch <= 2; ch += 2) { 954 for (y = 0; y < 2; ++y) { 955 for (x = 0; x < 2; ++x, ++n) { 956 const int ctx = it->top_nz[4 + ch + x] + it->left_nz[4 + ch + y]; 957 const int non_zero = TrellisQuantizeBlock( 958 enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv, 959 dqm->lambda_trellis_uv); 960 it->top_nz[4 + ch + x] = it->left_nz[4 + ch + y] = non_zero; 961 nz |= non_zero << n; 962 } 963 } 964 } 965 } else { 966 for (n = 0; n < 8; n += 2) { 967 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv) << n; 968 } 969 } 970 971 for (n = 0; n < 8; n += 2) { 972 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1); 973 } 974 return (nz << 16); 975 } 976 977 //------------------------------------------------------------------------------ 978 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost. 979 // Pick the mode is lower RD-cost = Rate + lambda * Distortion. 980 981 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) { 982 // We look at the first three AC coefficients to determine what is the average 983 // delta between each sub-4x4 block. 984 const int v0 = abs(DCs[1]); 985 const int v1 = abs(DCs[2]); 986 const int v2 = abs(DCs[4]); 987 int max_v = (v1 > v0) ? v1 : v0; 988 max_v = (v2 > max_v) ? v2 : max_v; 989 if (max_v > dqm->max_edge) dqm->max_edge = max_v; 990 } 991 992 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) { 993 VP8ModeScore* const tmp = *a; 994 *a = *b; 995 *b = tmp; 996 } 997 998 static void SwapPtr(uint8_t** a, uint8_t** b) { 999 uint8_t* const tmp = *a; 1000 *a = *b; 1001 *b = tmp; 1002 } 1003 1004 static void SwapOut(VP8EncIterator* const it) { 1005 SwapPtr(&it->yuv_out, &it->yuv_out2); 1006 } 1007 1008 static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it, 1009 VP8ModeScore* WEBP_RESTRICT rd) { 1010 const int kNumBlocks = 16; 1011 VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment]; 1012 const int lambda = dqm->lambda_i16; 1013 const int tlambda = dqm->tlambda; 1014 const uint8_t* const src = it->yuv_in + Y_OFF_ENC; 1015 VP8ModeScore rd_tmp; 1016 VP8ModeScore* rd_cur = &rd_tmp; 1017 VP8ModeScore* rd_best = rd; 1018 int mode; 1019 int is_flat = IsFlatSource16(it->yuv_in + Y_OFF_ENC); 1020 1021 rd->mode_i16 = -1; 1022 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1023 uint8_t* const tmp_dst = it->yuv_out2 + Y_OFF_ENC; // scratch buffer 1024 rd_cur->mode_i16 = mode; 1025 1026 // Reconstruct 1027 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode); 1028 1029 // Measure RD-score 1030 rd_cur->D = VP8SSE16x16(src, tmp_dst); 1031 rd_cur->SD = 1032 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0; 1033 rd_cur->H = VP8FixedCostsI16[mode]; 1034 rd_cur->R = VP8GetCostLuma16(it, rd_cur); 1035 if (is_flat) { 1036 // refine the first impression (which was in pixel space) 1037 is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16); 1038 if (is_flat) { 1039 // Block is very flat. We put emphasis on the distortion being very low! 1040 rd_cur->D *= 2; 1041 rd_cur->SD *= 2; 1042 } 1043 } 1044 1045 // Since we always examine Intra16 first, we can overwrite *rd directly. 1046 SetRDScore(lambda, rd_cur); 1047 if (mode == 0 || rd_cur->score < rd_best->score) { 1048 SwapModeScore(&rd_cur, &rd_best); 1049 SwapOut(it); 1050 } 1051 } 1052 if (rd_best != rd) { 1053 memcpy(rd, rd_best, sizeof(*rd)); 1054 } 1055 SetRDScore(dqm->lambda_mode, rd); // finalize score for mode decision. 1056 VP8SetIntra16Mode(it, rd->mode_i16); 1057 1058 // we have a blocky macroblock (only DCs are non-zero) with fairly high 1059 // distortion, record max delta so we can later adjust the minimal filtering 1060 // strength needed to smooth these blocks out. 1061 if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto) { 1062 StoreMaxDelta(dqm, rd->y_dc_levels); 1063 } 1064 } 1065 1066 //------------------------------------------------------------------------------ 1067 1068 // return the cost array corresponding to the surrounding prediction modes. 1069 static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it, 1070 const uint8_t modes[16]) { 1071 const int preds_w = it->enc->preds_w; 1072 const int x = (it->i4 & 3), y = it->i4 >> 2; 1073 const int left = (x == 0) ? it->preds[y * preds_w - 1] : modes[it->i4 - 1]; 1074 const int top = (y == 0) ? it->preds[-preds_w + x] : modes[it->i4 - 4]; 1075 return VP8FixedCostsI4[top][left]; 1076 } 1077 1078 static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it, 1079 VP8ModeScore* WEBP_RESTRICT const rd) { 1080 const VP8Encoder* const enc = it->enc; 1081 const VP8SegmentInfo* const dqm = &enc->dqm[it->mb->segment]; 1082 const int lambda = dqm->lambda_i4; 1083 const int tlambda = dqm->tlambda; 1084 const uint8_t* const src0 = it->yuv_in + Y_OFF_ENC; 1085 uint8_t* const best_blocks = it->yuv_out2 + Y_OFF_ENC; 1086 int total_header_bits = 0; 1087 VP8ModeScore rd_best; 1088 1089 if (enc->max_i4_header_bits == 0) { 1090 return 0; 1091 } 1092 1093 InitScore(&rd_best); 1094 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145) 1095 SetRDScore(dqm->lambda_mode, &rd_best); 1096 VP8IteratorStartI4(it); 1097 do { 1098 const int kNumBlocks = 1; 1099 VP8ModeScore rd_i4; 1100 int mode; 1101 int best_mode = -1; 1102 const uint8_t* const src = src0 + VP8Scan[it->i4]; 1103 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); 1104 uint8_t* best_block = best_blocks + VP8Scan[it->i4]; 1105 uint8_t* tmp_dst = it->yuv_p + I4TMP; // scratch buffer. 1106 1107 InitScore(&rd_i4); 1108 MakeIntra4Preds(it); 1109 for (mode = 0; mode < NUM_BMODES; ++mode) { 1110 VP8ModeScore rd_tmp; 1111 int16_t tmp_levels[16]; 1112 1113 // Reconstruct 1114 rd_tmp.nz = 1115 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4; 1116 1117 // Compute RD-score 1118 rd_tmp.D = VP8SSE4x4(src, tmp_dst); 1119 rd_tmp.SD = 1120 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY)) 1121 : 0; 1122 rd_tmp.H = mode_costs[mode]; 1123 1124 // Add flatness penalty, to avoid flat area to be mispredicted 1125 // by a complex mode. 1126 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) { 1127 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks; 1128 } else { 1129 rd_tmp.R = 0; 1130 } 1131 1132 // early-out check 1133 SetRDScore(lambda, &rd_tmp); 1134 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue; 1135 1136 // finish computing score 1137 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels); 1138 SetRDScore(lambda, &rd_tmp); 1139 1140 if (best_mode < 0 || rd_tmp.score < rd_i4.score) { 1141 CopyScore(&rd_i4, &rd_tmp); 1142 best_mode = mode; 1143 SwapPtr(&tmp_dst, &best_block); 1144 memcpy(rd_best.y_ac_levels[it->i4], tmp_levels, 1145 sizeof(rd_best.y_ac_levels[it->i4])); 1146 } 1147 } 1148 SetRDScore(dqm->lambda_mode, &rd_i4); 1149 AddScore(&rd_best, &rd_i4); 1150 if (rd_best.score >= rd->score) { 1151 return 0; 1152 } 1153 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode]; 1154 if (total_header_bits > enc->max_i4_header_bits) { 1155 return 0; 1156 } 1157 // Copy selected samples if not in the right place already. 1158 if (best_block != best_blocks + VP8Scan[it->i4]) { 1159 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4]); 1160 } 1161 rd->modes_i4[it->i4] = best_mode; 1162 it->top_nz[it->i4 & 3] = it->left_nz[it->i4 >> 2] = (rd_i4.nz ? 1 : 0); 1163 } while (VP8IteratorRotateI4(it, best_blocks)); 1164 1165 // finalize state 1166 CopyScore(rd, &rd_best); 1167 VP8SetIntra4Mode(it, rd->modes_i4); 1168 SwapOut(it); 1169 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels)); 1170 return 1; // select intra4x4 over intra16x16 1171 } 1172 1173 //------------------------------------------------------------------------------ 1174 1175 static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it, 1176 VP8ModeScore* WEBP_RESTRICT const rd) { 1177 const int kNumBlocks = 8; 1178 const VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment]; 1179 const int lambda = dqm->lambda_uv; 1180 const uint8_t* const src = it->yuv_in + U_OFF_ENC; 1181 uint8_t* tmp_dst = it->yuv_out2 + U_OFF_ENC; // scratch buffer 1182 uint8_t* dst0 = it->yuv_out + U_OFF_ENC; 1183 uint8_t* dst = dst0; 1184 VP8ModeScore rd_best; 1185 int mode; 1186 1187 rd->mode_uv = -1; 1188 InitScore(&rd_best); 1189 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1190 VP8ModeScore rd_uv; 1191 1192 // Reconstruct 1193 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode); 1194 1195 // Compute RD-score 1196 rd_uv.D = VP8SSE16x8(src, tmp_dst); 1197 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas. 1198 rd_uv.H = VP8FixedCostsUV[mode]; 1199 rd_uv.R = VP8GetCostUV(it, &rd_uv); 1200 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) { 1201 rd_uv.R += FLATNESS_PENALTY * kNumBlocks; 1202 } 1203 1204 SetRDScore(lambda, &rd_uv); 1205 if (mode == 0 || rd_uv.score < rd_best.score) { 1206 CopyScore(&rd_best, &rd_uv); 1207 rd->mode_uv = mode; 1208 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels)); 1209 if (it->top_derr != NULL) { 1210 memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr)); 1211 } 1212 SwapPtr(&dst, &tmp_dst); 1213 } 1214 } 1215 VP8SetIntraUVMode(it, rd->mode_uv); 1216 AddScore(rd, &rd_best); 1217 if (dst != dst0) { // copy 16x8 block if needed 1218 VP8Copy16x8(dst, dst0); 1219 } 1220 if (it->top_derr != NULL) { // store diffusion errors for next block 1221 StoreDiffusionErrors(it, rd); 1222 } 1223 } 1224 1225 //------------------------------------------------------------------------------ 1226 // Final reconstruction and quantization. 1227 1228 static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it, 1229 VP8ModeScore* WEBP_RESTRICT const rd) { 1230 const VP8Encoder* const enc = it->enc; 1231 const int is_i16 = (it->mb->type == 1); 1232 int nz = 0; 1233 1234 if (is_i16) { 1235 nz = ReconstructIntra16(it, rd, it->yuv_out + Y_OFF_ENC, it->preds[0]); 1236 } else { 1237 VP8IteratorStartI4(it); 1238 do { 1239 const int mode = 1240 it->preds[(it->i4 & 3) + (it->i4 >> 2) * enc->preds_w]; 1241 const uint8_t* const src = it->yuv_in + Y_OFF_ENC + VP8Scan[it->i4]; 1242 uint8_t* const dst = it->yuv_out + Y_OFF_ENC + VP8Scan[it->i4]; 1243 MakeIntra4Preds(it); 1244 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4], 1245 src, dst, mode) << it->i4; 1246 } while (VP8IteratorRotateI4(it, it->yuv_out + Y_OFF_ENC)); 1247 } 1248 1249 nz |= ReconstructUV(it, rd, it->yuv_out + U_OFF_ENC, it->mb->uv_mode); 1250 rd->nz = nz; 1251 } 1252 1253 // Refine intra16/intra4 sub-modes based on distortion only (not rate). 1254 static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it, 1255 int try_both_modes, int refine_uv_mode, 1256 VP8ModeScore* WEBP_RESTRICT const rd) { 1257 score_t best_score = MAX_COST; 1258 int nz = 0; 1259 int mode; 1260 int is_i16 = try_both_modes || (it->mb->type == 1); 1261 1262 const VP8SegmentInfo* const dqm = &it->enc->dqm[it->mb->segment]; 1263 // Some empiric constants, of approximate order of magnitude. 1264 const int lambda_d_i16 = 106; 1265 const int lambda_d_i4 = 11; 1266 const int lambda_d_uv = 120; 1267 score_t score_i4 = dqm->i4_penalty; 1268 score_t i4_bit_sum = 0; 1269 const score_t bit_limit = try_both_modes ? it->enc->mb_header_limit 1270 : MAX_COST; // no early-out allowed 1271 1272 if (is_i16) { // First, evaluate Intra16 distortion 1273 int best_mode = -1; 1274 const uint8_t* const src = it->yuv_in + Y_OFF_ENC; 1275 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1276 const uint8_t* const ref = it->yuv_p + VP8I16ModeOffsets[mode]; 1277 const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT 1278 + VP8FixedCostsI16[mode] * lambda_d_i16; 1279 if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) { 1280 continue; 1281 } 1282 1283 if (score < best_score) { 1284 best_mode = mode; 1285 best_score = score; 1286 } 1287 } 1288 if (it->x == 0 || it->y == 0) { 1289 // avoid starting a checkerboard resonance from the border. See bug #432. 1290 if (IsFlatSource16(src)) { 1291 best_mode = (it->x == 0) ? 0 : 2; 1292 try_both_modes = 0; // stick to i16 1293 } 1294 } 1295 VP8SetIntra16Mode(it, best_mode); 1296 // we'll reconstruct later, if i16 mode actually gets selected 1297 } 1298 1299 // Next, evaluate Intra4 1300 if (try_both_modes || !is_i16) { 1301 // We don't evaluate the rate here, but just account for it through a 1302 // constant penalty (i4 mode usually needs more bits compared to i16). 1303 is_i16 = 0; 1304 VP8IteratorStartI4(it); 1305 do { 1306 int best_i4_mode = -1; 1307 score_t best_i4_score = MAX_COST; 1308 const uint8_t* const src = it->yuv_in + Y_OFF_ENC + VP8Scan[it->i4]; 1309 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); 1310 1311 MakeIntra4Preds(it); 1312 for (mode = 0; mode < NUM_BMODES; ++mode) { 1313 const uint8_t* const ref = it->yuv_p + VP8I4ModeOffsets[mode]; 1314 const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT 1315 + mode_costs[mode] * lambda_d_i4; 1316 if (score < best_i4_score) { 1317 best_i4_mode = mode; 1318 best_i4_score = score; 1319 } 1320 } 1321 i4_bit_sum += mode_costs[best_i4_mode]; 1322 rd->modes_i4[it->i4] = best_i4_mode; 1323 score_i4 += best_i4_score; 1324 if (score_i4 >= best_score || i4_bit_sum > bit_limit) { 1325 // Intra4 won't be better than Intra16. Bail out and pick Intra16. 1326 is_i16 = 1; 1327 break; 1328 } else { // reconstruct partial block inside yuv_out2 buffer 1329 uint8_t* const tmp_dst = it->yuv_out2 + Y_OFF_ENC + VP8Scan[it->i4]; 1330 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4], 1331 src, tmp_dst, best_i4_mode) << it->i4; 1332 } 1333 } while (VP8IteratorRotateI4(it, it->yuv_out2 + Y_OFF_ENC)); 1334 } 1335 1336 // Final reconstruction, depending on which mode is selected. 1337 if (!is_i16) { 1338 VP8SetIntra4Mode(it, rd->modes_i4); 1339 SwapOut(it); 1340 best_score = score_i4; 1341 } else { 1342 nz = ReconstructIntra16(it, rd, it->yuv_out + Y_OFF_ENC, it->preds[0]); 1343 } 1344 1345 // ... and UV! 1346 if (refine_uv_mode) { 1347 int best_mode = -1; 1348 score_t best_uv_score = MAX_COST; 1349 const uint8_t* const src = it->yuv_in + U_OFF_ENC; 1350 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1351 const uint8_t* const ref = it->yuv_p + VP8UVModeOffsets[mode]; 1352 const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT 1353 + VP8FixedCostsUV[mode] * lambda_d_uv; 1354 if (score < best_uv_score) { 1355 best_mode = mode; 1356 best_uv_score = score; 1357 } 1358 } 1359 VP8SetIntraUVMode(it, best_mode); 1360 } 1361 nz |= ReconstructUV(it, rd, it->yuv_out + U_OFF_ENC, it->mb->uv_mode); 1362 1363 rd->nz = nz; 1364 rd->score = best_score; 1365 } 1366 1367 //------------------------------------------------------------------------------ 1368 // Entry point 1369 1370 int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it, 1371 VP8ModeScore* WEBP_RESTRICT const rd, 1372 VP8RDLevel rd_opt) { 1373 int is_skipped; 1374 const int method = it->enc->method; 1375 1376 InitScore(rd); 1377 1378 // We can perform predictions for Luma16x16 and Chroma8x8 already. 1379 // Luma4x4 predictions needs to be done as-we-go. 1380 VP8MakeLuma16Preds(it); 1381 VP8MakeChroma8Preds(it); 1382 1383 if (rd_opt > RD_OPT_NONE) { 1384 it->do_trellis = (rd_opt >= RD_OPT_TRELLIS_ALL); 1385 PickBestIntra16(it, rd); 1386 if (method >= 2) { 1387 PickBestIntra4(it, rd); 1388 } 1389 PickBestUV(it, rd); 1390 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now 1391 it->do_trellis = 1; 1392 SimpleQuantize(it, rd); 1393 } 1394 } else { 1395 // At this point we have heuristically decided intra16 / intra4. 1396 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower). 1397 // For method <= 1, we don't re-examine the decision but just go ahead with 1398 // quantization/reconstruction. 1399 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd); 1400 } 1401 is_skipped = (rd->nz == 0); 1402 VP8SetSkip(it, is_skipped); 1403 return is_skipped; 1404 }