frame_dec.c (28290B)
1 // Copyright 2010 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Frame-reconstruction function. Memory allocation. 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include <assert.h> 15 #include <stdlib.h> 16 #include <string.h> 17 18 #include "src/dec/common_dec.h" 19 #include "src/dec/vp8_dec.h" 20 #include "src/dec/vp8i_dec.h" 21 #include "src/dec/webpi_dec.h" 22 #include "src/dsp/dsp.h" 23 #include "src/utils/random_utils.h" 24 #include "src/utils/thread_utils.h" 25 #include "src/utils/utils.h" 26 #include "src/webp/decode.h" 27 #include "src/webp/types.h" 28 29 //------------------------------------------------------------------------------ 30 // Main reconstruction function. 31 32 static const uint16_t kScan[16] = { 33 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 34 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 35 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 36 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS 37 }; 38 39 static int CheckMode(int mb_x, int mb_y, int mode) { 40 if (mode == B_DC_PRED) { 41 if (mb_x == 0) { 42 return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT; 43 } else { 44 return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED; 45 } 46 } 47 return mode; 48 } 49 50 static void Copy32b(uint8_t* const dst, const uint8_t* const src) { 51 memcpy(dst, src, 4); 52 } 53 54 static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src, 55 uint8_t* const dst) { 56 switch (bits >> 30) { 57 case 3: 58 VP8Transform(src, dst, 0); 59 break; 60 case 2: 61 VP8TransformAC3(src, dst); 62 break; 63 case 1: 64 VP8TransformDC(src, dst); 65 break; 66 default: 67 break; 68 } 69 } 70 71 static void DoUVTransform(uint32_t bits, const int16_t* const src, 72 uint8_t* const dst) { 73 if (bits & 0xff) { // any non-zero coeff at all? 74 if (bits & 0xaa) { // any non-zero AC coefficient? 75 VP8TransformUV(src, dst); // note we don't use the AC3 variant for U/V 76 } else { 77 VP8TransformDCUV(src, dst); 78 } 79 } 80 } 81 82 static void ReconstructRow(const VP8Decoder* const dec, 83 const VP8ThreadContext* ctx) { 84 int j; 85 int mb_x; 86 const int mb_y = ctx->mb_y; 87 const int cache_id = ctx->id; 88 uint8_t* const y_dst = dec->yuv_b + Y_OFF; 89 uint8_t* const u_dst = dec->yuv_b + U_OFF; 90 uint8_t* const v_dst = dec->yuv_b + V_OFF; 91 92 // Initialize left-most block. 93 for (j = 0; j < 16; ++j) { 94 y_dst[j * BPS - 1] = 129; 95 } 96 for (j = 0; j < 8; ++j) { 97 u_dst[j * BPS - 1] = 129; 98 v_dst[j * BPS - 1] = 129; 99 } 100 101 // Init top-left sample on left column too. 102 if (mb_y > 0) { 103 y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129; 104 } else { 105 // we only need to do this init once at block (0,0). 106 // Afterward, it remains valid for the whole topmost row. 107 memset(y_dst - BPS - 1, 127, 16 + 4 + 1); 108 memset(u_dst - BPS - 1, 127, 8 + 1); 109 memset(v_dst - BPS - 1, 127, 8 + 1); 110 } 111 112 // Reconstruct one row. 113 for (mb_x = 0; mb_x < dec->mb_w; ++mb_x) { 114 const VP8MBData* const block = ctx->mb_data + mb_x; 115 116 // Rotate in the left samples from previously decoded block. We move four 117 // pixels at a time for alignment reason, and because of in-loop filter. 118 if (mb_x > 0) { 119 for (j = -1; j < 16; ++j) { 120 Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]); 121 } 122 for (j = -1; j < 8; ++j) { 123 Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]); 124 Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]); 125 } 126 } 127 { 128 // bring top samples into the cache 129 VP8TopSamples* const top_yuv = dec->yuv_t + mb_x; 130 const int16_t* const coeffs = block->coeffs; 131 uint32_t bits = block->non_zero_y; 132 int n; 133 134 if (mb_y > 0) { 135 memcpy(y_dst - BPS, top_yuv[0].y, 16); 136 memcpy(u_dst - BPS, top_yuv[0].u, 8); 137 memcpy(v_dst - BPS, top_yuv[0].v, 8); 138 } 139 140 // predict and add residuals 141 if (block->is_i4x4) { // 4x4 142 uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16); 143 144 if (mb_y > 0) { 145 if (mb_x >= dec->mb_w - 1) { // on rightmost border 146 memset(top_right, top_yuv[0].y[15], sizeof(*top_right)); 147 } else { 148 memcpy(top_right, top_yuv[1].y, sizeof(*top_right)); 149 } 150 } 151 // replicate the top-right pixels below 152 top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0]; 153 154 // predict and add residuals for all 4x4 blocks in turn. 155 for (n = 0; n < 16; ++n, bits <<= 2) { 156 uint8_t* const dst = y_dst + kScan[n]; 157 VP8PredLuma4[block->imodes[n]](dst); 158 DoTransform(bits, coeffs + n * 16, dst); 159 } 160 } else { // 16x16 161 const int pred_func = CheckMode(mb_x, mb_y, block->imodes[0]); 162 VP8PredLuma16[pred_func](y_dst); 163 if (bits != 0) { 164 for (n = 0; n < 16; ++n, bits <<= 2) { 165 DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]); 166 } 167 } 168 } 169 { 170 // Chroma 171 const uint32_t bits_uv = block->non_zero_uv; 172 const int pred_func = CheckMode(mb_x, mb_y, block->uvmode); 173 VP8PredChroma8[pred_func](u_dst); 174 VP8PredChroma8[pred_func](v_dst); 175 DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst); 176 DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst); 177 } 178 179 // stash away top samples for next block 180 if (mb_y < dec->mb_h - 1) { 181 memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16); 182 memcpy(top_yuv[0].u, u_dst + 7 * BPS, 8); 183 memcpy(top_yuv[0].v, v_dst + 7 * BPS, 8); 184 } 185 } 186 // Transfer reconstructed samples from yuv_b cache to final destination. 187 { 188 const int y_offset = cache_id * 16 * dec->cache_y_stride; 189 const int uv_offset = cache_id * 8 * dec->cache_uv_stride; 190 uint8_t* const y_out = dec->cache_y + mb_x * 16 + y_offset; 191 uint8_t* const u_out = dec->cache_u + mb_x * 8 + uv_offset; 192 uint8_t* const v_out = dec->cache_v + mb_x * 8 + uv_offset; 193 for (j = 0; j < 16; ++j) { 194 memcpy(y_out + j * dec->cache_y_stride, y_dst + j * BPS, 16); 195 } 196 for (j = 0; j < 8; ++j) { 197 memcpy(u_out + j * dec->cache_uv_stride, u_dst + j * BPS, 8); 198 memcpy(v_out + j * dec->cache_uv_stride, v_dst + j * BPS, 8); 199 } 200 } 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // Filtering 206 207 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary 208 // for caching, given a filtering level. 209 // Simple filter: up to 2 luma samples are read and 1 is written. 210 // Complex filter: up to 4 luma samples are read and 3 are written. Same for 211 // U/V, so it's 8 samples total (because of the 2x upsampling). 212 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 }; 213 214 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) { 215 const VP8ThreadContext* const ctx = &dec->thread_ctx; 216 const int cache_id = ctx->id; 217 const int y_bps = dec->cache_y_stride; 218 const VP8FInfo* const f_info = ctx->f_info + mb_x; 219 uint8_t* const y_dst = dec->cache_y + cache_id * 16 * y_bps + mb_x * 16; 220 const int ilevel = f_info->f_ilevel; 221 const int limit = f_info->f_limit; 222 if (limit == 0) { 223 return; 224 } 225 assert(limit >= 3); 226 if (dec->filter_type == 1) { // simple 227 if (mb_x > 0) { 228 VP8SimpleHFilter16(y_dst, y_bps, limit + 4); 229 } 230 if (f_info->f_inner) { 231 VP8SimpleHFilter16i(y_dst, y_bps, limit); 232 } 233 if (mb_y > 0) { 234 VP8SimpleVFilter16(y_dst, y_bps, limit + 4); 235 } 236 if (f_info->f_inner) { 237 VP8SimpleVFilter16i(y_dst, y_bps, limit); 238 } 239 } else { // complex 240 const int uv_bps = dec->cache_uv_stride; 241 uint8_t* const u_dst = dec->cache_u + cache_id * 8 * uv_bps + mb_x * 8; 242 uint8_t* const v_dst = dec->cache_v + cache_id * 8 * uv_bps + mb_x * 8; 243 const int hev_thresh = f_info->hev_thresh; 244 if (mb_x > 0) { 245 VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh); 246 VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh); 247 } 248 if (f_info->f_inner) { 249 VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh); 250 VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh); 251 } 252 if (mb_y > 0) { 253 VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh); 254 VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh); 255 } 256 if (f_info->f_inner) { 257 VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh); 258 VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh); 259 } 260 } 261 } 262 263 // Filter the decoded macroblock row (if needed) 264 static void FilterRow(const VP8Decoder* const dec) { 265 int mb_x; 266 const int mb_y = dec->thread_ctx.mb_y; 267 assert(dec->thread_ctx.filter_row); 268 for (mb_x = dec->tl_mb_x; mb_x < dec->br_mb_x; ++mb_x) { 269 DoFilter(dec, mb_x, mb_y); 270 } 271 } 272 273 //------------------------------------------------------------------------------ 274 // Precompute the filtering strength for each segment and each i4x4/i16x16 mode. 275 276 static void PrecomputeFilterStrengths(VP8Decoder* const dec) { 277 if (dec->filter_type > 0) { 278 int s; 279 const VP8FilterHeader* const hdr = &dec->filter_hdr; 280 for (s = 0; s < NUM_MB_SEGMENTS; ++s) { 281 int i4x4; 282 // First, compute the initial level 283 int base_level; 284 if (dec->segment_hdr.use_segment) { 285 base_level = dec->segment_hdr.filter_strength[s]; 286 if (!dec->segment_hdr.absolute_delta) { 287 base_level += hdr->level; 288 } 289 } else { 290 base_level = hdr->level; 291 } 292 for (i4x4 = 0; i4x4 <= 1; ++i4x4) { 293 VP8FInfo* const info = &dec->fstrengths[s][i4x4]; 294 int level = base_level; 295 if (hdr->use_lf_delta) { 296 level += hdr->ref_lf_delta[0]; 297 if (i4x4) { 298 level += hdr->mode_lf_delta[0]; 299 } 300 } 301 level = (level < 0) ? 0 : (level > 63) ? 63 : level; 302 if (level > 0) { 303 int ilevel = level; 304 if (hdr->sharpness > 0) { 305 if (hdr->sharpness > 4) { 306 ilevel >>= 2; 307 } else { 308 ilevel >>= 1; 309 } 310 if (ilevel > 9 - hdr->sharpness) { 311 ilevel = 9 - hdr->sharpness; 312 } 313 } 314 if (ilevel < 1) ilevel = 1; 315 info->f_ilevel = ilevel; 316 info->f_limit = 2 * level + ilevel; 317 info->hev_thresh = (level >= 40) ? 2 : (level >= 15) ? 1 : 0; 318 } else { 319 info->f_limit = 0; // no filtering 320 } 321 info->f_inner = i4x4; 322 } 323 } 324 } 325 } 326 327 //------------------------------------------------------------------------------ 328 // Dithering 329 330 // minimal amp that will provide a non-zero dithering effect 331 #define MIN_DITHER_AMP 4 332 333 #define DITHER_AMP_TAB_SIZE 12 334 static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = { 335 // roughly, it's dqm->uv_mat[1] 336 8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1 337 }; 338 339 void VP8InitDithering(const WebPDecoderOptions* const options, 340 VP8Decoder* const dec) { 341 assert(dec != NULL); 342 if (options != NULL) { 343 const int d = options->dithering_strength; 344 const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1; 345 const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100); 346 if (f > 0) { 347 int s; 348 int all_amp = 0; 349 for (s = 0; s < NUM_MB_SEGMENTS; ++s) { 350 VP8QuantMatrix* const dqm = &dec->dqm[s]; 351 if (dqm->uv_quant < DITHER_AMP_TAB_SIZE) { 352 const int idx = (dqm->uv_quant < 0) ? 0 : dqm->uv_quant; 353 dqm->dither = (f * kQuantToDitherAmp[idx]) >> 3; 354 } 355 all_amp |= dqm->dither; 356 } 357 if (all_amp != 0) { 358 VP8InitRandom(&dec->dithering_rg, 1.0f); 359 dec->dither = 1; 360 } 361 } 362 // potentially allow alpha dithering 363 dec->alpha_dithering = options->alpha_dithering_strength; 364 if (dec->alpha_dithering > 100) { 365 dec->alpha_dithering = 100; 366 } else if (dec->alpha_dithering < 0) { 367 dec->alpha_dithering = 0; 368 } 369 } 370 } 371 372 // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100 373 static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) { 374 uint8_t dither[64]; 375 int i; 376 for (i = 0; i < 8 * 8; ++i) { 377 dither[i] = VP8RandomBits2(rg, VP8_DITHER_AMP_BITS + 1, amp); 378 } 379 VP8DitherCombine8x8(dither, dst, bps); 380 } 381 382 static void DitherRow(VP8Decoder* const dec) { 383 int mb_x; 384 assert(dec->dither); 385 for (mb_x = dec->tl_mb_x; mb_x < dec->br_mb_x; ++mb_x) { 386 const VP8ThreadContext* const ctx = &dec->thread_ctx; 387 const VP8MBData* const data = ctx->mb_data + mb_x; 388 const int cache_id = ctx->id; 389 const int uv_bps = dec->cache_uv_stride; 390 if (data->dither >= MIN_DITHER_AMP) { 391 uint8_t* const u_dst = dec->cache_u + cache_id * 8 * uv_bps + mb_x * 8; 392 uint8_t* const v_dst = dec->cache_v + cache_id * 8 * uv_bps + mb_x * 8; 393 Dither8x8(&dec->dithering_rg, u_dst, uv_bps, data->dither); 394 Dither8x8(&dec->dithering_rg, v_dst, uv_bps, data->dither); 395 } 396 } 397 } 398 399 //------------------------------------------------------------------------------ 400 // This function is called after a row of macroblocks is finished decoding. 401 // It also takes into account the following restrictions: 402 // * In case of in-loop filtering, we must hold off sending some of the bottom 403 // pixels as they are yet unfiltered. They will be when the next macroblock 404 // row is decoded. Meanwhile, we must preserve them by rotating them in the 405 // cache area. This doesn't hold for the very bottom row of the uncropped 406 // picture of course. 407 // * we must clip the remaining pixels against the cropping area. The VP8Io 408 // struct must have the following fields set correctly before calling put(): 409 410 #define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB 411 412 // Finalize and transmit a complete row. Return false in case of user-abort. 413 static int FinishRow(void* arg1, void* arg2) { 414 VP8Decoder* const dec = (VP8Decoder*)arg1; 415 VP8Io* const io = (VP8Io*)arg2; 416 int ok = 1; 417 const VP8ThreadContext* const ctx = &dec->thread_ctx; 418 const int cache_id = ctx->id; 419 const int extra_y_rows = kFilterExtraRows[dec->filter_type]; 420 const int ysize = extra_y_rows * dec->cache_y_stride; 421 const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride; 422 const int y_offset = cache_id * 16 * dec->cache_y_stride; 423 const int uv_offset = cache_id * 8 * dec->cache_uv_stride; 424 uint8_t* const ydst = dec->cache_y - ysize + y_offset; 425 uint8_t* const udst = dec->cache_u - uvsize + uv_offset; 426 uint8_t* const vdst = dec->cache_v - uvsize + uv_offset; 427 const int mb_y = ctx->mb_y; 428 const int is_first_row = (mb_y == 0); 429 const int is_last_row = (mb_y >= dec->br_mb_y - 1); 430 431 if (dec->mt_method == 2) { 432 ReconstructRow(dec, ctx); 433 } 434 435 if (ctx->filter_row) { 436 FilterRow(dec); 437 } 438 439 if (dec->dither) { 440 DitherRow(dec); 441 } 442 443 if (io->put != NULL) { 444 int y_start = MACROBLOCK_VPOS(mb_y); 445 int y_end = MACROBLOCK_VPOS(mb_y + 1); 446 if (!is_first_row) { 447 y_start -= extra_y_rows; 448 io->y = ydst; 449 io->u = udst; 450 io->v = vdst; 451 } else { 452 io->y = dec->cache_y + y_offset; 453 io->u = dec->cache_u + uv_offset; 454 io->v = dec->cache_v + uv_offset; 455 } 456 457 if (!is_last_row) { 458 y_end -= extra_y_rows; 459 } 460 if (y_end > io->crop_bottom) { 461 y_end = io->crop_bottom; // make sure we don't overflow on last row. 462 } 463 // If dec->alpha_data is not NULL, we have some alpha plane present. 464 io->a = NULL; 465 if (dec->alpha_data != NULL && y_start < y_end) { 466 io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start); 467 if (io->a == NULL) { 468 return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, 469 "Could not decode alpha data."); 470 } 471 } 472 if (y_start < io->crop_top) { 473 const int delta_y = io->crop_top - y_start; 474 y_start = io->crop_top; 475 assert(!(delta_y & 1)); 476 io->y += dec->cache_y_stride * delta_y; 477 io->u += dec->cache_uv_stride * (delta_y >> 1); 478 io->v += dec->cache_uv_stride * (delta_y >> 1); 479 if (io->a != NULL) { 480 io->a += io->width * delta_y; 481 } 482 } 483 if (y_start < y_end) { 484 io->y += io->crop_left; 485 io->u += io->crop_left >> 1; 486 io->v += io->crop_left >> 1; 487 if (io->a != NULL) { 488 io->a += io->crop_left; 489 } 490 io->mb_y = y_start - io->crop_top; 491 io->mb_w = io->crop_right - io->crop_left; 492 io->mb_h = y_end - y_start; 493 ok = io->put(io); 494 } 495 } 496 // rotate top samples if needed 497 if (cache_id + 1 == dec->num_caches) { 498 if (!is_last_row) { 499 memcpy(dec->cache_y - ysize, ydst + 16 * dec->cache_y_stride, ysize); 500 memcpy(dec->cache_u - uvsize, udst + 8 * dec->cache_uv_stride, uvsize); 501 memcpy(dec->cache_v - uvsize, vdst + 8 * dec->cache_uv_stride, uvsize); 502 } 503 } 504 505 return ok; 506 } 507 508 #undef MACROBLOCK_VPOS 509 510 //------------------------------------------------------------------------------ 511 512 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) { 513 int ok = 1; 514 VP8ThreadContext* const ctx = &dec->thread_ctx; 515 const int filter_row = 516 (dec->filter_type > 0) && 517 (dec->mb_y >= dec->tl_mb_y) && (dec->mb_y <= dec->br_mb_y); 518 if (dec->mt_method == 0) { 519 // ctx->id and ctx->f_info are already set 520 ctx->mb_y = dec->mb_y; 521 ctx->filter_row = filter_row; 522 ReconstructRow(dec, ctx); 523 ok = FinishRow(dec, io); 524 } else { 525 WebPWorker* const worker = &dec->worker; 526 // Finish previous job *before* updating context 527 ok &= WebPGetWorkerInterface()->Sync(worker); 528 assert(worker->status == OK); 529 if (ok) { // spawn a new deblocking/output job 530 ctx->io = *io; 531 ctx->id = dec->cache_id; 532 ctx->mb_y = dec->mb_y; 533 ctx->filter_row = filter_row; 534 if (dec->mt_method == 2) { // swap macroblock data 535 VP8MBData* const tmp = ctx->mb_data; 536 ctx->mb_data = dec->mb_data; 537 dec->mb_data = tmp; 538 } else { 539 // perform reconstruction directly in main thread 540 ReconstructRow(dec, ctx); 541 } 542 if (filter_row) { // swap filter info 543 VP8FInfo* const tmp = ctx->f_info; 544 ctx->f_info = dec->f_info; 545 dec->f_info = tmp; 546 } 547 // (reconstruct)+filter in parallel 548 WebPGetWorkerInterface()->Launch(worker); 549 if (++dec->cache_id == dec->num_caches) { 550 dec->cache_id = 0; 551 } 552 } 553 } 554 return ok; 555 } 556 557 //------------------------------------------------------------------------------ 558 // Finish setting up the decoding parameter once user's setup() is called. 559 560 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) { 561 // Call setup() first. This may trigger additional decoding features on 'io'. 562 // Note: Afterward, we must call teardown() no matter what. 563 if (io->setup != NULL && !io->setup(io)) { 564 VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed"); 565 return dec->status; 566 } 567 568 // Disable filtering per user request 569 if (io->bypass_filtering) { 570 dec->filter_type = 0; 571 } 572 573 // Define the area where we can skip in-loop filtering, in case of cropping. 574 // 575 // 'Simple' filter reads two luma samples outside of the macroblock 576 // and filters one. It doesn't filter the chroma samples. Hence, we can 577 // avoid doing the in-loop filtering before crop_top/crop_left position. 578 // For the 'Complex' filter, 3 samples are read and up to 3 are filtered. 579 // Means: there's a dependency chain that goes all the way up to the 580 // top-left corner of the picture (MB #0). We must filter all the previous 581 // macroblocks. 582 { 583 const int extra_pixels = kFilterExtraRows[dec->filter_type]; 584 if (dec->filter_type == 2) { 585 // For complex filter, we need to preserve the dependency chain. 586 dec->tl_mb_x = 0; 587 dec->tl_mb_y = 0; 588 } else { 589 // For simple filter, we can filter only the cropped region. 590 // We include 'extra_pixels' on the other side of the boundary, since 591 // vertical or horizontal filtering of the previous macroblock can 592 // modify some abutting pixels. 593 dec->tl_mb_x = (io->crop_left - extra_pixels) >> 4; 594 dec->tl_mb_y = (io->crop_top - extra_pixels) >> 4; 595 if (dec->tl_mb_x < 0) dec->tl_mb_x = 0; 596 if (dec->tl_mb_y < 0) dec->tl_mb_y = 0; 597 } 598 // We need some 'extra' pixels on the right/bottom. 599 dec->br_mb_y = (io->crop_bottom + 15 + extra_pixels) >> 4; 600 dec->br_mb_x = (io->crop_right + 15 + extra_pixels) >> 4; 601 if (dec->br_mb_x > dec->mb_w) { 602 dec->br_mb_x = dec->mb_w; 603 } 604 if (dec->br_mb_y > dec->mb_h) { 605 dec->br_mb_y = dec->mb_h; 606 } 607 } 608 PrecomputeFilterStrengths(dec); 609 return VP8_STATUS_OK; 610 } 611 612 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) { 613 int ok = 1; 614 if (dec->mt_method > 0) { 615 ok = WebPGetWorkerInterface()->Sync(&dec->worker); 616 } 617 618 if (io->teardown != NULL) { 619 io->teardown(io); 620 } 621 return ok; 622 } 623 624 //------------------------------------------------------------------------------ 625 // For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line. 626 // 627 // Reason is: the deblocking filter cannot deblock the bottom horizontal edges 628 // immediately, and needs to wait for first few rows of the next macroblock to 629 // be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending 630 // on strength). 631 // With two threads, the vertical positions of the rows being decoded are: 632 // Decode: [ 0..15][16..31][32..47][48..63][64..79][... 633 // Deblock: [ 0..11][12..27][28..43][44..59][... 634 // If we use two threads and two caches of 16 pixels, the sequence would be: 635 // Decode: [ 0..15][16..31][ 0..15!!][16..31][ 0..15][... 636 // Deblock: [ 0..11][12..27!!][-4..11][12..27][... 637 // The problem occurs during row [12..15!!] that both the decoding and 638 // deblocking threads are writing simultaneously. 639 // With 3 cache lines, one get a safe write pattern: 640 // Decode: [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0.. 641 // Deblock: [ 0..11][12..27][28..43][-4..11][12..27][28... 642 // Note that multi-threaded output _without_ deblocking can make use of two 643 // cache lines of 16 pixels only, since there's no lagging behind. The decoding 644 // and output process have non-concurrent writing: 645 // Decode: [ 0..15][16..31][ 0..15][16..31][... 646 // io->put: [ 0..15][16..31][ 0..15][... 647 648 #define MT_CACHE_LINES 3 649 #define ST_CACHE_LINES 1 // 1 cache row only for single-threaded case 650 651 // Initialize multi/single-thread worker 652 static int InitThreadContext(VP8Decoder* const dec) { 653 dec->cache_id = 0; 654 if (dec->mt_method > 0) { 655 WebPWorker* const worker = &dec->worker; 656 if (!WebPGetWorkerInterface()->Reset(worker)) { 657 return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY, 658 "thread initialization failed."); 659 } 660 worker->data1 = dec; 661 worker->data2 = (void*)&dec->thread_ctx.io; 662 worker->hook = FinishRow; 663 dec->num_caches = 664 (dec->filter_type > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1; 665 } else { 666 dec->num_caches = ST_CACHE_LINES; 667 } 668 return 1; 669 } 670 671 int VP8GetThreadMethod(const WebPDecoderOptions* const options, 672 const WebPHeaderStructure* const headers, 673 int width, int height) { 674 if (options == NULL || options->use_threads == 0) { 675 return 0; 676 } 677 (void)headers; 678 (void)width; 679 (void)height; 680 assert(headers == NULL || !headers->is_lossless); 681 #if defined(WEBP_USE_THREAD) 682 if (width >= MIN_WIDTH_FOR_THREADS) return 2; 683 #endif 684 return 0; 685 } 686 687 #undef MT_CACHE_LINES 688 #undef ST_CACHE_LINES 689 690 //------------------------------------------------------------------------------ 691 // Memory setup 692 693 static int AllocateMemory(VP8Decoder* const dec) { 694 const int num_caches = dec->num_caches; 695 const int mb_w = dec->mb_w; 696 // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise. 697 const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t); 698 const size_t top_size = sizeof(VP8TopSamples) * mb_w; 699 const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB); 700 const size_t f_info_size = 701 (dec->filter_type > 0) ? 702 mb_w * (dec->mt_method > 0 ? 2 : 1) * sizeof(VP8FInfo) 703 : 0; 704 const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b); 705 const size_t mb_data_size = 706 (dec->mt_method == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data); 707 const size_t cache_height = (16 * num_caches 708 + kFilterExtraRows[dec->filter_type]) * 3 / 2; 709 const size_t cache_size = top_size * cache_height; 710 // alpha_size is the only one that scales as width x height. 711 const uint64_t alpha_size = (dec->alpha_data != NULL) ? 712 (uint64_t)dec->pic_hdr.width * dec->pic_hdr.height : 0ULL; 713 const uint64_t needed = (uint64_t)intra_pred_mode_size 714 + top_size + mb_info_size + f_info_size 715 + yuv_size + mb_data_size 716 + cache_size + alpha_size + WEBP_ALIGN_CST; 717 uint8_t* mem; 718 719 if (!CheckSizeOverflow(needed)) return 0; // check for overflow 720 if (needed > dec->mem_size) { 721 WebPSafeFree(dec->mem); 722 dec->mem_size = 0; 723 dec->mem = WebPSafeMalloc(needed, sizeof(uint8_t)); 724 if (dec->mem == NULL) { 725 return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY, 726 "no memory during frame initialization."); 727 } 728 // down-cast is ok, thanks to WebPSafeMalloc() above. 729 dec->mem_size = (size_t)needed; 730 } 731 732 mem = (uint8_t*)dec->mem; 733 dec->intra_t = mem; 734 mem += intra_pred_mode_size; 735 736 dec->yuv_t = (VP8TopSamples*)mem; 737 mem += top_size; 738 739 dec->mb_info = ((VP8MB*)mem) + 1; 740 mem += mb_info_size; 741 742 dec->f_info = f_info_size ? (VP8FInfo*)mem : NULL; 743 mem += f_info_size; 744 dec->thread_ctx.id = 0; 745 dec->thread_ctx.f_info = dec->f_info; 746 if (dec->filter_type > 0 && dec->mt_method > 0) { 747 // secondary cache line. The deblocking process need to make use of the 748 // filtering strength from previous macroblock row, while the new ones 749 // are being decoded in parallel. We'll just swap the pointers. 750 dec->thread_ctx.f_info += mb_w; 751 } 752 753 mem = (uint8_t*)WEBP_ALIGN(mem); 754 assert((yuv_size & WEBP_ALIGN_CST) == 0); 755 dec->yuv_b = mem; 756 mem += yuv_size; 757 758 dec->mb_data = (VP8MBData*)mem; 759 dec->thread_ctx.mb_data = (VP8MBData*)mem; 760 if (dec->mt_method == 2) { 761 dec->thread_ctx.mb_data += mb_w; 762 } 763 mem += mb_data_size; 764 765 dec->cache_y_stride = 16 * mb_w; 766 dec->cache_uv_stride = 8 * mb_w; 767 { 768 const int extra_rows = kFilterExtraRows[dec->filter_type]; 769 const int extra_y = extra_rows * dec->cache_y_stride; 770 const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride; 771 dec->cache_y = mem + extra_y; 772 dec->cache_u = dec->cache_y 773 + 16 * num_caches * dec->cache_y_stride + extra_uv; 774 dec->cache_v = dec->cache_u 775 + 8 * num_caches * dec->cache_uv_stride + extra_uv; 776 dec->cache_id = 0; 777 } 778 mem += cache_size; 779 780 // alpha plane 781 dec->alpha_plane = alpha_size ? mem : NULL; 782 mem += alpha_size; 783 assert(mem <= (uint8_t*)dec->mem + dec->mem_size); 784 785 // note: left/top-info is initialized once for all. 786 memset(dec->mb_info - 1, 0, mb_info_size); 787 VP8InitScanline(dec); // initialize left too. 788 789 // initialize top 790 memset(dec->intra_t, B_DC_PRED, intra_pred_mode_size); 791 792 return 1; 793 } 794 795 static void InitIo(VP8Decoder* const dec, VP8Io* io) { 796 // prepare 'io' 797 io->mb_y = 0; 798 io->y = dec->cache_y; 799 io->u = dec->cache_u; 800 io->v = dec->cache_v; 801 io->y_stride = dec->cache_y_stride; 802 io->uv_stride = dec->cache_uv_stride; 803 io->a = NULL; 804 } 805 806 int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io) { 807 if (!InitThreadContext(dec)) return 0; // call first. Sets dec->num_caches. 808 if (!AllocateMemory(dec)) return 0; 809 InitIo(dec, io); 810 VP8DspInit(); // Init critical function pointers and look-up tables. 811 return 1; 812 } 813 814 //------------------------------------------------------------------------------