tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

frame_dec.c (28290B)


      1 // Copyright 2010 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // Frame-reconstruction function. Memory allocation.
     11 //
     12 // Author: Skal (pascal.massimino@gmail.com)
     13 
     14 #include <assert.h>
     15 #include <stdlib.h>
     16 #include <string.h>
     17 
     18 #include "src/dec/common_dec.h"
     19 #include "src/dec/vp8_dec.h"
     20 #include "src/dec/vp8i_dec.h"
     21 #include "src/dec/webpi_dec.h"
     22 #include "src/dsp/dsp.h"
     23 #include "src/utils/random_utils.h"
     24 #include "src/utils/thread_utils.h"
     25 #include "src/utils/utils.h"
     26 #include "src/webp/decode.h"
     27 #include "src/webp/types.h"
     28 
     29 //------------------------------------------------------------------------------
     30 // Main reconstruction function.
     31 
     32 static const uint16_t kScan[16] = {
     33  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
     34  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
     35  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
     36  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
     37 };
     38 
     39 static int CheckMode(int mb_x, int mb_y, int mode) {
     40  if (mode == B_DC_PRED) {
     41    if (mb_x == 0) {
     42      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
     43    } else {
     44      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
     45    }
     46  }
     47  return mode;
     48 }
     49 
     50 static void Copy32b(uint8_t* const dst, const uint8_t* const src) {
     51  memcpy(dst, src, 4);
     52 }
     53 
     54 static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
     55                                    uint8_t* const dst) {
     56  switch (bits >> 30) {
     57    case 3:
     58      VP8Transform(src, dst, 0);
     59      break;
     60    case 2:
     61      VP8TransformAC3(src, dst);
     62      break;
     63    case 1:
     64      VP8TransformDC(src, dst);
     65      break;
     66    default:
     67      break;
     68  }
     69 }
     70 
     71 static void DoUVTransform(uint32_t bits, const int16_t* const src,
     72                          uint8_t* const dst) {
     73  if (bits & 0xff) {    // any non-zero coeff at all?
     74    if (bits & 0xaa) {  // any non-zero AC coefficient?
     75      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
     76    } else {
     77      VP8TransformDCUV(src, dst);
     78    }
     79  }
     80 }
     81 
     82 static void ReconstructRow(const VP8Decoder* const dec,
     83                           const VP8ThreadContext* ctx) {
     84  int j;
     85  int mb_x;
     86  const int mb_y = ctx->mb_y;
     87  const int cache_id = ctx->id;
     88  uint8_t* const y_dst = dec->yuv_b + Y_OFF;
     89  uint8_t* const u_dst = dec->yuv_b + U_OFF;
     90  uint8_t* const v_dst = dec->yuv_b + V_OFF;
     91 
     92  // Initialize left-most block.
     93  for (j = 0; j < 16; ++j) {
     94    y_dst[j * BPS - 1] = 129;
     95  }
     96  for (j = 0; j < 8; ++j) {
     97    u_dst[j * BPS - 1] = 129;
     98    v_dst[j * BPS - 1] = 129;
     99  }
    100 
    101  // Init top-left sample on left column too.
    102  if (mb_y > 0) {
    103    y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
    104  } else {
    105    // we only need to do this init once at block (0,0).
    106    // Afterward, it remains valid for the whole topmost row.
    107    memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
    108    memset(u_dst - BPS - 1, 127, 8 + 1);
    109    memset(v_dst - BPS - 1, 127, 8 + 1);
    110  }
    111 
    112  // Reconstruct one row.
    113  for (mb_x = 0; mb_x < dec->mb_w; ++mb_x) {
    114    const VP8MBData* const block = ctx->mb_data + mb_x;
    115 
    116    // Rotate in the left samples from previously decoded block. We move four
    117    // pixels at a time for alignment reason, and because of in-loop filter.
    118    if (mb_x > 0) {
    119      for (j = -1; j < 16; ++j) {
    120        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
    121      }
    122      for (j = -1; j < 8; ++j) {
    123        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
    124        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
    125      }
    126    }
    127    {
    128      // bring top samples into the cache
    129      VP8TopSamples* const top_yuv = dec->yuv_t + mb_x;
    130      const int16_t* const coeffs = block->coeffs;
    131      uint32_t bits = block->non_zero_y;
    132      int n;
    133 
    134      if (mb_y > 0) {
    135        memcpy(y_dst - BPS, top_yuv[0].y, 16);
    136        memcpy(u_dst - BPS, top_yuv[0].u, 8);
    137        memcpy(v_dst - BPS, top_yuv[0].v, 8);
    138      }
    139 
    140      // predict and add residuals
    141      if (block->is_i4x4) {   // 4x4
    142        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
    143 
    144        if (mb_y > 0) {
    145          if (mb_x >= dec->mb_w - 1) {    // on rightmost border
    146            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
    147          } else {
    148            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
    149          }
    150        }
    151        // replicate the top-right pixels below
    152        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
    153 
    154        // predict and add residuals for all 4x4 blocks in turn.
    155        for (n = 0; n < 16; ++n, bits <<= 2) {
    156          uint8_t* const dst = y_dst + kScan[n];
    157          VP8PredLuma4[block->imodes[n]](dst);
    158          DoTransform(bits, coeffs + n * 16, dst);
    159        }
    160      } else {    // 16x16
    161        const int pred_func = CheckMode(mb_x, mb_y, block->imodes[0]);
    162        VP8PredLuma16[pred_func](y_dst);
    163        if (bits != 0) {
    164          for (n = 0; n < 16; ++n, bits <<= 2) {
    165            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
    166          }
    167        }
    168      }
    169      {
    170        // Chroma
    171        const uint32_t bits_uv = block->non_zero_uv;
    172        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode);
    173        VP8PredChroma8[pred_func](u_dst);
    174        VP8PredChroma8[pred_func](v_dst);
    175        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
    176        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
    177      }
    178 
    179      // stash away top samples for next block
    180      if (mb_y < dec->mb_h - 1) {
    181        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
    182        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
    183        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
    184      }
    185    }
    186    // Transfer reconstructed samples from yuv_b cache to final destination.
    187    {
    188      const int y_offset = cache_id * 16 * dec->cache_y_stride;
    189      const int uv_offset = cache_id * 8 * dec->cache_uv_stride;
    190      uint8_t* const y_out = dec->cache_y + mb_x * 16 + y_offset;
    191      uint8_t* const u_out = dec->cache_u + mb_x * 8 + uv_offset;
    192      uint8_t* const v_out = dec->cache_v + mb_x * 8 + uv_offset;
    193      for (j = 0; j < 16; ++j) {
    194        memcpy(y_out + j * dec->cache_y_stride, y_dst + j * BPS, 16);
    195      }
    196      for (j = 0; j < 8; ++j) {
    197        memcpy(u_out + j * dec->cache_uv_stride, u_dst + j * BPS, 8);
    198        memcpy(v_out + j * dec->cache_uv_stride, v_dst + j * BPS, 8);
    199      }
    200    }
    201  }
    202 }
    203 
    204 //------------------------------------------------------------------------------
    205 // Filtering
    206 
    207 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
    208 // for caching, given a filtering level.
    209 // Simple filter:  up to 2 luma samples are read and 1 is written.
    210 // Complex filter: up to 4 luma samples are read and 3 are written. Same for
    211 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
    212 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
    213 
    214 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
    215  const VP8ThreadContext* const ctx = &dec->thread_ctx;
    216  const int cache_id = ctx->id;
    217  const int y_bps = dec->cache_y_stride;
    218  const VP8FInfo* const f_info = ctx->f_info + mb_x;
    219  uint8_t* const y_dst = dec->cache_y + cache_id * 16 * y_bps + mb_x * 16;
    220  const int ilevel = f_info->f_ilevel;
    221  const int limit = f_info->f_limit;
    222  if (limit == 0) {
    223    return;
    224  }
    225  assert(limit >= 3);
    226  if (dec->filter_type == 1) {   // simple
    227    if (mb_x > 0) {
    228      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
    229    }
    230    if (f_info->f_inner) {
    231      VP8SimpleHFilter16i(y_dst, y_bps, limit);
    232    }
    233    if (mb_y > 0) {
    234      VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
    235    }
    236    if (f_info->f_inner) {
    237      VP8SimpleVFilter16i(y_dst, y_bps, limit);
    238    }
    239  } else {    // complex
    240    const int uv_bps = dec->cache_uv_stride;
    241    uint8_t* const u_dst = dec->cache_u + cache_id * 8 * uv_bps + mb_x * 8;
    242    uint8_t* const v_dst = dec->cache_v + cache_id * 8 * uv_bps + mb_x * 8;
    243    const int hev_thresh = f_info->hev_thresh;
    244    if (mb_x > 0) {
    245      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
    246      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
    247    }
    248    if (f_info->f_inner) {
    249      VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
    250      VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
    251    }
    252    if (mb_y > 0) {
    253      VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
    254      VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
    255    }
    256    if (f_info->f_inner) {
    257      VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
    258      VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
    259    }
    260  }
    261 }
    262 
    263 // Filter the decoded macroblock row (if needed)
    264 static void FilterRow(const VP8Decoder* const dec) {
    265  int mb_x;
    266  const int mb_y = dec->thread_ctx.mb_y;
    267  assert(dec->thread_ctx.filter_row);
    268  for (mb_x = dec->tl_mb_x; mb_x < dec->br_mb_x; ++mb_x) {
    269    DoFilter(dec, mb_x, mb_y);
    270  }
    271 }
    272 
    273 //------------------------------------------------------------------------------
    274 // Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
    275 
    276 static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
    277  if (dec->filter_type > 0) {
    278    int s;
    279    const VP8FilterHeader* const hdr = &dec->filter_hdr;
    280    for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
    281      int i4x4;
    282      // First, compute the initial level
    283      int base_level;
    284      if (dec->segment_hdr.use_segment) {
    285        base_level = dec->segment_hdr.filter_strength[s];
    286        if (!dec->segment_hdr.absolute_delta) {
    287          base_level += hdr->level;
    288        }
    289      } else {
    290        base_level = hdr->level;
    291      }
    292      for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
    293        VP8FInfo* const info = &dec->fstrengths[s][i4x4];
    294        int level = base_level;
    295        if (hdr->use_lf_delta) {
    296          level += hdr->ref_lf_delta[0];
    297          if (i4x4) {
    298            level += hdr->mode_lf_delta[0];
    299          }
    300        }
    301        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
    302        if (level > 0) {
    303          int ilevel = level;
    304          if (hdr->sharpness > 0) {
    305            if (hdr->sharpness > 4) {
    306              ilevel >>= 2;
    307            } else {
    308              ilevel >>= 1;
    309            }
    310            if (ilevel > 9 - hdr->sharpness) {
    311              ilevel = 9 - hdr->sharpness;
    312            }
    313          }
    314          if (ilevel < 1) ilevel = 1;
    315          info->f_ilevel = ilevel;
    316          info->f_limit = 2 * level + ilevel;
    317          info->hev_thresh = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
    318        } else {
    319          info->f_limit = 0;  // no filtering
    320        }
    321        info->f_inner = i4x4;
    322      }
    323    }
    324  }
    325 }
    326 
    327 //------------------------------------------------------------------------------
    328 // Dithering
    329 
    330 // minimal amp that will provide a non-zero dithering effect
    331 #define MIN_DITHER_AMP 4
    332 
    333 #define DITHER_AMP_TAB_SIZE 12
    334 static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
    335  // roughly, it's dqm->uv_mat[1]
    336  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
    337 };
    338 
    339 void VP8InitDithering(const WebPDecoderOptions* const options,
    340                      VP8Decoder* const dec) {
    341  assert(dec != NULL);
    342  if (options != NULL) {
    343    const int d = options->dithering_strength;
    344    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
    345    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
    346    if (f > 0) {
    347      int s;
    348      int all_amp = 0;
    349      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
    350        VP8QuantMatrix* const dqm = &dec->dqm[s];
    351        if (dqm->uv_quant < DITHER_AMP_TAB_SIZE) {
    352          const int idx = (dqm->uv_quant < 0) ? 0 : dqm->uv_quant;
    353          dqm->dither = (f * kQuantToDitherAmp[idx]) >> 3;
    354        }
    355        all_amp |= dqm->dither;
    356      }
    357      if (all_amp != 0) {
    358        VP8InitRandom(&dec->dithering_rg, 1.0f);
    359        dec->dither = 1;
    360      }
    361    }
    362    // potentially allow alpha dithering
    363    dec->alpha_dithering = options->alpha_dithering_strength;
    364    if (dec->alpha_dithering > 100) {
    365      dec->alpha_dithering = 100;
    366    } else if (dec->alpha_dithering < 0) {
    367      dec->alpha_dithering = 0;
    368    }
    369  }
    370 }
    371 
    372 // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
    373 static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
    374  uint8_t dither[64];
    375  int i;
    376  for (i = 0; i < 8 * 8; ++i) {
    377    dither[i] = VP8RandomBits2(rg, VP8_DITHER_AMP_BITS + 1, amp);
    378  }
    379  VP8DitherCombine8x8(dither, dst, bps);
    380 }
    381 
    382 static void DitherRow(VP8Decoder* const dec) {
    383  int mb_x;
    384  assert(dec->dither);
    385  for (mb_x = dec->tl_mb_x; mb_x < dec->br_mb_x; ++mb_x) {
    386    const VP8ThreadContext* const ctx = &dec->thread_ctx;
    387    const VP8MBData* const data = ctx->mb_data + mb_x;
    388    const int cache_id = ctx->id;
    389    const int uv_bps = dec->cache_uv_stride;
    390    if (data->dither >= MIN_DITHER_AMP) {
    391      uint8_t* const u_dst = dec->cache_u + cache_id * 8 * uv_bps + mb_x * 8;
    392      uint8_t* const v_dst = dec->cache_v + cache_id * 8 * uv_bps + mb_x * 8;
    393      Dither8x8(&dec->dithering_rg, u_dst, uv_bps, data->dither);
    394      Dither8x8(&dec->dithering_rg, v_dst, uv_bps, data->dither);
    395    }
    396  }
    397 }
    398 
    399 //------------------------------------------------------------------------------
    400 // This function is called after a row of macroblocks is finished decoding.
    401 // It also takes into account the following restrictions:
    402 //  * In case of in-loop filtering, we must hold off sending some of the bottom
    403 //    pixels as they are yet unfiltered. They will be when the next macroblock
    404 //    row is decoded. Meanwhile, we must preserve them by rotating them in the
    405 //    cache area. This doesn't hold for the very bottom row of the uncropped
    406 //    picture of course.
    407 //  * we must clip the remaining pixels against the cropping area. The VP8Io
    408 //    struct must have the following fields set correctly before calling put():
    409 
    410 #define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
    411 
    412 // Finalize and transmit a complete row. Return false in case of user-abort.
    413 static int FinishRow(void* arg1, void* arg2) {
    414  VP8Decoder* const dec = (VP8Decoder*)arg1;
    415  VP8Io* const io = (VP8Io*)arg2;
    416  int ok = 1;
    417  const VP8ThreadContext* const ctx = &dec->thread_ctx;
    418  const int cache_id = ctx->id;
    419  const int extra_y_rows = kFilterExtraRows[dec->filter_type];
    420  const int ysize = extra_y_rows * dec->cache_y_stride;
    421  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride;
    422  const int y_offset = cache_id * 16 * dec->cache_y_stride;
    423  const int uv_offset = cache_id * 8 * dec->cache_uv_stride;
    424  uint8_t* const ydst = dec->cache_y - ysize + y_offset;
    425  uint8_t* const udst = dec->cache_u - uvsize + uv_offset;
    426  uint8_t* const vdst = dec->cache_v - uvsize + uv_offset;
    427  const int mb_y = ctx->mb_y;
    428  const int is_first_row = (mb_y == 0);
    429  const int is_last_row = (mb_y >= dec->br_mb_y - 1);
    430 
    431  if (dec->mt_method == 2) {
    432    ReconstructRow(dec, ctx);
    433  }
    434 
    435  if (ctx->filter_row) {
    436    FilterRow(dec);
    437  }
    438 
    439  if (dec->dither) {
    440    DitherRow(dec);
    441  }
    442 
    443  if (io->put != NULL) {
    444    int y_start = MACROBLOCK_VPOS(mb_y);
    445    int y_end = MACROBLOCK_VPOS(mb_y + 1);
    446    if (!is_first_row) {
    447      y_start -= extra_y_rows;
    448      io->y = ydst;
    449      io->u = udst;
    450      io->v = vdst;
    451    } else {
    452      io->y = dec->cache_y + y_offset;
    453      io->u = dec->cache_u + uv_offset;
    454      io->v = dec->cache_v + uv_offset;
    455    }
    456 
    457    if (!is_last_row) {
    458      y_end -= extra_y_rows;
    459    }
    460    if (y_end > io->crop_bottom) {
    461      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
    462    }
    463    // If dec->alpha_data is not NULL, we have some alpha plane present.
    464    io->a = NULL;
    465    if (dec->alpha_data != NULL && y_start < y_end) {
    466      io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start);
    467      if (io->a == NULL) {
    468        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
    469                           "Could not decode alpha data.");
    470      }
    471    }
    472    if (y_start < io->crop_top) {
    473      const int delta_y = io->crop_top - y_start;
    474      y_start = io->crop_top;
    475      assert(!(delta_y & 1));
    476      io->y += dec->cache_y_stride * delta_y;
    477      io->u += dec->cache_uv_stride * (delta_y >> 1);
    478      io->v += dec->cache_uv_stride * (delta_y >> 1);
    479      if (io->a != NULL) {
    480        io->a += io->width * delta_y;
    481      }
    482    }
    483    if (y_start < y_end) {
    484      io->y += io->crop_left;
    485      io->u += io->crop_left >> 1;
    486      io->v += io->crop_left >> 1;
    487      if (io->a != NULL) {
    488        io->a += io->crop_left;
    489      }
    490      io->mb_y = y_start - io->crop_top;
    491      io->mb_w = io->crop_right - io->crop_left;
    492      io->mb_h = y_end - y_start;
    493      ok = io->put(io);
    494    }
    495  }
    496  // rotate top samples if needed
    497  if (cache_id + 1 == dec->num_caches) {
    498    if (!is_last_row) {
    499      memcpy(dec->cache_y - ysize, ydst + 16 * dec->cache_y_stride, ysize);
    500      memcpy(dec->cache_u - uvsize, udst + 8 * dec->cache_uv_stride, uvsize);
    501      memcpy(dec->cache_v - uvsize, vdst + 8 * dec->cache_uv_stride, uvsize);
    502    }
    503  }
    504 
    505  return ok;
    506 }
    507 
    508 #undef MACROBLOCK_VPOS
    509 
    510 //------------------------------------------------------------------------------
    511 
    512 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
    513  int ok = 1;
    514  VP8ThreadContext* const ctx = &dec->thread_ctx;
    515  const int filter_row =
    516      (dec->filter_type > 0) &&
    517      (dec->mb_y >= dec->tl_mb_y) && (dec->mb_y <= dec->br_mb_y);
    518  if (dec->mt_method == 0) {
    519    // ctx->id and ctx->f_info are already set
    520    ctx->mb_y = dec->mb_y;
    521    ctx->filter_row = filter_row;
    522    ReconstructRow(dec, ctx);
    523    ok = FinishRow(dec, io);
    524  } else {
    525    WebPWorker* const worker = &dec->worker;
    526    // Finish previous job *before* updating context
    527    ok &= WebPGetWorkerInterface()->Sync(worker);
    528    assert(worker->status == OK);
    529    if (ok) {   // spawn a new deblocking/output job
    530      ctx->io = *io;
    531      ctx->id = dec->cache_id;
    532      ctx->mb_y = dec->mb_y;
    533      ctx->filter_row = filter_row;
    534      if (dec->mt_method == 2) {  // swap macroblock data
    535        VP8MBData* const tmp = ctx->mb_data;
    536        ctx->mb_data = dec->mb_data;
    537        dec->mb_data = tmp;
    538      } else {
    539        // perform reconstruction directly in main thread
    540        ReconstructRow(dec, ctx);
    541      }
    542      if (filter_row) {            // swap filter info
    543        VP8FInfo* const tmp = ctx->f_info;
    544        ctx->f_info = dec->f_info;
    545        dec->f_info = tmp;
    546      }
    547      // (reconstruct)+filter in parallel
    548      WebPGetWorkerInterface()->Launch(worker);
    549      if (++dec->cache_id == dec->num_caches) {
    550        dec->cache_id = 0;
    551      }
    552    }
    553  }
    554  return ok;
    555 }
    556 
    557 //------------------------------------------------------------------------------
    558 // Finish setting up the decoding parameter once user's setup() is called.
    559 
    560 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
    561  // Call setup() first. This may trigger additional decoding features on 'io'.
    562  // Note: Afterward, we must call teardown() no matter what.
    563  if (io->setup != NULL && !io->setup(io)) {
    564    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
    565    return dec->status;
    566  }
    567 
    568  // Disable filtering per user request
    569  if (io->bypass_filtering) {
    570    dec->filter_type = 0;
    571  }
    572 
    573  // Define the area where we can skip in-loop filtering, in case of cropping.
    574  //
    575  // 'Simple' filter reads two luma samples outside of the macroblock
    576  // and filters one. It doesn't filter the chroma samples. Hence, we can
    577  // avoid doing the in-loop filtering before crop_top/crop_left position.
    578  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
    579  // Means: there's a dependency chain that goes all the way up to the
    580  // top-left corner of the picture (MB #0). We must filter all the previous
    581  // macroblocks.
    582  {
    583    const int extra_pixels = kFilterExtraRows[dec->filter_type];
    584    if (dec->filter_type == 2) {
    585      // For complex filter, we need to preserve the dependency chain.
    586      dec->tl_mb_x = 0;
    587      dec->tl_mb_y = 0;
    588    } else {
    589      // For simple filter, we can filter only the cropped region.
    590      // We include 'extra_pixels' on the other side of the boundary, since
    591      // vertical or horizontal filtering of the previous macroblock can
    592      // modify some abutting pixels.
    593      dec->tl_mb_x = (io->crop_left - extra_pixels) >> 4;
    594      dec->tl_mb_y = (io->crop_top - extra_pixels) >> 4;
    595      if (dec->tl_mb_x < 0) dec->tl_mb_x = 0;
    596      if (dec->tl_mb_y < 0) dec->tl_mb_y = 0;
    597    }
    598    // We need some 'extra' pixels on the right/bottom.
    599    dec->br_mb_y = (io->crop_bottom + 15 + extra_pixels) >> 4;
    600    dec->br_mb_x = (io->crop_right + 15 + extra_pixels) >> 4;
    601    if (dec->br_mb_x > dec->mb_w) {
    602      dec->br_mb_x = dec->mb_w;
    603    }
    604    if (dec->br_mb_y > dec->mb_h) {
    605      dec->br_mb_y = dec->mb_h;
    606    }
    607  }
    608  PrecomputeFilterStrengths(dec);
    609  return VP8_STATUS_OK;
    610 }
    611 
    612 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
    613  int ok = 1;
    614  if (dec->mt_method > 0) {
    615    ok = WebPGetWorkerInterface()->Sync(&dec->worker);
    616  }
    617 
    618  if (io->teardown != NULL) {
    619    io->teardown(io);
    620  }
    621  return ok;
    622 }
    623 
    624 //------------------------------------------------------------------------------
    625 // For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
    626 //
    627 // Reason is: the deblocking filter cannot deblock the bottom horizontal edges
    628 // immediately, and needs to wait for first few rows of the next macroblock to
    629 // be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
    630 // on strength).
    631 // With two threads, the vertical positions of the rows being decoded are:
    632 // Decode:  [ 0..15][16..31][32..47][48..63][64..79][...
    633 // Deblock:         [ 0..11][12..27][28..43][44..59][...
    634 // If we use two threads and two caches of 16 pixels, the sequence would be:
    635 // Decode:  [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
    636 // Deblock:         [ 0..11][12..27!!][-4..11][12..27][...
    637 // The problem occurs during row [12..15!!] that both the decoding and
    638 // deblocking threads are writing simultaneously.
    639 // With 3 cache lines, one get a safe write pattern:
    640 // Decode:  [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
    641 // Deblock:         [ 0..11][12..27][28..43][-4..11][12..27][28...
    642 // Note that multi-threaded output _without_ deblocking can make use of two
    643 // cache lines of 16 pixels only, since there's no lagging behind. The decoding
    644 // and output process have non-concurrent writing:
    645 // Decode:  [ 0..15][16..31][ 0..15][16..31][...
    646 // io->put:         [ 0..15][16..31][ 0..15][...
    647 
    648 #define MT_CACHE_LINES 3
    649 #define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
    650 
    651 // Initialize multi/single-thread worker
    652 static int InitThreadContext(VP8Decoder* const dec) {
    653  dec->cache_id = 0;
    654  if (dec->mt_method > 0) {
    655    WebPWorker* const worker = &dec->worker;
    656    if (!WebPGetWorkerInterface()->Reset(worker)) {
    657      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
    658                         "thread initialization failed.");
    659    }
    660    worker->data1 = dec;
    661    worker->data2 = (void*)&dec->thread_ctx.io;
    662    worker->hook = FinishRow;
    663    dec->num_caches =
    664        (dec->filter_type > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
    665  } else {
    666    dec->num_caches = ST_CACHE_LINES;
    667  }
    668  return 1;
    669 }
    670 
    671 int VP8GetThreadMethod(const WebPDecoderOptions* const options,
    672                       const WebPHeaderStructure* const headers,
    673                       int width, int height) {
    674  if (options == NULL || options->use_threads == 0) {
    675    return 0;
    676  }
    677  (void)headers;
    678  (void)width;
    679  (void)height;
    680  assert(headers == NULL || !headers->is_lossless);
    681 #if defined(WEBP_USE_THREAD)
    682  if (width >= MIN_WIDTH_FOR_THREADS) return 2;
    683 #endif
    684  return 0;
    685 }
    686 
    687 #undef MT_CACHE_LINES
    688 #undef ST_CACHE_LINES
    689 
    690 //------------------------------------------------------------------------------
    691 // Memory setup
    692 
    693 static int AllocateMemory(VP8Decoder* const dec) {
    694  const int num_caches = dec->num_caches;
    695  const int mb_w = dec->mb_w;
    696  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
    697  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
    698  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
    699  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
    700  const size_t f_info_size =
    701      (dec->filter_type > 0) ?
    702          mb_w * (dec->mt_method > 0 ? 2 : 1) * sizeof(VP8FInfo)
    703        : 0;
    704  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b);
    705  const size_t mb_data_size =
    706      (dec->mt_method == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data);
    707  const size_t cache_height = (16 * num_caches
    708                            + kFilterExtraRows[dec->filter_type]) * 3 / 2;
    709  const size_t cache_size = top_size * cache_height;
    710  // alpha_size is the only one that scales as width x height.
    711  const uint64_t alpha_size = (dec->alpha_data != NULL) ?
    712      (uint64_t)dec->pic_hdr.width * dec->pic_hdr.height : 0ULL;
    713  const uint64_t needed = (uint64_t)intra_pred_mode_size
    714                        + top_size + mb_info_size + f_info_size
    715                        + yuv_size + mb_data_size
    716                        + cache_size + alpha_size + WEBP_ALIGN_CST;
    717  uint8_t* mem;
    718 
    719  if (!CheckSizeOverflow(needed)) return 0;  // check for overflow
    720  if (needed > dec->mem_size) {
    721    WebPSafeFree(dec->mem);
    722    dec->mem_size = 0;
    723    dec->mem = WebPSafeMalloc(needed, sizeof(uint8_t));
    724    if (dec->mem == NULL) {
    725      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
    726                         "no memory during frame initialization.");
    727    }
    728    // down-cast is ok, thanks to WebPSafeMalloc() above.
    729    dec->mem_size = (size_t)needed;
    730  }
    731 
    732  mem = (uint8_t*)dec->mem;
    733  dec->intra_t = mem;
    734  mem += intra_pred_mode_size;
    735 
    736  dec->yuv_t = (VP8TopSamples*)mem;
    737  mem += top_size;
    738 
    739  dec->mb_info = ((VP8MB*)mem) + 1;
    740  mem += mb_info_size;
    741 
    742  dec->f_info = f_info_size ? (VP8FInfo*)mem : NULL;
    743  mem += f_info_size;
    744  dec->thread_ctx.id = 0;
    745  dec->thread_ctx.f_info = dec->f_info;
    746  if (dec->filter_type > 0 && dec->mt_method > 0) {
    747    // secondary cache line. The deblocking process need to make use of the
    748    // filtering strength from previous macroblock row, while the new ones
    749    // are being decoded in parallel. We'll just swap the pointers.
    750    dec->thread_ctx.f_info += mb_w;
    751  }
    752 
    753  mem = (uint8_t*)WEBP_ALIGN(mem);
    754  assert((yuv_size & WEBP_ALIGN_CST) == 0);
    755  dec->yuv_b = mem;
    756  mem += yuv_size;
    757 
    758  dec->mb_data = (VP8MBData*)mem;
    759  dec->thread_ctx.mb_data = (VP8MBData*)mem;
    760  if (dec->mt_method == 2) {
    761    dec->thread_ctx.mb_data += mb_w;
    762  }
    763  mem += mb_data_size;
    764 
    765  dec->cache_y_stride = 16 * mb_w;
    766  dec->cache_uv_stride = 8 * mb_w;
    767  {
    768    const int extra_rows = kFilterExtraRows[dec->filter_type];
    769    const int extra_y = extra_rows * dec->cache_y_stride;
    770    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride;
    771    dec->cache_y = mem + extra_y;
    772    dec->cache_u = dec->cache_y
    773                  + 16 * num_caches * dec->cache_y_stride + extra_uv;
    774    dec->cache_v = dec->cache_u
    775                  + 8 * num_caches * dec->cache_uv_stride + extra_uv;
    776    dec->cache_id = 0;
    777  }
    778  mem += cache_size;
    779 
    780  // alpha plane
    781  dec->alpha_plane = alpha_size ? mem : NULL;
    782  mem += alpha_size;
    783  assert(mem <= (uint8_t*)dec->mem + dec->mem_size);
    784 
    785  // note: left/top-info is initialized once for all.
    786  memset(dec->mb_info - 1, 0, mb_info_size);
    787  VP8InitScanline(dec);   // initialize left too.
    788 
    789  // initialize top
    790  memset(dec->intra_t, B_DC_PRED, intra_pred_mode_size);
    791 
    792  return 1;
    793 }
    794 
    795 static void InitIo(VP8Decoder* const dec, VP8Io* io) {
    796  // prepare 'io'
    797  io->mb_y = 0;
    798  io->y = dec->cache_y;
    799  io->u = dec->cache_u;
    800  io->v = dec->cache_v;
    801  io->y_stride = dec->cache_y_stride;
    802  io->uv_stride = dec->cache_uv_stride;
    803  io->a = NULL;
    804 }
    805 
    806 int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io) {
    807  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches.
    808  if (!AllocateMemory(dec)) return 0;
    809  InitIo(dec, io);
    810  VP8DspInit();  // Init critical function pointers and look-up tables.
    811  return 1;
    812 }
    813 
    814 //------------------------------------------------------------------------------