tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp8.c (109758B)


      1 /*
      2 * VP7/VP8 compatible video decoder
      3 *
      4 * Copyright (C) 2010 David Conrad
      5 * Copyright (C) 2010 Ronald S. Bultje
      6 * Copyright (C) 2010 Fiona Glaser
      7 * Copyright (C) 2012 Daniel Kang
      8 * Copyright (C) 2014 Peter Ross
      9 *
     10 * This file is part of FFmpeg.
     11 *
     12 * FFmpeg is free software; you can redistribute it and/or
     13 * modify it under the terms of the GNU Lesser General Public
     14 * License as published by the Free Software Foundation; either
     15 * version 2.1 of the License, or (at your option) any later version.
     16 *
     17 * FFmpeg is distributed in the hope that it will be useful,
     18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     20 * Lesser General Public License for more details.
     21 *
     22 * You should have received a copy of the GNU Lesser General Public
     23 * License along with FFmpeg; if not, write to the Free Software
     24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     25 */
     26 
     27 #include "config_components.h"
     28 
     29 #include "libavutil/mem.h"
     30 #include "libavutil/mem_internal.h"
     31 
     32 #include "avcodec.h"
     33 #include "codec_internal.h"
     34 #include "decode.h"
     35 #include "hwaccel_internal.h"
     36 #include "hwconfig.h"
     37 #include "mathops.h"
     38 #include "progressframe.h"
     39 #include "libavutil/refstruct.h"
     40 #include "thread.h"
     41 #include "vp8.h"
     42 #include "vp89_rac.h"
     43 #include "vp8data.h"
     44 #include "vpx_rac.h"
     45 
     46 #if ARCH_ARM
     47 #   include "arm/vp8.h"
     48 #endif
     49 
     50 // fixme: add 1 bit to all the calls to this?
     51 static int vp8_rac_get_sint(VPXRangeCoder *c, int bits)
     52 {
     53    int v;
     54 
     55    if (!vp89_rac_get(c))
     56        return 0;
     57 
     58    v = vp89_rac_get_uint(c, bits);
     59 
     60    if (vp89_rac_get(c))
     61        v = -v;
     62 
     63    return v;
     64 }
     65 
     66 static int vp8_rac_get_nn(VPXRangeCoder *c)
     67 {
     68    int v = vp89_rac_get_uint(c, 7) << 1;
     69    return v + !v;
     70 }
     71 
     72 // DCTextra
     73 static int vp8_rac_get_coeff(VPXRangeCoder *c, const uint8_t *prob)
     74 {
     75    int v = 0;
     76 
     77    do {
     78        v = (v<<1) + vpx_rac_get_prob(c, *prob++);
     79    } while (*prob);
     80 
     81    return v;
     82 }
     83 
     84 static void free_buffers(VP8Context *s)
     85 {
     86    int i;
     87    if (s->thread_data)
     88        for (i = 0; i < MAX_THREADS; i++) {
     89 #if HAVE_THREADS
     90            pthread_cond_destroy(&s->thread_data[i].cond);
     91            pthread_mutex_destroy(&s->thread_data[i].lock);
     92 #endif
     93            av_freep(&s->thread_data[i].filter_strength);
     94        }
     95    av_freep(&s->thread_data);
     96    av_freep(&s->macroblocks_base);
     97    av_freep(&s->intra4x4_pred_mode_top);
     98    av_freep(&s->top_nnz);
     99    av_freep(&s->top_border);
    100 
    101    s->macroblocks = NULL;
    102 }
    103 
    104 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
    105 {
    106    int ret = ff_progress_frame_get_buffer(s->avctx, &f->tf,
    107                                           ref ? AV_GET_BUFFER_FLAG_REF : 0);
    108    if (ret < 0)
    109        return ret;
    110    f->seg_map = av_refstruct_allocz(s->mb_width * s->mb_height);
    111    if (!f->seg_map) {
    112        ret = AVERROR(ENOMEM);
    113        goto fail;
    114    }
    115    ret = ff_hwaccel_frame_priv_alloc(s->avctx, &f->hwaccel_picture_private);
    116    if (ret < 0)
    117        goto fail;
    118 
    119    return 0;
    120 
    121 fail:
    122    av_refstruct_unref(&f->seg_map);
    123    ff_progress_frame_unref(&f->tf);
    124    return ret;
    125 }
    126 
    127 static void vp8_release_frame(VP8Frame *f)
    128 {
    129    av_refstruct_unref(&f->seg_map);
    130    av_refstruct_unref(&f->hwaccel_picture_private);
    131    ff_progress_frame_unref(&f->tf);
    132 }
    133 
    134 static av_cold void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
    135 {
    136    VP8Context *s = avctx->priv_data;
    137    int i;
    138 
    139    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
    140        vp8_release_frame(&s->frames[i]);
    141    memset(s->framep, 0, sizeof(s->framep));
    142 
    143    if (free_mem)
    144        free_buffers(s);
    145 
    146    if (FF_HW_HAS_CB(avctx, flush))
    147        FF_HW_SIMPLE_CALL(avctx, flush);
    148 }
    149 
    150 static av_cold void vp8_decode_flush(AVCodecContext *avctx)
    151 {
    152    vp8_decode_flush_impl(avctx, 0);
    153 }
    154 
    155 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
    156 {
    157    VP8Frame *frame = NULL;
    158    int i;
    159 
    160    // find a free buffer
    161    for (i = 0; i < 5; i++)
    162        if (&s->frames[i] != s->framep[VP8_FRAME_CURRENT]  &&
    163            &s->frames[i] != s->framep[VP8_FRAME_PREVIOUS] &&
    164            &s->frames[i] != s->framep[VP8_FRAME_GOLDEN]   &&
    165            &s->frames[i] != s->framep[VP8_FRAME_ALTREF]) {
    166            frame = &s->frames[i];
    167            break;
    168        }
    169    if (i == 5) {
    170        av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
    171        abort();
    172    }
    173    if (frame->tf.f)
    174        vp8_release_frame(frame);
    175 
    176    return frame;
    177 }
    178 
    179 static enum AVPixelFormat get_pixel_format(VP8Context *s)
    180 {
    181    enum AVPixelFormat pix_fmts[] = {
    182 #if CONFIG_VP8_VAAPI_HWACCEL
    183        AV_PIX_FMT_VAAPI,
    184 #endif
    185 #if CONFIG_VP8_NVDEC_HWACCEL
    186        AV_PIX_FMT_CUDA,
    187 #endif
    188        AV_PIX_FMT_YUV420P,
    189        AV_PIX_FMT_NONE,
    190    };
    191 
    192    return ff_get_format(s->avctx, pix_fmts);
    193 }
    194 
    195 static av_always_inline
    196 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
    197 {
    198    AVCodecContext *avctx = s->avctx;
    199    int i, ret, dim_reset = 0;
    200 
    201    if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
    202        height != s->avctx->height) {
    203        vp8_decode_flush_impl(s->avctx, 1);
    204 
    205        ret = ff_set_dimensions(s->avctx, width, height);
    206        if (ret < 0)
    207            return ret;
    208 
    209        dim_reset = (s->macroblocks_base != NULL);
    210    }
    211 
    212    if ((s->pix_fmt == AV_PIX_FMT_NONE || dim_reset) &&
    213         !s->actually_webp && !is_vp7) {
    214        s->pix_fmt = get_pixel_format(s);
    215        if (s->pix_fmt < 0)
    216            return AVERROR(EINVAL);
    217        avctx->pix_fmt = s->pix_fmt;
    218    }
    219 
    220    s->mb_width  = (s->avctx->coded_width  + 15) / 16;
    221    s->mb_height = (s->avctx->coded_height + 15) / 16;
    222 
    223    s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
    224                   avctx->thread_count > 1;
    225    if (!s->mb_layout) { // Frame threading and one thread
    226        s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
    227                                               sizeof(*s->macroblocks));
    228        s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
    229    } else // Sliced threading
    230        s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
    231                                         sizeof(*s->macroblocks));
    232    s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
    233    s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
    234    s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
    235 
    236    if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
    237        !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
    238        free_buffers(s);
    239        return AVERROR(ENOMEM);
    240    }
    241 
    242    for (i = 0; i < MAX_THREADS; i++) {
    243        s->thread_data[i].filter_strength =
    244            av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
    245        if (!s->thread_data[i].filter_strength) {
    246            free_buffers(s);
    247            return AVERROR(ENOMEM);
    248        }
    249 #if HAVE_THREADS
    250        ret = pthread_mutex_init(&s->thread_data[i].lock, NULL);
    251        if (ret) {
    252            free_buffers(s);
    253            return AVERROR(ret);
    254        }
    255        ret = pthread_cond_init(&s->thread_data[i].cond, NULL);
    256        if (ret) {
    257            free_buffers(s);
    258            return AVERROR(ret);
    259        }
    260 #endif
    261    }
    262 
    263    s->macroblocks = s->macroblocks_base + 1;
    264 
    265    return 0;
    266 }
    267 
    268 static int vp7_update_dimensions(VP8Context *s, int width, int height)
    269 {
    270    return update_dimensions(s, width, height, IS_VP7);
    271 }
    272 
    273 static int vp8_update_dimensions(VP8Context *s, int width, int height)
    274 {
    275    return update_dimensions(s, width, height, IS_VP8);
    276 }
    277 
    278 
    279 static void parse_segment_info(VP8Context *s)
    280 {
    281    VPXRangeCoder *c = &s->c;
    282    int i;
    283 
    284    s->segmentation.update_map = vp89_rac_get(c);
    285    s->segmentation.update_feature_data = vp89_rac_get(c);
    286 
    287    if (s->segmentation.update_feature_data) {
    288        s->segmentation.absolute_vals = vp89_rac_get(c);
    289 
    290        for (i = 0; i < 4; i++)
    291            s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
    292 
    293        for (i = 0; i < 4; i++)
    294            s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
    295    }
    296    if (s->segmentation.update_map)
    297        for (i = 0; i < 3; i++)
    298            s->prob->segmentid[i] = vp89_rac_get(c) ? vp89_rac_get_uint(c, 8) : 255;
    299 }
    300 
    301 static void update_lf_deltas(VP8Context *s)
    302 {
    303    VPXRangeCoder *c = &s->c;
    304    int i;
    305 
    306    for (i = 0; i < 4; i++) {
    307        if (vp89_rac_get(c)) {
    308            s->lf_delta.ref[i] = vp89_rac_get_uint(c, 6);
    309 
    310            if (vp89_rac_get(c))
    311                s->lf_delta.ref[i] = -s->lf_delta.ref[i];
    312        }
    313    }
    314 
    315    for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
    316        if (vp89_rac_get(c)) {
    317            s->lf_delta.mode[i] = vp89_rac_get_uint(c, 6);
    318 
    319            if (vp89_rac_get(c))
    320                s->lf_delta.mode[i] = -s->lf_delta.mode[i];
    321        }
    322    }
    323 }
    324 
    325 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
    326 {
    327    const uint8_t *sizes = buf;
    328    int i;
    329    int ret;
    330 
    331    s->num_coeff_partitions = 1 << vp89_rac_get_uint(&s->c, 2);
    332 
    333    buf      += 3 * (s->num_coeff_partitions - 1);
    334    buf_size -= 3 * (s->num_coeff_partitions - 1);
    335    if (buf_size < 0)
    336        return -1;
    337 
    338    for (i = 0; i < s->num_coeff_partitions - 1; i++) {
    339        int size = AV_RL24(sizes + 3 * i);
    340        if (buf_size - size < 0)
    341            return -1;
    342        s->coeff_partition_size[i] = size;
    343 
    344        ret = ff_vpx_init_range_decoder(&s->coeff_partition[i], buf, size);
    345        if (ret < 0)
    346            return ret;
    347        buf      += size;
    348        buf_size -= size;
    349    }
    350 
    351    s->coeff_partition_size[i] = buf_size;
    352 
    353    return ff_vpx_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
    354 }
    355 
    356 static void vp7_get_quants(VP8Context *s)
    357 {
    358    VPXRangeCoder *c = &s->c;
    359 
    360    int yac_qi  = vp89_rac_get_uint(c, 7);
    361    int ydc_qi  = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
    362    int y2dc_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
    363    int y2ac_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
    364    int uvdc_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
    365    int uvac_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
    366 
    367    s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
    368    s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
    369    s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
    370    s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
    371    s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
    372    s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
    373 }
    374 
    375 static void vp8_get_quants(VP8Context *s)
    376 {
    377    VPXRangeCoder *c = &s->c;
    378    int i, base_qi;
    379 
    380    s->quant.yac_qi     = vp89_rac_get_uint(c, 7);
    381    s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
    382    s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
    383    s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
    384    s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
    385    s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
    386 
    387    for (i = 0; i < 4; i++) {
    388        if (s->segmentation.enabled) {
    389            base_qi = s->segmentation.base_quant[i];
    390            if (!s->segmentation.absolute_vals)
    391                base_qi += s->quant.yac_qi;
    392        } else
    393            base_qi = s->quant.yac_qi;
    394 
    395        s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
    396        s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
    397        s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
    398        /* 101581>>16 is equivalent to 155/100 */
    399        s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
    400        s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
    401        s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
    402 
    403        s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
    404        s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
    405    }
    406 }
    407 
    408 /**
    409 * Determine which buffers golden and altref should be updated with after this frame.
    410 * The spec isn't clear here, so I'm going by my understanding of what libvpx does
    411 *
    412 * Intra frames update all 3 references
    413 * Inter frames update VP8_FRAME_PREVIOUS if the update_last flag is set
    414 * If the update (golden|altref) flag is set, it's updated with the current frame
    415 *      if update_last is set, and VP8_FRAME_PREVIOUS otherwise.
    416 * If the flag is not set, the number read means:
    417 *      0: no update
    418 *      1: VP8_FRAME_PREVIOUS
    419 *      2: update golden with altref, or update altref with golden
    420 */
    421 static VP8FrameType ref_to_update(VP8Context *s, int update, VP8FrameType ref)
    422 {
    423    VPXRangeCoder *c = &s->c;
    424 
    425    if (update)
    426        return VP8_FRAME_CURRENT;
    427 
    428    switch (vp89_rac_get_uint(c, 2)) {
    429    case 1:
    430        return VP8_FRAME_PREVIOUS;
    431    case 2:
    432        return (ref == VP8_FRAME_GOLDEN) ? VP8_FRAME_ALTREF : VP8_FRAME_GOLDEN;
    433    }
    434    return VP8_FRAME_NONE;
    435 }
    436 
    437 static void vp78_reset_probability_tables(VP8Context *s)
    438 {
    439    int i, j;
    440    for (i = 0; i < 4; i++)
    441        for (j = 0; j < 16; j++)
    442            memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
    443                   sizeof(s->prob->token[i][j]));
    444 }
    445 
    446 static void vp78_update_probability_tables(VP8Context *s)
    447 {
    448    VPXRangeCoder *c = &s->c;
    449    int i, j, k, l, m;
    450 
    451    for (i = 0; i < 4; i++)
    452        for (j = 0; j < 8; j++)
    453            for (k = 0; k < 3; k++)
    454                for (l = 0; l < NUM_DCT_TOKENS-1; l++)
    455                    if (vpx_rac_get_prob_branchy(c, ff_vp8_token_update_probs[i][j][k][l])) {
    456                        int prob = vp89_rac_get_uint(c, 8);
    457                        for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
    458                            s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
    459                    }
    460 }
    461 
    462 #define VP7_MVC_SIZE 17
    463 #define VP8_MVC_SIZE 19
    464 
    465 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
    466                                                            int mvc_size)
    467 {
    468    VPXRangeCoder *c = &s->c;
    469    int i, j;
    470 
    471    if (vp89_rac_get(c))
    472        for (i = 0; i < 4; i++)
    473            s->prob->pred16x16[i] = vp89_rac_get_uint(c, 8);
    474    if (vp89_rac_get(c))
    475        for (i = 0; i < 3; i++)
    476            s->prob->pred8x8c[i]  = vp89_rac_get_uint(c, 8);
    477 
    478    // 17.2 MV probability update
    479    for (i = 0; i < 2; i++)
    480        for (j = 0; j < mvc_size; j++)
    481            if (vpx_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
    482                s->prob->mvc[i][j] = vp8_rac_get_nn(c);
    483 }
    484 
    485 static void update_refs(VP8Context *s)
    486 {
    487    VPXRangeCoder *c = &s->c;
    488 
    489    int update_golden = vp89_rac_get(c);
    490    int update_altref = vp89_rac_get(c);
    491 
    492    s->update_golden = ref_to_update(s, update_golden, VP8_FRAME_GOLDEN);
    493    s->update_altref = ref_to_update(s, update_altref, VP8_FRAME_ALTREF);
    494 }
    495 
    496 static void copy_chroma(AVFrame *dst, const AVFrame *src, int width, int height)
    497 {
    498    int i, j;
    499 
    500    for (j = 1; j < 3; j++) {
    501        for (i = 0; i < height / 2; i++)
    502            memcpy(dst->data[j] + i * dst->linesize[j],
    503                   src->data[j] + i * src->linesize[j], width / 2);
    504    }
    505 }
    506 
    507 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
    508                 const uint8_t *src, ptrdiff_t src_linesize,
    509                 int width, int height,
    510                 int alpha, int beta)
    511 {
    512    int i, j;
    513    for (j = 0; j < height; j++) {
    514        const uint8_t *src2 = src + j * src_linesize;
    515        uint8_t *dst2 = dst + j * dst_linesize;
    516        for (i = 0; i < width; i++) {
    517            uint8_t y = src2[i];
    518            dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
    519        }
    520    }
    521 }
    522 
    523 static int vp7_fade_frame(VP8Context *s, int alpha, int beta)
    524 {
    525    int ret;
    526 
    527    if (!s->keyframe && (alpha || beta)) {
    528        int width  = s->mb_width * 16;
    529        int height = s->mb_height * 16;
    530        const AVFrame *src;
    531        AVFrame *dst;
    532 
    533        if (!s->framep[VP8_FRAME_PREVIOUS] ||
    534            !s->framep[VP8_FRAME_GOLDEN]) {
    535            av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
    536            return AVERROR_INVALIDDATA;
    537        }
    538 
    539        src =
    540        dst = s->framep[VP8_FRAME_PREVIOUS]->tf.f;
    541 
    542        /* preserve the golden frame, write a new previous frame */
    543        if (s->framep[VP8_FRAME_GOLDEN] == s->framep[VP8_FRAME_PREVIOUS]) {
    544            s->framep[VP8_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
    545            if ((ret = vp8_alloc_frame(s, s->framep[VP8_FRAME_PREVIOUS], 1)) < 0)
    546                return ret;
    547 
    548            dst = s->framep[VP8_FRAME_PREVIOUS]->tf.f;
    549 
    550            copy_chroma(dst, src, width, height);
    551        }
    552 
    553        fade(dst->data[0], dst->linesize[0],
    554             src->data[0], src->linesize[0],
    555             width, height, alpha, beta);
    556    }
    557 
    558    return 0;
    559 }
    560 
    561 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
    562 {
    563    VPXRangeCoder *c = &s->c;
    564    int part1_size, hscale, vscale, i, j, ret;
    565    int width  = s->avctx->width;
    566    int height = s->avctx->height;
    567    int alpha = 0;
    568    int beta  = 0;
    569    int fade_present = 1;
    570 
    571    if (buf_size < 4) {
    572        return AVERROR_INVALIDDATA;
    573    }
    574 
    575    s->profile = (buf[0] >> 1) & 7;
    576    if (s->profile > 1) {
    577        avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
    578        return AVERROR_INVALIDDATA;
    579    }
    580 
    581    s->keyframe  = !(buf[0] & 1);
    582    s->invisible = 0;
    583    part1_size   = AV_RL24(buf) >> 4;
    584 
    585    if (buf_size < 4 - s->profile + part1_size) {
    586        av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
    587        return AVERROR_INVALIDDATA;
    588    }
    589 
    590    buf      += 4 - s->profile;
    591    buf_size -= 4 - s->profile;
    592 
    593    memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
    594 
    595    ret = ff_vpx_init_range_decoder(c, buf, part1_size);
    596    if (ret < 0)
    597        return ret;
    598    buf      += part1_size;
    599    buf_size -= part1_size;
    600 
    601    /* A. Dimension information (keyframes only) */
    602    if (s->keyframe) {
    603        width  = vp89_rac_get_uint(c, 12);
    604        height = vp89_rac_get_uint(c, 12);
    605        hscale = vp89_rac_get_uint(c, 2);
    606        vscale = vp89_rac_get_uint(c, 2);
    607        if (hscale || vscale)
    608            avpriv_request_sample(s->avctx, "Upscaling");
    609 
    610        s->update_golden = s->update_altref = VP8_FRAME_CURRENT;
    611        vp78_reset_probability_tables(s);
    612        memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
    613               sizeof(s->prob->pred16x16));
    614        memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
    615               sizeof(s->prob->pred8x8c));
    616        for (i = 0; i < 2; i++)
    617            memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
    618                   sizeof(vp7_mv_default_prob[i]));
    619        memset(&s->segmentation, 0, sizeof(s->segmentation));
    620        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
    621        memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
    622    }
    623 
    624    if (s->keyframe || s->profile > 0)
    625        memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
    626 
    627    /* B. Decoding information for all four macroblock-level features */
    628    for (i = 0; i < 4; i++) {
    629        s->feature_enabled[i] = vp89_rac_get(c);
    630        if (s->feature_enabled[i]) {
    631             s->feature_present_prob[i] = vp89_rac_get_uint(c, 8);
    632 
    633             for (j = 0; j < 3; j++)
    634                 s->feature_index_prob[i][j] =
    635                     vp89_rac_get(c) ? vp89_rac_get_uint(c, 8) : 255;
    636 
    637             if (vp7_feature_value_size[s->profile][i])
    638                 for (j = 0; j < 4; j++)
    639                     s->feature_value[i][j] =
    640                        vp89_rac_get(c) ? vp89_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
    641        }
    642    }
    643 
    644    s->segmentation.enabled    = 0;
    645    s->segmentation.update_map = 0;
    646    s->lf_delta.enabled        = 0;
    647 
    648    s->num_coeff_partitions = 1;
    649    ret = ff_vpx_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
    650    if (ret < 0)
    651        return ret;
    652 
    653    if (!s->macroblocks_base || /* first frame */
    654        width != s->avctx->width || height != s->avctx->height ||
    655        (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
    656        if ((ret = vp7_update_dimensions(s, width, height)) < 0)
    657            return ret;
    658    }
    659 
    660    /* C. Dequantization indices */
    661    vp7_get_quants(s);
    662 
    663    /* D. Golden frame update flag (a Flag) for interframes only */
    664    if (!s->keyframe) {
    665        s->update_golden = vp89_rac_get(c) ? VP8_FRAME_CURRENT : VP8_FRAME_NONE;
    666        s->sign_bias[VP8_FRAME_GOLDEN] = 0;
    667    }
    668 
    669    s->update_last          = 1;
    670    s->update_probabilities = 1;
    671 
    672    if (s->profile > 0) {
    673        s->update_probabilities = vp89_rac_get(c);
    674        if (!s->update_probabilities)
    675            s->prob[1] = s->prob[0];
    676 
    677        if (!s->keyframe)
    678            fade_present = vp89_rac_get(c);
    679    }
    680 
    681    if (vpx_rac_is_end(c))
    682        return AVERROR_INVALIDDATA;
    683    /* E. Fading information for previous frame */
    684    if (fade_present && vp89_rac_get(c)) {
    685        alpha = (int8_t) vp89_rac_get_uint(c, 8);
    686        beta  = (int8_t) vp89_rac_get_uint(c, 8);
    687    }
    688 
    689    /* F. Loop filter type */
    690    if (!s->profile)
    691        s->filter.simple = vp89_rac_get(c);
    692 
    693    /* G. DCT coefficient ordering specification */
    694    if (vp89_rac_get(c))
    695        for (i = 1; i < 16; i++)
    696            s->prob[0].scan[i] = ff_zigzag_scan[vp89_rac_get_uint(c, 4)];
    697 
    698    /* H. Loop filter levels  */
    699    if (s->profile > 0)
    700        s->filter.simple = vp89_rac_get(c);
    701    s->filter.level     = vp89_rac_get_uint(c, 6);
    702    s->filter.sharpness = vp89_rac_get_uint(c, 3);
    703 
    704    /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
    705    vp78_update_probability_tables(s);
    706 
    707    s->mbskip_enabled = 0;
    708 
    709    /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
    710    if (!s->keyframe) {
    711        s->prob->intra  = vp89_rac_get_uint(c, 8);
    712        s->prob->last   = vp89_rac_get_uint(c, 8);
    713        vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
    714    }
    715 
    716    if (vpx_rac_is_end(c))
    717        return AVERROR_INVALIDDATA;
    718 
    719    if ((ret = vp7_fade_frame(s, alpha, beta)) < 0)
    720        return ret;
    721 
    722    return 0;
    723 }
    724 
    725 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
    726 {
    727    VPXRangeCoder *c = &s->c;
    728    int header_size, hscale, vscale, ret;
    729    int width  = s->avctx->width;
    730    int height = s->avctx->height;
    731 
    732    if (buf_size < 3) {
    733        av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
    734        return AVERROR_INVALIDDATA;
    735    }
    736 
    737    s->keyframe  = !(buf[0] & 1);
    738    s->profile   =  (buf[0]>>1) & 7;
    739    s->invisible = !(buf[0] & 0x10);
    740    header_size  = AV_RL24(buf) >> 5;
    741    buf      += 3;
    742    buf_size -= 3;
    743 
    744    s->header_partition_size = header_size;
    745 
    746    if (s->profile > 3)
    747        av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
    748 
    749    if (!s->profile)
    750        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
    751               sizeof(s->put_pixels_tab));
    752    else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
    753        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
    754               sizeof(s->put_pixels_tab));
    755 
    756    if (header_size > buf_size - 7 * s->keyframe) {
    757        av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
    758        return AVERROR_INVALIDDATA;
    759    }
    760 
    761    if (s->keyframe) {
    762        if (AV_RL24(buf) != 0x2a019d) {
    763            av_log(s->avctx, AV_LOG_ERROR,
    764                   "Invalid start code 0x%x\n", AV_RL24(buf));
    765            return AVERROR_INVALIDDATA;
    766        }
    767        width     = AV_RL16(buf + 3) & 0x3fff;
    768        height    = AV_RL16(buf + 5) & 0x3fff;
    769        hscale    = buf[4] >> 6;
    770        vscale    = buf[6] >> 6;
    771        buf      += 7;
    772        buf_size -= 7;
    773 
    774        if (hscale || vscale)
    775            avpriv_request_sample(s->avctx, "Upscaling");
    776 
    777        s->update_golden = s->update_altref = VP8_FRAME_CURRENT;
    778        vp78_reset_probability_tables(s);
    779        memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
    780               sizeof(s->prob->pred16x16));
    781        memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
    782               sizeof(s->prob->pred8x8c));
    783        memcpy(s->prob->mvc, vp8_mv_default_prob,
    784               sizeof(s->prob->mvc));
    785        memset(&s->segmentation, 0, sizeof(s->segmentation));
    786        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
    787    }
    788 
    789    ret = ff_vpx_init_range_decoder(c, buf, header_size);
    790    if (ret < 0)
    791        return ret;
    792    buf      += header_size;
    793    buf_size -= header_size;
    794 
    795    if (s->keyframe) {
    796        s->colorspace = vp89_rac_get(c);
    797        if (s->colorspace)
    798            av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
    799        s->fullrange = vp89_rac_get(c);
    800    }
    801 
    802    if ((s->segmentation.enabled = vp89_rac_get(c)))
    803        parse_segment_info(s);
    804    else
    805        s->segmentation.update_map = 0; // FIXME: move this to some init function?
    806 
    807    s->filter.simple    = vp89_rac_get(c);
    808    s->filter.level     = vp89_rac_get_uint(c, 6);
    809    s->filter.sharpness = vp89_rac_get_uint(c, 3);
    810 
    811    if ((s->lf_delta.enabled = vp89_rac_get(c))) {
    812        s->lf_delta.update = vp89_rac_get(c);
    813        if (s->lf_delta.update)
    814            update_lf_deltas(s);
    815    }
    816 
    817    if (setup_partitions(s, buf, buf_size)) {
    818        av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
    819        return AVERROR_INVALIDDATA;
    820    }
    821 
    822    if (!s->macroblocks_base || /* first frame */
    823        width != s->avctx->width || height != s->avctx->height ||
    824        (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
    825        if ((ret = vp8_update_dimensions(s, width, height)) < 0)
    826            return ret;
    827 
    828    vp8_get_quants(s);
    829 
    830    if (!s->keyframe) {
    831        update_refs(s);
    832        s->sign_bias[VP8_FRAME_GOLDEN] = vp89_rac_get(c);
    833        s->sign_bias[VP8_FRAME_ALTREF] = vp89_rac_get(c);
    834    }
    835 
    836    // if we aren't saving this frame's probabilities for future frames,
    837    // make a copy of the current probabilities
    838    if (!(s->update_probabilities = vp89_rac_get(c)))
    839        s->prob[1] = s->prob[0];
    840 
    841    s->update_last = s->keyframe || vp89_rac_get(c);
    842 
    843    vp78_update_probability_tables(s);
    844 
    845    if ((s->mbskip_enabled = vp89_rac_get(c)))
    846        s->prob->mbskip = vp89_rac_get_uint(c, 8);
    847 
    848    if (!s->keyframe) {
    849        s->prob->intra  = vp89_rac_get_uint(c, 8);
    850        s->prob->last   = vp89_rac_get_uint(c, 8);
    851        s->prob->golden = vp89_rac_get_uint(c, 8);
    852        vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
    853    }
    854 
    855    // Record the entropy coder state here so that hwaccels can use it.
    856    s->c.code_word = vpx_rac_renorm(&s->c);
    857    s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
    858    s->coder_state_at_header_end.range     = s->c.high;
    859    s->coder_state_at_header_end.value     = s->c.code_word >> 16;
    860    s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
    861 
    862    return 0;
    863 }
    864 
    865 static av_always_inline
    866 void clamp_mv(const VP8mvbounds *s, VP8mv *dst, const VP8mv *src)
    867 {
    868    dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
    869                             av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
    870    dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
    871                             av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
    872 }
    873 
    874 /**
    875 * Motion vector coding, 17.1.
    876 */
    877 static av_always_inline int read_mv_component(VPXRangeCoder *c, const uint8_t *p, int vp7)
    878 {
    879    int bit, x = 0;
    880 
    881    if (vpx_rac_get_prob_branchy(c, p[0])) {
    882        int i;
    883 
    884        for (i = 0; i < 3; i++)
    885            x += vpx_rac_get_prob(c, p[9 + i]) << i;
    886        for (i = (vp7 ? 7 : 9); i > 3; i--)
    887            x += vpx_rac_get_prob(c, p[9 + i]) << i;
    888        if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vpx_rac_get_prob(c, p[12]))
    889            x += 8;
    890    } else {
    891        // small_mvtree
    892        const uint8_t *ps = p + 2;
    893        bit = vpx_rac_get_prob(c, *ps);
    894        ps += 1 + 3 * bit;
    895        x  += 4 * bit;
    896        bit = vpx_rac_get_prob(c, *ps);
    897        ps += 1 + bit;
    898        x  += 2 * bit;
    899        x  += vpx_rac_get_prob(c, *ps);
    900    }
    901 
    902    return (x && vpx_rac_get_prob(c, p[1])) ? -x : x;
    903 }
    904 
    905 static int vp7_read_mv_component(VPXRangeCoder *c, const uint8_t *p)
    906 {
    907    return read_mv_component(c, p, 1);
    908 }
    909 
    910 static int vp8_read_mv_component(VPXRangeCoder *c, const uint8_t *p)
    911 {
    912    return read_mv_component(c, p, 0);
    913 }
    914 
    915 static av_always_inline
    916 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
    917 {
    918    if (is_vp7)
    919        return vp7_submv_prob;
    920 
    921    if (left == top)
    922        return vp8_submv_prob[4 - !!left];
    923    if (!top)
    924        return vp8_submv_prob[2];
    925    return vp8_submv_prob[1 - !!left];
    926 }
    927 
    928 /**
    929 * Split motion vector prediction, 16.4.
    930 * @returns the number of motion vectors parsed (2, 4 or 16)
    931 */
    932 static av_always_inline
    933 int decode_splitmvs(const VP8Context *s, VPXRangeCoder *c, VP8Macroblock *mb,
    934                    int layout, int is_vp7)
    935 {
    936    int part_idx;
    937    int n, num;
    938    const VP8Macroblock *top_mb;
    939    const VP8Macroblock *left_mb = &mb[-1];
    940    const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
    941    const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
    942    const VP8mv *top_mv;
    943    const VP8mv *left_mv = left_mb->bmv;
    944    const VP8mv *cur_mv  = mb->bmv;
    945 
    946    if (!layout) // layout is inlined, s->mb_layout is not
    947        top_mb = &mb[2];
    948    else
    949        top_mb = &mb[-s->mb_width - 1];
    950    mbsplits_top = vp8_mbsplits[top_mb->partitioning];
    951    top_mv       = top_mb->bmv;
    952 
    953    if (vpx_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
    954        if (vpx_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
    955            part_idx = VP8_SPLITMVMODE_16x8 + vpx_rac_get_prob(c, vp8_mbsplit_prob[2]);
    956        else
    957            part_idx = VP8_SPLITMVMODE_8x8;
    958    } else {
    959        part_idx = VP8_SPLITMVMODE_4x4;
    960    }
    961 
    962    num              = vp8_mbsplit_count[part_idx];
    963    mbsplits_cur     = vp8_mbsplits[part_idx],
    964    firstidx         = vp8_mbfirstidx[part_idx];
    965    mb->partitioning = part_idx;
    966 
    967    for (n = 0; n < num; n++) {
    968        int k = firstidx[n];
    969        uint32_t left, above;
    970        const uint8_t *submv_prob;
    971 
    972        if (!(k & 3))
    973            left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
    974        else
    975            left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
    976        if (k <= 3)
    977            above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
    978        else
    979            above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
    980 
    981        submv_prob = get_submv_prob(left, above, is_vp7);
    982 
    983        if (vpx_rac_get_prob_branchy(c, submv_prob[0])) {
    984            if (vpx_rac_get_prob_branchy(c, submv_prob[1])) {
    985                if (vpx_rac_get_prob_branchy(c, submv_prob[2])) {
    986                    mb->bmv[n].y = mb->mv.y +
    987                                   read_mv_component(c, s->prob->mvc[0], is_vp7);
    988                    mb->bmv[n].x = mb->mv.x +
    989                                   read_mv_component(c, s->prob->mvc[1], is_vp7);
    990                } else {
    991                    AV_ZERO32(&mb->bmv[n]);
    992                }
    993            } else {
    994                AV_WN32A(&mb->bmv[n], above);
    995            }
    996        } else {
    997            AV_WN32A(&mb->bmv[n], left);
    998        }
    999    }
   1000 
   1001    return num;
   1002 }
   1003 
   1004 /**
   1005 * The vp7 reference decoder uses a padding macroblock column (added to right
   1006 * edge of the frame) to guard against illegal macroblock offsets. The
   1007 * algorithm has bugs that permit offsets to straddle the padding column.
   1008 * This function replicates those bugs.
   1009 *
   1010 * @param[out] edge_x macroblock x address
   1011 * @param[out] edge_y macroblock y address
   1012 *
   1013 * @return macroblock offset legal (boolean)
   1014 */
   1015 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
   1016                                   int xoffset, int yoffset, int boundary,
   1017                                   int *edge_x, int *edge_y)
   1018 {
   1019    int vwidth = mb_width + 1;
   1020    int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
   1021    if (new < boundary || new % vwidth == vwidth - 1)
   1022        return 0;
   1023    *edge_y = new / vwidth;
   1024    *edge_x = new % vwidth;
   1025    return 1;
   1026 }
   1027 
   1028 static const VP8mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
   1029 {
   1030    return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
   1031 }
   1032 
   1033 static av_always_inline
   1034 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
   1035                    int mb_x, int mb_y, int layout)
   1036 {
   1037    enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
   1038    enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
   1039    int idx = CNT_ZERO;
   1040    VP8mv near_mv[3];
   1041    uint8_t cnt[3] = { 0 };
   1042    VPXRangeCoder *c = &s->c;
   1043    int i;
   1044 
   1045    AV_ZERO32(&near_mv[0]);
   1046    AV_ZERO32(&near_mv[1]);
   1047    AV_ZERO32(&near_mv[2]);
   1048 
   1049    for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
   1050        const VP7MVPred * pred = &vp7_mv_pred[i];
   1051        int edge_x, edge_y;
   1052 
   1053        if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
   1054                                    pred->yoffset, !s->profile, &edge_x, &edge_y)) {
   1055            const VP8Macroblock *edge = (s->mb_layout == 1)
   1056                                      ? s->macroblocks_base + 1 + edge_x +
   1057                                        (s->mb_width + 1) * (edge_y + 1)
   1058                                      : s->macroblocks + edge_x +
   1059                                        (s->mb_height - edge_y - 1) * 2;
   1060            uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
   1061            if (mv) {
   1062                if (AV_RN32A(&near_mv[CNT_NEAREST])) {
   1063                    if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
   1064                        idx = CNT_NEAREST;
   1065                    } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
   1066                        if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
   1067                            continue;
   1068                        idx = CNT_NEAR;
   1069                    } else {
   1070                        AV_WN32A(&near_mv[CNT_NEAR], mv);
   1071                        idx = CNT_NEAR;
   1072                    }
   1073                } else {
   1074                    AV_WN32A(&near_mv[CNT_NEAREST], mv);
   1075                    idx = CNT_NEAREST;
   1076                }
   1077            } else {
   1078                idx = CNT_ZERO;
   1079            }
   1080        } else {
   1081            idx = CNT_ZERO;
   1082        }
   1083        cnt[idx] += vp7_mv_pred[i].score;
   1084    }
   1085 
   1086    mb->partitioning = VP8_SPLITMVMODE_NONE;
   1087 
   1088    if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
   1089        mb->mode = VP8_MVMODE_MV;
   1090 
   1091        if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
   1092 
   1093            if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
   1094 
   1095                if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
   1096                    AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
   1097                else
   1098                    AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
   1099 
   1100                if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
   1101                    mb->mode = VP8_MVMODE_SPLIT;
   1102                    mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
   1103                } else {
   1104                    mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
   1105                    mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
   1106                    mb->bmv[0] = mb->mv;
   1107                }
   1108            } else {
   1109                mb->mv = near_mv[CNT_NEAR];
   1110                mb->bmv[0] = mb->mv;
   1111            }
   1112        } else {
   1113            mb->mv = near_mv[CNT_NEAREST];
   1114            mb->bmv[0] = mb->mv;
   1115        }
   1116    } else {
   1117        mb->mode = VP8_MVMODE_ZERO;
   1118        AV_ZERO32(&mb->mv);
   1119        mb->bmv[0] = mb->mv;
   1120    }
   1121 }
   1122 
   1123 static av_always_inline
   1124 void vp8_decode_mvs(VP8Context *s, const VP8mvbounds *mv_bounds, VP8Macroblock *mb,
   1125                    int mb_x, int mb_y, int layout)
   1126 {
   1127    VP8Macroblock *mb_edge[3] = { 0      /* top */,
   1128                                  mb - 1 /* left */,
   1129                                  0      /* top-left */ };
   1130    enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
   1131    enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
   1132    int idx = CNT_ZERO;
   1133    int cur_sign_bias = s->sign_bias[mb->ref_frame];
   1134    const int8_t *sign_bias = s->sign_bias;
   1135    VP8mv near_mv[4];
   1136    uint8_t cnt[4] = { 0 };
   1137    VPXRangeCoder *c = &s->c;
   1138 
   1139    if (!layout) { // layout is inlined (s->mb_layout is not)
   1140        mb_edge[0] = mb + 2;
   1141        mb_edge[2] = mb + 1;
   1142    } else {
   1143        mb_edge[0] = mb - s->mb_width - 1;
   1144        mb_edge[2] = mb - s->mb_width - 2;
   1145    }
   1146 
   1147    AV_ZERO32(&near_mv[0]);
   1148    AV_ZERO32(&near_mv[1]);
   1149    AV_ZERO32(&near_mv[2]);
   1150 
   1151    /* Process MB on top, left and top-left */
   1152 #define MV_EDGE_CHECK(n)                                                      \
   1153    {                                                                         \
   1154        const VP8Macroblock *edge = mb_edge[n];                               \
   1155        int edge_ref = edge->ref_frame;                                       \
   1156        if (edge_ref != VP8_FRAME_CURRENT) {                                 \
   1157            uint32_t mv = AV_RN32A(&edge->mv);                                \
   1158            if (mv) {                                                         \
   1159                if (cur_sign_bias != sign_bias[edge_ref]) {                   \
   1160                    /* SWAR negate of the values in mv. */                    \
   1161                    mv = ~mv;                                                 \
   1162                    mv = ((mv & 0x7fff7fff) +                                 \
   1163                          0x00010001) ^ (mv & 0x80008000);                    \
   1164                }                                                             \
   1165                if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
   1166                    AV_WN32A(&near_mv[++idx], mv);                            \
   1167                cnt[idx] += 1 + (n != 2);                                     \
   1168            } else                                                            \
   1169                cnt[CNT_ZERO] += 1 + (n != 2);                                \
   1170        }                                                                     \
   1171    }
   1172 
   1173    MV_EDGE_CHECK(0)
   1174    MV_EDGE_CHECK(1)
   1175    MV_EDGE_CHECK(2)
   1176 
   1177    mb->partitioning = VP8_SPLITMVMODE_NONE;
   1178    if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
   1179        mb->mode = VP8_MVMODE_MV;
   1180 
   1181        /* If we have three distinct MVs, merge first and last if they're the same */
   1182        if (cnt[CNT_SPLITMV] &&
   1183            AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
   1184            cnt[CNT_NEAREST] += 1;
   1185 
   1186        /* Swap near and nearest if necessary */
   1187        if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
   1188            FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
   1189            FFSWAP(VP8mv,   near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
   1190        }
   1191 
   1192        if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
   1193            if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
   1194                /* Choose the best mv out of 0,0 and the nearest mv */
   1195                clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
   1196                cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
   1197                                    (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
   1198                                    (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
   1199 
   1200                if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
   1201                    mb->mode = VP8_MVMODE_SPLIT;
   1202                    mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
   1203                } else {
   1204                    mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
   1205                    mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
   1206                    mb->bmv[0] = mb->mv;
   1207                }
   1208            } else {
   1209                clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
   1210                mb->bmv[0] = mb->mv;
   1211            }
   1212        } else {
   1213            clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
   1214            mb->bmv[0] = mb->mv;
   1215        }
   1216    } else {
   1217        mb->mode = VP8_MVMODE_ZERO;
   1218        AV_ZERO32(&mb->mv);
   1219        mb->bmv[0] = mb->mv;
   1220    }
   1221 }
   1222 
   1223 static av_always_inline
   1224 void decode_intra4x4_modes(VP8Context *s, VPXRangeCoder *c, VP8Macroblock *mb,
   1225                           int mb_x, int keyframe, int layout)
   1226 {
   1227    uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
   1228 
   1229    if (layout) {
   1230        VP8Macroblock *mb_top = mb - s->mb_width - 1;
   1231        memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
   1232    }
   1233    if (keyframe) {
   1234        int x, y;
   1235        uint8_t *top;
   1236        uint8_t *const left = s->intra4x4_pred_mode_left;
   1237        if (layout)
   1238            top = mb->intra4x4_pred_mode_top;
   1239        else
   1240            top = s->intra4x4_pred_mode_top + 4 * mb_x;
   1241        for (y = 0; y < 4; y++) {
   1242            for (x = 0; x < 4; x++) {
   1243                const uint8_t *ctx;
   1244                ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
   1245                *intra4x4 = vp89_rac_get_tree(c, vp8_pred4x4_tree, ctx);
   1246                left[y]   = top[x] = *intra4x4;
   1247                intra4x4++;
   1248            }
   1249        }
   1250    } else {
   1251        int i;
   1252        for (i = 0; i < 16; i++)
   1253            intra4x4[i] = vp89_rac_get_tree(c, vp8_pred4x4_tree,
   1254                                            vp8_pred4x4_prob_inter);
   1255    }
   1256 }
   1257 
   1258 static av_always_inline
   1259 void decode_mb_mode(VP8Context *s, const VP8mvbounds *mv_bounds,
   1260                    VP8Macroblock *mb, int mb_x, int mb_y,
   1261                    uint8_t *segment, const uint8_t *ref, int layout, int is_vp7)
   1262 {
   1263    VPXRangeCoder *c = &s->c;
   1264    static const char * const vp7_feature_name[] = { "q-index",
   1265                                                     "lf-delta",
   1266                                                     "partial-golden-update",
   1267                                                     "blit-pitch" };
   1268    if (is_vp7) {
   1269        int i;
   1270        *segment = 0;
   1271        for (i = 0; i < 4; i++) {
   1272            if (s->feature_enabled[i]) {
   1273                if (vpx_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
   1274                      int index = vp89_rac_get_tree(c, vp7_feature_index_tree,
   1275                                                    s->feature_index_prob[i]);
   1276                      av_log(s->avctx, AV_LOG_WARNING,
   1277                             "Feature %s present in macroblock (value 0x%x)\n",
   1278                             vp7_feature_name[i], s->feature_value[i][index]);
   1279                }
   1280           }
   1281        }
   1282    } else if (s->segmentation.update_map) {
   1283        int bit  = vpx_rac_get_prob(c, s->prob->segmentid[0]);
   1284        *segment = vpx_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
   1285    } else if (s->segmentation.enabled)
   1286        *segment = ref ? *ref : *segment;
   1287    mb->segment = *segment;
   1288 
   1289    mb->skip = s->mbskip_enabled ? vpx_rac_get_prob(c, s->prob->mbskip) : 0;
   1290 
   1291    if (s->keyframe) {
   1292        mb->mode = vp89_rac_get_tree(c, vp8_pred16x16_tree_intra,
   1293                                     vp8_pred16x16_prob_intra);
   1294 
   1295        if (mb->mode == MODE_I4x4) {
   1296            decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
   1297        } else {
   1298            const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
   1299                                           : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
   1300            if (s->mb_layout)
   1301                AV_WN32A(mb->intra4x4_pred_mode_top, modes);
   1302            else
   1303                AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
   1304            AV_WN32A(s->intra4x4_pred_mode_left, modes);
   1305        }
   1306 
   1307        mb->chroma_pred_mode = vp89_rac_get_tree(c, vp8_pred8x8c_tree,
   1308                                                 vp8_pred8x8c_prob_intra);
   1309        mb->ref_frame        = VP8_FRAME_CURRENT;
   1310    } else if (vpx_rac_get_prob_branchy(c, s->prob->intra)) {
   1311        // inter MB, 16.2
   1312        if (vpx_rac_get_prob_branchy(c, s->prob->last))
   1313            mb->ref_frame =
   1314                (!is_vp7 && vpx_rac_get_prob(c, s->prob->golden)) ? VP8_FRAME_ALTREF
   1315                                                                  : VP8_FRAME_GOLDEN;
   1316        else
   1317            mb->ref_frame = VP8_FRAME_PREVIOUS;
   1318        s->ref_count[mb->ref_frame - 1]++;
   1319 
   1320        // motion vectors, 16.3
   1321        if (is_vp7)
   1322            vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
   1323        else
   1324            vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
   1325    } else {
   1326        // intra MB, 16.1
   1327        mb->mode = vp89_rac_get_tree(c, vp8_pred16x16_tree_inter,
   1328                                     s->prob->pred16x16);
   1329 
   1330        if (mb->mode == MODE_I4x4)
   1331            decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
   1332 
   1333        mb->chroma_pred_mode = vp89_rac_get_tree(c, vp8_pred8x8c_tree,
   1334                                                 s->prob->pred8x8c);
   1335        mb->ref_frame        = VP8_FRAME_CURRENT;
   1336        mb->partitioning     = VP8_SPLITMVMODE_NONE;
   1337        AV_ZERO32(&mb->bmv[0]);
   1338    }
   1339 }
   1340 
   1341 /**
   1342 * @param r     arithmetic bitstream reader context
   1343 * @param block destination for block coefficients
   1344 * @param probs probabilities to use when reading trees from the bitstream
   1345 * @param i     initial coeff index, 0 unless a separate DC block is coded
   1346 * @param qmul  array holding the dc/ac dequant factor at position 0/1
   1347 *
   1348 * @return 0 if no coeffs were decoded
   1349 *         otherwise, the index of the last coeff decoded plus one
   1350 */
   1351 static av_always_inline
   1352 int decode_block_coeffs_internal(VPXRangeCoder *r, int16_t block[16],
   1353                                 uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
   1354                                 int i, const uint8_t *token_prob, const int16_t qmul[2],
   1355                                 const uint8_t scan[16], int vp7)
   1356 {
   1357    VPXRangeCoder c = *r;
   1358    goto skip_eob;
   1359    do {
   1360        int coeff;
   1361 restart:
   1362        if (!vpx_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
   1363            break;
   1364 
   1365 skip_eob:
   1366        if (!vpx_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
   1367            if (++i == 16)
   1368                break; // invalid input; blocks should end with EOB
   1369            token_prob = probs[i][0];
   1370            if (vp7)
   1371                goto restart;
   1372            goto skip_eob;
   1373        }
   1374 
   1375        if (!vpx_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
   1376            coeff = 1;
   1377            token_prob = probs[i + 1][1];
   1378        } else {
   1379            if (!vpx_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
   1380                coeff = vpx_rac_get_prob_branchy(&c, token_prob[4]);
   1381                if (coeff)
   1382                    coeff += vpx_rac_get_prob(&c, token_prob[5]);
   1383                coeff += 2;
   1384            } else {
   1385                // DCT_CAT*
   1386                if (!vpx_rac_get_prob_branchy(&c, token_prob[6])) {
   1387                    if (!vpx_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
   1388                        coeff = 5 + vpx_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
   1389                    } else {                                    // DCT_CAT2
   1390                        coeff  = 7;
   1391                        coeff += vpx_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
   1392                        coeff += vpx_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
   1393                    }
   1394                } else {    // DCT_CAT3 and up
   1395                    int a   = vpx_rac_get_prob(&c, token_prob[8]);
   1396                    int b   = vpx_rac_get_prob(&c, token_prob[9 + a]);
   1397                    int cat = (a << 1) + b;
   1398                    coeff  = 3 + (8 << cat);
   1399                    coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
   1400                }
   1401            }
   1402            token_prob = probs[i + 1][2];
   1403        }
   1404        block[scan[i]] = (vp89_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
   1405    } while (++i < 16);
   1406 
   1407    *r = c;
   1408    return i;
   1409 }
   1410 
   1411 static av_always_inline
   1412 int inter_predict_dc(int16_t block[16], int16_t pred[2])
   1413 {
   1414    int16_t dc = block[0];
   1415    int ret = 0;
   1416 
   1417    if (pred[1] > 3) {
   1418        dc += pred[0];
   1419        ret = 1;
   1420    }
   1421 
   1422    if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
   1423        block[0] = pred[0] = dc;
   1424        pred[1] = 0;
   1425    } else {
   1426        if (pred[0] == dc)
   1427            pred[1]++;
   1428        block[0] = pred[0] = dc;
   1429    }
   1430 
   1431    return ret;
   1432 }
   1433 
   1434 static int vp7_decode_block_coeffs_internal(VPXRangeCoder *r,
   1435                                            int16_t block[16],
   1436                                            uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
   1437                                            int i, const uint8_t *token_prob,
   1438                                            const int16_t qmul[2],
   1439                                            const uint8_t scan[16])
   1440 {
   1441    return decode_block_coeffs_internal(r, block, probs, i,
   1442                                        token_prob, qmul, scan, IS_VP7);
   1443 }
   1444 
   1445 #ifndef vp8_decode_block_coeffs_internal
   1446 static int vp8_decode_block_coeffs_internal(VPXRangeCoder *r,
   1447                                            int16_t block[16],
   1448                                            uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
   1449                                            int i, const uint8_t *token_prob,
   1450                                            const int16_t qmul[2])
   1451 {
   1452    return decode_block_coeffs_internal(r, block, probs, i,
   1453                                        token_prob, qmul, ff_zigzag_scan, IS_VP8);
   1454 }
   1455 #endif
   1456 
   1457 /**
   1458 * @param c          arithmetic bitstream reader context
   1459 * @param block      destination for block coefficients
   1460 * @param probs      probabilities to use when reading trees from the bitstream
   1461 * @param i          initial coeff index, 0 unless a separate DC block is coded
   1462 * @param zero_nhood the initial prediction context for number of surrounding
   1463 *                   all-zero blocks (only left/top, so 0-2)
   1464 * @param qmul       array holding the dc/ac dequant factor at position 0/1
   1465 * @param scan       scan pattern (VP7 only)
   1466 *
   1467 * @return 0 if no coeffs were decoded
   1468 *         otherwise, the index of the last coeff decoded plus one
   1469 */
   1470 static av_always_inline
   1471 int decode_block_coeffs(VPXRangeCoder *c, int16_t block[16],
   1472                        uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
   1473                        int i, int zero_nhood, const int16_t qmul[2],
   1474                        const uint8_t scan[16], int vp7)
   1475 {
   1476    const uint8_t *token_prob = probs[i][zero_nhood];
   1477    if (!vpx_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
   1478        return 0;
   1479    return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
   1480                                                  token_prob, qmul, scan)
   1481               : vp8_decode_block_coeffs_internal(c, block, probs, i,
   1482                                                  token_prob, qmul);
   1483 }
   1484 
   1485 static av_always_inline
   1486 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VPXRangeCoder *c,
   1487                      VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
   1488                      int is_vp7)
   1489 {
   1490    int i, x, y, luma_start = 0, luma_ctx = 3;
   1491    int nnz_pred, nnz, nnz_total = 0;
   1492    int segment = mb->segment;
   1493    int block_dc = 0;
   1494 
   1495    if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
   1496        nnz_pred = t_nnz[8] + l_nnz[8];
   1497 
   1498        // decode DC values and do hadamard
   1499        nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
   1500                                  nnz_pred, s->qmat[segment].luma_dc_qmul,
   1501                                  ff_zigzag_scan, is_vp7);
   1502        l_nnz[8] = t_nnz[8] = !!nnz;
   1503 
   1504        if (is_vp7 && mb->mode > MODE_I4x4) {
   1505            nnz |=  inter_predict_dc(td->block_dc,
   1506                                     s->inter_dc_pred[mb->ref_frame - 1]);
   1507        }
   1508 
   1509        if (nnz) {
   1510            nnz_total += nnz;
   1511            block_dc   = 1;
   1512            if (nnz == 1)
   1513                s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
   1514            else
   1515                s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
   1516        }
   1517        luma_start = 1;
   1518        luma_ctx   = 0;
   1519    }
   1520 
   1521    // luma blocks
   1522    for (y = 0; y < 4; y++)
   1523        for (x = 0; x < 4; x++) {
   1524            nnz_pred = l_nnz[y] + t_nnz[x];
   1525            nnz = decode_block_coeffs(c, td->block[y][x],
   1526                                      s->prob->token[luma_ctx],
   1527                                      luma_start, nnz_pred,
   1528                                      s->qmat[segment].luma_qmul,
   1529                                      s->prob[0].scan, is_vp7);
   1530            /* nnz+block_dc may be one more than the actual last index,
   1531             * but we don't care */
   1532            td->non_zero_count_cache[y][x] = nnz + block_dc;
   1533            t_nnz[x] = l_nnz[y] = !!nnz;
   1534            nnz_total += nnz;
   1535        }
   1536 
   1537    // chroma blocks
   1538    // TODO: what to do about dimensions? 2nd dim for luma is x,
   1539    // but for chroma it's (y<<1)|x
   1540    for (i = 4; i < 6; i++)
   1541        for (y = 0; y < 2; y++)
   1542            for (x = 0; x < 2; x++) {
   1543                nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
   1544                nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
   1545                                          s->prob->token[2], 0, nnz_pred,
   1546                                          s->qmat[segment].chroma_qmul,
   1547                                          s->prob[0].scan, is_vp7);
   1548                td->non_zero_count_cache[i][(y << 1) + x] = nnz;
   1549                t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
   1550                nnz_total += nnz;
   1551            }
   1552 
   1553    // if there were no coded coeffs despite the macroblock not being marked skip,
   1554    // we MUST not do the inner loop filter and should not do IDCT
   1555    // Since skip isn't used for bitstream prediction, just manually set it.
   1556    if (!nnz_total)
   1557        mb->skip = 1;
   1558 }
   1559 
   1560 static av_always_inline
   1561 void backup_mb_border(uint8_t *top_border, const uint8_t *src_y,
   1562                      const uint8_t *src_cb, const uint8_t *src_cr,
   1563                      ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
   1564 {
   1565    AV_COPY128(top_border, src_y + 15 * linesize);
   1566    if (!simple) {
   1567        AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
   1568        AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
   1569    }
   1570 }
   1571 
   1572 static av_always_inline
   1573 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
   1574                    uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
   1575                    int mb_y, int mb_width, int simple, int xchg)
   1576 {
   1577    uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
   1578    src_y  -= linesize;
   1579    src_cb -= uvlinesize;
   1580    src_cr -= uvlinesize;
   1581 
   1582 #define XCHG(a, b, xchg)                                                      \
   1583    do {                                                                      \
   1584        if (xchg)                                                             \
   1585            AV_SWAP64(b, a);                                                  \
   1586        else                                                                  \
   1587            AV_COPY64(b, a);                                                  \
   1588    } while (0)
   1589 
   1590    XCHG(top_border_m1 + 8, src_y - 8, xchg);
   1591    XCHG(top_border, src_y, xchg);
   1592    XCHG(top_border + 8, src_y + 8, 1);
   1593    if (mb_x < mb_width - 1)
   1594        XCHG(top_border + 32, src_y + 16, 1);
   1595 
   1596    // only copy chroma for normal loop filter
   1597    // or to initialize the top row to 127
   1598    if (!simple || !mb_y) {
   1599        XCHG(top_border_m1 + 16, src_cb - 8, xchg);
   1600        XCHG(top_border_m1 + 24, src_cr - 8, xchg);
   1601        XCHG(top_border + 16, src_cb, 1);
   1602        XCHG(top_border + 24, src_cr, 1);
   1603    }
   1604 }
   1605 
   1606 static av_always_inline
   1607 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
   1608 {
   1609    if (!mb_x)
   1610        return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
   1611    else
   1612        return mb_y ? mode : LEFT_DC_PRED8x8;
   1613 }
   1614 
   1615 static av_always_inline
   1616 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
   1617 {
   1618    if (!mb_x)
   1619        return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
   1620    else
   1621        return mb_y ? mode : HOR_PRED8x8;
   1622 }
   1623 
   1624 static av_always_inline
   1625 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
   1626 {
   1627    switch (mode) {
   1628    case DC_PRED8x8:
   1629        return check_dc_pred8x8_mode(mode, mb_x, mb_y);
   1630    case VERT_PRED8x8:
   1631        return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
   1632    case HOR_PRED8x8:
   1633        return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
   1634    case PLANE_PRED8x8: /* TM */
   1635        return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
   1636    }
   1637    return mode;
   1638 }
   1639 
   1640 static av_always_inline
   1641 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
   1642 {
   1643    if (!mb_x) {
   1644        return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
   1645    } else {
   1646        return mb_y ? mode : HOR_VP8_PRED;
   1647    }
   1648 }
   1649 
   1650 static av_always_inline
   1651 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
   1652                                     int *copy_buf, int vp7)
   1653 {
   1654    switch (mode) {
   1655    case VERT_PRED:
   1656        if (!mb_x && mb_y) {
   1657            *copy_buf = 1;
   1658            return mode;
   1659        }
   1660        /* fall-through */
   1661    case DIAG_DOWN_LEFT_PRED:
   1662    case VERT_LEFT_PRED:
   1663        return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
   1664    case HOR_PRED:
   1665        if (!mb_y) {
   1666            *copy_buf = 1;
   1667            return mode;
   1668        }
   1669        /* fall-through */
   1670    case HOR_UP_PRED:
   1671        return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
   1672    case TM_VP8_PRED:
   1673        return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
   1674    case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
   1675                   * as 16x16/8x8 DC */
   1676    case DIAG_DOWN_RIGHT_PRED:
   1677    case VERT_RIGHT_PRED:
   1678    case HOR_DOWN_PRED:
   1679        if (!mb_y || !mb_x)
   1680            *copy_buf = 1;
   1681        return mode;
   1682    }
   1683    return mode;
   1684 }
   1685 
   1686 static av_always_inline
   1687 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3],
   1688                   VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
   1689 {
   1690    int x, y, mode, nnz;
   1691    uint32_t tr;
   1692 
   1693    /* for the first row, we need to run xchg_mb_border to init the top edge
   1694     * to 127 otherwise, skip it if we aren't going to deblock */
   1695    if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
   1696        xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
   1697                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
   1698                       s->filter.simple, 1);
   1699 
   1700    if (mb->mode < MODE_I4x4) {
   1701        mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
   1702        s->hpc.pred16x16[mode](dst[0], s->linesize);
   1703    } else {
   1704        uint8_t *ptr = dst[0];
   1705        const uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
   1706        const uint8_t lo = is_vp7 ? 128 : 127;
   1707        const uint8_t hi = is_vp7 ? 128 : 129;
   1708        const uint8_t tr_top[4] = { lo, lo, lo, lo };
   1709 
   1710        // all blocks on the right edge of the macroblock use bottom edge
   1711        // the top macroblock for their topright edge
   1712        const uint8_t *tr_right = ptr - s->linesize + 16;
   1713 
   1714        // if we're on the right edge of the frame, said edge is extended
   1715        // from the top macroblock
   1716        if (mb_y && mb_x == s->mb_width - 1) {
   1717            tr       = tr_right[-1] * 0x01010101u;
   1718            tr_right = (uint8_t *) &tr;
   1719        }
   1720 
   1721        if (mb->skip)
   1722            AV_ZERO128(td->non_zero_count_cache);
   1723 
   1724        for (y = 0; y < 4; y++) {
   1725            const uint8_t *topright = ptr + 4 - s->linesize;
   1726            for (x = 0; x < 4; x++) {
   1727                int copy = 0;
   1728                ptrdiff_t linesize = s->linesize;
   1729                uint8_t *dst = ptr + 4 * x;
   1730                LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
   1731 
   1732                if ((y == 0 || x == 3) && mb_y == 0) {
   1733                    topright = tr_top;
   1734                } else if (x == 3)
   1735                    topright = tr_right;
   1736 
   1737                mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
   1738                                                        mb_y + y, &copy, is_vp7);
   1739                if (copy) {
   1740                    dst      = copy_dst + 12;
   1741                    linesize = 8;
   1742                    if (!(mb_y + y)) {
   1743                        copy_dst[3] = lo;
   1744                        AV_WN32A(copy_dst + 4, lo * 0x01010101U);
   1745                    } else {
   1746                        AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
   1747                        if (!(mb_x + x)) {
   1748                            copy_dst[3] = hi;
   1749                        } else {
   1750                            copy_dst[3] = ptr[4 * x - s->linesize - 1];
   1751                        }
   1752                    }
   1753                    if (!(mb_x + x)) {
   1754                        copy_dst[11] =
   1755                        copy_dst[19] =
   1756                        copy_dst[27] =
   1757                        copy_dst[35] = hi;
   1758                    } else {
   1759                        copy_dst[11] = ptr[4 * x                   - 1];
   1760                        copy_dst[19] = ptr[4 * x + s->linesize     - 1];
   1761                        copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
   1762                        copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
   1763                    }
   1764                }
   1765                s->hpc.pred4x4[mode](dst, topright, linesize);
   1766                if (copy) {
   1767                    AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
   1768                    AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
   1769                    AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
   1770                    AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
   1771                }
   1772 
   1773                nnz = td->non_zero_count_cache[y][x];
   1774                if (nnz) {
   1775                    if (nnz == 1)
   1776                        s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
   1777                                                  td->block[y][x], s->linesize);
   1778                    else
   1779                        s->vp8dsp.vp8_idct_add(ptr + 4 * x,
   1780                                               td->block[y][x], s->linesize);
   1781                }
   1782                topright += 4;
   1783            }
   1784 
   1785            ptr      += 4 * s->linesize;
   1786            intra4x4 += 4;
   1787        }
   1788    }
   1789 
   1790    mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
   1791                                            mb_x, mb_y, is_vp7);
   1792    s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
   1793    s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
   1794 
   1795    if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
   1796        xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
   1797                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
   1798                       s->filter.simple, 0);
   1799 }
   1800 
   1801 static const uint8_t subpel_idx[3][8] = {
   1802    { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
   1803                                // also function pointer index
   1804    { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
   1805    { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
   1806 };
   1807 
   1808 /**
   1809 * luma MC function
   1810 *
   1811 * @param s        VP8 decoding context
   1812 * @param dst      target buffer for block data at block position
   1813 * @param ref      reference picture buffer at origin (0, 0)
   1814 * @param mv       motion vector (relative to block position) to get pixel data from
   1815 * @param x_off    horizontal position of block from origin (0, 0)
   1816 * @param y_off    vertical position of block from origin (0, 0)
   1817 * @param block_w  width of block (16, 8 or 4)
   1818 * @param block_h  height of block (always same as block_w)
   1819 * @param width    width of src/dst plane data
   1820 * @param height   height of src/dst plane data
   1821 * @param linesize size of a single line of plane data, including padding
   1822 * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
   1823 */
   1824 static av_always_inline
   1825 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
   1826                 const ProgressFrame *ref, const VP8mv *mv,
   1827                 int x_off, int y_off, int block_w, int block_h,
   1828                 int width, int height, ptrdiff_t linesize,
   1829                 vp8_mc_func mc_func[3][3])
   1830 {
   1831    const uint8_t *src = ref->f->data[0];
   1832 
   1833    if (AV_RN32A(mv)) {
   1834        ptrdiff_t src_linesize = linesize;
   1835 
   1836        int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
   1837        int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
   1838 
   1839        x_off += mv->x >> 2;
   1840        y_off += mv->y >> 2;
   1841 
   1842        // edge emulation
   1843        ff_progress_frame_await(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4);
   1844        src += y_off * linesize + x_off;
   1845        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
   1846            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
   1847            s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
   1848                                     src - my_idx * linesize - mx_idx,
   1849                                     EDGE_EMU_LINESIZE, linesize,
   1850                                     block_w + subpel_idx[1][mx],
   1851                                     block_h + subpel_idx[1][my],
   1852                                     x_off - mx_idx, y_off - my_idx,
   1853                                     width, height);
   1854            src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
   1855            src_linesize = EDGE_EMU_LINESIZE;
   1856        }
   1857        mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
   1858    } else {
   1859        ff_progress_frame_await(ref, (3 + y_off + block_h) >> 4);
   1860        mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
   1861                      linesize, block_h, 0, 0);
   1862    }
   1863 }
   1864 
   1865 /**
   1866 * chroma MC function
   1867 *
   1868 * @param s        VP8 decoding context
   1869 * @param dst1     target buffer for block data at block position (U plane)
   1870 * @param dst2     target buffer for block data at block position (V plane)
   1871 * @param ref      reference picture buffer at origin (0, 0)
   1872 * @param mv       motion vector (relative to block position) to get pixel data from
   1873 * @param x_off    horizontal position of block from origin (0, 0)
   1874 * @param y_off    vertical position of block from origin (0, 0)
   1875 * @param block_w  width of block (16, 8 or 4)
   1876 * @param block_h  height of block (always same as block_w)
   1877 * @param width    width of src/dst plane data
   1878 * @param height   height of src/dst plane data
   1879 * @param linesize size of a single line of plane data, including padding
   1880 * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
   1881 */
   1882 static av_always_inline
   1883 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
   1884                   uint8_t *dst2, const ProgressFrame *ref, const VP8mv *mv,
   1885                   int x_off, int y_off, int block_w, int block_h,
   1886                   int width, int height, ptrdiff_t linesize,
   1887                   vp8_mc_func mc_func[3][3])
   1888 {
   1889    const uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
   1890 
   1891    if (AV_RN32A(mv)) {
   1892        int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
   1893        int my = mv->y & 7, my_idx = subpel_idx[0][my];
   1894 
   1895        x_off += mv->x >> 3;
   1896        y_off += mv->y >> 3;
   1897 
   1898        // edge emulation
   1899        src1 += y_off * linesize + x_off;
   1900        src2 += y_off * linesize + x_off;
   1901        ff_progress_frame_await(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3);
   1902        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
   1903            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
   1904            s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
   1905                                     src1 - my_idx * linesize - mx_idx,
   1906                                     EDGE_EMU_LINESIZE, linesize,
   1907                                     block_w + subpel_idx[1][mx],
   1908                                     block_h + subpel_idx[1][my],
   1909                                     x_off - mx_idx, y_off - my_idx, width, height);
   1910            src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
   1911            mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
   1912 
   1913            s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
   1914                                     src2 - my_idx * linesize - mx_idx,
   1915                                     EDGE_EMU_LINESIZE, linesize,
   1916                                     block_w + subpel_idx[1][mx],
   1917                                     block_h + subpel_idx[1][my],
   1918                                     x_off - mx_idx, y_off - my_idx, width, height);
   1919            src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
   1920            mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
   1921        } else {
   1922            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
   1923            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
   1924        }
   1925    } else {
   1926        ff_progress_frame_await(ref, (3 + y_off + block_h) >> 3);
   1927        mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
   1928        mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
   1929    }
   1930 }
   1931 
   1932 static av_always_inline
   1933 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3],
   1934                 const ProgressFrame *ref_frame, int x_off, int y_off,
   1935                 int bx_off, int by_off, int block_w, int block_h,
   1936                 int width, int height, const VP8mv *mv)
   1937 {
   1938    VP8mv uvmv = *mv;
   1939 
   1940    /* Y */
   1941    vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
   1942                ref_frame, mv, x_off + bx_off, y_off + by_off,
   1943                block_w, block_h, width, height, s->linesize,
   1944                s->put_pixels_tab[block_w == 8]);
   1945 
   1946    /* U/V */
   1947    if (s->profile == 3) {
   1948        /* this block only applies VP8; it is safe to check
   1949         * only the profile, as VP7 profile <= 1 */
   1950        uvmv.x &= ~7;
   1951        uvmv.y &= ~7;
   1952    }
   1953    x_off   >>= 1;
   1954    y_off   >>= 1;
   1955    bx_off  >>= 1;
   1956    by_off  >>= 1;
   1957    width   >>= 1;
   1958    height  >>= 1;
   1959    block_w >>= 1;
   1960    block_h >>= 1;
   1961    vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
   1962                  dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
   1963                  &uvmv, x_off + bx_off, y_off + by_off,
   1964                  block_w, block_h, width, height, s->uvlinesize,
   1965                  s->put_pixels_tab[1 + (block_w == 4)]);
   1966 }
   1967 
   1968 /* Fetch pixels for estimated mv 4 macroblocks ahead.
   1969 * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
   1970 static av_always_inline
   1971 void prefetch_motion(const VP8Context *s, const VP8Macroblock *mb,
   1972                     int mb_x, int mb_y, int mb_xy, int ref)
   1973 {
   1974    /* Don't prefetch refs that haven't been used very often this frame. */
   1975    if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
   1976        int x_off = mb_x << 4, y_off = mb_y << 4;
   1977        int mx = (mb->mv.x >> 2) + x_off + 8;
   1978        int my = (mb->mv.y >> 2) + y_off;
   1979        uint8_t **src = s->framep[ref]->tf.f->data;
   1980        int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
   1981        /* For threading, a ff_thread_await_progress here might be useful, but
   1982         * it actually slows down the decoder. Since a bad prefetch doesn't
   1983         * generate bad decoder output, we don't run it here. */
   1984        s->vdsp.prefetch(src[0] + off, s->linesize, 4);
   1985        off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
   1986        s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
   1987    }
   1988 }
   1989 
   1990 /**
   1991 * Apply motion vectors to prediction buffer, chapter 18.
   1992 */
   1993 static av_always_inline
   1994 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3],
   1995                   VP8Macroblock *mb, int mb_x, int mb_y)
   1996 {
   1997    int x_off = mb_x << 4, y_off = mb_y << 4;
   1998    int width = 16 * s->mb_width, height = 16 * s->mb_height;
   1999    const ProgressFrame *ref = &s->framep[mb->ref_frame]->tf;
   2000    const VP8mv *bmv = mb->bmv;
   2001 
   2002    switch (mb->partitioning) {
   2003    case VP8_SPLITMVMODE_NONE:
   2004        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2005                    0, 0, 16, 16, width, height, &mb->mv);
   2006        break;
   2007    case VP8_SPLITMVMODE_4x4: {
   2008        int x, y;
   2009        VP8mv uvmv;
   2010 
   2011        /* Y */
   2012        for (y = 0; y < 4; y++) {
   2013            for (x = 0; x < 4; x++) {
   2014                vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
   2015                            ref, &bmv[4 * y + x],
   2016                            4 * x + x_off, 4 * y + y_off, 4, 4,
   2017                            width, height, s->linesize,
   2018                            s->put_pixels_tab[2]);
   2019            }
   2020        }
   2021 
   2022        /* U/V */
   2023        x_off  >>= 1;
   2024        y_off  >>= 1;
   2025        width  >>= 1;
   2026        height >>= 1;
   2027        for (y = 0; y < 2; y++) {
   2028            for (x = 0; x < 2; x++) {
   2029                uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
   2030                         mb->bmv[2 * y       * 4 + 2 * x + 1].x +
   2031                         mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
   2032                         mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
   2033                uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
   2034                         mb->bmv[2 * y       * 4 + 2 * x + 1].y +
   2035                         mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
   2036                         mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
   2037                uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
   2038                uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
   2039                if (s->profile == 3) {
   2040                    uvmv.x &= ~7;
   2041                    uvmv.y &= ~7;
   2042                }
   2043                vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
   2044                              dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
   2045                              &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
   2046                              width, height, s->uvlinesize,
   2047                              s->put_pixels_tab[2]);
   2048            }
   2049        }
   2050        break;
   2051    }
   2052    case VP8_SPLITMVMODE_16x8:
   2053        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2054                    0, 0, 16, 8, width, height, &bmv[0]);
   2055        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2056                    0, 8, 16, 8, width, height, &bmv[1]);
   2057        break;
   2058    case VP8_SPLITMVMODE_8x16:
   2059        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2060                    0, 0, 8, 16, width, height, &bmv[0]);
   2061        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2062                    8, 0, 8, 16, width, height, &bmv[1]);
   2063        break;
   2064    case VP8_SPLITMVMODE_8x8:
   2065        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2066                    0, 0, 8, 8, width, height, &bmv[0]);
   2067        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2068                    8, 0, 8, 8, width, height, &bmv[1]);
   2069        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2070                    0, 8, 8, 8, width, height, &bmv[2]);
   2071        vp8_mc_part(s, td, dst, ref, x_off, y_off,
   2072                    8, 8, 8, 8, width, height, &bmv[3]);
   2073        break;
   2074    }
   2075 }
   2076 
   2077 static av_always_inline
   2078 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3],
   2079             const VP8Macroblock *mb)
   2080 {
   2081    int x, y, ch;
   2082 
   2083    if (mb->mode != MODE_I4x4) {
   2084        uint8_t *y_dst = dst[0];
   2085        for (y = 0; y < 4; y++) {
   2086            uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
   2087            if (nnz4) {
   2088                if (nnz4 & ~0x01010101) {
   2089                    for (x = 0; x < 4; x++) {
   2090                        if ((uint8_t) nnz4 == 1)
   2091                            s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
   2092                                                      td->block[y][x],
   2093                                                      s->linesize);
   2094                        else if ((uint8_t) nnz4 > 1)
   2095                            s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
   2096                                                   td->block[y][x],
   2097                                                   s->linesize);
   2098                        nnz4 >>= 8;
   2099                        if (!nnz4)
   2100                            break;
   2101                    }
   2102                } else {
   2103                    s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
   2104                }
   2105            }
   2106            y_dst += 4 * s->linesize;
   2107        }
   2108    }
   2109 
   2110    for (ch = 0; ch < 2; ch++) {
   2111        uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
   2112        if (nnz4) {
   2113            uint8_t *ch_dst = dst[1 + ch];
   2114            if (nnz4 & ~0x01010101) {
   2115                for (y = 0; y < 2; y++) {
   2116                    for (x = 0; x < 2; x++) {
   2117                        if ((uint8_t) nnz4 == 1)
   2118                            s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
   2119                                                      td->block[4 + ch][(y << 1) + x],
   2120                                                      s->uvlinesize);
   2121                        else if ((uint8_t) nnz4 > 1)
   2122                            s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
   2123                                                   td->block[4 + ch][(y << 1) + x],
   2124                                                   s->uvlinesize);
   2125                        nnz4 >>= 8;
   2126                        if (!nnz4)
   2127                            goto chroma_idct_end;
   2128                    }
   2129                    ch_dst += 4 * s->uvlinesize;
   2130                }
   2131            } else {
   2132                s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
   2133            }
   2134        }
   2135 chroma_idct_end:
   2136        ;
   2137    }
   2138 }
   2139 
   2140 static av_always_inline
   2141 void filter_level_for_mb(const VP8Context *s, const VP8Macroblock *mb,
   2142                         VP8FilterStrength *f, int is_vp7)
   2143 {
   2144    int interior_limit, filter_level;
   2145 
   2146    if (s->segmentation.enabled) {
   2147        filter_level = s->segmentation.filter_level[mb->segment];
   2148        if (!s->segmentation.absolute_vals)
   2149            filter_level += s->filter.level;
   2150    } else
   2151        filter_level = s->filter.level;
   2152 
   2153    if (s->lf_delta.enabled) {
   2154        filter_level += s->lf_delta.ref[mb->ref_frame];
   2155        filter_level += s->lf_delta.mode[mb->mode];
   2156    }
   2157 
   2158    filter_level = av_clip_uintp2(filter_level, 6);
   2159 
   2160    interior_limit = filter_level;
   2161    if (s->filter.sharpness) {
   2162        interior_limit >>= (s->filter.sharpness + 3) >> 2;
   2163        interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
   2164    }
   2165    interior_limit = FFMAX(interior_limit, 1);
   2166 
   2167    f->filter_level = filter_level;
   2168    f->inner_limit = interior_limit;
   2169    f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
   2170                      mb->mode == VP8_MVMODE_SPLIT;
   2171 }
   2172 
   2173 static av_always_inline
   2174 void filter_mb(const VP8Context *s, uint8_t *const dst[3], const VP8FilterStrength *f,
   2175               int mb_x, int mb_y, int is_vp7)
   2176 {
   2177    int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
   2178    int filter_level = f->filter_level;
   2179    int inner_limit = f->inner_limit;
   2180    int inner_filter = f->inner_filter;
   2181    ptrdiff_t linesize   = s->linesize;
   2182    ptrdiff_t uvlinesize = s->uvlinesize;
   2183    static const uint8_t hev_thresh_lut[2][64] = {
   2184        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
   2185          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   2186          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   2187          3, 3, 3, 3 },
   2188        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
   2189          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   2190          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   2191          2, 2, 2, 2 }
   2192    };
   2193 
   2194    if (!filter_level)
   2195        return;
   2196 
   2197    if (is_vp7) {
   2198        bedge_lim_y  = filter_level;
   2199        bedge_lim_uv = filter_level * 2;
   2200        mbedge_lim   = filter_level + 2;
   2201    } else {
   2202        bedge_lim_y  =
   2203        bedge_lim_uv = filter_level * 2 + inner_limit;
   2204        mbedge_lim   = bedge_lim_y + 4;
   2205    }
   2206 
   2207    hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
   2208 
   2209    if (mb_x) {
   2210        s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
   2211                                       mbedge_lim, inner_limit, hev_thresh);
   2212        s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
   2213                                       mbedge_lim, inner_limit, hev_thresh);
   2214    }
   2215 
   2216 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
   2217    if (cond && inner_filter) {                                               \
   2218        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
   2219                                             bedge_lim_y, inner_limit,        \
   2220                                             hev_thresh);                     \
   2221        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
   2222                                             bedge_lim_y, inner_limit,        \
   2223                                             hev_thresh);                     \
   2224        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
   2225                                             bedge_lim_y, inner_limit,        \
   2226                                             hev_thresh);                     \
   2227        s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
   2228                                             uvlinesize,  bedge_lim_uv,       \
   2229                                             inner_limit, hev_thresh);        \
   2230    }
   2231 
   2232    H_LOOP_FILTER_16Y_INNER(!is_vp7)
   2233 
   2234    if (mb_y) {
   2235        s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
   2236                                       mbedge_lim, inner_limit, hev_thresh);
   2237        s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
   2238                                       mbedge_lim, inner_limit, hev_thresh);
   2239    }
   2240 
   2241    if (inner_filter) {
   2242        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
   2243                                             linesize, bedge_lim_y,
   2244                                             inner_limit, hev_thresh);
   2245        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
   2246                                             linesize, bedge_lim_y,
   2247                                             inner_limit, hev_thresh);
   2248        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
   2249                                             linesize, bedge_lim_y,
   2250                                             inner_limit, hev_thresh);
   2251        s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
   2252                                             dst[2] +  4 * uvlinesize,
   2253                                             uvlinesize, bedge_lim_uv,
   2254                                             inner_limit, hev_thresh);
   2255    }
   2256 
   2257    H_LOOP_FILTER_16Y_INNER(is_vp7)
   2258 }
   2259 
   2260 static av_always_inline
   2261 void filter_mb_simple(const VP8Context *s, uint8_t *dst, const VP8FilterStrength *f,
   2262                      int mb_x, int mb_y)
   2263 {
   2264    int mbedge_lim, bedge_lim;
   2265    int filter_level = f->filter_level;
   2266    int inner_limit  = f->inner_limit;
   2267    int inner_filter = f->inner_filter;
   2268    ptrdiff_t linesize = s->linesize;
   2269 
   2270    if (!filter_level)
   2271        return;
   2272 
   2273    bedge_lim  = 2 * filter_level + inner_limit;
   2274    mbedge_lim = bedge_lim + 4;
   2275 
   2276    if (mb_x)
   2277        s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
   2278    if (inner_filter) {
   2279        s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
   2280        s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
   2281        s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
   2282    }
   2283 
   2284    if (mb_y)
   2285        s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
   2286    if (inner_filter) {
   2287        s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
   2288        s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
   2289        s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
   2290    }
   2291 }
   2292 
   2293 #define MARGIN (16 << 2)
   2294 static av_always_inline
   2295 int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
   2296                            const VP8Frame *prev_frame, int is_vp7)
   2297 {
   2298    VP8Context *s = avctx->priv_data;
   2299    int mb_x, mb_y;
   2300 
   2301    s->mv_bounds.mv_min.y = -MARGIN;
   2302    s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
   2303    for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
   2304        VP8Macroblock *mb = s->macroblocks_base +
   2305                            ((s->mb_width + 1) * (mb_y + 1) + 1);
   2306        int mb_xy = mb_y * s->mb_width;
   2307 
   2308        AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
   2309 
   2310        s->mv_bounds.mv_min.x = -MARGIN;
   2311        s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
   2312 
   2313        for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
   2314            if (vpx_rac_is_end(&s->c)) {
   2315                return AVERROR_INVALIDDATA;
   2316            }
   2317            if (mb_y == 0)
   2318                AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
   2319                         DC_PRED * 0x01010101);
   2320            decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map + mb_xy,
   2321                           prev_frame && prev_frame->seg_map ?
   2322                           prev_frame->seg_map + mb_xy : NULL, 1, is_vp7);
   2323            s->mv_bounds.mv_min.x -= 64;
   2324            s->mv_bounds.mv_max.x -= 64;
   2325        }
   2326        s->mv_bounds.mv_min.y -= 64;
   2327        s->mv_bounds.mv_max.y -= 64;
   2328    }
   2329    return 0;
   2330 }
   2331 
   2332 static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
   2333                                  const VP8Frame *prev_frame)
   2334 {
   2335    return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
   2336 }
   2337 
   2338 static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
   2339                                  const VP8Frame *prev_frame)
   2340 {
   2341    return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
   2342 }
   2343 
   2344 #if HAVE_THREADS
   2345 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
   2346    do {                                                                      \
   2347        int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
   2348        if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
   2349            pthread_mutex_lock(&otd->lock);                                   \
   2350            atomic_store(&td->wait_mb_pos, tmp);                              \
   2351            do {                                                              \
   2352                if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
   2353                    break;                                                    \
   2354                pthread_cond_wait(&otd->cond, &otd->lock);                    \
   2355            } while (1);                                                      \
   2356            atomic_store(&td->wait_mb_pos, INT_MAX);                          \
   2357            pthread_mutex_unlock(&otd->lock);                                 \
   2358        }                                                                     \
   2359    } while (0)
   2360 
   2361 #define update_pos(td, mb_y, mb_x)                                            \
   2362    do {                                                                      \
   2363        int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
   2364        int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
   2365                               (num_jobs > 1);                                \
   2366        int is_null          = !next_td || !prev_td;                          \
   2367        int pos_check        = (is_null) ? 1 :                                \
   2368            (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
   2369            (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
   2370        atomic_store(&td->thread_mb_pos, pos);                                \
   2371        if (sliced_threading && pos_check) {                                  \
   2372            pthread_mutex_lock(&td->lock);                                    \
   2373            pthread_cond_broadcast(&td->cond);                                \
   2374            pthread_mutex_unlock(&td->lock);                                  \
   2375        }                                                                     \
   2376    } while (0)
   2377 #else
   2378 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
   2379 #define update_pos(td, mb_y, mb_x) while(0)
   2380 #endif
   2381 
   2382 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
   2383                                        int jobnr, int threadnr, int is_vp7)
   2384 {
   2385    VP8Context *s = avctx->priv_data;
   2386    VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
   2387    int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
   2388    int mb_x, mb_xy = mb_y * s->mb_width;
   2389    int num_jobs = s->num_jobs;
   2390    const VP8Frame *prev_frame = s->prev_frame;
   2391    VP8Frame *curframe = s->curframe;
   2392    VPXRangeCoder *coeff_c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
   2393 
   2394    VP8Macroblock *mb;
   2395    uint8_t *dst[3] = {
   2396        curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
   2397        curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
   2398        curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
   2399    };
   2400 
   2401    if (vpx_rac_is_end(&s->c))
   2402         return AVERROR_INVALIDDATA;
   2403 
   2404    if (mb_y == 0)
   2405        prev_td = td;
   2406    else
   2407        prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
   2408    if (mb_y == s->mb_height - 1)
   2409        next_td = td;
   2410    else
   2411        next_td = &s->thread_data[(jobnr + 1) % num_jobs];
   2412    if (s->mb_layout == 1)
   2413        mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
   2414    else {
   2415        // Make sure the previous frame has read its segmentation map,
   2416        // if we re-use the same map.
   2417        if (prev_frame && s->segmentation.enabled &&
   2418            !s->segmentation.update_map)
   2419            ff_progress_frame_await(&prev_frame->tf, mb_y);
   2420        mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
   2421        memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
   2422        AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
   2423    }
   2424 
   2425    if (!is_vp7 || mb_y == 0)
   2426        memset(td->left_nnz, 0, sizeof(td->left_nnz));
   2427 
   2428    td->mv_bounds.mv_min.x = -MARGIN;
   2429    td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
   2430 
   2431    for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
   2432        if (vpx_rac_is_end(&s->c))
   2433            return AVERROR_INVALIDDATA;
   2434        // Wait for previous thread to read mb_x+2, and reach mb_y-1.
   2435        if (prev_td != td) {
   2436            if (threadnr != 0) {
   2437                check_thread_pos(td, prev_td,
   2438                                 mb_x + (is_vp7 ? 2 : 1),
   2439                                 mb_y - (is_vp7 ? 2 : 1));
   2440            } else {
   2441                check_thread_pos(td, prev_td,
   2442                                 mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
   2443                                 mb_y - (is_vp7 ? 2 : 1));
   2444            }
   2445        }
   2446 
   2447        s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
   2448                         s->linesize, 4);
   2449        s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
   2450                         dst[2] - dst[1], 2);
   2451 
   2452        if (!s->mb_layout)
   2453            decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map + mb_xy,
   2454                           prev_frame && prev_frame->seg_map ?
   2455                           prev_frame->seg_map + mb_xy : NULL, 0, is_vp7);
   2456 
   2457        prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_PREVIOUS);
   2458 
   2459        if (!mb->skip) {
   2460            if (vpx_rac_is_end(coeff_c))
   2461                return AVERROR_INVALIDDATA;
   2462            decode_mb_coeffs(s, td, coeff_c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
   2463        }
   2464 
   2465        if (mb->mode <= MODE_I4x4)
   2466            intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
   2467        else
   2468            inter_predict(s, td, dst, mb, mb_x, mb_y);
   2469 
   2470        prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_GOLDEN);
   2471 
   2472        if (!mb->skip) {
   2473            idct_mb(s, td, dst, mb);
   2474        } else {
   2475            AV_ZERO64(td->left_nnz);
   2476            AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
   2477 
   2478            /* Reset DC block predictors if they would exist
   2479             * if the mb had coefficients */
   2480            if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
   2481                td->left_nnz[8]     = 0;
   2482                s->top_nnz[mb_x][8] = 0;
   2483            }
   2484        }
   2485 
   2486        if (s->deblock_filter)
   2487            filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
   2488 
   2489        if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
   2490            if (s->filter.simple)
   2491                backup_mb_border(s->top_border[mb_x + 1], dst[0],
   2492                                 NULL, NULL, s->linesize, 0, 1);
   2493            else
   2494                backup_mb_border(s->top_border[mb_x + 1], dst[0],
   2495                                 dst[1], dst[2], s->linesize, s->uvlinesize, 0);
   2496        }
   2497 
   2498        prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_ALTREF);
   2499 
   2500        dst[0]      += 16;
   2501        dst[1]      += 8;
   2502        dst[2]      += 8;
   2503        td->mv_bounds.mv_min.x -= 64;
   2504        td->mv_bounds.mv_max.x -= 64;
   2505 
   2506        if (mb_x == s->mb_width + 1) {
   2507            update_pos(td, mb_y, s->mb_width + 3);
   2508        } else {
   2509            update_pos(td, mb_y, mb_x);
   2510        }
   2511    }
   2512    return 0;
   2513 }
   2514 
   2515 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
   2516                                        int jobnr, int threadnr)
   2517 {
   2518    return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
   2519 }
   2520 
   2521 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
   2522                                        int jobnr, int threadnr)
   2523 {
   2524    return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
   2525 }
   2526 
   2527 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
   2528                              int jobnr, int threadnr, int is_vp7)
   2529 {
   2530    VP8Context *s = avctx->priv_data;
   2531    VP8ThreadData *td = &s->thread_data[threadnr];
   2532    int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
   2533    AVFrame *curframe = s->curframe->tf.f;
   2534    VP8Macroblock *mb;
   2535    VP8ThreadData *prev_td, *next_td;
   2536    uint8_t *dst[3] = {
   2537        curframe->data[0] + 16 * mb_y * s->linesize,
   2538        curframe->data[1] +  8 * mb_y * s->uvlinesize,
   2539        curframe->data[2] +  8 * mb_y * s->uvlinesize
   2540    };
   2541 
   2542    if (s->mb_layout == 1)
   2543        mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
   2544    else
   2545        mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
   2546 
   2547    if (mb_y == 0)
   2548        prev_td = td;
   2549    else
   2550        prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
   2551    if (mb_y == s->mb_height - 1)
   2552        next_td = td;
   2553    else
   2554        next_td = &s->thread_data[(jobnr + 1) % num_jobs];
   2555 
   2556    for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
   2557        const VP8FilterStrength *f = &td->filter_strength[mb_x];
   2558        if (prev_td != td)
   2559            check_thread_pos(td, prev_td,
   2560                             (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
   2561        if (next_td != td)
   2562            if (next_td != &s->thread_data[0])
   2563                check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
   2564 
   2565        if (num_jobs == 1) {
   2566            if (s->filter.simple)
   2567                backup_mb_border(s->top_border[mb_x + 1], dst[0],
   2568                                 NULL, NULL, s->linesize, 0, 1);
   2569            else
   2570                backup_mb_border(s->top_border[mb_x + 1], dst[0],
   2571                                 dst[1], dst[2], s->linesize, s->uvlinesize, 0);
   2572        }
   2573 
   2574        if (s->filter.simple)
   2575            filter_mb_simple(s, dst[0], f, mb_x, mb_y);
   2576        else
   2577            filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
   2578        dst[0] += 16;
   2579        dst[1] += 8;
   2580        dst[2] += 8;
   2581 
   2582        update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
   2583    }
   2584 }
   2585 
   2586 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
   2587                              int jobnr, int threadnr)
   2588 {
   2589    filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
   2590 }
   2591 
   2592 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
   2593                              int jobnr, int threadnr)
   2594 {
   2595    filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
   2596 }
   2597 
   2598 static av_always_inline
   2599 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
   2600                              int threadnr, int is_vp7)
   2601 {
   2602    const VP8Context *s = avctx->priv_data;
   2603    VP8ThreadData *td = &s->thread_data[jobnr];
   2604    VP8ThreadData *next_td = NULL, *prev_td = NULL;
   2605    VP8Frame *curframe = s->curframe;
   2606    int mb_y, num_jobs = s->num_jobs;
   2607    int ret;
   2608 
   2609    td->thread_nr = threadnr;
   2610    td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
   2611    td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
   2612    for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
   2613        atomic_store(&td->thread_mb_pos, mb_y << 16);
   2614        ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
   2615        if (ret < 0) {
   2616            update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
   2617            return ret;
   2618        }
   2619        if (s->deblock_filter)
   2620            s->filter_mb_row(avctx, tdata, jobnr, threadnr);
   2621        update_pos(td, mb_y, INT_MAX & 0xFFFF);
   2622 
   2623        td->mv_bounds.mv_min.y -= 64 * num_jobs;
   2624        td->mv_bounds.mv_max.y -= 64 * num_jobs;
   2625 
   2626        if (avctx->active_thread_type == FF_THREAD_FRAME)
   2627            ff_progress_frame_report(&curframe->tf, mb_y);
   2628    }
   2629 
   2630    return 0;
   2631 }
   2632 
   2633 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
   2634                                    int jobnr, int threadnr)
   2635 {
   2636    return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
   2637 }
   2638 
   2639 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
   2640                                    int jobnr, int threadnr)
   2641 {
   2642    return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
   2643 }
   2644 
   2645 static av_always_inline
   2646 int vp78_decode_frame(AVCodecContext *avctx, AVFrame *rframe, int *got_frame,
   2647                      const AVPacket *avpkt, int is_vp7)
   2648 {
   2649    VP8Context *s = avctx->priv_data;
   2650    int ret, i, referenced, num_jobs;
   2651    enum AVDiscard skip_thresh;
   2652    VP8Frame *av_uninit(curframe), *prev_frame;
   2653 
   2654    if (is_vp7)
   2655        ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
   2656    else
   2657        ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
   2658 
   2659    if (ret < 0)
   2660        goto err;
   2661 
   2662    if (!is_vp7 && s->actually_webp) {
   2663        // VP8 in WebP is supposed to be intra-only. Enforce this here
   2664        // to ensure that output is reproducible with frame-threading.
   2665        if (!s->keyframe)
   2666            return AVERROR_INVALIDDATA;
   2667        // avctx->pix_fmt already set in caller.
   2668    } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
   2669        s->pix_fmt = get_pixel_format(s);
   2670        if (s->pix_fmt < 0) {
   2671            ret = AVERROR(EINVAL);
   2672            goto err;
   2673        }
   2674        avctx->pix_fmt = s->pix_fmt;
   2675    }
   2676 
   2677    prev_frame = s->framep[VP8_FRAME_CURRENT];
   2678 
   2679    referenced = s->update_last || s->update_golden == VP8_FRAME_CURRENT ||
   2680                 s->update_altref == VP8_FRAME_CURRENT;
   2681 
   2682    skip_thresh = !referenced ? AVDISCARD_NONREF
   2683                              : !s->keyframe ? AVDISCARD_NONKEY
   2684                                             : AVDISCARD_ALL;
   2685 
   2686    if (avctx->skip_frame >= skip_thresh) {
   2687        s->invisible = 1;
   2688        memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
   2689        goto skip_decode;
   2690    }
   2691    s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
   2692 
   2693    // release no longer referenced frames
   2694    for (i = 0; i < 5; i++)
   2695        if (s->frames[i].tf.f &&
   2696            &s->frames[i] != prev_frame &&
   2697            &s->frames[i] != s->framep[VP8_FRAME_PREVIOUS] &&
   2698            &s->frames[i] != s->framep[VP8_FRAME_GOLDEN]   &&
   2699            &s->frames[i] != s->framep[VP8_FRAME_ALTREF])
   2700            vp8_release_frame(&s->frames[i]);
   2701 
   2702    curframe = s->framep[VP8_FRAME_CURRENT] = vp8_find_free_buffer(s);
   2703 
   2704    if (!s->colorspace)
   2705        avctx->colorspace = AVCOL_SPC_BT470BG;
   2706    if (s->fullrange)
   2707        avctx->color_range = AVCOL_RANGE_JPEG;
   2708    else
   2709        avctx->color_range = AVCOL_RANGE_MPEG;
   2710 
   2711    /* Given that arithmetic probabilities are updated every frame, it's quite
   2712     * likely that the values we have on a random interframe are complete
   2713     * junk if we didn't start decode on a keyframe. So just don't display
   2714     * anything rather than junk. */
   2715    if (!s->keyframe && (!s->framep[VP8_FRAME_PREVIOUS] ||
   2716                         !s->framep[VP8_FRAME_GOLDEN]   ||
   2717                         !s->framep[VP8_FRAME_ALTREF])) {
   2718        av_log(avctx, AV_LOG_WARNING,
   2719               "Discarding interframe without a prior keyframe!\n");
   2720        ret = AVERROR_INVALIDDATA;
   2721        goto err;
   2722    }
   2723 
   2724    if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
   2725        goto err;
   2726    if (s->keyframe)
   2727        curframe->tf.f->flags |= AV_FRAME_FLAG_KEY;
   2728    else
   2729        curframe->tf.f->flags &= ~AV_FRAME_FLAG_KEY;
   2730    curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
   2731                                            : AV_PICTURE_TYPE_P;
   2732 
   2733    // check if golden and altref are swapped
   2734    if (s->update_altref != VP8_FRAME_NONE)
   2735        s->next_framep[VP8_FRAME_ALTREF] = s->framep[s->update_altref];
   2736    else
   2737        s->next_framep[VP8_FRAME_ALTREF] = s->framep[VP8_FRAME_ALTREF];
   2738 
   2739    if (s->update_golden != VP8_FRAME_NONE)
   2740        s->next_framep[VP8_FRAME_GOLDEN] = s->framep[s->update_golden];
   2741    else
   2742        s->next_framep[VP8_FRAME_GOLDEN] = s->framep[VP8_FRAME_GOLDEN];
   2743 
   2744    if (s->update_last)
   2745        s->next_framep[VP8_FRAME_PREVIOUS] = curframe;
   2746    else
   2747        s->next_framep[VP8_FRAME_PREVIOUS] = s->framep[VP8_FRAME_PREVIOUS];
   2748 
   2749    s->next_framep[VP8_FRAME_CURRENT] = curframe;
   2750 
   2751    if (!is_vp7 && !s->actually_webp)
   2752        ff_thread_finish_setup(avctx);
   2753 
   2754    if (avctx->hwaccel) {
   2755        const FFHWAccel *hwaccel = ffhwaccel(avctx->hwaccel);
   2756        ret = hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
   2757        if (ret < 0)
   2758            goto err;
   2759 
   2760        ret = hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
   2761        if (ret < 0)
   2762            goto err;
   2763 
   2764        ret = hwaccel->end_frame(avctx);
   2765        if (ret < 0)
   2766            goto err;
   2767 
   2768    } else {
   2769        s->linesize   = curframe->tf.f->linesize[0];
   2770        s->uvlinesize = curframe->tf.f->linesize[1];
   2771 
   2772        memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
   2773        /* Zero macroblock structures for top/top-left prediction
   2774         * from outside the frame. */
   2775        if (!s->mb_layout)
   2776            memset(s->macroblocks + s->mb_height * 2 - 1, 0,
   2777                   (s->mb_width + 1) * sizeof(*s->macroblocks));
   2778        if (!s->mb_layout && s->keyframe)
   2779            memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
   2780 
   2781        memset(s->ref_count, 0, sizeof(s->ref_count));
   2782 
   2783        if (s->mb_layout == 1) {
   2784            // Make sure the previous frame has read its segmentation map,
   2785            // if we re-use the same map.
   2786            if (prev_frame && s->segmentation.enabled &&
   2787                !s->segmentation.update_map)
   2788                ff_progress_frame_await(&prev_frame->tf, 1);
   2789            if (is_vp7)
   2790                ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
   2791            else
   2792                ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
   2793            if (ret < 0)
   2794                goto err;
   2795        }
   2796 
   2797        if (avctx->active_thread_type == FF_THREAD_FRAME)
   2798            num_jobs = 1;
   2799        else
   2800            num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
   2801        s->num_jobs   = num_jobs;
   2802        s->curframe   = curframe;
   2803        s->prev_frame = prev_frame;
   2804        s->mv_bounds.mv_min.y   = -MARGIN;
   2805        s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
   2806        for (i = 0; i < MAX_THREADS; i++) {
   2807            VP8ThreadData *td = &s->thread_data[i];
   2808            atomic_init(&td->thread_mb_pos, 0);
   2809            atomic_init(&td->wait_mb_pos, INT_MAX);
   2810        }
   2811        if (is_vp7)
   2812            avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
   2813                            num_jobs);
   2814        else
   2815            avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
   2816                            num_jobs);
   2817    }
   2818 
   2819    ff_progress_frame_report(&curframe->tf, INT_MAX);
   2820    memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
   2821 
   2822 skip_decode:
   2823    // if future frames don't use the updated probabilities,
   2824    // reset them to the values we saved
   2825    if (!s->update_probabilities)
   2826        s->prob[0] = s->prob[1];
   2827 
   2828    if (!s->invisible) {
   2829        if ((ret = av_frame_ref(rframe, curframe->tf.f)) < 0)
   2830            return ret;
   2831        *got_frame = 1;
   2832    }
   2833 
   2834    return avpkt->size;
   2835 err:
   2836    memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
   2837    return ret;
   2838 }
   2839 
   2840 int ff_vp8_decode_frame(AVCodecContext *avctx, AVFrame *frame,
   2841                        int *got_frame, AVPacket *avpkt)
   2842 {
   2843    return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP8);
   2844 }
   2845 
   2846 #if CONFIG_VP7_DECODER
   2847 static int vp7_decode_frame(AVCodecContext *avctx, AVFrame *frame,
   2848                            int *got_frame, AVPacket *avpkt)
   2849 {
   2850    return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP7);
   2851 }
   2852 #endif /* CONFIG_VP7_DECODER */
   2853 
   2854 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
   2855 {
   2856    vp8_decode_flush_impl(avctx, 1);
   2857 
   2858    return 0;
   2859 }
   2860 
   2861 static av_always_inline
   2862 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
   2863 {
   2864    VP8Context *s = avctx->priv_data;
   2865 
   2866    s->avctx = avctx;
   2867    s->pix_fmt = AV_PIX_FMT_NONE;
   2868    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
   2869 
   2870    ff_videodsp_init(&s->vdsp, 8);
   2871 
   2872    ff_vp78dsp_init(&s->vp8dsp);
   2873    if (CONFIG_VP7_DECODER && is_vp7) {
   2874        ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
   2875        ff_vp7dsp_init(&s->vp8dsp);
   2876        s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
   2877        s->filter_mb_row           = vp7_filter_mb_row;
   2878    } else if (CONFIG_VP8_DECODER && !is_vp7) {
   2879        ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
   2880        ff_vp8dsp_init(&s->vp8dsp);
   2881        s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
   2882        s->filter_mb_row           = vp8_filter_mb_row;
   2883    }
   2884 
   2885    /* does not change for VP8 */
   2886    memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
   2887 
   2888    return 0;
   2889 }
   2890 
   2891 #if CONFIG_VP7_DECODER
   2892 static int vp7_decode_init(AVCodecContext *avctx)
   2893 {
   2894    return vp78_decode_init(avctx, IS_VP7);
   2895 }
   2896 #endif /* CONFIG_VP7_DECODER */
   2897 
   2898 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
   2899 {
   2900    return vp78_decode_init(avctx, IS_VP8);
   2901 }
   2902 
   2903 #if CONFIG_VP8_DECODER
   2904 #if HAVE_THREADS
   2905 static void vp8_replace_frame(VP8Frame *dst, const VP8Frame *src)
   2906 {
   2907    ff_progress_frame_replace(&dst->tf, &src->tf);
   2908    av_refstruct_replace(&dst->seg_map, src->seg_map);
   2909    av_refstruct_replace(&dst->hwaccel_picture_private,
   2910                          src->hwaccel_picture_private);
   2911 }
   2912 
   2913 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
   2914 
   2915 static int vp8_decode_update_thread_context(AVCodecContext *dst,
   2916                                            const AVCodecContext *src)
   2917 {
   2918    VP8Context *s = dst->priv_data, *s_src = src->priv_data;
   2919 
   2920    if (s->macroblocks_base &&
   2921        (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
   2922        free_buffers(s);
   2923        s->mb_width  = s_src->mb_width;
   2924        s->mb_height = s_src->mb_height;
   2925    }
   2926 
   2927    s->pix_fmt      = s_src->pix_fmt;
   2928    s->prob[0]      = s_src->prob[!s_src->update_probabilities];
   2929    s->segmentation = s_src->segmentation;
   2930    s->lf_delta     = s_src->lf_delta;
   2931    memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
   2932 
   2933    for (int i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++)
   2934        vp8_replace_frame(&s->frames[i], &s_src->frames[i]);
   2935 
   2936    s->framep[0] = REBASE(s_src->next_framep[0]);
   2937    s->framep[1] = REBASE(s_src->next_framep[1]);
   2938    s->framep[2] = REBASE(s_src->next_framep[2]);
   2939    s->framep[3] = REBASE(s_src->next_framep[3]);
   2940 
   2941    return 0;
   2942 }
   2943 #endif /* HAVE_THREADS */
   2944 #endif /* CONFIG_VP8_DECODER */
   2945 
   2946 #if CONFIG_VP7_DECODER
   2947 const FFCodec ff_vp7_decoder = {
   2948    .p.name                = "vp7",
   2949    CODEC_LONG_NAME("On2 VP7"),
   2950    .p.type                = AVMEDIA_TYPE_VIDEO,
   2951    .p.id                  = AV_CODEC_ID_VP7,
   2952    .priv_data_size        = sizeof(VP8Context),
   2953    .init                  = vp7_decode_init,
   2954    .close                 = ff_vp8_decode_free,
   2955    FF_CODEC_DECODE_CB(vp7_decode_frame),
   2956    .p.capabilities        = AV_CODEC_CAP_DR1,
   2957    .flush                 = vp8_decode_flush,
   2958    .caps_internal         = FF_CODEC_CAP_USES_PROGRESSFRAMES,
   2959 };
   2960 #endif /* CONFIG_VP7_DECODER */
   2961 
   2962 #if CONFIG_VP8_DECODER
   2963 const FFCodec ff_vp8_decoder = {
   2964    .p.name                = "vp8",
   2965    CODEC_LONG_NAME("On2 VP8"),
   2966    .p.type                = AVMEDIA_TYPE_VIDEO,
   2967    .p.id                  = AV_CODEC_ID_VP8,
   2968    .priv_data_size        = sizeof(VP8Context),
   2969    .init                  = ff_vp8_decode_init,
   2970    .close                 = ff_vp8_decode_free,
   2971    FF_CODEC_DECODE_CB(ff_vp8_decode_frame),
   2972    .p.capabilities        = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
   2973                             AV_CODEC_CAP_SLICE_THREADS,
   2974    .caps_internal         = FF_CODEC_CAP_USES_PROGRESSFRAMES,
   2975    .flush                 = vp8_decode_flush,
   2976    UPDATE_THREAD_CONTEXT(vp8_decode_update_thread_context),
   2977    .hw_configs            = (const AVCodecHWConfigInternal *const []) {
   2978 #if CONFIG_VP8_VAAPI_HWACCEL
   2979                               HWACCEL_VAAPI(vp8),
   2980 #endif
   2981 #if CONFIG_VP8_NVDEC_HWACCEL
   2982                               HWACCEL_NVDEC(vp8),
   2983 #endif
   2984                               NULL
   2985                           },
   2986 };
   2987 #endif /* CONFIG_VP7_DECODER */