tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

decode.c (162202B)


      1 /*
      2 * Copyright © 2018-2021, VideoLAN and dav1d authors
      3 * Copyright © 2018, Two Orioles, LLC
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "config.h"
     29 
     30 #include <errno.h>
     31 #include <limits.h>
     32 #include <string.h>
     33 #include <stdio.h>
     34 #include <inttypes.h>
     35 
     36 #include "dav1d/data.h"
     37 
     38 #include "common/frame.h"
     39 #include "common/intops.h"
     40 
     41 #include "src/ctx.h"
     42 #include "src/decode.h"
     43 #include "src/dequant_tables.h"
     44 #include "src/env.h"
     45 #include "src/filmgrain.h"
     46 #include "src/log.h"
     47 #include "src/qm.h"
     48 #include "src/recon.h"
     49 #include "src/ref.h"
     50 #include "src/tables.h"
     51 #include "src/thread_task.h"
     52 #include "src/warpmv.h"
     53 
     54 static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
     55                              const Dav1dFrameHeader *const frame_hdr,
     56                              const int qidx, uint16_t (*dq)[3][2])
     57 {
     58    for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) {
     59        const int yac = frame_hdr->segmentation.enabled ?
     60            iclip_u8(qidx + frame_hdr->segmentation.seg_data.d[i].delta_q) : qidx;
     61        const int ydc = iclip_u8(yac + frame_hdr->quant.ydc_delta);
     62        const int uac = iclip_u8(yac + frame_hdr->quant.uac_delta);
     63        const int udc = iclip_u8(yac + frame_hdr->quant.udc_delta);
     64        const int vac = iclip_u8(yac + frame_hdr->quant.vac_delta);
     65        const int vdc = iclip_u8(yac + frame_hdr->quant.vdc_delta);
     66 
     67        dq[i][0][0] = dav1d_dq_tbl[seq_hdr->hbd][ydc][0];
     68        dq[i][0][1] = dav1d_dq_tbl[seq_hdr->hbd][yac][1];
     69        dq[i][1][0] = dav1d_dq_tbl[seq_hdr->hbd][udc][0];
     70        dq[i][1][1] = dav1d_dq_tbl[seq_hdr->hbd][uac][1];
     71        dq[i][2][0] = dav1d_dq_tbl[seq_hdr->hbd][vdc][0];
     72        dq[i][2][1] = dav1d_dq_tbl[seq_hdr->hbd][vac][1];
     73    }
     74 }
     75 
     76 static int read_mv_component_diff(MsacContext *const msac,
     77                                  CdfMvComponent *const mv_comp,
     78                                  const int mv_prec)
     79 {
     80    const int sign = dav1d_msac_decode_bool_adapt(msac, mv_comp->sign);
     81    const int cl = dav1d_msac_decode_symbol_adapt16(msac, mv_comp->classes, 10);
     82    int up, fp = 3, hp = 1;
     83 
     84    if (!cl) {
     85        up = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0);
     86        if (mv_prec >= 0) {  // !force_integer_mv
     87            fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->class0_fp[up], 3);
     88            if (mv_prec > 0) // allow_high_precision_mv
     89                hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0_hp);
     90        }
     91    } else {
     92        up = 1 << cl;
     93        for (int n = 0; n < cl; n++)
     94            up |= dav1d_msac_decode_bool_adapt(msac, mv_comp->classN[n]) << n;
     95        if (mv_prec >= 0) {  // !force_integer_mv
     96            fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->classN_fp, 3);
     97            if (mv_prec > 0) // allow_high_precision_mv
     98                hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->classN_hp);
     99        }
    100    }
    101 
    102    const int diff = ((up << 3) | (fp << 1) | hp) + 1;
    103 
    104    return sign ? -diff : diff;
    105 }
    106 
    107 static void read_mv_residual(Dav1dTileState *const ts, mv *const ref_mv,
    108                             const int mv_prec)
    109 {
    110    MsacContext *const msac = &ts->msac;
    111    const enum MVJoint mv_joint =
    112        dav1d_msac_decode_symbol_adapt4(msac, ts->cdf.mv.joint, N_MV_JOINTS - 1);
    113    if (mv_joint & MV_JOINT_V)
    114        ref_mv->y += read_mv_component_diff(msac, &ts->cdf.mv.comp[0], mv_prec);
    115    if (mv_joint & MV_JOINT_H)
    116        ref_mv->x += read_mv_component_diff(msac, &ts->cdf.mv.comp[1], mv_prec);
    117 }
    118 
    119 static void read_tx_tree(Dav1dTaskContext *const t,
    120                         const enum RectTxfmSize from,
    121                         const int depth, uint16_t *const masks,
    122                         const int x_off, const int y_off)
    123 {
    124    const Dav1dFrameContext *const f = t->f;
    125    const int bx4 = t->bx & 31, by4 = t->by & 31;
    126    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[from];
    127    const int txw = t_dim->lw, txh = t_dim->lh;
    128    int is_split;
    129 
    130    if (depth < 2 && from > (int) TX_4X4) {
    131        const int cat = 2 * (TX_64X64 - t_dim->max) - depth;
    132        const int a = t->a->tx[bx4] < txw;
    133        const int l = t->l.tx[by4] < txh;
    134 
    135        is_split = dav1d_msac_decode_bool_adapt(&t->ts->msac,
    136                       t->ts->cdf.m.txpart[cat][a + l]);
    137        if (is_split)
    138            masks[depth] |= 1 << (y_off * 4 + x_off);
    139    } else {
    140        is_split = 0;
    141    }
    142 
    143    if (is_split && t_dim->max > TX_8X8) {
    144        const enum RectTxfmSize sub = t_dim->sub;
    145        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
    146        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
    147 
    148        read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 0);
    149        t->bx += txsw;
    150        if (txw >= txh && t->bx < f->bw)
    151            read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 1, y_off * 2 + 0);
    152        t->bx -= txsw;
    153        t->by += txsh;
    154        if (txh >= txw && t->by < f->bh) {
    155            read_tx_tree(t, sub, depth + 1, masks, x_off * 2 + 0, y_off * 2 + 1);
    156            t->bx += txsw;
    157            if (txw >= txh && t->bx < f->bw)
    158                read_tx_tree(t, sub, depth + 1, masks,
    159                             x_off * 2 + 1, y_off * 2 + 1);
    160            t->bx -= txsw;
    161        }
    162        t->by -= txsh;
    163    } else {
    164        dav1d_memset_pow2[t_dim->lw](&t->a->tx[bx4], is_split ? TX_4X4 : txw);
    165        dav1d_memset_pow2[t_dim->lh](&t->l.tx[by4], is_split ? TX_4X4 : txh);
    166    }
    167 }
    168 
    169 static int neg_deinterleave(int diff, int ref, int max) {
    170    if (!ref) return diff;
    171    if (ref >= (max - 1)) return max - diff - 1;
    172    if (2 * ref < max) {
    173        if (diff <= 2 * ref) {
    174            if (diff & 1)
    175                return ref + ((diff + 1) >> 1);
    176            else
    177                return ref - (diff >> 1);
    178        }
    179        return diff;
    180    } else {
    181        if (diff <= 2 * (max - ref - 1)) {
    182            if (diff & 1)
    183                return ref + ((diff + 1) >> 1);
    184            else
    185                return ref - (diff >> 1);
    186        }
    187        return max - (diff + 1);
    188    }
    189 }
    190 
    191 static void find_matching_ref(const Dav1dTaskContext *const t,
    192                              const enum EdgeFlags intra_edge_flags,
    193                              const int bw4, const int bh4,
    194                              const int w4, const int h4,
    195                              const int have_left, const int have_top,
    196                              const int ref, uint64_t masks[2])
    197 {
    198    /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
    199    int count = 0;
    200    int have_topleft = have_top && have_left;
    201    int have_topright = imax(bw4, bh4) < 32 &&
    202                        have_top && t->bx + bw4 < t->ts->tiling.col_end &&
    203                        (intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT);
    204 
    205 #define bs(rp) dav1d_block_dimensions[(rp)->bs]
    206 #define matches(rp) ((rp)->ref.ref[0] == ref + 1 && (rp)->ref.ref[1] == -1)
    207 
    208    if (have_top) {
    209        const refmvs_block *r2 = &r[-1][t->bx];
    210        if (matches(r2)) {
    211            masks[0] |= 1;
    212            count = 1;
    213        }
    214        int aw4 = bs(r2)[0];
    215        if (aw4 >= bw4) {
    216            const int off = t->bx & (aw4 - 1);
    217            if (off) have_topleft = 0;
    218            if (aw4 - off > bw4) have_topright = 0;
    219        } else {
    220            unsigned mask = 1 << aw4;
    221            for (int x = aw4; x < w4; x += aw4) {
    222                r2 += aw4;
    223                if (matches(r2)) {
    224                    masks[0] |= mask;
    225                    if (++count >= 8) return;
    226                }
    227                aw4 = bs(r2)[0];
    228                mask <<= aw4;
    229            }
    230        }
    231    }
    232    if (have_left) {
    233        /*const*/ refmvs_block *const *r2 = r;
    234        if (matches(&r2[0][t->bx - 1])) {
    235            masks[1] |= 1;
    236            if (++count >= 8) return;
    237        }
    238        int lh4 = bs(&r2[0][t->bx - 1])[1];
    239        if (lh4 >= bh4) {
    240            if (t->by & (lh4 - 1)) have_topleft = 0;
    241        } else {
    242            unsigned mask = 1 << lh4;
    243            for (int y = lh4; y < h4; y += lh4) {
    244                r2 += lh4;
    245                if (matches(&r2[0][t->bx - 1])) {
    246                    masks[1] |= mask;
    247                    if (++count >= 8) return;
    248                }
    249                lh4 = bs(&r2[0][t->bx - 1])[1];
    250                mask <<= lh4;
    251            }
    252        }
    253    }
    254    if (have_topleft && matches(&r[-1][t->bx - 1])) {
    255        masks[1] |= 1ULL << 32;
    256        if (++count >= 8) return;
    257    }
    258    if (have_topright && matches(&r[-1][t->bx + bw4])) {
    259        masks[0] |= 1ULL << 32;
    260    }
    261 #undef matches
    262 }
    263 
    264 static void derive_warpmv(const Dav1dTaskContext *const t,
    265                          const int bw4, const int bh4,
    266                          const uint64_t masks[2], const union mv mv,
    267                          Dav1dWarpedMotionParams *const wmp)
    268 {
    269    int pts[8][2 /* in, out */][2 /* x, y */], np = 0;
    270    /*const*/ refmvs_block *const *r = &t->rt.r[(t->by & 31) + 5];
    271 
    272 #define add_sample(dx, dy, sx, sy, rp) do { \
    273    pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
    274    pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \
    275    pts[np][1][0] = pts[np][0][0] + (rp)->mv.mv[0].x; \
    276    pts[np][1][1] = pts[np][0][1] + (rp)->mv.mv[0].y; \
    277    np++; \
    278 } while (0)
    279 
    280    // use masks[] to find the projectable motion vectors in the edges
    281    if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) {
    282        const int off = t->bx & (bs(&r[-1][t->bx])[0] - 1);
    283        add_sample(-off, 0, 1, -1, &r[-1][t->bx]);
    284    } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top
    285        const int tz = ctz(xmask);
    286        off += tz;
    287        xmask >>= tz;
    288        add_sample(off, 0, 1, -1, &r[-1][t->bx + off]);
    289        xmask &= ~1;
    290    }
    291    if (np < 8 && masks[1] == 1) {
    292        const int off = t->by & (bs(&r[0][t->bx - 1])[1] - 1);
    293        add_sample(0, -off, -1, 1, &r[-off][t->bx - 1]);
    294    } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left
    295        const int tz = ctz(ymask);
    296        off += tz;
    297        ymask >>= tz;
    298        add_sample(0, off, -1, 1, &r[off][t->bx - 1]);
    299        ymask &= ~1;
    300    }
    301    if (np < 8 && masks[1] >> 32) // top/left
    302        add_sample(0, 0, -1, -1, &r[-1][t->bx - 1]);
    303    if (np < 8 && masks[0] >> 32) // top/right
    304        add_sample(bw4, 0, 1, -1, &r[-1][t->bx + bw4]);
    305    assert(np > 0 && np <= 8);
    306 #undef bs
    307 
    308    // select according to motion vector difference against a threshold
    309    int mvd[8], ret = 0;
    310    const int thresh = 4 * iclip(imax(bw4, bh4), 4, 28);
    311    for (int i = 0; i < np; i++) {
    312        mvd[i] = abs(pts[i][1][0] - pts[i][0][0] - mv.x) +
    313                 abs(pts[i][1][1] - pts[i][0][1] - mv.y);
    314        if (mvd[i] > thresh)
    315            mvd[i] = -1;
    316        else
    317            ret++;
    318    }
    319    if (!ret) {
    320        ret = 1;
    321    } else for (int i = 0, j = np - 1, k = 0; k < np - ret; k++, i++, j--) {
    322        while (mvd[i] != -1) i++;
    323        while (mvd[j] == -1) j--;
    324        assert(i != j);
    325        if (i > j) break;
    326        // replace the discarded samples;
    327        mvd[i] = mvd[j];
    328        memcpy(pts[i], pts[j], sizeof(*pts));
    329    }
    330 
    331    if (!dav1d_find_affine_int(pts, ret, bw4, bh4, mv, wmp, t->bx, t->by) &&
    332        !dav1d_get_shear_params(wmp))
    333    {
    334        wmp->type = DAV1D_WM_TYPE_AFFINE;
    335    } else
    336        wmp->type = DAV1D_WM_TYPE_IDENTITY;
    337 }
    338 
    339 static inline int findoddzero(const uint8_t *buf, int len) {
    340    for (int n = 0; n < len; n++)
    341        if (!buf[n * 2]) return 1;
    342    return 0;
    343 }
    344 
    345 // meant to be SIMD'able, so that theoretical complexity of this function
    346 // times block size goes from w4*h4 to w4+h4-1
    347 // a and b are previous two lines containing (a) top/left entries or (b)
    348 // top/left entries, with a[0] being either the first top or first left entry,
    349 // depending on top_offset being 1 or 0, and b being the first top/left entry
    350 // for whichever has one. left_offset indicates whether the (len-1)th entry
    351 // has a left neighbour.
    352 // output is order[] and ctx for each member of this diagonal.
    353 static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride,
    354                          const int i, const int first, const int last,
    355                          uint8_t (*const order)[8], uint8_t *const ctx)
    356 {
    357    int have_top = i > first;
    358 
    359    assert(pal_idx);
    360    pal_idx += first + (i - first) * stride;
    361    for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) {
    362        const int have_left = j > 0;
    363 
    364        assert(have_left || have_top);
    365 
    366 #define add(v_in) do { \
    367        const int v = v_in; \
    368        assert((unsigned)v < 8U); \
    369        order[n][o_idx++] = v; \
    370        mask |= 1 << v; \
    371    } while (0)
    372 
    373        unsigned mask = 0;
    374        int o_idx = 0;
    375        if (!have_left) {
    376            ctx[n] = 0;
    377            add(pal_idx[-stride]);
    378        } else if (!have_top) {
    379            ctx[n] = 0;
    380            add(pal_idx[-1]);
    381        } else {
    382            const int l = pal_idx[-1], t = pal_idx[-stride], tl = pal_idx[-(stride + 1)];
    383            const int same_t_l = t == l;
    384            const int same_t_tl = t == tl;
    385            const int same_l_tl = l == tl;
    386            const int same_all = same_t_l & same_t_tl & same_l_tl;
    387 
    388            if (same_all) {
    389                ctx[n] = 4;
    390                add(t);
    391            } else if (same_t_l) {
    392                ctx[n] = 3;
    393                add(t);
    394                add(tl);
    395            } else if (same_t_tl | same_l_tl) {
    396                ctx[n] = 2;
    397                add(tl);
    398                add(same_t_tl ? l : t);
    399            } else {
    400                ctx[n] = 1;
    401                add(imin(t, l));
    402                add(imax(t, l));
    403                add(tl);
    404            }
    405        }
    406        for (unsigned m = 1, bit = 0; m < 0x100; m <<= 1, bit++)
    407            if (!(mask & m))
    408                order[n][o_idx++] = bit;
    409        assert(o_idx == 8);
    410 #undef add
    411    }
    412 }
    413 
    414 static void read_pal_indices(Dav1dTaskContext *const t,
    415                             uint8_t *const pal_idx,
    416                             const int pal_sz, const int pl,
    417                             const int w4, const int h4,
    418                             const int bw4, const int bh4)
    419 {
    420    Dav1dTileState *const ts = t->ts;
    421    const ptrdiff_t stride = bw4 * 4;
    422    assert(pal_idx);
    423    uint8_t *const pal_tmp = t->scratch.pal_idx_uv;
    424    pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, pal_sz);
    425    uint16_t (*const color_map_cdf)[8] =
    426        ts->cdf.m.color_map[pl][pal_sz - 2];
    427    uint8_t (*const order)[8] = t->scratch.pal_order;
    428    uint8_t *const ctx = t->scratch.pal_ctx;
    429    for (int i = 1; i < 4 * (w4 + h4) - 1; i++) {
    430        // top/left-to-bottom/right diagonals ("wave-front")
    431        const int first = imin(i, w4 * 4 - 1);
    432        const int last = imax(0, i - h4 * 4 + 1);
    433        order_palette(pal_tmp, stride, i, first, last, order, ctx);
    434        for (int j = first, m = 0; j >= last; j--, m++) {
    435            const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
    436                                      color_map_cdf[ctx[m]], pal_sz - 1);
    437            pal_tmp[(i - j) * stride + j] = order[m][color_idx];
    438        }
    439    }
    440 
    441    t->c->pal_dsp.pal_idx_finish(pal_idx, pal_tmp, bw4 * 4, bh4 * 4,
    442                                 w4 * 4, h4 * 4);
    443 }
    444 
    445 static void read_vartx_tree(Dav1dTaskContext *const t,
    446                            Av1Block *const b, const enum BlockSize bs,
    447                            const int bx4, const int by4)
    448 {
    449    const Dav1dFrameContext *const f = t->f;
    450    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    451    const int bw4 = b_dim[0], bh4 = b_dim[1];
    452 
    453    // var-tx tree coding
    454    uint16_t tx_split[2] = { 0 };
    455    b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
    456    if (!b->skip && (f->frame_hdr->segmentation.lossless[b->seg_id] ||
    457                     b->max_ytx == TX_4X4))
    458    {
    459        b->max_ytx = b->uvtx = TX_4X4;
    460        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
    461            dav1d_memset_pow2[b_dim[2]](&t->a->tx[bx4], TX_4X4);
    462            dav1d_memset_pow2[b_dim[3]](&t->l.tx[by4], TX_4X4);
    463        }
    464    } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) {
    465        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
    466            dav1d_memset_pow2[b_dim[2]](&t->a->tx[bx4], b_dim[2 + 0]);
    467            dav1d_memset_pow2[b_dim[3]](&t->l.tx[by4], b_dim[2 + 1]);
    468        }
    469        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
    470    } else {
    471        assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64);
    472        int y, x, y_off, x_off;
    473        const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
    474        for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
    475            for (x = 0, x_off = 0; x < bw4; x += ytx->w, x_off++) {
    476                read_tx_tree(t, b->max_ytx, 0, tx_split, x_off, y_off);
    477                // contexts are updated inside read_tx_tree()
    478                t->bx += ytx->w;
    479            }
    480            t->bx -= x;
    481            t->by += ytx->h;
    482        }
    483        t->by -= y;
    484        if (DEBUG_BLOCK_INFO)
    485            printf("Post-vartxtree[%x/%x]: r=%d\n",
    486                   tx_split[0], tx_split[1], t->ts->msac.rng);
    487        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
    488    }
    489    assert(!(tx_split[0] & ~0x33));
    490    b->tx_split0 = (uint8_t)tx_split[0];
    491    b->tx_split1 = tx_split[1];
    492 }
    493 
    494 static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
    495                                            const int by, const int bx,
    496                                            const int w4, int h4,
    497                                            const uint8_t *ref_seg_map,
    498                                            const ptrdiff_t stride)
    499 {
    500    assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
    501 
    502    unsigned seg_id = 8;
    503    ref_seg_map += by * stride + bx;
    504    do {
    505        for (int x = 0; x < w4; x++)
    506            seg_id = imin(seg_id, ref_seg_map[x]);
    507        ref_seg_map += stride;
    508    } while (--h4 > 0 && seg_id);
    509    assert(seg_id < 8);
    510 
    511    return seg_id;
    512 }
    513 
    514 static inline void splat_oneref_mv(const Dav1dContext *const c,
    515                                   Dav1dTaskContext *const t,
    516                                   const enum BlockSize bs,
    517                                   const Av1Block *const b,
    518                                   const int bw4, const int bh4)
    519 {
    520    const enum InterPredMode mode = b->inter_mode;
    521    const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
    522        .ref.ref = { b->ref[0] + 1, b->interintra_type ? 0 : -1 },
    523        .mv.mv[0] = b->mv[0],
    524        .bs = bs,
    525        .mf = (mode == GLOBALMV && imin(bw4, bh4) >= 2) | ((mode == NEWMV) * 2),
    526    };
    527    c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
    528 }
    529 
    530 static inline void splat_intrabc_mv(const Dav1dContext *const c,
    531                                    Dav1dTaskContext *const t,
    532                                    const enum BlockSize bs,
    533                                    const Av1Block *const b,
    534                                    const int bw4, const int bh4)
    535 {
    536    const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
    537        .ref.ref = { 0, -1 },
    538        .mv.mv[0] = b->mv[0],
    539        .bs = bs,
    540        .mf = 0,
    541    };
    542    c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
    543 }
    544 
    545 static inline void splat_tworef_mv(const Dav1dContext *const c,
    546                                   Dav1dTaskContext *const t,
    547                                   const enum BlockSize bs,
    548                                   const Av1Block *const b,
    549                                   const int bw4, const int bh4)
    550 {
    551    assert(bw4 >= 2 && bh4 >= 2);
    552    const enum CompInterPredMode mode = b->inter_mode;
    553    const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
    554        .ref.ref = { b->ref[0] + 1, b->ref[1] + 1 },
    555        .mv.mv = { b->mv[0], b->mv[1] },
    556        .bs = bs,
    557        .mf = (mode == GLOBALMV_GLOBALMV) | !!((1 << mode) & (0xbc)) * 2,
    558    };
    559    c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
    560 }
    561 
    562 static inline void splat_intraref(const Dav1dContext *const c,
    563                                  Dav1dTaskContext *const t,
    564                                  const enum BlockSize bs,
    565                                  const int bw4, const int bh4)
    566 {
    567    const refmvs_block ALIGN(tmpl, 16) = (refmvs_block) {
    568        .ref.ref = { 0, -1 },
    569        .mv.mv[0].n = INVALID_MV,
    570        .bs = bs,
    571        .mf = 0,
    572    };
    573    c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
    574 }
    575 
    576 static void mc_lowest_px(int *const dst, const int by4, const int bh4,
    577                         const int mvy, const int ss_ver,
    578                         const struct ScalableMotionParams *const smp)
    579 {
    580    const int v_mul = 4 >> ss_ver;
    581    if (!smp->scale) {
    582        const int my = mvy >> (3 + ss_ver), dy = mvy & (15 >> !ss_ver);
    583        *dst = imax(*dst, (by4 + bh4) * v_mul + my + 4 * !!dy);
    584    } else {
    585        int y = (by4 * v_mul << 4) + mvy * (1 << !ss_ver);
    586        const int64_t tmp = (int64_t)(y) * smp->scale + (smp->scale - 0x4000) * 8;
    587        y = apply_sign64((int)((llabs(tmp) + 128) >> 8), tmp) + 32;
    588        const int bottom = ((y + (bh4 * v_mul - 1) * smp->step) >> 10) + 1 + 4;
    589        *dst = imax(*dst, bottom);
    590    }
    591 }
    592 
    593 static ALWAYS_INLINE void affine_lowest_px(Dav1dTaskContext *const t, int *const dst,
    594                                           const uint8_t *const b_dim,
    595                                           const Dav1dWarpedMotionParams *const wmp,
    596                                           const int ss_ver, const int ss_hor)
    597 {
    598    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    599    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
    600    const int32_t *const mat = wmp->matrix;
    601    const int y = b_dim[1] * v_mul - 8; // lowest y
    602 
    603    const int src_y = t->by * 4 + ((y + 4) << ss_ver);
    604    const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
    605    // check left- and right-most blocks
    606    for (int x = 0; x < b_dim[0] * h_mul; x += imax(8, b_dim[0] * h_mul - 8)) {
    607        // calculate transformation relative to center of 8x8 block in
    608        // luma pixel units
    609        const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
    610        const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
    611        const int dy = (int) (mvy >> 16) - 4;
    612        *dst = imax(*dst, dy + 4 + 8);
    613    }
    614 }
    615 
    616 static NOINLINE void affine_lowest_px_luma(Dav1dTaskContext *const t, int *const dst,
    617                                           const uint8_t *const b_dim,
    618                                           const Dav1dWarpedMotionParams *const wmp)
    619 {
    620    affine_lowest_px(t, dst, b_dim, wmp, 0, 0);
    621 }
    622 
    623 static NOINLINE void affine_lowest_px_chroma(Dav1dTaskContext *const t, int *const dst,
    624                                             const uint8_t *const b_dim,
    625                                             const Dav1dWarpedMotionParams *const wmp)
    626 {
    627    const Dav1dFrameContext *const f = t->f;
    628    assert(f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400);
    629    if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I444)
    630        affine_lowest_px_luma(t, dst, b_dim, wmp);
    631    else
    632        affine_lowest_px(t, dst, b_dim, wmp, f->cur.p.layout & DAV1D_PIXEL_LAYOUT_I420, 1);
    633 }
    634 
    635 static void obmc_lowest_px(Dav1dTaskContext *const t,
    636                           int (*const dst)[2], const int is_chroma,
    637                           const uint8_t *const b_dim,
    638                           const int bx4, const int by4, const int w4, const int h4)
    639 {
    640    assert(!(t->bx & 1) && !(t->by & 1));
    641    const Dav1dFrameContext *const f = t->f;
    642    /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
    643    const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    644    const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    645    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    646 
    647    if (t->by > t->ts->tiling.row_start &&
    648        (!is_chroma || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
    649    {
    650        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
    651            // only odd blocks are considered for overlap handling, hence +1
    652            const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
    653            const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
    654 
    655            if (a_r->ref.ref[0] > 0) {
    656                const int oh4 = imin(b_dim[1], 16) >> 1;
    657                mc_lowest_px(&dst[a_r->ref.ref[0] - 1][is_chroma], t->by,
    658                             (oh4 * 3 + 3) >> 2, a_r->mv.mv[0].y, ss_ver,
    659                             &f->svc[a_r->ref.ref[0] - 1][1]);
    660                i++;
    661            }
    662            x += imax(a_b_dim[0], 2);
    663        }
    664    }
    665 
    666    if (t->bx > t->ts->tiling.col_start)
    667        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
    668            // only odd blocks are considered for overlap handling, hence +1
    669            const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
    670            const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
    671 
    672            if (l_r->ref.ref[0] > 0) {
    673                const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
    674                mc_lowest_px(&dst[l_r->ref.ref[0] - 1][is_chroma],
    675                             t->by + y, oh4, l_r->mv.mv[0].y, ss_ver,
    676                             &f->svc[l_r->ref.ref[0] - 1][1]);
    677                i++;
    678            }
    679            y += imax(l_b_dim[1], 2);
    680        }
    681 }
    682 
    683 static int decode_b(Dav1dTaskContext *const t,
    684                    const enum BlockLevel bl,
    685                    const enum BlockSize bs,
    686                    const enum BlockPartition bp,
    687                    const enum EdgeFlags intra_edge_flags) {
    688    Dav1dTileState *const ts = t->ts;
    689    const Dav1dFrameContext *const f = t->f;
    690    Av1Block b_mem, *const b = t->frame_thread.pass ?
    691        &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem;
    692    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    693    const int bx4 = t->bx & 31, by4 = t->by & 31;
    694    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    695    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    696    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
    697    const int bw4 = b_dim[0], bh4 = b_dim[1];
    698    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
    699    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
    700    const int have_left = t->bx > ts->tiling.col_start;
    701    const int have_top = t->by > ts->tiling.row_start;
    702    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
    703                           (bw4 > ss_hor || t->bx & 1) &&
    704                           (bh4 > ss_ver || t->by & 1);
    705 
    706    if (t->frame_thread.pass == 2) {
    707        if (b->intra) {
    708            f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
    709 
    710            const enum IntraPredMode y_mode_nofilt =
    711                b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
    712 #define set_ctx(rep_macro) \
    713            rep_macro(edge->mode, off, y_mode_nofilt); \
    714            rep_macro(edge->intra, off, 1)
    715            BlockContext *edge = t->a;
    716            for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
    717                case_set(b_dim[2 + i]);
    718            }
    719 #undef set_ctx
    720            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
    721                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
    722                for (int x = 0; x < bw4; x++) {
    723                    r[x].ref.ref[0] = 0;
    724                    r[x].bs = bs;
    725                }
    726                refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
    727                for (int y = 0; y < bh4 - 1; y++) {
    728                    rr[y][t->bx + bw4 - 1].ref.ref[0] = 0;
    729                    rr[y][t->bx + bw4 - 1].bs = bs;
    730                }
    731            }
    732 
    733            if (has_chroma) {
    734                uint8_t uv_mode = b->uv_mode;
    735                dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode);
    736                dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode);
    737            }
    738        } else {
    739            if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ &&
    740                b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
    741            {
    742                if (b->matrix[0] == INT16_MIN) {
    743                    t->warpmv.type = DAV1D_WM_TYPE_IDENTITY;
    744                } else {
    745                    t->warpmv.type = DAV1D_WM_TYPE_AFFINE;
    746                    t->warpmv.matrix[2] = b->matrix[0] + 0x10000;
    747                    t->warpmv.matrix[3] = b->matrix[1];
    748                    t->warpmv.matrix[4] = b->matrix[2];
    749                    t->warpmv.matrix[5] = b->matrix[3] + 0x10000;
    750                    dav1d_set_affine_mv2d(bw4, bh4, b->mv2d, &t->warpmv,
    751                                          t->bx, t->by);
    752                    dav1d_get_shear_params(&t->warpmv);
    753 #define signabs(v) v < 0 ? '-' : ' ', abs(v)
    754                    if (DEBUG_BLOCK_INFO)
    755                        printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
    756                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, mv=y:%d,x:%d\n",
    757                               signabs(t->warpmv.matrix[0]),
    758                               signabs(t->warpmv.matrix[1]),
    759                               signabs(t->warpmv.matrix[2]),
    760                               signabs(t->warpmv.matrix[3]),
    761                               signabs(t->warpmv.matrix[4]),
    762                               signabs(t->warpmv.matrix[5]),
    763                               signabs(t->warpmv.u.p.alpha),
    764                               signabs(t->warpmv.u.p.beta),
    765                               signabs(t->warpmv.u.p.gamma),
    766                               signabs(t->warpmv.u.p.delta),
    767                               b->mv2d.y, b->mv2d.x);
    768 #undef signabs
    769                }
    770            }
    771            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
    772 
    773            const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
    774            BlockContext *edge = t->a;
    775            for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
    776 #define set_ctx(rep_macro) \
    777                rep_macro(edge->filter[0], off, filter[0]); \
    778                rep_macro(edge->filter[1], off, filter[1]); \
    779                rep_macro(edge->intra, off, 0)
    780                case_set(b_dim[2 + i]);
    781 #undef set_ctx
    782            }
    783 
    784            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
    785                refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx];
    786                for (int x = 0; x < bw4; x++) {
    787                    r[x].ref.ref[0] = b->ref[0] + 1;
    788                    r[x].mv.mv[0] = b->mv[0];
    789                    r[x].bs = bs;
    790                }
    791                refmvs_block *const *rr = &t->rt.r[(t->by & 31) + 5];
    792                for (int y = 0; y < bh4 - 1; y++) {
    793                    rr[y][t->bx + bw4 - 1].ref.ref[0] = b->ref[0] + 1;
    794                    rr[y][t->bx + bw4 - 1].mv.mv[0] = b->mv[0];
    795                    rr[y][t->bx + bw4 - 1].bs = bs;
    796                }
    797            }
    798 
    799            if (has_chroma) {
    800                dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
    801                dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
    802            }
    803        }
    804        return 0;
    805    }
    806 
    807    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
    808 
    809    b->bl = bl;
    810    b->bp = bp;
    811    b->bs = bs;
    812 
    813    const Dav1dSegmentationData *seg = NULL;
    814 
    815    // segment_id (if seg_feature for skip/ref/gmv is enabled)
    816    int seg_pred = 0;
    817    if (f->frame_hdr->segmentation.enabled) {
    818        if (!f->frame_hdr->segmentation.update_map) {
    819            if (f->prev_segmap) {
    820                unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
    821                                                       f->prev_segmap,
    822                                                       f->b4_stride);
    823                if (seg_id >= 8) return -1;
    824                b->seg_id = seg_id;
    825            } else {
    826                b->seg_id = 0;
    827            }
    828            seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
    829        } else if (f->frame_hdr->segmentation.seg_data.preskip) {
    830            if (f->frame_hdr->segmentation.temporal &&
    831                (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
    832                                ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
    833                                t->l.seg_pred[by4]])))
    834            {
    835                // temporal predicted seg_id
    836                if (f->prev_segmap) {
    837                    unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx,
    838                                                           w4, h4,
    839                                                           f->prev_segmap,
    840                                                           f->b4_stride);
    841                    if (seg_id >= 8) return -1;
    842                    b->seg_id = seg_id;
    843                } else {
    844                    b->seg_id = 0;
    845                }
    846            } else {
    847                int seg_ctx;
    848                const unsigned pred_seg_id =
    849                    get_cur_frame_segid(t->by, t->bx, have_top, have_left,
    850                                        &seg_ctx, f->cur_segmap, f->b4_stride);
    851                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
    852                                          ts->cdf.m.seg_id[seg_ctx],
    853                                          DAV1D_MAX_SEGMENTS - 1);
    854                const unsigned last_active_seg_id =
    855                    f->frame_hdr->segmentation.seg_data.last_active_segid;
    856                b->seg_id = neg_deinterleave(diff, pred_seg_id,
    857                                             last_active_seg_id + 1);
    858                if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
    859                if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
    860            }
    861 
    862            if (DEBUG_BLOCK_INFO)
    863                printf("Post-segid[preskip;%d]: r=%d\n",
    864                       b->seg_id, ts->msac.rng);
    865 
    866            seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
    867        }
    868    } else {
    869        b->seg_id = 0;
    870    }
    871 
    872    // skip_mode
    873    if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) &&
    874        f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1)
    875    {
    876        const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
    877        b->skip_mode = dav1d_msac_decode_bool_adapt(&ts->msac,
    878                           ts->cdf.m.skip_mode[smctx]);
    879        if (DEBUG_BLOCK_INFO)
    880            printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
    881    } else {
    882        b->skip_mode = 0;
    883    }
    884 
    885    // skip
    886    if (b->skip_mode || (seg && seg->skip)) {
    887        b->skip = 1;
    888    } else {
    889        const int sctx = t->a->skip[bx4] + t->l.skip[by4];
    890        b->skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
    891        if (DEBUG_BLOCK_INFO)
    892            printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
    893    }
    894 
    895    // segment_id
    896    if (f->frame_hdr->segmentation.enabled &&
    897        f->frame_hdr->segmentation.update_map &&
    898        !f->frame_hdr->segmentation.seg_data.preskip)
    899    {
    900        if (!b->skip && f->frame_hdr->segmentation.temporal &&
    901            (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
    902                            ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
    903                            t->l.seg_pred[by4]])))
    904        {
    905            // temporal predicted seg_id
    906            if (f->prev_segmap) {
    907                unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
    908                                                       f->prev_segmap,
    909                                                       f->b4_stride);
    910                if (seg_id >= 8) return -1;
    911                b->seg_id = seg_id;
    912            } else {
    913                b->seg_id = 0;
    914            }
    915        } else {
    916            int seg_ctx;
    917            const unsigned pred_seg_id =
    918                get_cur_frame_segid(t->by, t->bx, have_top, have_left,
    919                                    &seg_ctx, f->cur_segmap, f->b4_stride);
    920            if (b->skip) {
    921                b->seg_id = pred_seg_id;
    922            } else {
    923                const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
    924                                          ts->cdf.m.seg_id[seg_ctx],
    925                                          DAV1D_MAX_SEGMENTS - 1);
    926                const unsigned last_active_seg_id =
    927                    f->frame_hdr->segmentation.seg_data.last_active_segid;
    928                b->seg_id = neg_deinterleave(diff, pred_seg_id,
    929                                             last_active_seg_id + 1);
    930                if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
    931            }
    932            if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
    933        }
    934 
    935        seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
    936 
    937        if (DEBUG_BLOCK_INFO)
    938            printf("Post-segid[postskip;%d]: r=%d\n",
    939                   b->seg_id, ts->msac.rng);
    940    }
    941 
    942    // cdef index
    943    if (!b->skip) {
    944        const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) +
    945                                           ((t->by & 16) >> 3) : 0;
    946        if (t->cur_sb_cdef_idx_ptr[idx] == -1) {
    947            const int v = dav1d_msac_decode_bools(&ts->msac,
    948                              f->frame_hdr->cdef.n_bits);
    949            t->cur_sb_cdef_idx_ptr[idx] = v;
    950            if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v;
    951            if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v;
    952            if (bw4 == 32 && bh4 == 32) t->cur_sb_cdef_idx_ptr[idx + 3] = v;
    953 
    954            if (DEBUG_BLOCK_INFO)
    955                printf("Post-cdef_idx[%d]: r=%d\n",
    956                        *t->cur_sb_cdef_idx_ptr, ts->msac.rng);
    957        }
    958    }
    959 
    960    // delta-q/lf
    961    if (!((t->bx | t->by) & (31 >> !f->seq_hdr->sb128))) {
    962        const int prev_qidx = ts->last_qidx;
    963        const int have_delta_q = f->frame_hdr->delta.q.present &&
    964            (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);
    965 
    966        uint32_t prev_delta_lf = ts->last_delta_lf.u32;
    967 
    968        if (have_delta_q) {
    969            int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
    970                                                          ts->cdf.m.delta_q, 3);
    971            if (delta_q == 3) {
    972                const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
    973                delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
    974                          1 + (1 << n_bits);
    975            }
    976            if (delta_q) {
    977                if (dav1d_msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q;
    978                delta_q *= 1 << f->frame_hdr->delta.q.res_log2;
    979            }
    980            ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
    981            if (have_delta_q && DEBUG_BLOCK_INFO)
    982                printf("Post-delta_q[%d->%d]: r=%d\n",
    983                       delta_q, ts->last_qidx, ts->msac.rng);
    984 
    985            if (f->frame_hdr->delta.lf.present) {
    986                const int n_lfs = f->frame_hdr->delta.lf.multi ?
    987                    f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
    988 
    989                for (int i = 0; i < n_lfs; i++) {
    990                    int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
    991                        ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3);
    992                    if (delta_lf == 3) {
    993                        const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
    994                        delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
    995                                   1 + (1 << n_bits);
    996                    }
    997                    if (delta_lf) {
    998                        if (dav1d_msac_decode_bool_equi(&ts->msac))
    999                            delta_lf = -delta_lf;
   1000                        delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
   1001                    }
   1002                    ts->last_delta_lf.i8[i] =
   1003                        iclip(ts->last_delta_lf.i8[i] + delta_lf, -63, 63);
   1004                    if (have_delta_q && DEBUG_BLOCK_INFO)
   1005                        printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
   1006                               ts->msac.rng);
   1007                }
   1008            }
   1009        }
   1010        if (ts->last_qidx == f->frame_hdr->quant.yac) {
   1011            // assign frame-wide q values to this sb
   1012            ts->dq = f->dq;
   1013        } else if (ts->last_qidx != prev_qidx) {
   1014            // find sb-specific quant parameters
   1015            init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem);
   1016            ts->dq = ts->dqmem;
   1017        }
   1018        if (!ts->last_delta_lf.u32) {
   1019            // assign frame-wide lf values to this sb
   1020            ts->lflvl = f->lf.lvl;
   1021        } else if (ts->last_delta_lf.u32 != prev_delta_lf) {
   1022            // find sb-specific lf lvl parameters
   1023            ts->lflvl = ts->lflvlmem;
   1024            dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf.i8);
   1025        }
   1026    }
   1027 
   1028    if (b->skip_mode) {
   1029        b->intra = 0;
   1030    } else if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
   1031        if (seg && (seg->ref >= 0 || seg->globalmv)) {
   1032            b->intra = !seg->ref;
   1033        } else {
   1034            const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
   1035                                           have_top, have_left);
   1036            b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac,
   1037                            ts->cdf.m.intra[ictx]);
   1038            if (DEBUG_BLOCK_INFO)
   1039                printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
   1040        }
   1041    } else if (f->frame_hdr->allow_intrabc) {
   1042        b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
   1043        if (DEBUG_BLOCK_INFO)
   1044            printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng);
   1045    } else {
   1046        b->intra = 1;
   1047    }
   1048 
   1049    // intra/inter-specific stuff
   1050    if (b->intra) {
   1051        uint16_t *const ymode_cdf = IS_INTER_OR_SWITCH(f->frame_hdr) ?
   1052            ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
   1053            ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
   1054                        [dav1d_intra_mode_context[t->l.mode[by4]]];
   1055        b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
   1056                                                     N_INTRA_PRED_MODES - 1);
   1057        if (DEBUG_BLOCK_INFO)
   1058            printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
   1059 
   1060        // angle delta
   1061        if (b_dim[2] + b_dim[3] >= 2 && b->y_mode >= VERT_PRED &&
   1062            b->y_mode <= VERT_LEFT_PRED)
   1063        {
   1064            uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
   1065            const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
   1066            b->y_angle = angle - 3;
   1067        } else {
   1068            b->y_angle = 0;
   1069        }
   1070 
   1071        if (has_chroma) {
   1072            const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
   1073                cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
   1074            uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
   1075            b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
   1076                             N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed);
   1077            if (DEBUG_BLOCK_INFO)
   1078                printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
   1079 
   1080            b->uv_angle = 0;
   1081            if (b->uv_mode == CFL_PRED) {
   1082 #define SIGN(a) (!!(a) + ((a) > 0))
   1083                const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
   1084                                     ts->cdf.m.cfl_sign, 7) + 1;
   1085                const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
   1086                assert(sign_u == sign / 3);
   1087                if (sign_u) {
   1088                    const int ctx = (sign_u == 2) * 3 + sign_v;
   1089                    b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
   1090                                          ts->cdf.m.cfl_alpha[ctx], 15) + 1;
   1091                    if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
   1092                } else {
   1093                    b->cfl_alpha[0] = 0;
   1094                }
   1095                if (sign_v) {
   1096                    const int ctx = (sign_v == 2) * 3 + sign_u;
   1097                    b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
   1098                                          ts->cdf.m.cfl_alpha[ctx], 15) + 1;
   1099                    if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
   1100                } else {
   1101                    b->cfl_alpha[1] = 0;
   1102                }
   1103 #undef SIGN
   1104                if (DEBUG_BLOCK_INFO)
   1105                    printf("Post-uvalphas[%d/%d]: r=%d\n",
   1106                           b->cfl_alpha[0], b->cfl_alpha[1], ts->msac.rng);
   1107            } else if (b_dim[2] + b_dim[3] >= 2 && b->uv_mode >= VERT_PRED &&
   1108                       b->uv_mode <= VERT_LEFT_PRED)
   1109            {
   1110                uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
   1111                const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
   1112                b->uv_angle = angle - 3;
   1113            }
   1114        }
   1115 
   1116        b->pal_sz[0] = b->pal_sz[1] = 0;
   1117        if (f->frame_hdr->allow_screen_content_tools &&
   1118            imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4)
   1119        {
   1120            const int sz_ctx = b_dim[2] + b_dim[3] - 2;
   1121            if (b->y_mode == DC_PRED) {
   1122                const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0);
   1123                const int use_y_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
   1124                                          ts->cdf.m.pal_y[sz_ctx][pal_ctx]);
   1125                if (DEBUG_BLOCK_INFO)
   1126                    printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
   1127                if (use_y_pal)
   1128                    f->bd_fn.read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
   1129            }
   1130 
   1131            if (has_chroma && b->uv_mode == DC_PRED) {
   1132                const int pal_ctx = b->pal_sz[0] > 0;
   1133                const int use_uv_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
   1134                                           ts->cdf.m.pal_uv[pal_ctx]);
   1135                if (DEBUG_BLOCK_INFO)
   1136                    printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
   1137                if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
   1138                    f->bd_fn.read_pal_uv(t, b, sz_ctx, bx4, by4);
   1139            }
   1140        }
   1141 
   1142        if (b->y_mode == DC_PRED && !b->pal_sz[0] &&
   1143            imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra)
   1144        {
   1145            const int is_filter = dav1d_msac_decode_bool_adapt(&ts->msac,
   1146                                      ts->cdf.m.use_filter_intra[bs]);
   1147            if (is_filter) {
   1148                b->y_mode = FILTER_PRED;
   1149                b->y_angle = dav1d_msac_decode_symbol_adapt8(&ts->msac,
   1150                                 ts->cdf.m.filter_intra, 4);
   1151            }
   1152            if (DEBUG_BLOCK_INFO)
   1153                printf("Post-filterintramode[%d/%d]: r=%d\n",
   1154                       b->y_mode, b->y_angle, ts->msac.rng);
   1155        }
   1156 
   1157        if (b->pal_sz[0]) {
   1158            uint8_t *pal_idx;
   1159            if (t->frame_thread.pass) {
   1160                const int p = t->frame_thread.pass & 1;
   1161                assert(ts->frame_thread[p].pal_idx);
   1162                pal_idx = ts->frame_thread[p].pal_idx;
   1163                ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
   1164            } else
   1165                pal_idx = t->scratch.pal_idx_y;
   1166            read_pal_indices(t, pal_idx, b->pal_sz[0], 0, w4, h4, bw4, bh4);
   1167            if (DEBUG_BLOCK_INFO)
   1168                printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
   1169        }
   1170 
   1171        if (has_chroma && b->pal_sz[1]) {
   1172            uint8_t *pal_idx;
   1173            if (t->frame_thread.pass) {
   1174                const int p = t->frame_thread.pass & 1;
   1175                assert(ts->frame_thread[p].pal_idx);
   1176                pal_idx = ts->frame_thread[p].pal_idx;
   1177                ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
   1178            } else
   1179                pal_idx = t->scratch.pal_idx_uv;
   1180            read_pal_indices(t, pal_idx, b->pal_sz[1], 1, cw4, ch4, cbw4, cbh4);
   1181            if (DEBUG_BLOCK_INFO)
   1182                printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
   1183        }
   1184 
   1185        const TxfmInfo *t_dim;
   1186        if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
   1187            b->tx = b->uvtx = (int) TX_4X4;
   1188            t_dim = &dav1d_txfm_dimensions[TX_4X4];
   1189        } else {
   1190            b->tx = dav1d_max_txfm_size_for_bs[bs][0];
   1191            b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
   1192            t_dim = &dav1d_txfm_dimensions[b->tx];
   1193            if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
   1194                const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
   1195                uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
   1196                int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
   1197                                imin(t_dim->max, 2));
   1198 
   1199                while (depth--) {
   1200                    b->tx = t_dim->sub;
   1201                    t_dim = &dav1d_txfm_dimensions[b->tx];
   1202                }
   1203            }
   1204            if (DEBUG_BLOCK_INFO)
   1205                printf("Post-tx[%d]: r=%d\n", b->tx, ts->msac.rng);
   1206        }
   1207 
   1208        // reconstruction
   1209        if (t->frame_thread.pass == 1) {
   1210            f->bd_fn.read_coef_blocks(t, bs, b);
   1211        } else {
   1212            f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
   1213        }
   1214 
   1215        if (f->frame_hdr->loopfilter.level_y[0] ||
   1216            f->frame_hdr->loopfilter.level_y[1])
   1217        {
   1218            dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
   1219                                       (const uint8_t (*)[8][2])
   1220                                       &ts->lflvl[b->seg_id][0][0][0],
   1221                                       t->bx, t->by, f->w4, f->h4, bs,
   1222                                       b->tx, b->uvtx, f->cur.p.layout,
   1223                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
   1224                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
   1225                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
   1226        }
   1227        // update contexts
   1228        const enum IntraPredMode y_mode_nofilt =
   1229            b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
   1230        BlockContext *edge = t->a;
   1231        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
   1232            int t_lsz = ((uint8_t *) &t_dim->lw)[i]; // lw then lh
   1233 #define set_ctx(rep_macro) \
   1234            rep_macro(edge->tx_intra, off, t_lsz); \
   1235            rep_macro(edge->tx, off, t_lsz); \
   1236            rep_macro(edge->mode, off, y_mode_nofilt); \
   1237            rep_macro(edge->pal_sz, off, b->pal_sz[0]); \
   1238            rep_macro(edge->seg_pred, off, seg_pred); \
   1239            rep_macro(edge->skip_mode, off, 0); \
   1240            rep_macro(edge->intra, off, 1); \
   1241            rep_macro(edge->skip, off, b->skip); \
   1242            /* see aomedia bug 2183 for why we use luma coordinates here */ \
   1243            rep_macro(t->pal_sz_uv[i], off, (has_chroma ? b->pal_sz[1] : 0)); \
   1244            if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \
   1245                rep_macro(edge->comp_type, off, COMP_INTER_NONE); \
   1246                rep_macro(edge->ref[0], off, ((uint8_t) -1)); \
   1247                rep_macro(edge->ref[1], off, ((uint8_t) -1)); \
   1248                rep_macro(edge->filter[0], off, DAV1D_N_SWITCHABLE_FILTERS); \
   1249                rep_macro(edge->filter[1], off, DAV1D_N_SWITCHABLE_FILTERS); \
   1250            }
   1251            case_set(b_dim[2 + i]);
   1252 #undef set_ctx
   1253        }
   1254        if (b->pal_sz[0])
   1255            f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4);
   1256        if (has_chroma) {
   1257            uint8_t uv_mode = b->uv_mode;
   1258            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode);
   1259            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode);
   1260            if (b->pal_sz[1])
   1261                f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4);
   1262        }
   1263        if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)
   1264            splat_intraref(f->c, t, bs, bw4, bh4);
   1265    } else if (IS_KEY_OR_INTRA(f->frame_hdr)) {
   1266        // intra block copy
   1267        refmvs_candidate mvstack[8];
   1268        int n_mvs, ctx;
   1269        dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
   1270                          (union refmvs_refpair) { .ref = { 0, -1 }},
   1271                          bs, intra_edge_flags, t->by, t->bx);
   1272 
   1273        if (mvstack[0].mv.mv[0].n)
   1274            b->mv[0] = mvstack[0].mv.mv[0];
   1275        else if (mvstack[1].mv.mv[0].n)
   1276            b->mv[0] = mvstack[1].mv.mv[0];
   1277        else {
   1278            if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) {
   1279                b->mv[0].y = 0;
   1280                b->mv[0].x = -(512 << f->seq_hdr->sb128) - 2048;
   1281            } else {
   1282                b->mv[0].y = -(512 << f->seq_hdr->sb128);
   1283                b->mv[0].x = 0;
   1284            }
   1285        }
   1286 
   1287        const union mv ref = b->mv[0];
   1288        read_mv_residual(ts, &b->mv[0], -1);
   1289 
   1290        // clip intrabc motion vector to decoded parts of current tile
   1291        int border_left = ts->tiling.col_start * 4;
   1292        int border_top  = ts->tiling.row_start * 4;
   1293        if (has_chroma) {
   1294            if (bw4 < 2 &&  ss_hor)
   1295                border_left += 4;
   1296            if (bh4 < 2 &&  ss_ver)
   1297                border_top  += 4;
   1298        }
   1299        int src_left   = t->bx * 4 + (b->mv[0].x >> 3);
   1300        int src_top    = t->by * 4 + (b->mv[0].y >> 3);
   1301        int src_right  = src_left + bw4 * 4;
   1302        int src_bottom = src_top  + bh4 * 4;
   1303        const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4;
   1304 
   1305        // check against left or right tile boundary and adjust if necessary
   1306        if (src_left < border_left) {
   1307            src_right += border_left - src_left;
   1308            src_left  += border_left - src_left;
   1309        } else if (src_right > border_right) {
   1310            src_left  -= src_right - border_right;
   1311            src_right -= src_right - border_right;
   1312        }
   1313        // check against top tile boundary and adjust if necessary
   1314        if (src_top < border_top) {
   1315            src_bottom += border_top - src_top;
   1316            src_top    += border_top - src_top;
   1317        }
   1318 
   1319        const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
   1320        const int sby = (t->by >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
   1321        const int sb_size = 1 << (6 + f->seq_hdr->sb128);
   1322        // check for overlap with current superblock
   1323        if (src_bottom > sby && src_right > sbx) {
   1324            if (src_top - border_top >= src_bottom - sby) {
   1325                // if possible move src up into the previous suberblock row
   1326                src_top    -= src_bottom - sby;
   1327                src_bottom -= src_bottom - sby;
   1328            } else if (src_left - border_left >= src_right - sbx) {
   1329                // if possible move src left into the previous suberblock
   1330                src_left  -= src_right - sbx;
   1331                src_right -= src_right - sbx;
   1332            }
   1333        }
   1334        // move src up if it is below current superblock row
   1335        if (src_bottom > sby + sb_size) {
   1336            src_top    -= src_bottom - (sby + sb_size);
   1337            src_bottom -= src_bottom - (sby + sb_size);
   1338        }
   1339        // error out if mv still overlaps with the current superblock
   1340        if (src_bottom > sby && src_right > sbx)
   1341            return -1;
   1342 
   1343        b->mv[0].x = (src_left - t->bx * 4) * 8;
   1344        b->mv[0].y = (src_top  - t->by * 4) * 8;
   1345 
   1346        if (DEBUG_BLOCK_INFO)
   1347            printf("Post-dmv[%d/%d,ref=%d/%d|%d/%d]: r=%d\n",
   1348                   b->mv[0].y, b->mv[0].x, ref.y, ref.x,
   1349                   mvstack[0].mv.mv[0].y, mvstack[0].mv.mv[0].x, ts->msac.rng);
   1350        read_vartx_tree(t, b, bs, bx4, by4);
   1351 
   1352        // reconstruction
   1353        if (t->frame_thread.pass == 1) {
   1354            f->bd_fn.read_coef_blocks(t, bs, b);
   1355            b->filter2d = FILTER_2D_BILINEAR;
   1356        } else {
   1357            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
   1358        }
   1359 
   1360        splat_intrabc_mv(f->c, t, bs, b, bw4, bh4);
   1361        BlockContext *edge = t->a;
   1362        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
   1363 #define set_ctx(rep_macro) \
   1364            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
   1365            rep_macro(edge->mode, off, DC_PRED); \
   1366            rep_macro(edge->pal_sz, off, 0); \
   1367            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
   1368            rep_macro(t->pal_sz_uv[i], off, 0); \
   1369            rep_macro(edge->seg_pred, off, seg_pred); \
   1370            rep_macro(edge->skip_mode, off, 0); \
   1371            rep_macro(edge->intra, off, 0); \
   1372            rep_macro(edge->skip, off, b->skip)
   1373            case_set(b_dim[2 + i]);
   1374 #undef set_ctx
   1375        }
   1376        if (has_chroma) {
   1377            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
   1378            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
   1379        }
   1380    } else {
   1381        // inter-specific mode/mv coding
   1382        int is_comp, has_subpel_filter;
   1383 
   1384        if (b->skip_mode) {
   1385            is_comp = 1;
   1386        } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) &&
   1387                   f->frame_hdr->switchable_comp_refs && imin(bw4, bh4) > 1)
   1388        {
   1389            const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4,
   1390                                         have_top, have_left);
   1391            is_comp = dav1d_msac_decode_bool_adapt(&ts->msac,
   1392                          ts->cdf.m.comp[ctx]);
   1393            if (DEBUG_BLOCK_INFO)
   1394                printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng);
   1395        } else {
   1396            is_comp = 0;
   1397        }
   1398 
   1399        if (b->skip_mode) {
   1400            b->ref[0] = f->frame_hdr->skip_mode_refs[0];
   1401            b->ref[1] = f->frame_hdr->skip_mode_refs[1];
   1402            b->comp_type = COMP_INTER_AVG;
   1403            b->inter_mode = NEARESTMV_NEARESTMV;
   1404            b->drl_idx = NEAREST_DRL;
   1405            has_subpel_filter = 0;
   1406 
   1407            refmvs_candidate mvstack[8];
   1408            int n_mvs, ctx;
   1409            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
   1410                              (union refmvs_refpair) { .ref = {
   1411                                    b->ref[0] + 1, b->ref[1] + 1 }},
   1412                              bs, intra_edge_flags, t->by, t->bx);
   1413 
   1414            b->mv[0] = mvstack[0].mv.mv[0];
   1415            b->mv[1] = mvstack[0].mv.mv[1];
   1416            fix_mv_precision(f->frame_hdr, &b->mv[0]);
   1417            fix_mv_precision(f->frame_hdr, &b->mv[1]);
   1418            if (DEBUG_BLOCK_INFO)
   1419                printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n",
   1420                       b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
   1421                       b->ref[0], b->ref[1]);
   1422        } else if (is_comp) {
   1423            const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4,
   1424                                                 have_top, have_left);
   1425            if (dav1d_msac_decode_bool_adapt(&ts->msac,
   1426                    ts->cdf.m.comp_dir[dir_ctx]))
   1427            {
   1428                // bidir - first reference (fw)
   1429                const int ctx1 = av1_get_fwd_ref_ctx(t->a, &t->l, by4, bx4,
   1430                                                     have_top, have_left);
   1431                if (dav1d_msac_decode_bool_adapt(&ts->msac,
   1432                        ts->cdf.m.comp_fwd_ref[0][ctx1]))
   1433                {
   1434                    const int ctx2 = av1_get_fwd_ref_2_ctx(t->a, &t->l, by4, bx4,
   1435                                                           have_top, have_left);
   1436                    b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
   1437                                        ts->cdf.m.comp_fwd_ref[2][ctx2]);
   1438                } else {
   1439                    const int ctx2 = av1_get_fwd_ref_1_ctx(t->a, &t->l, by4, bx4,
   1440                                                           have_top, have_left);
   1441                    b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
   1442                                    ts->cdf.m.comp_fwd_ref[1][ctx2]);
   1443                }
   1444 
   1445                // second reference (bw)
   1446                const int ctx3 = av1_get_bwd_ref_ctx(t->a, &t->l, by4, bx4,
   1447                                                     have_top, have_left);
   1448                if (dav1d_msac_decode_bool_adapt(&ts->msac,
   1449                        ts->cdf.m.comp_bwd_ref[0][ctx3]))
   1450                {
   1451                    b->ref[1] = 6;
   1452                } else {
   1453                    const int ctx4 = av1_get_bwd_ref_1_ctx(t->a, &t->l, by4, bx4,
   1454                                                           have_top, have_left);
   1455                    b->ref[1] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
   1456                                        ts->cdf.m.comp_bwd_ref[1][ctx4]);
   1457                }
   1458            } else {
   1459                // unidir
   1460                const int uctx_p = av1_get_uni_p_ctx(t->a, &t->l, by4, bx4,
   1461                                                     have_top, have_left);
   1462                if (dav1d_msac_decode_bool_adapt(&ts->msac,
   1463                        ts->cdf.m.comp_uni_ref[0][uctx_p]))
   1464                {
   1465                    b->ref[0] = 4;
   1466                    b->ref[1] = 6;
   1467                } else {
   1468                    const int uctx_p1 = av1_get_uni_p1_ctx(t->a, &t->l, by4, bx4,
   1469                                                           have_top, have_left);
   1470                    b->ref[0] = 0;
   1471                    b->ref[1] = 1 + dav1d_msac_decode_bool_adapt(&ts->msac,
   1472                                        ts->cdf.m.comp_uni_ref[1][uctx_p1]);
   1473                    if (b->ref[1] == 2) {
   1474                        const int uctx_p2 = av1_get_uni_p2_ctx(t->a, &t->l, by4, bx4,
   1475                                                               have_top, have_left);
   1476                        b->ref[1] += dav1d_msac_decode_bool_adapt(&ts->msac,
   1477                                         ts->cdf.m.comp_uni_ref[2][uctx_p2]);
   1478                    }
   1479                }
   1480            }
   1481            if (DEBUG_BLOCK_INFO)
   1482                printf("Post-refs[%d/%d]: r=%d\n",
   1483                       b->ref[0], b->ref[1], ts->msac.rng);
   1484 
   1485            refmvs_candidate mvstack[8];
   1486            int n_mvs, ctx;
   1487            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
   1488                              (union refmvs_refpair) { .ref = {
   1489                                    b->ref[0] + 1, b->ref[1] + 1 }},
   1490                              bs, intra_edge_flags, t->by, t->bx);
   1491 
   1492            b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
   1493                                ts->cdf.m.comp_inter_mode[ctx],
   1494                                N_COMP_INTER_PRED_MODES - 1);
   1495            if (DEBUG_BLOCK_INFO)
   1496                printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
   1497                       b->inter_mode, ctx, n_mvs, ts->msac.rng);
   1498 
   1499            const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
   1500            b->drl_idx = NEAREST_DRL;
   1501            if (b->inter_mode == NEWMV_NEWMV) {
   1502                if (n_mvs > 1) { // NEARER, NEAR or NEARISH
   1503                    const int drl_ctx_v1 = get_drl_context(mvstack, 0);
   1504                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
   1505                                      ts->cdf.m.drl_bit[drl_ctx_v1]);
   1506                    if (b->drl_idx == NEARER_DRL && n_mvs > 2) {
   1507                        const int drl_ctx_v2 = get_drl_context(mvstack, 1);
   1508                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
   1509                                          ts->cdf.m.drl_bit[drl_ctx_v2]);
   1510                    }
   1511                    if (DEBUG_BLOCK_INFO)
   1512                        printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
   1513                               b->drl_idx, n_mvs, ts->msac.rng);
   1514                }
   1515            } else if (im[0] == NEARMV || im[1] == NEARMV) {
   1516                b->drl_idx = NEARER_DRL;
   1517                if (n_mvs > 2) { // NEAR or NEARISH
   1518                    const int drl_ctx_v2 = get_drl_context(mvstack, 1);
   1519                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
   1520                                      ts->cdf.m.drl_bit[drl_ctx_v2]);
   1521                    if (b->drl_idx == NEAR_DRL && n_mvs > 3) {
   1522                        const int drl_ctx_v3 = get_drl_context(mvstack, 2);
   1523                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
   1524                                          ts->cdf.m.drl_bit[drl_ctx_v3]);
   1525                    }
   1526                    if (DEBUG_BLOCK_INFO)
   1527                        printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
   1528                               b->drl_idx, n_mvs, ts->msac.rng);
   1529                }
   1530            }
   1531            assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
   1532 
   1533 #define assign_comp_mv(idx) \
   1534            switch (im[idx]) { \
   1535            case NEARMV: \
   1536            case NEARESTMV: \
   1537                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
   1538                fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
   1539                break; \
   1540            case GLOBALMV: \
   1541                has_subpel_filter |= \
   1542                    f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \
   1543                b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \
   1544                                        t->bx, t->by, bw4, bh4, f->frame_hdr); \
   1545                break; \
   1546            case NEWMV: \
   1547                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
   1548                const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; \
   1549                read_mv_residual(ts, &b->mv[idx], mv_prec); \
   1550                break; \
   1551            }
   1552            has_subpel_filter = imin(bw4, bh4) == 1 ||
   1553                                b->inter_mode != GLOBALMV_GLOBALMV;
   1554            assign_comp_mv(0);
   1555            assign_comp_mv(1);
   1556 #undef assign_comp_mv
   1557            if (DEBUG_BLOCK_INFO)
   1558                printf("Post-residual_mv[1:y=%d,x=%d,2:y=%d,x=%d]: r=%d\n",
   1559                       b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
   1560                       ts->msac.rng);
   1561 
   1562            // jnt_comp vs. seg vs. wedge
   1563            int is_segwedge = 0;
   1564            if (f->seq_hdr->masked_compound) {
   1565                const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4);
   1566 
   1567                is_segwedge = dav1d_msac_decode_bool_adapt(&ts->msac,
   1568                                  ts->cdf.m.mask_comp[mask_ctx]);
   1569                if (DEBUG_BLOCK_INFO)
   1570                    printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n",
   1571                           is_segwedge, mask_ctx, ts->msac.rng);
   1572            }
   1573 
   1574            if (!is_segwedge) {
   1575                if (f->seq_hdr->jnt_comp) {
   1576                    const int jnt_ctx =
   1577                        get_jnt_comp_ctx(f->seq_hdr->order_hint_n_bits,
   1578                                         f->cur.frame_hdr->frame_offset,
   1579                                         f->refp[b->ref[0]].p.frame_hdr->frame_offset,
   1580                                         f->refp[b->ref[1]].p.frame_hdr->frame_offset,
   1581                                         t->a, &t->l, by4, bx4);
   1582                    b->comp_type = COMP_INTER_WEIGHTED_AVG +
   1583                                   dav1d_msac_decode_bool_adapt(&ts->msac,
   1584                                       ts->cdf.m.jnt_comp[jnt_ctx]);
   1585                    if (DEBUG_BLOCK_INFO)
   1586                        printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n",
   1587                               b->comp_type == COMP_INTER_AVG,
   1588                               jnt_ctx, t->a->comp_type[bx4], t->a->ref[0][bx4],
   1589                               t->l.comp_type[by4], t->l.ref[0][by4],
   1590                               ts->msac.rng);
   1591                } else {
   1592                    b->comp_type = COMP_INTER_AVG;
   1593                }
   1594            } else {
   1595                if (wedge_allowed_mask & (1 << bs)) {
   1596                    const int ctx = dav1d_wedge_ctx_lut[bs];
   1597                    b->comp_type = COMP_INTER_WEDGE -
   1598                                   dav1d_msac_decode_bool_adapt(&ts->msac,
   1599                                       ts->cdf.m.wedge_comp[ctx]);
   1600                    if (b->comp_type == COMP_INTER_WEDGE)
   1601                        b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
   1602                                           ts->cdf.m.wedge_idx[ctx], 15);
   1603                } else {
   1604                    b->comp_type = COMP_INTER_SEG;
   1605                }
   1606                b->mask_sign = dav1d_msac_decode_bool_equi(&ts->msac);
   1607                if (DEBUG_BLOCK_INFO)
   1608                    printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n",
   1609                           b->comp_type == COMP_INTER_WEDGE,
   1610                           b->wedge_idx, b->mask_sign, ts->msac.rng);
   1611            }
   1612        } else {
   1613            b->comp_type = COMP_INTER_NONE;
   1614 
   1615            // ref
   1616            if (seg && seg->ref > 0) {
   1617                b->ref[0] = seg->ref - 1;
   1618            } else if (seg && (seg->globalmv || seg->skip)) {
   1619                b->ref[0] = 0;
   1620            } else {
   1621                const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4,
   1622                                                 have_top, have_left);
   1623                if (dav1d_msac_decode_bool_adapt(&ts->msac,
   1624                                                 ts->cdf.m.ref[0][ctx1]))
   1625                {
   1626                    const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4,
   1627                                                       have_top, have_left);
   1628                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
   1629                                                     ts->cdf.m.ref[1][ctx2]))
   1630                    {
   1631                        b->ref[0] = 6;
   1632                    } else {
   1633                        const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4,
   1634                                                           have_top, have_left);
   1635                        b->ref[0] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
   1636                                            ts->cdf.m.ref[5][ctx3]);
   1637                    }
   1638                } else {
   1639                    const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4,
   1640                                                       have_top, have_left);
   1641                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
   1642                                                     ts->cdf.m.ref[2][ctx2]))
   1643                    {
   1644                        const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4,
   1645                                                           have_top, have_left);
   1646                        b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
   1647                                            ts->cdf.m.ref[4][ctx3]);
   1648                    } else {
   1649                        const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4,
   1650                                                           have_top, have_left);
   1651                        b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
   1652                                        ts->cdf.m.ref[3][ctx3]);
   1653                    }
   1654                }
   1655                if (DEBUG_BLOCK_INFO)
   1656                    printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
   1657            }
   1658            b->ref[1] = -1;
   1659 
   1660            refmvs_candidate mvstack[8];
   1661            int n_mvs, ctx;
   1662            dav1d_refmvs_find(&t->rt, mvstack, &n_mvs, &ctx,
   1663                              (union refmvs_refpair) { .ref = { b->ref[0] + 1, -1 }},
   1664                              bs, intra_edge_flags, t->by, t->bx);
   1665 
   1666            // mode parsing and mv derivation from ref_mvs
   1667            if ((seg && (seg->skip || seg->globalmv)) ||
   1668                dav1d_msac_decode_bool_adapt(&ts->msac,
   1669                                             ts->cdf.m.newmv_mode[ctx & 7]))
   1670            {
   1671                if ((seg && (seg->skip || seg->globalmv)) ||
   1672                    !dav1d_msac_decode_bool_adapt(&ts->msac,
   1673                         ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
   1674                {
   1675                    b->inter_mode = GLOBALMV;
   1676                    b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]],
   1677                                          t->bx, t->by, bw4, bh4, f->frame_hdr);
   1678                    has_subpel_filter = imin(bw4, bh4) == 1 ||
   1679                        f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION;
   1680                } else {
   1681                    has_subpel_filter = 1;
   1682                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
   1683                            ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
   1684                    { // NEAREST, NEARER, NEAR or NEARISH
   1685                        b->inter_mode = NEARMV;
   1686                        b->drl_idx = NEARER_DRL;
   1687                        if (n_mvs > 2) { // NEARER, NEAR or NEARISH
   1688                            const int drl_ctx_v2 = get_drl_context(mvstack, 1);
   1689                            b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
   1690                                              ts->cdf.m.drl_bit[drl_ctx_v2]);
   1691                            if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH
   1692                                const int drl_ctx_v3 =
   1693                                    get_drl_context(mvstack, 2);
   1694                                b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
   1695                                                  ts->cdf.m.drl_bit[drl_ctx_v3]);
   1696                            }
   1697                        }
   1698                    } else {
   1699                        b->inter_mode = NEARESTMV;
   1700                        b->drl_idx = NEAREST_DRL;
   1701                    }
   1702                    assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
   1703                    b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
   1704                    if (b->drl_idx < NEAR_DRL)
   1705                        fix_mv_precision(f->frame_hdr, &b->mv[0]);
   1706                }
   1707 
   1708                if (DEBUG_BLOCK_INFO)
   1709                    printf("Post-intermode[%d,drl=%d,mv=y:%d,x:%d,n_mvs=%d]: r=%d\n",
   1710                           b->inter_mode, b->drl_idx, b->mv[0].y, b->mv[0].x, n_mvs,
   1711                           ts->msac.rng);
   1712            } else {
   1713                has_subpel_filter = 1;
   1714                b->inter_mode = NEWMV;
   1715                b->drl_idx = NEAREST_DRL;
   1716                if (n_mvs > 1) { // NEARER, NEAR or NEARISH
   1717                    const int drl_ctx_v1 = get_drl_context(mvstack, 0);
   1718                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
   1719                                      ts->cdf.m.drl_bit[drl_ctx_v1]);
   1720                    if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH
   1721                        const int drl_ctx_v2 = get_drl_context(mvstack, 1);
   1722                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
   1723                                          ts->cdf.m.drl_bit[drl_ctx_v2]);
   1724                    }
   1725                }
   1726                assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
   1727                if (n_mvs > 1) {
   1728                    b->mv[0] = mvstack[b->drl_idx].mv.mv[0];
   1729                } else {
   1730                    assert(!b->drl_idx);
   1731                    b->mv[0] = mvstack[0].mv.mv[0];
   1732                    fix_mv_precision(f->frame_hdr, &b->mv[0]);
   1733                }
   1734                if (DEBUG_BLOCK_INFO)
   1735                    printf("Post-intermode[%d,drl=%d]: r=%d\n",
   1736                           b->inter_mode, b->drl_idx, ts->msac.rng);
   1737                const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv;
   1738                read_mv_residual(ts, &b->mv[0], mv_prec);
   1739                if (DEBUG_BLOCK_INFO)
   1740                    printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
   1741                           b->mv[0].y, b->mv[0].x, ts->msac.rng);
   1742            }
   1743 
   1744            // interintra flags
   1745            const int ii_sz_grp = dav1d_ymode_size_context[bs];
   1746            if (f->seq_hdr->inter_intra &&
   1747                interintra_allowed_mask & (1 << bs) &&
   1748                dav1d_msac_decode_bool_adapt(&ts->msac,
   1749                                             ts->cdf.m.interintra[ii_sz_grp]))
   1750            {
   1751                b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
   1752                                         ts->cdf.m.interintra_mode[ii_sz_grp],
   1753                                         N_INTER_INTRA_PRED_MODES - 1);
   1754                const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
   1755                b->interintra_type = INTER_INTRA_BLEND +
   1756                                     dav1d_msac_decode_bool_adapt(&ts->msac,
   1757                                         ts->cdf.m.interintra_wedge[wedge_ctx]);
   1758                if (b->interintra_type == INTER_INTRA_WEDGE)
   1759                    b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
   1760                                       ts->cdf.m.wedge_idx[wedge_ctx], 15);
   1761            } else {
   1762                b->interintra_type = INTER_INTRA_NONE;
   1763            }
   1764            if (DEBUG_BLOCK_INFO && f->seq_hdr->inter_intra &&
   1765                interintra_allowed_mask & (1 << bs))
   1766            {
   1767                printf("Post-interintra[t=%d,m=%d,w=%d]: r=%d\n",
   1768                       b->interintra_type, b->interintra_mode,
   1769                       b->wedge_idx, ts->msac.rng);
   1770            }
   1771 
   1772            // motion variation
   1773            if (f->frame_hdr->switchable_motion_mode &&
   1774                b->interintra_type == INTER_INTRA_NONE && imin(bw4, bh4) >= 2 &&
   1775                // is not warped global motion
   1776                !(!f->frame_hdr->force_integer_mv && b->inter_mode == GLOBALMV &&
   1777                  f->frame_hdr->gmv[b->ref[0]].type > DAV1D_WM_TYPE_TRANSLATION) &&
   1778                // has overlappable neighbours
   1779                ((have_left && findoddzero(&t->l.intra[by4 + 1], h4 >> 1)) ||
   1780                 (have_top && findoddzero(&t->a->intra[bx4 + 1], w4 >> 1))))
   1781            {
   1782                // reaching here means the block allows obmc - check warp by
   1783                // finding matching-ref blocks in top/left edges
   1784                uint64_t mask[2] = { 0, 0 };
   1785                find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
   1786                                  have_left, have_top, b->ref[0], mask);
   1787                const int allow_warp = !f->svc[b->ref[0]][0].scale &&
   1788                    !f->frame_hdr->force_integer_mv &&
   1789                    f->frame_hdr->warp_motion && (mask[0] | mask[1]);
   1790 
   1791                b->motion_mode = allow_warp ?
   1792                    dav1d_msac_decode_symbol_adapt4(&ts->msac,
   1793                        ts->cdf.m.motion_mode[bs], 2) :
   1794                    dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
   1795                if (b->motion_mode == MM_WARP) {
   1796                    has_subpel_filter = 0;
   1797                    derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
   1798 #define signabs(v) v < 0 ? '-' : ' ', abs(v)
   1799                    if (DEBUG_BLOCK_INFO)
   1800                        printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
   1801                               "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x, "
   1802                               "mv=y:%d,x:%d\n",
   1803                               signabs(t->warpmv.matrix[0]),
   1804                               signabs(t->warpmv.matrix[1]),
   1805                               signabs(t->warpmv.matrix[2]),
   1806                               signabs(t->warpmv.matrix[3]),
   1807                               signabs(t->warpmv.matrix[4]),
   1808                               signabs(t->warpmv.matrix[5]),
   1809                               signabs(t->warpmv.u.p.alpha),
   1810                               signabs(t->warpmv.u.p.beta),
   1811                               signabs(t->warpmv.u.p.gamma),
   1812                               signabs(t->warpmv.u.p.delta),
   1813                               b->mv[0].y, b->mv[0].x);
   1814 #undef signabs
   1815                    if (t->frame_thread.pass) {
   1816                        if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) {
   1817                            b->matrix[0] = t->warpmv.matrix[2] - 0x10000;
   1818                            b->matrix[1] = t->warpmv.matrix[3];
   1819                            b->matrix[2] = t->warpmv.matrix[4];
   1820                            b->matrix[3] = t->warpmv.matrix[5] - 0x10000;
   1821                        } else {
   1822                            b->matrix[0] = INT16_MIN;
   1823                        }
   1824                    }
   1825                }
   1826 
   1827                if (DEBUG_BLOCK_INFO)
   1828                    printf("Post-motionmode[%d]: r=%d [mask: 0x%" PRIx64 "/0x%"
   1829                           PRIx64 "]\n", b->motion_mode, ts->msac.rng, mask[0],
   1830                            mask[1]);
   1831            } else {
   1832                b->motion_mode = MM_TRANSLATION;
   1833            }
   1834        }
   1835 
   1836        // subpel filter
   1837        enum Dav1dFilterMode filter[2];
   1838        if (f->frame_hdr->subpel_filter_mode == DAV1D_FILTER_SWITCHABLE) {
   1839            if (has_subpel_filter) {
   1840                const int comp = b->comp_type != COMP_INTER_NONE;
   1841                const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
   1842                                                by4, bx4);
   1843                filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
   1844                               ts->cdf.m.filter[0][ctx1],
   1845                               DAV1D_N_SWITCHABLE_FILTERS - 1);
   1846                if (f->seq_hdr->dual_filter) {
   1847                    const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
   1848                                                    b->ref[0], by4, bx4);
   1849                    if (DEBUG_BLOCK_INFO)
   1850                        printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
   1851                               filter[0], ctx1, ts->msac.rng);
   1852                    filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
   1853                                    ts->cdf.m.filter[1][ctx2],
   1854                                    DAV1D_N_SWITCHABLE_FILTERS - 1);
   1855                    if (DEBUG_BLOCK_INFO)
   1856                        printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
   1857                               filter[1], ctx2, ts->msac.rng);
   1858                } else {
   1859                    filter[1] = filter[0];
   1860                    if (DEBUG_BLOCK_INFO)
   1861                        printf("Post-subpel_filter[%d,ctx=%d]: r=%d\n",
   1862                               filter[0], ctx1, ts->msac.rng);
   1863                }
   1864            } else {
   1865                filter[0] = filter[1] = DAV1D_FILTER_8TAP_REGULAR;
   1866            }
   1867        } else {
   1868            filter[0] = filter[1] = f->frame_hdr->subpel_filter_mode;
   1869        }
   1870        b->filter2d = dav1d_filter_2d[filter[1]][filter[0]];
   1871 
   1872        read_vartx_tree(t, b, bs, bx4, by4);
   1873 
   1874        // reconstruction
   1875        if (t->frame_thread.pass == 1) {
   1876            f->bd_fn.read_coef_blocks(t, bs, b);
   1877        } else {
   1878            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
   1879        }
   1880 
   1881        if (f->frame_hdr->loopfilter.level_y[0] ||
   1882            f->frame_hdr->loopfilter.level_y[1])
   1883        {
   1884            const int is_globalmv =
   1885                b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
   1886            const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
   1887                &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
   1888            const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
   1889            enum RectTxfmSize ytx = b->max_ytx, uvtx = b->uvtx;
   1890            if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
   1891                ytx  = (enum RectTxfmSize) TX_4X4;
   1892                uvtx = (enum RectTxfmSize) TX_4X4;
   1893            }
   1894            dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
   1895                                       t->bx, t->by, f->w4, f->h4, b->skip, bs,
   1896                                       ytx, tx_split, uvtx, f->cur.p.layout,
   1897                                       &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
   1898                                       has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
   1899                                       has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
   1900        }
   1901 
   1902        // context updates
   1903        if (is_comp)
   1904            splat_tworef_mv(f->c, t, bs, b, bw4, bh4);
   1905        else
   1906            splat_oneref_mv(f->c, t, bs, b, bw4, bh4);
   1907        BlockContext *edge = t->a;
   1908        for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) {
   1909 #define set_ctx(rep_macro) \
   1910            rep_macro(edge->seg_pred, off, seg_pred); \
   1911            rep_macro(edge->skip_mode, off, b->skip_mode); \
   1912            rep_macro(edge->intra, off, 0); \
   1913            rep_macro(edge->skip, off, b->skip); \
   1914            rep_macro(edge->pal_sz, off, 0); \
   1915            /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
   1916            rep_macro(t->pal_sz_uv[i], off, 0); \
   1917            rep_macro(edge->tx_intra, off, b_dim[2 + i]); \
   1918            rep_macro(edge->comp_type, off, b->comp_type); \
   1919            rep_macro(edge->filter[0], off, filter[0]); \
   1920            rep_macro(edge->filter[1], off, filter[1]); \
   1921            rep_macro(edge->mode, off, b->inter_mode); \
   1922            rep_macro(edge->ref[0], off, b->ref[0]); \
   1923            rep_macro(edge->ref[1], off, ((uint8_t) b->ref[1]))
   1924            case_set(b_dim[2 + i]);
   1925 #undef set_ctx
   1926        }
   1927        if (has_chroma) {
   1928            dav1d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED);
   1929            dav1d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED);
   1930        }
   1931    }
   1932 
   1933    // update contexts
   1934    if (f->frame_hdr->segmentation.enabled &&
   1935        f->frame_hdr->segmentation.update_map)
   1936    {
   1937        uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
   1938 #define set_ctx(rep_macro) \
   1939        for (int y = 0; y < bh4; y++) { \
   1940            rep_macro(seg_ptr, 0, b->seg_id); \
   1941            seg_ptr += f->b4_stride; \
   1942        }
   1943        case_set(b_dim[2]);
   1944 #undef set_ctx
   1945    }
   1946    if (!b->skip) {
   1947        uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1];
   1948        const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
   1949        const int bx_idx = (bx4 & 16) >> 4;
   1950        for (int y = 0; y < bh4; y += 2, noskip_mask++) {
   1951            (*noskip_mask)[bx_idx] |= mask;
   1952            if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
   1953                (*noskip_mask)[1] |= mask;
   1954        }
   1955    }
   1956 
   1957    if (t->frame_thread.pass == 1 && !b->intra && IS_INTER_OR_SWITCH(f->frame_hdr)) {
   1958        const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
   1959        int (*const lowest_px)[2] = ts->lowest_pixel[sby];
   1960 
   1961        // keep track of motion vectors for each reference
   1962        if (b->comp_type == COMP_INTER_NONE) {
   1963            // y
   1964            if (imin(bw4, bh4) > 1 &&
   1965                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
   1966                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
   1967            {
   1968                affine_lowest_px_luma(t, &lowest_px[b->ref[0]][0], b_dim,
   1969                                      b->motion_mode == MM_WARP ? &t->warpmv :
   1970                                      &f->frame_hdr->gmv[b->ref[0]]);
   1971            } else {
   1972                mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y,
   1973                             0, &f->svc[b->ref[0]][1]);
   1974                if (b->motion_mode == MM_OBMC) {
   1975                    obmc_lowest_px(t, lowest_px, 0, b_dim, bx4, by4, w4, h4);
   1976                }
   1977            }
   1978 
   1979            // uv
   1980            if (has_chroma) {
   1981                // sub8x8 derivation
   1982                int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
   1983                refmvs_block *const *r;
   1984                if (is_sub8x8) {
   1985                    assert(ss_hor == 1);
   1986                    r = &t->rt.r[(t->by & 31) + 5];
   1987                    if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
   1988                    if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
   1989                    if (bw4 == 1 && bh4 == ss_ver)
   1990                        is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
   1991                }
   1992 
   1993                // chroma prediction
   1994                if (is_sub8x8) {
   1995                    assert(ss_hor == 1);
   1996                    if (bw4 == 1 && bh4 == ss_ver) {
   1997                        const refmvs_block *const rr = &r[-1][t->bx - 1];
   1998                        mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
   1999                                     t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
   2000                                     &f->svc[rr->ref.ref[0] - 1][1]);
   2001                    }
   2002                    if (bw4 == 1) {
   2003                        const refmvs_block *const rr = &r[0][t->bx - 1];
   2004                        mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
   2005                                     t->by, bh4, rr->mv.mv[0].y, ss_ver,
   2006                                     &f->svc[rr->ref.ref[0] - 1][1]);
   2007                    }
   2008                    if (bh4 == ss_ver) {
   2009                        const refmvs_block *const rr = &r[-1][t->bx];
   2010                        mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
   2011                                     t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
   2012                                     &f->svc[rr->ref.ref[0] - 1][1]);
   2013                    }
   2014                    mc_lowest_px(&lowest_px[b->ref[0]][1], t->by, bh4,
   2015                                 b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
   2016                } else {
   2017                    if (imin(cbw4, cbh4) > 1 &&
   2018                        ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
   2019                         (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
   2020                    {
   2021                        affine_lowest_px_chroma(t, &lowest_px[b->ref[0]][1], b_dim,
   2022                                                b->motion_mode == MM_WARP ? &t->warpmv :
   2023                                                &f->frame_hdr->gmv[b->ref[0]]);
   2024                    } else {
   2025                        mc_lowest_px(&lowest_px[b->ref[0]][1],
   2026                                     t->by & ~ss_ver, bh4 << (bh4 == ss_ver),
   2027                                     b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
   2028                        if (b->motion_mode == MM_OBMC) {
   2029                            obmc_lowest_px(t, lowest_px, 1, b_dim, bx4, by4, w4, h4);
   2030                        }
   2031                    }
   2032                }
   2033            }
   2034        } else {
   2035            // y
   2036            for (int i = 0; i < 2; i++) {
   2037                if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
   2038                    affine_lowest_px_luma(t, &lowest_px[b->ref[i]][0], b_dim,
   2039                                          &f->frame_hdr->gmv[b->ref[i]]);
   2040                } else {
   2041                    mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4,
   2042                                 b->mv[i].y, 0, &f->svc[b->ref[i]][1]);
   2043                }
   2044            }
   2045 
   2046            // uv
   2047            if (has_chroma) for (int i = 0; i < 2; i++) {
   2048                if (b->inter_mode == GLOBALMV_GLOBALMV &&
   2049                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
   2050                {
   2051                    affine_lowest_px_chroma(t, &lowest_px[b->ref[i]][1], b_dim,
   2052                                            &f->frame_hdr->gmv[b->ref[i]]);
   2053                } else {
   2054                    mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4,
   2055                                 b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]);
   2056                }
   2057            }
   2058        }
   2059    }
   2060 
   2061    return 0;
   2062 }
   2063 
   2064 #if __has_feature(memory_sanitizer)
   2065 
   2066 #include <sanitizer/msan_interface.h>
   2067 
   2068 static int checked_decode_b(Dav1dTaskContext *const t,
   2069                            const enum BlockLevel bl,
   2070                            const enum BlockSize bs,
   2071                            const enum BlockPartition bp,
   2072                            const enum EdgeFlags intra_edge_flags)
   2073 {
   2074    const Dav1dFrameContext *const f = t->f;
   2075    const int err = decode_b(t, bl, bs, bp, intra_edge_flags);
   2076 
   2077    if (err == 0 && !(t->frame_thread.pass & 1)) {
   2078        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2079        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   2080        const uint8_t *const b_dim = dav1d_block_dimensions[bs];
   2081        const int bw4 = b_dim[0], bh4 = b_dim[1];
   2082        const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
   2083        const int has_chroma = f->seq_hdr->layout != DAV1D_PIXEL_LAYOUT_I400 &&
   2084                               (bw4 > ss_hor || t->bx & 1) &&
   2085                               (bh4 > ss_ver || t->by & 1);
   2086 
   2087        for (int p = 0; p < 1 + 2 * has_chroma; p++) {
   2088            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2089            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   2090            const ptrdiff_t stride = f->cur.stride[!!p];
   2091            const int bx = t->bx & ~ss_hor;
   2092            const int by = t->by & ~ss_ver;
   2093            const int width  = w4 << (2 - ss_hor + (bw4 == ss_hor));
   2094            const int height = h4 << (2 - ss_ver + (bh4 == ss_ver));
   2095 
   2096            const uint8_t *data = f->cur.data[p] + (by << (2 - ss_ver)) * stride +
   2097                                  (bx << (2 - ss_hor + !!f->seq_hdr->hbd));
   2098 
   2099            for (int y = 0; y < height; data += stride, y++) {
   2100                const size_t line_sz = width << !!f->seq_hdr->hbd;
   2101                if (__msan_test_shadow(data, line_sz) != -1) {
   2102                    fprintf(stderr, "B[%d](%d, %d) w4:%d, h4:%d, row:%d\n",
   2103                            p, bx, by, w4, h4, y);
   2104                    __msan_check_mem_is_initialized(data, line_sz);
   2105                }
   2106            }
   2107        }
   2108    }
   2109 
   2110    return err;
   2111 }
   2112 
   2113 #define decode_b checked_decode_b
   2114 
   2115 #endif /* defined(__has_feature) */
   2116 
   2117 static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl,
   2118                     const EdgeNode *const node)
   2119 {
   2120    const Dav1dFrameContext *const f = t->f;
   2121    Dav1dTileState *const ts = t->ts;
   2122    const int hsz = 16 >> bl;
   2123    const int have_h_split = f->bw > t->bx + hsz;
   2124    const int have_v_split = f->bh > t->by + hsz;
   2125 
   2126    if (!have_h_split && !have_v_split) {
   2127        assert(bl < BL_8X8);
   2128        return decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0));
   2129    }
   2130 
   2131    uint16_t *pc;
   2132    enum BlockPartition bp;
   2133    int ctx, bx8, by8;
   2134    if (t->frame_thread.pass != 2) {
   2135        if (0 && bl == BL_64X64)
   2136            printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n",
   2137                   f->frame_hdr->frame_offset, t->by, t->bx, bl, ts->msac.rng);
   2138        bx8 = (t->bx & 31) >> 1;
   2139        by8 = (t->by & 31) >> 1;
   2140        ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8);
   2141        pc = ts->cdf.m.partition[bl][ctx];
   2142    }
   2143 
   2144    if (have_h_split && have_v_split) {
   2145        if (t->frame_thread.pass == 2) {
   2146            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
   2147            bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
   2148        } else {
   2149            bp = dav1d_msac_decode_symbol_adapt16(&ts->msac, pc,
   2150                                                  dav1d_partition_type_count[bl]);
   2151            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
   2152                (bp == PARTITION_V || bp == PARTITION_V4 ||
   2153                 bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
   2154            {
   2155                return 1;
   2156            }
   2157            if (DEBUG_BLOCK_INFO)
   2158                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
   2159                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp,
   2160                       ts->msac.rng);
   2161        }
   2162        const uint8_t *const b = dav1d_block_sizes[bl][bp];
   2163 
   2164        switch (bp) {
   2165        case PARTITION_NONE:
   2166            if (decode_b(t, bl, b[0], PARTITION_NONE, node->o))
   2167                return -1;
   2168            break;
   2169        case PARTITION_H:
   2170            if (decode_b(t, bl, b[0], PARTITION_H, node->h[0]))
   2171                return -1;
   2172            t->by += hsz;
   2173            if (decode_b(t, bl, b[0], PARTITION_H, node->h[1]))
   2174                return -1;
   2175            t->by -= hsz;
   2176            break;
   2177        case PARTITION_V:
   2178            if (decode_b(t, bl, b[0], PARTITION_V, node->v[0]))
   2179                return -1;
   2180            t->bx += hsz;
   2181            if (decode_b(t, bl, b[0], PARTITION_V, node->v[1]))
   2182                return -1;
   2183            t->bx -= hsz;
   2184            break;
   2185        case PARTITION_SPLIT:
   2186            if (bl == BL_8X8) {
   2187                const EdgeTip *const tip = (const EdgeTip *) node;
   2188                assert(hsz == 1);
   2189                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, EDGE_ALL_TR_AND_BL))
   2190                    return -1;
   2191                const enum Filter2d tl_filter = t->tl_4x4_filter;
   2192                t->bx++;
   2193                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[0]))
   2194                    return -1;
   2195                t->bx--;
   2196                t->by++;
   2197                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[1]))
   2198                    return -1;
   2199                t->bx++;
   2200                t->tl_4x4_filter = tl_filter;
   2201                if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[2]))
   2202                    return -1;
   2203                t->bx--;
   2204                t->by--;
   2205 #if ARCH_X86_64
   2206                if (t->frame_thread.pass) {
   2207                    /* In 8-bit mode with 2-pass decoding the coefficient buffer
   2208                     * can end up misaligned due to skips here. Work around
   2209                     * the issue by explicitly realigning the buffer. */
   2210                    const int p = t->frame_thread.pass & 1;
   2211                    ts->frame_thread[p].cf =
   2212                        (void*)(((uintptr_t)ts->frame_thread[p].cf + 63) & ~63);
   2213                }
   2214 #endif
   2215            } else {
   2216                if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0)))
   2217                    return 1;
   2218                t->bx += hsz;
   2219                if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 1)))
   2220                    return 1;
   2221                t->bx -= hsz;
   2222                t->by += hsz;
   2223                if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 2)))
   2224                    return 1;
   2225                t->bx += hsz;
   2226                if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 3)))
   2227                    return 1;
   2228                t->bx -= hsz;
   2229                t->by -= hsz;
   2230            }
   2231            break;
   2232        case PARTITION_T_TOP_SPLIT: {
   2233            if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, EDGE_ALL_TR_AND_BL))
   2234                return -1;
   2235            t->bx += hsz;
   2236            if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, node->v[1]))
   2237                return -1;
   2238            t->bx -= hsz;
   2239            t->by += hsz;
   2240            if (decode_b(t, bl, b[1], PARTITION_T_TOP_SPLIT, node->h[1]))
   2241                return -1;
   2242            t->by -= hsz;
   2243            break;
   2244        }
   2245        case PARTITION_T_BOTTOM_SPLIT: {
   2246            if (decode_b(t, bl, b[0], PARTITION_T_BOTTOM_SPLIT, node->h[0]))
   2247                return -1;
   2248            t->by += hsz;
   2249            if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, node->v[0]))
   2250                return -1;
   2251            t->bx += hsz;
   2252            if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, 0))
   2253                return -1;
   2254            t->bx -= hsz;
   2255            t->by -= hsz;
   2256            break;
   2257        }
   2258        case PARTITION_T_LEFT_SPLIT: {
   2259            if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, EDGE_ALL_TR_AND_BL))
   2260                return -1;
   2261            t->by += hsz;
   2262            if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, node->h[1]))
   2263                return -1;
   2264            t->by -= hsz;
   2265            t->bx += hsz;
   2266            if (decode_b(t, bl, b[1], PARTITION_T_LEFT_SPLIT, node->v[1]))
   2267                return -1;
   2268            t->bx -= hsz;
   2269            break;
   2270        }
   2271        case PARTITION_T_RIGHT_SPLIT: {
   2272            if (decode_b(t, bl, b[0], PARTITION_T_RIGHT_SPLIT, node->v[0]))
   2273                return -1;
   2274            t->bx += hsz;
   2275            if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, node->h[0]))
   2276                return -1;
   2277            t->by += hsz;
   2278            if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, 0))
   2279                return -1;
   2280            t->by -= hsz;
   2281            t->bx -= hsz;
   2282            break;
   2283        }
   2284        case PARTITION_H4: {
   2285            const EdgeBranch *const branch = (const EdgeBranch *) node;
   2286            if (decode_b(t, bl, b[0], PARTITION_H4, node->h[0]))
   2287                return -1;
   2288            t->by += hsz >> 1;
   2289            if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4))
   2290                return -1;
   2291            t->by += hsz >> 1;
   2292            if (decode_b(t, bl, b[0], PARTITION_H4, EDGE_ALL_LEFT_HAS_BOTTOM))
   2293                return -1;
   2294            t->by += hsz >> 1;
   2295            if (t->by < f->bh)
   2296                if (decode_b(t, bl, b[0], PARTITION_H4, node->h[1]))
   2297                    return -1;
   2298            t->by -= hsz * 3 >> 1;
   2299            break;
   2300        }
   2301        case PARTITION_V4: {
   2302            const EdgeBranch *const branch = (const EdgeBranch *) node;
   2303            if (decode_b(t, bl, b[0], PARTITION_V4, node->v[0]))
   2304                return -1;
   2305            t->bx += hsz >> 1;
   2306            if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4))
   2307                return -1;
   2308            t->bx += hsz >> 1;
   2309            if (decode_b(t, bl, b[0], PARTITION_V4, EDGE_ALL_TOP_HAS_RIGHT))
   2310                return -1;
   2311            t->bx += hsz >> 1;
   2312            if (t->bx < f->bw)
   2313                if (decode_b(t, bl, b[0], PARTITION_V4, node->v[1]))
   2314                    return -1;
   2315            t->bx -= hsz * 3 >> 1;
   2316            break;
   2317        }
   2318        default: assert(0);
   2319        }
   2320    } else if (have_h_split) {
   2321        unsigned is_split;
   2322        if (t->frame_thread.pass == 2) {
   2323            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
   2324            is_split = b->bl != bl;
   2325        } else {
   2326            is_split = dav1d_msac_decode_bool(&ts->msac,
   2327                           gather_top_partition_prob(pc, bl));
   2328            if (DEBUG_BLOCK_INFO)
   2329                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
   2330                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
   2331                       is_split ? PARTITION_SPLIT : PARTITION_H, ts->msac.rng);
   2332        }
   2333 
   2334        assert(bl < BL_8X8);
   2335        if (is_split) {
   2336            bp = PARTITION_SPLIT;
   2337            if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1;
   2338            t->bx += hsz;
   2339            if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 1))) return 1;
   2340            t->bx -= hsz;
   2341        } else {
   2342            bp = PARTITION_H;
   2343            if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_H][0],
   2344                         PARTITION_H, node->h[0]))
   2345                return -1;
   2346        }
   2347    } else {
   2348        assert(have_v_split);
   2349        unsigned is_split;
   2350        if (t->frame_thread.pass == 2) {
   2351            const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
   2352            is_split = b->bl != bl;
   2353        } else {
   2354            is_split = dav1d_msac_decode_bool(&ts->msac,
   2355                           gather_left_partition_prob(pc, bl));
   2356            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
   2357                return 1;
   2358            if (DEBUG_BLOCK_INFO)
   2359                printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
   2360                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
   2361                       is_split ? PARTITION_SPLIT : PARTITION_V, ts->msac.rng);
   2362        }
   2363 
   2364        assert(bl < BL_8X8);
   2365        if (is_split) {
   2366            bp = PARTITION_SPLIT;
   2367            if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1;
   2368            t->by += hsz;
   2369            if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 2))) return 1;
   2370            t->by -= hsz;
   2371        } else {
   2372            bp = PARTITION_V;
   2373            if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_V][0],
   2374                         PARTITION_V, node->v[0]))
   2375                return -1;
   2376        }
   2377    }
   2378 
   2379    if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
   2380 #define set_ctx(rep_macro) \
   2381        rep_macro(t->a->partition, bx8, dav1d_al_part_ctx[0][bl][bp]); \
   2382        rep_macro(t->l.partition, by8, dav1d_al_part_ctx[1][bl][bp])
   2383        case_set_upto16(ulog2(hsz));
   2384 #undef set_ctx
   2385    }
   2386 
   2387    return 0;
   2388 }
   2389 
   2390 static void reset_context(BlockContext *const ctx, const int keyframe, const int pass) {
   2391    memset(ctx->intra, keyframe, sizeof(ctx->intra));
   2392    memset(ctx->uvmode, DC_PRED, sizeof(ctx->uvmode));
   2393    if (keyframe)
   2394        memset(ctx->mode, DC_PRED, sizeof(ctx->mode));
   2395 
   2396    if (pass == 2) return;
   2397 
   2398    memset(ctx->partition, 0, sizeof(ctx->partition));
   2399    memset(ctx->skip, 0, sizeof(ctx->skip));
   2400    memset(ctx->skip_mode, 0, sizeof(ctx->skip_mode));
   2401    memset(ctx->tx_lpf_y, 2, sizeof(ctx->tx_lpf_y));
   2402    memset(ctx->tx_lpf_uv, 1, sizeof(ctx->tx_lpf_uv));
   2403    memset(ctx->tx_intra, -1, sizeof(ctx->tx_intra));
   2404    memset(ctx->tx, TX_64X64, sizeof(ctx->tx));
   2405    if (!keyframe) {
   2406        memset(ctx->ref, -1, sizeof(ctx->ref));
   2407        memset(ctx->comp_type, 0, sizeof(ctx->comp_type));
   2408        memset(ctx->mode, NEARESTMV, sizeof(ctx->mode));
   2409    }
   2410    memset(ctx->lcoef, 0x40, sizeof(ctx->lcoef));
   2411    memset(ctx->ccoef, 0x40, sizeof(ctx->ccoef));
   2412    memset(ctx->filter, DAV1D_N_SWITCHABLE_FILTERS, sizeof(ctx->filter));
   2413    memset(ctx->seg_pred, 0, sizeof(ctx->seg_pred));
   2414    memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz));
   2415 }
   2416 
   2417 // { Y+U+V, Y+U } * 4
   2418 static const uint8_t ss_size_mul[4][2] = {
   2419    [DAV1D_PIXEL_LAYOUT_I400] = {  4, 4 },
   2420    [DAV1D_PIXEL_LAYOUT_I420] = {  6, 5 },
   2421    [DAV1D_PIXEL_LAYOUT_I422] = {  8, 6 },
   2422    [DAV1D_PIXEL_LAYOUT_I444] = { 12, 8 },
   2423 };
   2424 
   2425 static void setup_tile(Dav1dTileState *const ts,
   2426                       const Dav1dFrameContext *const f,
   2427                       const uint8_t *const data, const size_t sz,
   2428                       const int tile_row, const int tile_col,
   2429                       const unsigned tile_start_off)
   2430 {
   2431    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
   2432    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
   2433    const int col_sb_end = f->frame_hdr->tiling.col_start_sb[tile_col + 1];
   2434    const int row_sb_start = f->frame_hdr->tiling.row_start_sb[tile_row];
   2435    const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1];
   2436    const int sb_shift = f->sb_shift;
   2437 
   2438    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
   2439    for (int p = 0; p < 2; p++) {
   2440        ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
   2441            &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
   2442            NULL;
   2443        ts->frame_thread[p].cbi = f->frame_thread.cbi ?
   2444            &f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] :
   2445            NULL;
   2446        ts->frame_thread[p].cf = f->frame_thread.cf ?
   2447            (uint8_t*)f->frame_thread.cf +
   2448                (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
   2449            NULL;
   2450    }
   2451 
   2452    dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
   2453    ts->last_qidx = f->frame_hdr->quant.yac;
   2454    ts->last_delta_lf.u32 = 0;
   2455 
   2456    dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
   2457 
   2458    ts->tiling.row = tile_row;
   2459    ts->tiling.col = tile_col;
   2460    ts->tiling.col_start = col_sb_start << sb_shift;
   2461    ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw);
   2462    ts->tiling.row_start = row_sb_start << sb_shift;
   2463    ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh);
   2464 
   2465    // Reference Restoration Unit (used for exp coding)
   2466    int sb_idx, unit_idx;
   2467    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
   2468        // vertical components only
   2469        sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
   2470        unit_idx = (ts->tiling.row_start & 16) >> 3;
   2471    } else {
   2472        sb_idx = (ts->tiling.row_start >> 5) * f->sb128w + col_sb128_start;
   2473        unit_idx = ((ts->tiling.row_start & 16) >> 3) +
   2474                   ((ts->tiling.col_start & 16) >> 4);
   2475    }
   2476    for (int p = 0; p < 3; p++) {
   2477        if (!((f->lf.restore_planes >> p) & 1U))
   2478            continue;
   2479 
   2480        if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
   2481            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   2482            const int d = f->frame_hdr->super_res.width_scale_denominator;
   2483            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
   2484            const int rnd = (8 << unit_size_log2) - 1, shift = unit_size_log2 + 3;
   2485            const int x = ((4 * ts->tiling.col_start * d >> ss_hor) + rnd) >> shift;
   2486            const int px_x = x << (unit_size_log2 + ss_hor);
   2487            const int u_idx = unit_idx + ((px_x & 64) >> 6);
   2488            const int sb128x = px_x >> 7;
   2489            if (sb128x >= f->sr_sb128w) continue;
   2490            ts->lr_ref[p] = &f->lf.lr_mask[sb_idx + sb128x].lr[p][u_idx];
   2491        } else {
   2492            ts->lr_ref[p] = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
   2493        }
   2494 
   2495        ts->lr_ref[p]->filter_v[0] = 3;
   2496        ts->lr_ref[p]->filter_v[1] = -7;
   2497        ts->lr_ref[p]->filter_v[2] = 15;
   2498        ts->lr_ref[p]->filter_h[0] = 3;
   2499        ts->lr_ref[p]->filter_h[1] = -7;
   2500        ts->lr_ref[p]->filter_h[2] = 15;
   2501        ts->lr_ref[p]->sgr_weights[0] = -32;
   2502        ts->lr_ref[p]->sgr_weights[1] = 31;
   2503    }
   2504 
   2505    if (f->c->n_tc > 1) {
   2506        for (int p = 0; p < 2; p++)
   2507            atomic_init(&ts->progress[p], row_sb_start);
   2508    }
   2509 }
   2510 
   2511 static void read_restoration_info(Dav1dTaskContext *const t,
   2512                                  Av1RestorationUnit *const lr, const int p,
   2513                                  const enum Dav1dRestorationType frame_type)
   2514 {
   2515    const Dav1dFrameContext *const f = t->f;
   2516    Dav1dTileState *const ts = t->ts;
   2517 
   2518    if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
   2519        const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
   2520                               ts->cdf.m.restore_switchable, 2);
   2521        lr->type = filter + !!filter; /* NONE/WIENER/SGRPROJ */
   2522    } else {
   2523        const unsigned type =
   2524            dav1d_msac_decode_bool_adapt(&ts->msac,
   2525                frame_type == DAV1D_RESTORATION_WIENER ?
   2526                ts->cdf.m.restore_wiener : ts->cdf.m.restore_sgrproj);
   2527        lr->type = type ? frame_type : DAV1D_RESTORATION_NONE;
   2528    }
   2529 
   2530    if (lr->type == DAV1D_RESTORATION_WIENER) {
   2531        lr->filter_v[0] = p ? 0 :
   2532            dav1d_msac_decode_subexp(&ts->msac,
   2533                ts->lr_ref[p]->filter_v[0] + 5, 16, 1) - 5;
   2534        lr->filter_v[1] =
   2535            dav1d_msac_decode_subexp(&ts->msac,
   2536                ts->lr_ref[p]->filter_v[1] + 23, 32, 2) - 23;
   2537        lr->filter_v[2] =
   2538            dav1d_msac_decode_subexp(&ts->msac,
   2539                ts->lr_ref[p]->filter_v[2] + 17, 64, 3) - 17;
   2540 
   2541        lr->filter_h[0] = p ? 0 :
   2542            dav1d_msac_decode_subexp(&ts->msac,
   2543                ts->lr_ref[p]->filter_h[0] + 5, 16, 1) - 5;
   2544        lr->filter_h[1] =
   2545            dav1d_msac_decode_subexp(&ts->msac,
   2546                ts->lr_ref[p]->filter_h[1] + 23, 32, 2) - 23;
   2547        lr->filter_h[2] =
   2548            dav1d_msac_decode_subexp(&ts->msac,
   2549                ts->lr_ref[p]->filter_h[2] + 17, 64, 3) - 17;
   2550        memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights));
   2551        ts->lr_ref[p] = lr;
   2552        if (DEBUG_BLOCK_INFO)
   2553            printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n",
   2554                   p, lr->filter_v[0], lr->filter_v[1],
   2555                   lr->filter_v[2], lr->filter_h[0],
   2556                   lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
   2557    } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
   2558        const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
   2559        const uint16_t *const sgr_params = dav1d_sgr_params[idx];
   2560        lr->type += idx;
   2561        lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac,
   2562            ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0;
   2563        lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac,
   2564            ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95;
   2565        memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
   2566        memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
   2567        ts->lr_ref[p] = lr;
   2568        if (DEBUG_BLOCK_INFO)
   2569            printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n",
   2570                   p, idx, lr->sgr_weights[0],
   2571                   lr->sgr_weights[1], ts->msac.rng);
   2572    }
   2573 }
   2574 
   2575 // modeled after the equivalent function in aomdec:decodeframe.c
   2576 static int check_trailing_bits_after_symbol_coder(const MsacContext *const msac) {
   2577    // check marker bit (single 1), followed by zeroes
   2578    const int n_bits = -(msac->cnt + 14);
   2579    assert(n_bits <= 0); // this assumes we errored out when cnt <= -15 in caller
   2580    const int n_bytes = (n_bits + 7) >> 3;
   2581    const uint8_t *p = &msac->buf_pos[n_bytes];
   2582    const int pattern = 128 >> ((n_bits - 1) & 7);
   2583    if ((p[-1] & (2 * pattern - 1)) != pattern)
   2584        return 1;
   2585 
   2586    // check remainder zero bytes
   2587    for (; p < msac->buf_end; p++)
   2588        if (*p)
   2589            return 1;
   2590 
   2591    return 0;
   2592 }
   2593 
   2594 int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
   2595    const Dav1dFrameContext *const f = t->f;
   2596    const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64;
   2597    Dav1dTileState *const ts = t->ts;
   2598    const Dav1dContext *const c = f->c;
   2599    const int sb_step = f->sb_step;
   2600    const int tile_row = ts->tiling.row, tile_col = ts->tiling.col;
   2601    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
   2602    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
   2603 
   2604    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
   2605        dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
   2606                                     ts->tiling.col_end, ts->tiling.row_start,
   2607                                     ts->tiling.row_end, t->by >> f->sb_shift,
   2608                                     ts->tiling.row, t->frame_thread.pass);
   2609    }
   2610 
   2611    if (IS_INTER_OR_SWITCH(f->frame_hdr) && c->n_fc > 1) {
   2612        const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
   2613        int (*const lowest_px)[2] = ts->lowest_pixel[sby];
   2614        for (int n = 0; n < 7; n++)
   2615            for (int m = 0; m < 2; m++)
   2616                lowest_px[n][m] = INT_MIN;
   2617    }
   2618 
   2619    reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), t->frame_thread.pass);
   2620    if (t->frame_thread.pass == 2) {
   2621        const int off_2pass = c->n_tc > 1 ? f->sb128w * f->frame_hdr->tiling.rows : 0;
   2622        for (t->bx = ts->tiling.col_start,
   2623             t->a = f->a + off_2pass + col_sb128_start + tile_row * f->sb128w;
   2624             t->bx < ts->tiling.col_end; t->bx += sb_step)
   2625        {
   2626            if (atomic_load_explicit(c->flush, memory_order_acquire))
   2627                return 1;
   2628            if (decode_sb(t, root_bl, dav1d_intra_edge_tree[root_bl]))
   2629                return 1;
   2630            if (t->bx & 16 || f->seq_hdr->sb128)
   2631                t->a++;
   2632        }
   2633        f->bd_fn.backup_ipred_edge(t);
   2634        return 0;
   2635    }
   2636 
   2637    if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
   2638        f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row,
   2639                                   ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
   2640                                   t->by >> 1, (t->by + sb_step) >> 1);
   2641    }
   2642    memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
   2643    const int sb128y = t->by >> 5;
   2644    for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w,
   2645         t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
   2646         t->bx < ts->tiling.col_end; t->bx += sb_step)
   2647    {
   2648        if (atomic_load_explicit(c->flush, memory_order_acquire))
   2649            return 1;
   2650        if (root_bl == BL_128X128) {
   2651            t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
   2652            t->cur_sb_cdef_idx_ptr[0] = -1;
   2653            t->cur_sb_cdef_idx_ptr[1] = -1;
   2654            t->cur_sb_cdef_idx_ptr[2] = -1;
   2655            t->cur_sb_cdef_idx_ptr[3] = -1;
   2656        } else {
   2657            t->cur_sb_cdef_idx_ptr =
   2658                &t->lf_mask->cdef_idx[((t->bx & 16) >> 4) +
   2659                                      ((t->by & 16) >> 3)];
   2660            t->cur_sb_cdef_idx_ptr[0] = -1;
   2661        }
   2662        // Restoration filter
   2663        for (int p = 0; p < 3; p++) {
   2664            if (!((f->lf.restore_planes >> p) & 1U))
   2665                continue;
   2666 
   2667            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2668            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   2669            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
   2670            const int y = t->by * 4 >> ss_ver;
   2671            const int h = (f->cur.p.h + ss_ver) >> ss_ver;
   2672 
   2673            const int unit_size = 1 << unit_size_log2;
   2674            const unsigned mask = unit_size - 1;
   2675            if (y & mask) continue;
   2676            const int half_unit = unit_size >> 1;
   2677            // Round half up at frame boundaries, if there's more than one
   2678            // restoration unit
   2679            if (y && y + half_unit > h) continue;
   2680 
   2681            const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];
   2682 
   2683            if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
   2684                const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
   2685                const int n_units = imax(1, (w + half_unit) >> unit_size_log2);
   2686 
   2687                const int d = f->frame_hdr->super_res.width_scale_denominator;
   2688                const int rnd = unit_size * 8 - 1, shift = unit_size_log2 + 3;
   2689                const int x0 = ((4 *  t->bx            * d >> ss_hor) + rnd) >> shift;
   2690                const int x1 = ((4 * (t->bx + sb_step) * d >> ss_hor) + rnd) >> shift;
   2691 
   2692                for (int x = x0; x < imin(x1, n_units); x++) {
   2693                    const int px_x = x << (unit_size_log2 + ss_hor);
   2694                    const int sb_idx = (t->by >> 5) * f->sr_sb128w + (px_x >> 7);
   2695                    const int unit_idx = ((t->by & 16) >> 3) + ((px_x & 64) >> 6);
   2696                    Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
   2697 
   2698                    read_restoration_info(t, lr, p, frame_type);
   2699                }
   2700            } else {
   2701                const int x = 4 * t->bx >> ss_hor;
   2702                if (x & mask) continue;
   2703                const int w = (f->cur.p.w + ss_hor) >> ss_hor;
   2704                // Round half up at frame boundaries, if there's more than one
   2705                // restoration unit
   2706                if (x && x + half_unit > w) continue;
   2707                const int sb_idx = (t->by >> 5) * f->sr_sb128w + (t->bx >> 5);
   2708                const int unit_idx = ((t->by & 16) >> 3) + ((t->bx & 16) >> 4);
   2709                Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
   2710 
   2711                read_restoration_info(t, lr, p, frame_type);
   2712            }
   2713        }
   2714        if (decode_sb(t, root_bl, dav1d_intra_edge_tree[root_bl]))
   2715            return 1;
   2716        if (t->bx & 16 || f->seq_hdr->sb128) {
   2717            t->a++;
   2718            t->lf_mask++;
   2719        }
   2720    }
   2721 
   2722    if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
   2723        dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
   2724                               ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
   2725                               t->by >> 1, (t->by + sb_step) >> 1);
   2726    }
   2727 
   2728    // backup pre-loopfilter pixels for intra prediction of the next sbrow
   2729    if (t->frame_thread.pass != 1)
   2730        f->bd_fn.backup_ipred_edge(t);
   2731 
   2732    // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix"
   2733    // up the initial value in neighbour tiles when running the loopfilter
   2734    int align_h = (f->bh + 31) & ~31;
   2735    memcpy(&f->lf.tx_lpf_right_edge[0][align_h * tile_col + t->by],
   2736           &t->l.tx_lpf_y[t->by & 16], sb_step);
   2737    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2738    align_h >>= ss_ver;
   2739    memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
   2740           &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
   2741 
   2742    // error out on symbol decoder overread
   2743    if (ts->msac.cnt <= -15) return 1;
   2744 
   2745    return c->strict_std_compliance &&
   2746           (t->by >> f->sb_shift) + 1 >= f->frame_hdr->tiling.row_start_sb[tile_row + 1] &&
   2747           check_trailing_bits_after_symbol_coder(&ts->msac);
   2748 }
   2749 
   2750 int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
   2751    const Dav1dContext *const c = f->c;
   2752    int retval = DAV1D_ERR(ENOMEM);
   2753 
   2754    if (f->sbh > f->lf.start_of_tile_row_sz) {
   2755        dav1d_free(f->lf.start_of_tile_row);
   2756        f->lf.start_of_tile_row = dav1d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t));
   2757        if (!f->lf.start_of_tile_row) {
   2758            f->lf.start_of_tile_row_sz = 0;
   2759            goto error;
   2760        }
   2761        f->lf.start_of_tile_row_sz = f->sbh;
   2762    }
   2763    int sby = 0;
   2764    for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
   2765        f->lf.start_of_tile_row[sby++] = tile_row;
   2766        while (sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1])
   2767            f->lf.start_of_tile_row[sby++] = 0;
   2768    }
   2769 
   2770    const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
   2771    if (n_ts != f->n_ts) {
   2772        if (c->n_fc > 1) {
   2773            dav1d_free(f->frame_thread.tile_start_off);
   2774            f->frame_thread.tile_start_off =
   2775                dav1d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts);
   2776            if (!f->frame_thread.tile_start_off) {
   2777                f->n_ts = 0;
   2778                goto error;
   2779            }
   2780        }
   2781        dav1d_free_aligned(f->ts);
   2782        f->ts = dav1d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32);
   2783        if (!f->ts) goto error;
   2784        f->n_ts = n_ts;
   2785    }
   2786 
   2787    const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1));
   2788    if (a_sz != f->a_sz) {
   2789        dav1d_free(f->a);
   2790        f->a = dav1d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz);
   2791        if (!f->a) {
   2792            f->a_sz = 0;
   2793            goto error;
   2794        }
   2795        f->a_sz = a_sz;
   2796    }
   2797 
   2798    const int num_sb128 = f->sb128w * f->sb128h;
   2799    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
   2800    const int hbd = !!f->seq_hdr->hbd;
   2801    if (c->n_fc > 1) {
   2802        const unsigned sb_step4 = f->sb_step * 4;
   2803        int tile_idx = 0;
   2804        for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
   2805            const unsigned row_off = f->frame_hdr->tiling.row_start_sb[tile_row] *
   2806                                     sb_step4 * f->sb128w * 128;
   2807            const unsigned b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
   2808                                     f->frame_hdr->tiling.row_start_sb[tile_row]) * sb_step4;
   2809            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
   2810                f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff *
   2811                    f->frame_hdr->tiling.col_start_sb[tile_col] * sb_step4;
   2812            }
   2813        }
   2814 
   2815        const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh;
   2816        if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) {
   2817            dav1d_free(f->tile_thread.lowest_pixel_mem);
   2818            f->tile_thread.lowest_pixel_mem =
   2819                dav1d_malloc(ALLOC_TILE, lowest_pixel_mem_sz *
   2820                             sizeof(*f->tile_thread.lowest_pixel_mem));
   2821            if (!f->tile_thread.lowest_pixel_mem) {
   2822                f->tile_thread.lowest_pixel_mem_sz = 0;
   2823                goto error;
   2824            }
   2825            f->tile_thread.lowest_pixel_mem_sz = lowest_pixel_mem_sz;
   2826        }
   2827        int (*lowest_pixel_ptr)[7][2] = f->tile_thread.lowest_pixel_mem;
   2828        for (int tile_row = 0, tile_row_base = 0; tile_row < f->frame_hdr->tiling.rows;
   2829             tile_row++, tile_row_base += f->frame_hdr->tiling.cols)
   2830        {
   2831            const int tile_row_sb_h = f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
   2832                                      f->frame_hdr->tiling.row_start_sb[tile_row];
   2833            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
   2834                f->ts[tile_row_base + tile_col].lowest_pixel = lowest_pixel_ptr;
   2835                lowest_pixel_ptr += tile_row_sb_h;
   2836            }
   2837        }
   2838 
   2839        const int cbi_sz = num_sb128 * size_mul[0];
   2840        if (cbi_sz != f->frame_thread.cbi_sz) {
   2841            dav1d_free_aligned(f->frame_thread.cbi);
   2842            f->frame_thread.cbi =
   2843                dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
   2844                                    cbi_sz * 32 * 32 / 4, 64);
   2845            if (!f->frame_thread.cbi) {
   2846                f->frame_thread.cbi_sz = 0;
   2847                goto error;
   2848            }
   2849            f->frame_thread.cbi_sz = cbi_sz;
   2850        }
   2851 
   2852        const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
   2853        if (cf_sz != f->frame_thread.cf_sz) {
   2854            dav1d_free_aligned(f->frame_thread.cf);
   2855            f->frame_thread.cf =
   2856                dav1d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 128 * 128 / 2, 64);
   2857            if (!f->frame_thread.cf) {
   2858                f->frame_thread.cf_sz = 0;
   2859                goto error;
   2860            }
   2861            memset(f->frame_thread.cf, 0, (size_t)cf_sz * 128 * 128 / 2);
   2862            f->frame_thread.cf_sz = cf_sz;
   2863        }
   2864 
   2865        if (f->frame_hdr->allow_screen_content_tools) {
   2866            const int pal_sz = num_sb128 << hbd;
   2867            if (pal_sz != f->frame_thread.pal_sz) {
   2868                dav1d_free_aligned(f->frame_thread.pal);
   2869                f->frame_thread.pal =
   2870                    dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
   2871                                        pal_sz * 16 * 16, 64);
   2872                if (!f->frame_thread.pal) {
   2873                    f->frame_thread.pal_sz = 0;
   2874                    goto error;
   2875                }
   2876                f->frame_thread.pal_sz = pal_sz;
   2877            }
   2878 
   2879            const int pal_idx_sz = num_sb128 * size_mul[1];
   2880            if (pal_idx_sz != f->frame_thread.pal_idx_sz) {
   2881                dav1d_free_aligned(f->frame_thread.pal_idx);
   2882                f->frame_thread.pal_idx =
   2883                    dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
   2884                                        pal_idx_sz * 128 * 128 / 8, 64);
   2885                if (!f->frame_thread.pal_idx) {
   2886                    f->frame_thread.pal_idx_sz = 0;
   2887                    goto error;
   2888                }
   2889                f->frame_thread.pal_idx_sz = pal_idx_sz;
   2890            }
   2891        } else if (f->frame_thread.pal) {
   2892            dav1d_freep_aligned(&f->frame_thread.pal);
   2893            dav1d_freep_aligned(&f->frame_thread.pal_idx);
   2894            f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0;
   2895        }
   2896    }
   2897 
   2898    // update allocation of block contexts for above
   2899    ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
   2900    const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
   2901    const int need_cdef_lpf_copy = c->n_tc > 1 && has_resize;
   2902    if (y_stride * f->sbh * 4 != f->lf.cdef_buf_plane_sz[0] ||
   2903        uv_stride * f->sbh * 8 != f->lf.cdef_buf_plane_sz[1] ||
   2904        need_cdef_lpf_copy != f->lf.need_cdef_lpf_copy ||
   2905        f->sbh != f->lf.cdef_buf_sbh)
   2906    {
   2907        dav1d_free_aligned(f->lf.cdef_line_buf);
   2908        size_t alloc_sz = 64;
   2909        alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy;
   2910        alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy;
   2911        uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32);
   2912        if (!ptr) {
   2913            f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0;
   2914            goto error;
   2915        }
   2916 
   2917        ptr += 32;
   2918        if (y_stride < 0) {
   2919            f->lf.cdef_line[0][0] = ptr - y_stride * (f->sbh * 4 - 1);
   2920            f->lf.cdef_line[1][0] = ptr - y_stride * (f->sbh * 4 - 3);
   2921        } else {
   2922            f->lf.cdef_line[0][0] = ptr + y_stride * 0;
   2923            f->lf.cdef_line[1][0] = ptr + y_stride * 2;
   2924        }
   2925        ptr += llabs(y_stride) * f->sbh * 4;
   2926        if (uv_stride < 0) {
   2927            f->lf.cdef_line[0][1] = ptr - uv_stride * (f->sbh * 8 - 1);
   2928            f->lf.cdef_line[0][2] = ptr - uv_stride * (f->sbh * 8 - 3);
   2929            f->lf.cdef_line[1][1] = ptr - uv_stride * (f->sbh * 8 - 5);
   2930            f->lf.cdef_line[1][2] = ptr - uv_stride * (f->sbh * 8 - 7);
   2931        } else {
   2932            f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
   2933            f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
   2934            f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
   2935            f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
   2936        }
   2937 
   2938        if (need_cdef_lpf_copy) {
   2939            ptr += llabs(uv_stride) * f->sbh * 8;
   2940            if (y_stride < 0)
   2941                f->lf.cdef_lpf_line[0] = ptr - y_stride * (f->sbh * 4 - 1);
   2942            else
   2943                f->lf.cdef_lpf_line[0] = ptr;
   2944            ptr += llabs(y_stride) * f->sbh * 4;
   2945            if (uv_stride < 0) {
   2946                f->lf.cdef_lpf_line[1] = ptr - uv_stride * (f->sbh * 4 - 1);
   2947                f->lf.cdef_lpf_line[2] = ptr - uv_stride * (f->sbh * 8 - 1);
   2948            } else {
   2949                f->lf.cdef_lpf_line[1] = ptr;
   2950                f->lf.cdef_lpf_line[2] = ptr + uv_stride * f->sbh * 4;
   2951            }
   2952        }
   2953 
   2954        f->lf.cdef_buf_plane_sz[0] = (int) y_stride * f->sbh * 4;
   2955        f->lf.cdef_buf_plane_sz[1] = (int) uv_stride * f->sbh * 8;
   2956        f->lf.need_cdef_lpf_copy = need_cdef_lpf_copy;
   2957        f->lf.cdef_buf_sbh = f->sbh;
   2958    }
   2959 
   2960    const int sb128 = f->seq_hdr->sb128;
   2961    const int num_lines = c->n_tc > 1 ? f->sbh * 4 << sb128 : 12;
   2962    y_stride = f->sr_cur.p.stride[0], uv_stride = f->sr_cur.p.stride[1];
   2963    if (y_stride * num_lines != f->lf.lr_buf_plane_sz[0] ||
   2964        uv_stride * num_lines * 2 != f->lf.lr_buf_plane_sz[1])
   2965    {
   2966        dav1d_free_aligned(f->lf.lr_line_buf);
   2967        // lr simd may overread the input, so slightly over-allocate the lpf buffer
   2968        size_t alloc_sz = 128;
   2969        alloc_sz += (size_t)llabs(y_stride) * num_lines;
   2970        alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2;
   2971        uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(ALLOC_LR, alloc_sz, 64);
   2972        if (!ptr) {
   2973            f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0;
   2974            goto error;
   2975        }
   2976 
   2977        ptr += 64;
   2978        if (y_stride < 0)
   2979            f->lf.lr_lpf_line[0] = ptr - y_stride * (num_lines - 1);
   2980        else
   2981            f->lf.lr_lpf_line[0] = ptr;
   2982        ptr += llabs(y_stride) * num_lines;
   2983        if (uv_stride < 0) {
   2984            f->lf.lr_lpf_line[1] = ptr - uv_stride * (num_lines * 1 - 1);
   2985            f->lf.lr_lpf_line[2] = ptr - uv_stride * (num_lines * 2 - 1);
   2986        } else {
   2987            f->lf.lr_lpf_line[1] = ptr;
   2988            f->lf.lr_lpf_line[2] = ptr + uv_stride * num_lines;
   2989        }
   2990 
   2991        f->lf.lr_buf_plane_sz[0] = (int) y_stride * num_lines;
   2992        f->lf.lr_buf_plane_sz[1] = (int) uv_stride * num_lines * 2;
   2993    }
   2994 
   2995    // update allocation for loopfilter masks
   2996    if (num_sb128 != f->lf.mask_sz) {
   2997        dav1d_free(f->lf.mask);
   2998        dav1d_free(f->lf.level);
   2999        f->lf.mask = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb128);
   3000        // over-allocate by 3 bytes since some of the SIMD implementations
   3001        // index this from the level type and can thus over-read by up to 3
   3002        f->lf.level = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);
   3003        if (!f->lf.mask || !f->lf.level) {
   3004            f->lf.mask_sz = 0;
   3005            goto error;
   3006        }
   3007        if (c->n_fc > 1) {
   3008            dav1d_free(f->frame_thread.b);
   3009            f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
   3010                                             num_sb128 * 32 * 32);
   3011            if (!f->frame_thread.b) {
   3012                f->lf.mask_sz = 0;
   3013                goto error;
   3014            }
   3015        }
   3016        f->lf.mask_sz = num_sb128;
   3017    }
   3018 
   3019    f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
   3020    const int lr_mask_sz = f->sr_sb128w * f->sb128h;
   3021    if (lr_mask_sz != f->lf.lr_mask_sz) {
   3022        dav1d_free(f->lf.lr_mask);
   3023        f->lf.lr_mask = dav1d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * lr_mask_sz);
   3024        if (!f->lf.lr_mask) {
   3025            f->lf.lr_mask_sz = 0;
   3026            goto error;
   3027        }
   3028        f->lf.lr_mask_sz = lr_mask_sz;
   3029    }
   3030    f->lf.restore_planes =
   3031        ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
   3032        ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
   3033        ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
   3034    if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
   3035        dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
   3036        f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
   3037    }
   3038    dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });
   3039    memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb128);
   3040 
   3041    const int ipred_edge_sz = f->sbh * f->sb128w << hbd;
   3042    if (ipred_edge_sz != f->ipred_edge_sz) {
   3043        dav1d_free_aligned(f->ipred_edge[0]);
   3044        uint8_t *ptr = f->ipred_edge[0] =
   3045            dav1d_alloc_aligned(ALLOC_IPRED, ipred_edge_sz * 128 * 3, 64);
   3046        if (!ptr) {
   3047            f->ipred_edge_sz = 0;
   3048            goto error;
   3049        }
   3050        f->ipred_edge[1] = ptr + ipred_edge_sz * 128 * 1;
   3051        f->ipred_edge[2] = ptr + ipred_edge_sz * 128 * 2;
   3052        f->ipred_edge_sz = ipred_edge_sz;
   3053    }
   3054 
   3055    const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;
   3056    if (re_sz != f->lf.re_sz) {
   3057        dav1d_free(f->lf.tx_lpf_right_edge[0]);
   3058        f->lf.tx_lpf_right_edge[0] = dav1d_malloc(ALLOC_LF, re_sz * 32 * 2);
   3059        if (!f->lf.tx_lpf_right_edge[0]) {
   3060            f->lf.re_sz = 0;
   3061            goto error;
   3062        }
   3063        f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] + re_sz * 32;
   3064        f->lf.re_sz = re_sz;
   3065    }
   3066 
   3067    // init ref mvs
   3068    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
   3069        const int ret =
   3070            dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
   3071                                    f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs,
   3072                                    f->c->n_tc, f->c->n_fc);
   3073        if (ret < 0) goto error;
   3074    }
   3075 
   3076    // setup dequant tables
   3077    init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
   3078    if (f->frame_hdr->quant.qm)
   3079        for (int i = 0; i < N_RECT_TX_SIZES; i++) {
   3080            f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i];
   3081            f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i];
   3082            f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i];
   3083        }
   3084    else
   3085        memset(f->qm, 0, sizeof(f->qm));
   3086 
   3087    // setup jnt_comp weights
   3088    if (f->frame_hdr->switchable_comp_refs) {
   3089        for (int i = 0; i < 7; i++) {
   3090            const unsigned ref0poc = f->refp[i].p.frame_hdr->frame_offset;
   3091 
   3092            for (int j = i + 1; j < 7; j++) {
   3093                const unsigned ref1poc = f->refp[j].p.frame_hdr->frame_offset;
   3094 
   3095                const unsigned d1 =
   3096                    imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref0poc,
   3097                                          f->cur.frame_hdr->frame_offset)), 31);
   3098                const unsigned d0 =
   3099                    imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref1poc,
   3100                                          f->cur.frame_hdr->frame_offset)), 31);
   3101                const int order = d0 <= d1;
   3102 
   3103                static const uint8_t quant_dist_weight[3][2] = {
   3104                    { 2, 3 }, { 2, 5 }, { 2, 7 }
   3105                };
   3106                static const uint8_t quant_dist_lookup_table[4][2] = {
   3107                    { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 }
   3108                };
   3109 
   3110                int k;
   3111                for (k = 0; k < 3; k++) {
   3112                    const int c0 = quant_dist_weight[k][order];
   3113                    const int c1 = quant_dist_weight[k][!order];
   3114                    const int d0_c0 = d0 * c0;
   3115                    const int d1_c1 = d1 * c1;
   3116                    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
   3117                }
   3118 
   3119                f->jnt_weights[i][j] = quant_dist_lookup_table[k][order];
   3120            }
   3121        }
   3122    }
   3123 
   3124    /* Init loopfilter pointers. Increasing NULL pointers is technically UB,
   3125     * so just point the chroma pointers in 4:0:0 to the luma plane here to
   3126     * avoid having additional in-loop branches in various places. We never
   3127     * dereference those pointers so it doesn't really matter what they
   3128     * point at, as long as the pointers are valid. */
   3129    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
   3130    f->lf.p[0] = f->cur.data[0];
   3131    f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0];
   3132    f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0];
   3133    f->lf.sr_p[0] = f->sr_cur.p.data[0];
   3134    f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
   3135    f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
   3136 
   3137    retval = 0;
   3138 error:
   3139    return retval;
   3140 }
   3141 
   3142 int dav1d_decode_frame_init_cdf(Dav1dFrameContext *const f) {
   3143    const Dav1dContext *const c = f->c;
   3144    int retval = DAV1D_ERR(EINVAL);
   3145 
   3146    if (f->frame_hdr->refresh_context)
   3147        dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);
   3148 
   3149    // parse individual tiles per tile group
   3150    int tile_row = 0, tile_col = 0;
   3151    f->task_thread.update_set = 0;
   3152    for (int i = 0; i < f->n_tile_data; i++) {
   3153        const uint8_t *data = f->tile[i].data.data;
   3154        size_t size = f->tile[i].data.sz;
   3155 
   3156        for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
   3157            size_t tile_sz;
   3158            if (j == f->tile[i].end) {
   3159                tile_sz = size;
   3160            } else {
   3161                if (f->frame_hdr->tiling.n_bytes > size) goto error;
   3162                tile_sz = 0;
   3163                for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++)
   3164                    tile_sz |= (unsigned)*data++ << (k * 8);
   3165                tile_sz++;
   3166                size -= f->frame_hdr->tiling.n_bytes;
   3167                if (tile_sz > size) goto error;
   3168            }
   3169 
   3170            setup_tile(&f->ts[j], f, data, tile_sz, tile_row, tile_col++,
   3171                       c->n_fc > 1 ? f->frame_thread.tile_start_off[j] : 0);
   3172 
   3173            if (tile_col == f->frame_hdr->tiling.cols) {
   3174                tile_col = 0;
   3175                tile_row++;
   3176            }
   3177            if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context)
   3178                f->task_thread.update_set = 1;
   3179            data += tile_sz;
   3180            size -= tile_sz;
   3181        }
   3182    }
   3183 
   3184    if (c->n_tc > 1) {
   3185        const int uses_2pass = c->n_fc > 1;
   3186        for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows * (1 + uses_2pass); n++)
   3187            reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr),
   3188                          uses_2pass ? 1 + (n >= f->sb128w * f->frame_hdr->tiling.rows) : 0);
   3189    }
   3190 
   3191    retval = 0;
   3192 error:
   3193    return retval;
   3194 }
   3195 
   3196 int dav1d_decode_frame_main(Dav1dFrameContext *const f) {
   3197    const Dav1dContext *const c = f->c;
   3198    int retval = DAV1D_ERR(EINVAL);
   3199 
   3200    assert(f->c->n_tc == 1);
   3201 
   3202    Dav1dTaskContext *const t = &c->tc[f - c->fc];
   3203    t->f = f;
   3204    t->frame_thread.pass = 0;
   3205 
   3206    for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
   3207        reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), 0);
   3208 
   3209    // no threading - we explicitly interleave tile/sbrow decoding
   3210    // and post-filtering, so that the full process runs in-line
   3211    for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
   3212        const int sbh_end =
   3213            imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh);
   3214        for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
   3215             sby < sbh_end; sby++)
   3216        {
   3217            t->by = sby << (4 + f->seq_hdr->sb128);
   3218            const int by_end = (t->by + f->sb_step) >> 1;
   3219            if (f->frame_hdr->use_ref_frame_mvs) {
   3220                f->c->refmvs_dsp.load_tmvs(&f->rf, tile_row,
   3221                                           0, f->bw >> 1, t->by >> 1, by_end);
   3222            }
   3223            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
   3224                t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
   3225                if (dav1d_decode_tile_sbrow(t)) goto error;
   3226            }
   3227            if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
   3228                dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt,
   3229                                       0, f->bw >> 1, t->by >> 1, by_end);
   3230            }
   3231 
   3232            // loopfilter + cdef + restoration
   3233            f->bd_fn.filter_sbrow(f, sby);
   3234        }
   3235    }
   3236 
   3237    retval = 0;
   3238 error:
   3239    return retval;
   3240 }
   3241 
   3242 void dav1d_decode_frame_exit(Dav1dFrameContext *const f, int retval) {
   3243    const Dav1dContext *const c = f->c;
   3244 
   3245    if (f->sr_cur.p.data[0])
   3246        atomic_init(&f->task_thread.error, 0);
   3247 
   3248    if (c->n_fc > 1 && retval && f->frame_thread.cf) {
   3249        memset(f->frame_thread.cf, 0,
   3250               (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
   3251    }
   3252    for (int i = 0; i < 7; i++) {
   3253        if (f->refp[i].p.frame_hdr) {
   3254            if (!retval && c->n_fc > 1 && c->strict_std_compliance &&
   3255                atomic_load(&f->refp[i].progress[1]) == FRAME_ERROR)
   3256            {
   3257                retval = DAV1D_ERR(EINVAL);
   3258                atomic_store(&f->task_thread.error, 1);
   3259                atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
   3260            }
   3261            dav1d_thread_picture_unref(&f->refp[i]);
   3262        }
   3263        dav1d_ref_dec(&f->ref_mvs_ref[i]);
   3264    }
   3265 
   3266    dav1d_picture_unref_internal(&f->cur);
   3267    dav1d_thread_picture_unref(&f->sr_cur);
   3268    dav1d_cdf_thread_unref(&f->in_cdf);
   3269    if (f->frame_hdr && f->frame_hdr->refresh_context) {
   3270        if (f->out_cdf.progress)
   3271            atomic_store(f->out_cdf.progress, retval == 0 ? 1 : TILE_ERROR);
   3272        dav1d_cdf_thread_unref(&f->out_cdf);
   3273    }
   3274    dav1d_ref_dec(&f->cur_segmap_ref);
   3275    dav1d_ref_dec(&f->prev_segmap_ref);
   3276    dav1d_ref_dec(&f->mvs_ref);
   3277    dav1d_ref_dec(&f->seq_hdr_ref);
   3278    dav1d_ref_dec(&f->frame_hdr_ref);
   3279 
   3280    for (int i = 0; i < f->n_tile_data; i++)
   3281        dav1d_data_unref_internal(&f->tile[i].data);
   3282    f->task_thread.retval = retval;
   3283 }
   3284 
   3285 int dav1d_decode_frame(Dav1dFrameContext *const f) {
   3286    assert(f->c->n_fc == 1);
   3287    // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task
   3288    // threads also. Not sure it makes a measurable difference.
   3289    int res = dav1d_decode_frame_init(f);
   3290    if (!res) res = dav1d_decode_frame_init_cdf(f);
   3291    // wait until all threads have completed
   3292    if (!res) {
   3293        if (f->c->n_tc > 1) {
   3294            res = dav1d_task_create_tile_sbrow(f, 0, 1);
   3295            pthread_mutex_lock(&f->task_thread.ttd->lock);
   3296            pthread_cond_signal(&f->task_thread.ttd->cond);
   3297            if (!res) {
   3298                while (!f->task_thread.done[0] ||
   3299                       atomic_load(&f->task_thread.task_counter) > 0)
   3300                {
   3301                    pthread_cond_wait(&f->task_thread.cond,
   3302                                      &f->task_thread.ttd->lock);
   3303                }
   3304            }
   3305            pthread_mutex_unlock(&f->task_thread.ttd->lock);
   3306            res = f->task_thread.retval;
   3307        } else {
   3308            res = dav1d_decode_frame_main(f);
   3309            if (!res && f->frame_hdr->refresh_context && f->task_thread.update_set) {
   3310                dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
   3311                                        &f->ts[f->frame_hdr->tiling.update].cdf);
   3312            }
   3313        }
   3314    }
   3315    dav1d_decode_frame_exit(f, res);
   3316    res = f->task_thread.retval;
   3317    f->n_tile_data = 0;
   3318    return res;
   3319 }
   3320 
   3321 static int get_upscale_x0(const int in_w, const int out_w, const int step) {
   3322    const int err = out_w * step - (in_w << 14);
   3323    const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err / 2);
   3324    return x0 & 0x3fff;
   3325 }
   3326 
   3327 int dav1d_submit_frame(Dav1dContext *const c) {
   3328    Dav1dFrameContext *f;
   3329    int res = -1;
   3330 
   3331    // wait for c->out_delayed[next] and move into c->out if visible
   3332    Dav1dThreadPicture *out_delayed;
   3333    if (c->n_fc > 1) {
   3334        pthread_mutex_lock(&c->task_thread.lock);
   3335        const unsigned next = c->frame_thread.next++;
   3336        if (c->frame_thread.next == c->n_fc)
   3337            c->frame_thread.next = 0;
   3338 
   3339        f = &c->fc[next];
   3340        while (f->n_tile_data > 0)
   3341            pthread_cond_wait(&f->task_thread.cond,
   3342                              &c->task_thread.lock);
   3343        out_delayed = &c->frame_thread.out_delayed[next];
   3344        if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
   3345            unsigned first = atomic_load(&c->task_thread.first);
   3346            if (first + 1U < c->n_fc)
   3347                atomic_fetch_add(&c->task_thread.first, 1U);
   3348            else
   3349                atomic_store(&c->task_thread.first, 0);
   3350            atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
   3351                                           &first, UINT_MAX);
   3352            if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
   3353                c->task_thread.cur--;
   3354        }
   3355        const int error = f->task_thread.retval;
   3356        if (error) {
   3357            f->task_thread.retval = 0;
   3358            c->cached_error = error;
   3359            dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m);
   3360            dav1d_thread_picture_unref(out_delayed);
   3361        } else if (out_delayed->p.data[0]) {
   3362            const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
   3363                                                           memory_order_relaxed);
   3364            if ((out_delayed->visible || c->output_invisible_frames) &&
   3365                progress != FRAME_ERROR)
   3366            {
   3367                dav1d_thread_picture_ref(&c->out, out_delayed);
   3368                c->event_flags |= dav1d_picture_get_event_flags(out_delayed);
   3369            }
   3370            dav1d_thread_picture_unref(out_delayed);
   3371        }
   3372    } else {
   3373        f = c->fc;
   3374    }
   3375 
   3376    f->seq_hdr = c->seq_hdr;
   3377    f->seq_hdr_ref = c->seq_hdr_ref;
   3378    dav1d_ref_inc(f->seq_hdr_ref);
   3379    f->frame_hdr = c->frame_hdr;
   3380    f->frame_hdr_ref = c->frame_hdr_ref;
   3381    c->frame_hdr = NULL;
   3382    c->frame_hdr_ref = NULL;
   3383    f->dsp = &c->dsp[f->seq_hdr->hbd];
   3384 
   3385    const int bpc = 8 + 2 * f->seq_hdr->hbd;
   3386 
   3387    if (!f->dsp->ipred.intra_pred[DC_PRED]) {
   3388        Dav1dDSPContext *const dsp = &c->dsp[f->seq_hdr->hbd];
   3389 
   3390        switch (bpc) {
   3391 #define assign_bitdepth_case(bd) \
   3392            dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
   3393            dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
   3394            dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
   3395            dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
   3396            dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
   3397            dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
   3398            dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
   3399            break
   3400 #if CONFIG_8BPC
   3401        case 8:
   3402            assign_bitdepth_case(8);
   3403 #endif
   3404 #if CONFIG_16BPC
   3405        case 10:
   3406        case 12:
   3407            assign_bitdepth_case(16);
   3408 #endif
   3409 #undef assign_bitdepth_case
   3410        default:
   3411            dav1d_log(c, "Compiled without support for %d-bit decoding\n",
   3412                    8 + 2 * f->seq_hdr->hbd);
   3413            res = DAV1D_ERR(ENOPROTOOPT);
   3414            goto error;
   3415        }
   3416    }
   3417 
   3418 #define assign_bitdepth_case(bd) \
   3419        f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
   3420        f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
   3421        f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
   3422        f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \
   3423        f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \
   3424        f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
   3425        f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
   3426        f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
   3427        f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
   3428        f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \
   3429        f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \
   3430        f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \
   3431        f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \
   3432        f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc
   3433    if (!f->seq_hdr->hbd) {
   3434 #if CONFIG_8BPC
   3435        assign_bitdepth_case(8);
   3436 #endif
   3437    } else {
   3438 #if CONFIG_16BPC
   3439        assign_bitdepth_case(16);
   3440 #endif
   3441    }
   3442 #undef assign_bitdepth_case
   3443 
   3444    int ref_coded_width[7];
   3445    if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
   3446        if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
   3447            const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
   3448            if (!c->refs[pri_ref].p.p.data[0]) {
   3449                res = DAV1D_ERR(EINVAL);
   3450                goto error;
   3451            }
   3452        }
   3453        for (int i = 0; i < 7; i++) {
   3454            const int refidx = f->frame_hdr->refidx[i];
   3455            if (!c->refs[refidx].p.p.data[0] ||
   3456                f->frame_hdr->width[0] * 2 < c->refs[refidx].p.p.p.w ||
   3457                f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h ||
   3458                f->frame_hdr->width[0] > c->refs[refidx].p.p.p.w * 16 ||
   3459                f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 ||
   3460                f->seq_hdr->layout != c->refs[refidx].p.p.p.layout ||
   3461                bpc != c->refs[refidx].p.p.p.bpc)
   3462            {
   3463                for (int j = 0; j < i; j++)
   3464                    dav1d_thread_picture_unref(&f->refp[j]);
   3465                res = DAV1D_ERR(EINVAL);
   3466                goto error;
   3467            }
   3468            dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);
   3469            ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width[0];
   3470            if (f->frame_hdr->width[0] != c->refs[refidx].p.p.p.w ||
   3471                f->frame_hdr->height != c->refs[refidx].p.p.p.h)
   3472            {
   3473 #define scale_fac(ref_sz, this_sz) \
   3474    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
   3475                f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w,
   3476                                               f->frame_hdr->width[0]);
   3477                f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h,
   3478                                               f->frame_hdr->height);
   3479                f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;
   3480                f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;
   3481            } else {
   3482                f->svc[i][0].scale = f->svc[i][1].scale = 0;
   3483            }
   3484            f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
   3485                                     !f->frame_hdr->force_integer_mv &&
   3486                                     !dav1d_get_shear_params(&f->frame_hdr->gmv[i]) &&
   3487                                     !f->svc[i][0].scale;
   3488        }
   3489    }
   3490 
   3491    // setup entropy
   3492    if (f->frame_hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
   3493        dav1d_cdf_thread_init_static(&f->in_cdf, f->frame_hdr->quant.yac);
   3494    } else {
   3495        const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
   3496        dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
   3497    }
   3498    if (f->frame_hdr->refresh_context) {
   3499        res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1);
   3500        if (res < 0) goto error;
   3501    }
   3502 
   3503    // FIXME qsort so tiles are in order (for frame threading)
   3504    if (f->n_tile_data_alloc < c->n_tile_data) {
   3505        dav1d_free(f->tile);
   3506        assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
   3507        f->tile = dav1d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile));
   3508        if (!f->tile) {
   3509            f->n_tile_data_alloc = f->n_tile_data = 0;
   3510            res = DAV1D_ERR(ENOMEM);
   3511            goto error;
   3512        }
   3513        f->n_tile_data_alloc = c->n_tile_data;
   3514    }
   3515    memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
   3516    memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
   3517    f->n_tile_data = c->n_tile_data;
   3518    c->n_tile_data = 0;
   3519 
   3520    // allocate frame
   3521    res = dav1d_thread_picture_alloc(c, f, bpc);
   3522    if (res < 0) goto error;
   3523 
   3524    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
   3525        res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
   3526        if (res < 0) goto error;
   3527    } else {
   3528        dav1d_picture_ref(&f->cur, &f->sr_cur.p);
   3529    }
   3530 
   3531    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
   3532        f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
   3533        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   3534        const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;
   3535        const int out_cw = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
   3536        f->resize_step[1] = scale_fac(in_cw, out_cw);
   3537 #undef scale_fac
   3538        f->resize_start[0] = get_upscale_x0(f->cur.p.w, f->sr_cur.p.p.w, f->resize_step[0]);
   3539        f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
   3540    }
   3541 
   3542    // move f->cur into output queue
   3543    if (c->n_fc == 1) {
   3544        if (f->frame_hdr->show_frame || c->output_invisible_frames) {
   3545            dav1d_thread_picture_ref(&c->out, &f->sr_cur);
   3546            c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur);
   3547        }
   3548    } else {
   3549        dav1d_thread_picture_ref(out_delayed, &f->sr_cur);
   3550    }
   3551 
   3552    f->w4 = (f->frame_hdr->width[0] + 3) >> 2;
   3553    f->h4 = (f->frame_hdr->height + 3) >> 2;
   3554    f->bw = ((f->frame_hdr->width[0] + 7) >> 3) << 1;
   3555    f->bh = ((f->frame_hdr->height + 7) >> 3) << 1;
   3556    f->sb128w = (f->bw + 31) >> 5;
   3557    f->sb128h = (f->bh + 31) >> 5;
   3558    f->sb_shift = 4 + f->seq_hdr->sb128;
   3559    f->sb_step = 16 << f->seq_hdr->sb128;
   3560    f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift;
   3561    f->b4_stride = (f->bw + 31) & ~31;
   3562    f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
   3563    atomic_init(&f->task_thread.error, 0);
   3564    const int uses_2pass = c->n_fc > 1;
   3565    const int cols = f->frame_hdr->tiling.cols;
   3566    const int rows = f->frame_hdr->tiling.rows;
   3567    atomic_store(&f->task_thread.task_counter,
   3568                 (cols * rows + f->sbh) << uses_2pass);
   3569 
   3570    // ref_mvs
   3571    if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
   3572        f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
   3573            sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
   3574        if (!f->mvs_ref) {
   3575            res = DAV1D_ERR(ENOMEM);
   3576            goto error;
   3577        }
   3578        f->mvs = f->mvs_ref->data;
   3579        if (!f->frame_hdr->allow_intrabc) {
   3580            for (int i = 0; i < 7; i++)
   3581                f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset;
   3582        } else {
   3583            memset(f->refpoc, 0, sizeof(f->refpoc));
   3584        }
   3585        if (f->frame_hdr->use_ref_frame_mvs) {
   3586            for (int i = 0; i < 7; i++) {
   3587                const int refidx = f->frame_hdr->refidx[i];
   3588                const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1;
   3589                const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1;
   3590                if (c->refs[refidx].refmvs != NULL &&
   3591                    ref_w == f->bw && ref_h == f->bh)
   3592                {
   3593                    f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
   3594                    dav1d_ref_inc(f->ref_mvs_ref[i]);
   3595                    f->ref_mvs[i] = c->refs[refidx].refmvs->data;
   3596                } else {
   3597                    f->ref_mvs[i] = NULL;
   3598                    f->ref_mvs_ref[i] = NULL;
   3599                }
   3600                memcpy(f->refrefpoc[i], c->refs[refidx].refpoc,
   3601                       sizeof(*f->refrefpoc));
   3602            }
   3603        } else {
   3604            memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
   3605        }
   3606    } else {
   3607        f->mvs_ref = NULL;
   3608        memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
   3609    }
   3610 
   3611    // segmap
   3612    if (f->frame_hdr->segmentation.enabled) {
   3613        // By default, the previous segmentation map is not initialised.
   3614        f->prev_segmap_ref = NULL;
   3615        f->prev_segmap = NULL;
   3616 
   3617        // We might need a previous frame's segmentation map. This
   3618        // happens if there is either no update or a temporal update.
   3619        if (f->frame_hdr->segmentation.temporal || !f->frame_hdr->segmentation.update_map) {
   3620            const int pri_ref = f->frame_hdr->primary_ref_frame;
   3621            assert(pri_ref != DAV1D_PRIMARY_REF_NONE);
   3622            const int ref_w = ((ref_coded_width[pri_ref] + 7) >> 3) << 1;
   3623            const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1;
   3624            if (ref_w == f->bw && ref_h == f->bh) {
   3625                f->prev_segmap_ref = c->refs[f->frame_hdr->refidx[pri_ref]].segmap;
   3626                if (f->prev_segmap_ref) {
   3627                    dav1d_ref_inc(f->prev_segmap_ref);
   3628                    f->prev_segmap = f->prev_segmap_ref->data;
   3629                }
   3630            }
   3631        }
   3632 
   3633        if (f->frame_hdr->segmentation.update_map) {
   3634            // We're updating an existing map, but need somewhere to
   3635            // put the new values. Allocate them here (the data
   3636            // actually gets set elsewhere)
   3637            f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool,
   3638                sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h);
   3639            if (!f->cur_segmap_ref) {
   3640                dav1d_ref_dec(&f->prev_segmap_ref);
   3641                res = DAV1D_ERR(ENOMEM);
   3642                goto error;
   3643            }
   3644            f->cur_segmap = f->cur_segmap_ref->data;
   3645        } else if (f->prev_segmap_ref) {
   3646            // We're not updating an existing map, and we have a valid
   3647            // reference. Use that.
   3648            f->cur_segmap_ref = f->prev_segmap_ref;
   3649            dav1d_ref_inc(f->cur_segmap_ref);
   3650            f->cur_segmap = f->prev_segmap_ref->data;
   3651        } else {
   3652            // We need to make a new map. Allocate one here and zero it out.
   3653            const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h;
   3654            f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool, segmap_size);
   3655            if (!f->cur_segmap_ref) {
   3656                res = DAV1D_ERR(ENOMEM);
   3657                goto error;
   3658            }
   3659            f->cur_segmap = f->cur_segmap_ref->data;
   3660            memset(f->cur_segmap, 0, segmap_size);
   3661        }
   3662    } else {
   3663        f->cur_segmap = NULL;
   3664        f->cur_segmap_ref = NULL;
   3665        f->prev_segmap_ref = NULL;
   3666    }
   3667 
   3668    // update references etc.
   3669    const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
   3670    for (int i = 0; i < 8; i++) {
   3671        if (refresh_frame_flags & (1 << i)) {
   3672            if (c->refs[i].p.p.frame_hdr)
   3673                dav1d_thread_picture_unref(&c->refs[i].p);
   3674            dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur);
   3675 
   3676            dav1d_cdf_thread_unref(&c->cdf[i]);
   3677            if (f->frame_hdr->refresh_context) {
   3678                dav1d_cdf_thread_ref(&c->cdf[i], &f->out_cdf);
   3679            } else {
   3680                dav1d_cdf_thread_ref(&c->cdf[i], &f->in_cdf);
   3681            }
   3682 
   3683            dav1d_ref_dec(&c->refs[i].segmap);
   3684            c->refs[i].segmap = f->cur_segmap_ref;
   3685            if (f->cur_segmap_ref)
   3686                dav1d_ref_inc(f->cur_segmap_ref);
   3687            dav1d_ref_dec(&c->refs[i].refmvs);
   3688            if (!f->frame_hdr->allow_intrabc) {
   3689                c->refs[i].refmvs = f->mvs_ref;
   3690                if (f->mvs_ref)
   3691                    dav1d_ref_inc(f->mvs_ref);
   3692            }
   3693            memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
   3694        }
   3695    }
   3696 
   3697    if (c->n_fc == 1) {
   3698        if ((res = dav1d_decode_frame(f)) < 0) {
   3699            dav1d_thread_picture_unref(&c->out);
   3700            for (int i = 0; i < 8; i++) {
   3701                if (refresh_frame_flags & (1 << i)) {
   3702                    if (c->refs[i].p.p.frame_hdr)
   3703                        dav1d_thread_picture_unref(&c->refs[i].p);
   3704                    dav1d_cdf_thread_unref(&c->cdf[i]);
   3705                    dav1d_ref_dec(&c->refs[i].segmap);
   3706                    dav1d_ref_dec(&c->refs[i].refmvs);
   3707                }
   3708            }
   3709            goto error;
   3710        }
   3711    } else {
   3712        dav1d_task_frame_init(f);
   3713        pthread_mutex_unlock(&c->task_thread.lock);
   3714    }
   3715 
   3716    return 0;
   3717 error:
   3718    atomic_init(&f->task_thread.error, 1);
   3719    dav1d_cdf_thread_unref(&f->in_cdf);
   3720    if (f->frame_hdr->refresh_context)
   3721        dav1d_cdf_thread_unref(&f->out_cdf);
   3722    for (int i = 0; i < 7; i++) {
   3723        if (f->refp[i].p.frame_hdr)
   3724            dav1d_thread_picture_unref(&f->refp[i]);
   3725        dav1d_ref_dec(&f->ref_mvs_ref[i]);
   3726    }
   3727    if (c->n_fc == 1)
   3728        dav1d_thread_picture_unref(&c->out);
   3729    else
   3730        dav1d_thread_picture_unref(out_delayed);
   3731    dav1d_picture_unref_internal(&f->cur);
   3732    dav1d_thread_picture_unref(&f->sr_cur);
   3733    dav1d_ref_dec(&f->mvs_ref);
   3734    dav1d_ref_dec(&f->seq_hdr_ref);
   3735    dav1d_ref_dec(&f->frame_hdr_ref);
   3736    dav1d_data_props_copy(&c->cached_error_props, &c->in.m);
   3737 
   3738    for (int i = 0; i < f->n_tile_data; i++)
   3739        dav1d_data_unref_internal(&f->tile[i].data);
   3740    f->n_tile_data = 0;
   3741 
   3742    if (c->n_fc > 1)
   3743        pthread_mutex_unlock(&c->task_thread.lock);
   3744 
   3745    return res;
   3746 }