tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

recon_tmpl.c (107321B)


      1 /*
      2 * Copyright © 2018-2021, VideoLAN and dav1d authors
      3 * Copyright © 2018, Two Orioles, LLC
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "config.h"
     29 
     30 #include <string.h>
     31 #include <stdio.h>
     32 
     33 #include "common/attributes.h"
     34 #include "common/bitdepth.h"
     35 #include "common/dump.h"
     36 #include "common/frame.h"
     37 #include "common/intops.h"
     38 
     39 #include "src/cdef_apply.h"
     40 #include "src/ctx.h"
     41 #include "src/ipred_prepare.h"
     42 #include "src/lf_apply.h"
     43 #include "src/lr_apply.h"
     44 #include "src/recon.h"
     45 #include "src/scan.h"
     46 #include "src/tables.h"
     47 #include "src/wedge.h"
     48 
     49 static inline unsigned read_golomb(MsacContext *const msac) {
     50    int len = 0;
     51    unsigned val = 1;
     52 
     53    while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
     54    while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
     55 
     56    return val - 1;
     57 }
     58 
     59 static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
     60                                    const enum BlockSize bs,
     61                                    const uint8_t *const a,
     62                                    const uint8_t *const l,
     63                                    const int chroma,
     64                                    const enum Dav1dPixelLayout layout)
     65 {
     66    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
     67 
     68    if (chroma) {
     69        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
     70        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
     71        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
     72                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
     73        unsigned ca, cl;
     74 
     75 #define MERGE_CTX(dir, type, no_val) \
     76        c##dir = *(const type *) dir != no_val; \
     77        break
     78 
     79        switch (t_dim->lw) {
     80        /* For some reason the MSVC CRT _wassert() function is not flagged as
     81         * __declspec(noreturn), so when using those headers the compiler will
     82         * expect execution to continue after an assertion has been triggered
     83         * and will therefore complain about the use of uninitialized variables
     84         * when compiled in debug mode if we put the default case at the end. */
     85        default: assert(0); /* fall-through */
     86        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x40);
     87        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x4040);
     88        case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
     89        case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
     90        }
     91        switch (t_dim->lh) {
     92        default: assert(0); /* fall-through */
     93        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x40);
     94        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x4040);
     95        case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
     96        case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
     97        }
     98 #undef MERGE_CTX
     99 
    100        return 7 + not_one_blk * 3 + ca + cl;
    101    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
    102        return 0;
    103    } else {
    104        unsigned la, ll;
    105 
    106 #define MERGE_CTX(dir, type, tx) \
    107        if (tx == TX_64X64) { \
    108            uint64_t tmp = *(const uint64_t *) dir; \
    109            tmp |= *(const uint64_t *) &dir[8]; \
    110            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
    111        } else \
    112            l##dir = *(const type *) dir; \
    113        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
    114        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
    115        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
    116        break
    117 
    118        switch (t_dim->lw) {
    119        default: assert(0); /* fall-through */
    120        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);
    121        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);
    122        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
    123        case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
    124        case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
    125        }
    126        switch (t_dim->lh) {
    127        default: assert(0); /* fall-through */
    128        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
    129        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
    130        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
    131        case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
    132        case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
    133        }
    134 #undef MERGE_CTX
    135 
    136        return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
    137    }
    138 }
    139 
    140 static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
    141                                       const uint8_t *const a,
    142                                       const uint8_t *const l)
    143 {
    144    uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
    145    int s;
    146 
    147 #if ARCH_X86_64 && defined(__GNUC__)
    148    /* Coerce compilers into producing better code. For some reason
    149     * every x86-64 compiler is awful at handling 64-bit constants. */
    150    __asm__("" : "+r"(mask), "+r"(mul));
    151 #endif
    152 
    153    switch(tx) {
    154    default: assert(0); /* fall-through */
    155    case TX_4X4: {
    156        int t = *(const uint8_t *) a >> 6;
    157        t    += *(const uint8_t *) l >> 6;
    158        s = t - 1 - 1;
    159        break;
    160    }
    161    case TX_8X8: {
    162        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
    163        t         += *(const uint16_t *) l & (uint32_t) mask;
    164        t *= 0x04040404U;
    165        s = (int) (t >> 24) - 2 - 2;
    166        break;
    167    }
    168    case TX_16X16: {
    169        uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
    170        t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
    171        t *= (uint32_t) mul;
    172        s = (int) (t >> 24) - 4 - 4;
    173        break;
    174    }
    175    case TX_32X32: {
    176        uint64_t t = (*(const uint64_t *) a & mask) >> 6;
    177        t         += (*(const uint64_t *) l & mask) >> 6;
    178        t *= mul;
    179        s = (int) (t >> 56) - 8 - 8;
    180        break;
    181    }
    182    case TX_64X64: {
    183        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
    184        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
    185        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
    186        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
    187        t *= mul;
    188        s = (int) (t >> 56) - 16 - 16;
    189        break;
    190    }
    191    case RTX_4X8: {
    192        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
    193        t         += *(const uint16_t *) l & (uint32_t) mask;
    194        t *= 0x04040404U;
    195        s = (int) (t >> 24) - 1 - 2;
    196        break;
    197    }
    198    case RTX_8X4: {
    199        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
    200        t         += *(const uint8_t  *) l & (uint32_t) mask;
    201        t *= 0x04040404U;
    202        s = (int) (t >> 24) - 2 - 1;
    203        break;
    204    }
    205    case RTX_8X16: {
    206        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
    207        t         += *(const uint32_t *) l & (uint32_t) mask;
    208        t = (t >> 6) * (uint32_t) mul;
    209        s = (int) (t >> 24) - 2 - 4;
    210        break;
    211    }
    212    case RTX_16X8: {
    213        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
    214        t         += *(const uint16_t *) l & (uint32_t) mask;
    215        t = (t >> 6) * (uint32_t) mul;
    216        s = (int) (t >> 24) - 4 - 2;
    217        break;
    218    }
    219    case RTX_16X32: {
    220        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
    221        t         += *(const uint64_t *) l & mask;
    222        t = (t >> 6) * mul;
    223        s = (int) (t >> 56) - 4 - 8;
    224        break;
    225    }
    226    case RTX_32X16: {
    227        uint64_t t = *(const uint64_t *) a & mask;
    228        t         += *(const uint32_t *) l & (uint32_t) mask;
    229        t = (t >> 6) * mul;
    230        s = (int) (t >> 56) - 8 - 4;
    231        break;
    232    }
    233    case RTX_32X64: {
    234        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
    235        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
    236        t         += (*(const uint64_t *) &l[8] & mask) >> 6;
    237        t *= mul;
    238        s = (int) (t >> 56) - 8 - 16;
    239        break;
    240    }
    241    case RTX_64X32: {
    242        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
    243        t         += (*(const uint64_t *) &a[8] & mask) >> 6;
    244        t         += (*(const uint64_t *) &l[0] & mask) >> 6;
    245        t *= mul;
    246        s = (int) (t >> 56) - 16 - 8;
    247        break;
    248    }
    249    case RTX_4X16: {
    250        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
    251        t         += *(const uint32_t *) l & (uint32_t) mask;
    252        t = (t >> 6) * (uint32_t) mul;
    253        s = (int) (t >> 24) - 1 - 4;
    254        break;
    255    }
    256    case RTX_16X4: {
    257        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
    258        t         += *(const uint8_t  *) l & (uint32_t) mask;
    259        t = (t >> 6) * (uint32_t) mul;
    260        s = (int) (t >> 24) - 4 - 1;
    261        break;
    262    }
    263    case RTX_8X32: {
    264        uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
    265        t         += *(const uint64_t *) l & mask;
    266        t = (t >> 6) * mul;
    267        s = (int) (t >> 56) - 2 - 8;
    268        break;
    269    }
    270    case RTX_32X8: {
    271        uint64_t t = *(const uint64_t *) a & mask;
    272        t         += *(const uint16_t *) l & (uint32_t) mask;
    273        t = (t >> 6) * mul;
    274        s = (int) (t >> 56) - 8 - 2;
    275        break;
    276    }
    277    case RTX_16X64: {
    278        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
    279        t         += *(const uint64_t *) &l[0] & mask;
    280        t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
    281        t *= mul;
    282        s = (int) (t >> 56) - 4 - 16;
    283        break;
    284    }
    285    case RTX_64X16: {
    286        uint64_t t = *(const uint64_t *) &a[0] & mask;
    287        t         += *(const uint32_t *) l & (uint32_t) mask;
    288        t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
    289        t *= mul;
    290        s = (int) (t >> 56) - 16 - 4;
    291        break;
    292    }
    293    }
    294 
    295    return (s != 0) + (s > 0);
    296 }
    297 
    298 static inline unsigned get_lo_ctx(const uint8_t *const levels,
    299                                  const enum TxClass tx_class,
    300                                  unsigned *const hi_mag,
    301                                  const uint8_t (*const ctx_offsets)[5],
    302                                  const unsigned x, const unsigned y,
    303                                  const ptrdiff_t stride)
    304 {
    305    unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
    306    unsigned offset;
    307    if (tx_class == TX_CLASS_2D) {
    308        mag += levels[1 * stride + 1];
    309        *hi_mag = mag;
    310        mag += levels[0 * stride + 2] + levels[2 * stride + 0];
    311        offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
    312    } else {
    313        mag += levels[0 * stride + 2];
    314        *hi_mag = mag;
    315        mag += levels[0 * stride + 3] + levels[0 * stride + 4];
    316        offset = 26 + (y > 1 ? 10 : y * 5);
    317    }
    318    return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
    319 }
    320 
    321 static int decode_coefs(Dav1dTaskContext *const t,
    322                        uint8_t *const a, uint8_t *const l,
    323                        const enum RectTxfmSize tx, const enum BlockSize bs,
    324                        const Av1Block *const b, const int intra,
    325                        const int plane, coef *cf,
    326                        enum TxfmType *const txtp, uint8_t *res_ctx)
    327 {
    328    Dav1dTileState *const ts = t->ts;
    329    const int chroma = !!plane;
    330    const Dav1dFrameContext *const f = t->f;
    331    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
    332    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
    333    const int dbg = DEBUG_BLOCK_INFO && plane && 0;
    334 
    335    if (dbg)
    336        printf("Start: r=%d\n", ts->msac.rng);
    337 
    338    // does this block have any non-zero coefficients
    339    const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
    340    const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
    341                             ts->cdf.coef.skip[t_dim->ctx][sctx]);
    342    if (dbg)
    343        printf("Post-non-zero[%d][%d][%d]: r=%d\n",
    344               t_dim->ctx, sctx, all_skip, ts->msac.rng);
    345    if (all_skip) {
    346        *res_ctx = 0x40;
    347        *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
    348        return -1;
    349    }
    350 
    351    // transform type (chroma: derived, luma: explicitly coded)
    352    if (lossless) {
    353        assert(t_dim->max == TX_4X4);
    354        *txtp = WHT_WHT;
    355    } else if (t_dim->max + intra >= TX_64X64) {
    356        *txtp = DCT_DCT;
    357    } else if (chroma) {
    358        // inferred from either the luma txtp (inter) or a LUT (intra)
    359        *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
    360                        get_uv_inter_txtp(t_dim, *txtp);
    361    } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
    362        // In libaom, lossless is checked by a literal qidx == 0, but not all
    363        // such blocks are actually lossless. The remainder gets an implicit
    364        // transform type (for luma)
    365        *txtp = DCT_DCT;
    366    } else {
    367        unsigned idx;
    368        if (intra) {
    369            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
    370                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
    371            if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
    372                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
    373                          ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
    374                *txtp = dav1d_tx_types_per_set[idx + 0];
    375            } else {
    376                idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
    377                          ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
    378                *txtp = dav1d_tx_types_per_set[idx + 5];
    379            }
    380            if (dbg)
    381                printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
    382                       tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
    383        } else {
    384            if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
    385                idx = dav1d_msac_decode_bool_adapt(&ts->msac,
    386                          ts->cdf.m.txtp_inter3[t_dim->min]);
    387                *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
    388            } else if (t_dim->min == TX_16X16) {
    389                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
    390                          ts->cdf.m.txtp_inter2, 11);
    391                *txtp = dav1d_tx_types_per_set[idx + 12];
    392            } else {
    393                idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
    394                          ts->cdf.m.txtp_inter1[t_dim->min], 15);
    395                *txtp = dav1d_tx_types_per_set[idx + 24];
    396            }
    397            if (dbg)
    398                printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
    399                       tx, t_dim->min, idx, *txtp, ts->msac.rng);
    400        }
    401    }
    402 
    403    // find end-of-block (eob)
    404    int eob;
    405    const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32);
    406    const int tx2dszctx = slw + slh;
    407    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
    408    const int is_1d = tx_class != TX_CLASS_2D;
    409    switch (tx2dszctx) {
    410 #define case_sz(sz, bin, ns, is_1d) \
    411    case sz: { \
    412        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
    413        eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
    414        break; \
    415    }
    416    case_sz(0,   16,  8, [is_1d]);
    417    case_sz(1,   32,  8, [is_1d]);
    418    case_sz(2,   64,  8, [is_1d]);
    419    case_sz(3,  128,  8, [is_1d]);
    420    case_sz(4,  256, 16, [is_1d]);
    421    case_sz(5,  512, 16,        );
    422    case_sz(6, 1024, 16,        );
    423 #undef case_sz
    424    }
    425    if (dbg)
    426        printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
    427               16 << tx2dszctx, chroma, is_1d, eob, ts->msac.rng);
    428    if (eob > 1) {
    429        const int eob_bin = eob - 2;
    430        uint16_t *const eob_hi_bit_cdf =
    431            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
    432        const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
    433        if (dbg)
    434            printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
    435                   t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
    436        eob = ((eob_hi_bit | 2) << eob_bin) | dav1d_msac_decode_bools(&ts->msac, eob_bin);
    437        if (dbg)
    438            printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
    439    }
    440    assert(eob >= 0);
    441 
    442    // base tokens
    443    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
    444    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
    445    unsigned rc, dc_tok;
    446 
    447    if (eob) {
    448        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
    449        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
    450 
    451        /* eob */
    452        unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx);
    453        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
    454        int tok = eob_tok + 1;
    455        int level_tok = tok * 0x41;
    456        unsigned mag;
    457 
    458 #define DECODE_COEFS_CLASS(tx_class) \
    459        unsigned x, y; \
    460        uint8_t *level; \
    461        if (tx_class == TX_CLASS_2D) \
    462            rc = scan[eob], x = rc >> shift, y = rc & mask; \
    463        else if (tx_class == TX_CLASS_H) \
    464            /* Transposing reduces the stride and padding requirements */ \
    465            x = eob & mask, y = eob >> shift, rc = eob; \
    466        else /* tx_class == TX_CLASS_V */ \
    467            x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
    468        if (dbg) \
    469            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
    470                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
    471        if (eob_tok == 2) { \
    472            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
    473            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
    474            level_tok = tok + (3 << 6); \
    475            if (dbg) \
    476                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
    477                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
    478                       ts->msac.rng); \
    479        } \
    480        cf[rc] = tok << 11; \
    481        if (tx_class == TX_CLASS_2D) \
    482            level = levels + rc; \
    483        else \
    484            level = levels + x * stride + y; \
    485        *level = (uint8_t) level_tok; \
    486        for (int i = eob - 1; i > 0; i--) { /* ac */ \
    487            unsigned rc_i; \
    488            if (tx_class == TX_CLASS_2D) \
    489                rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
    490            else if (tx_class == TX_CLASS_H) \
    491                x = i & mask, y = i >> shift, rc_i = i; \
    492            else /* tx_class == TX_CLASS_V */ \
    493                x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
    494            assert(x < 32 && y < 32); \
    495            if (tx_class == TX_CLASS_2D) \
    496                level = levels + rc_i; \
    497            else \
    498                level = levels + x * stride + y; \
    499            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
    500            if (tx_class == TX_CLASS_2D) \
    501                y |= x; \
    502            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
    503            if (dbg) \
    504                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
    505                       t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
    506            if (tok == 3) { \
    507                mag &= 63; \
    508                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
    509                      (mag > 12 ? 6 : (mag + 1) >> 1); \
    510                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
    511                if (dbg) \
    512                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
    513                           imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
    514                           ts->msac.rng); \
    515                *level = (uint8_t) (tok + (3 << 6)); \
    516                cf[rc_i] = (tok << 11) | rc; \
    517                rc = rc_i; \
    518            } else { \
    519                /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
    520                tok *= 0x17ff41; \
    521                *level = (uint8_t) tok; \
    522                /* tok ? (tok << 11) | rc : 0 */ \
    523                tok = (tok >> 9) & (rc + ~0x7ffu); \
    524                if (tok) rc = rc_i; \
    525                cf[rc_i] = tok; \
    526            } \
    527        } \
    528        /* dc */ \
    529        ctx = (tx_class == TX_CLASS_2D) ? 0 : \
    530            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
    531        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
    532        if (dbg) \
    533            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
    534                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
    535        if (dc_tok == 3) { \
    536            if (tx_class == TX_CLASS_2D) \
    537                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
    538                      levels[1 * stride + 1]; \
    539            mag &= 63; \
    540            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
    541            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
    542            if (dbg) \
    543                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
    544                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
    545        } \
    546        break
    547 
    548        const uint16_t *scan;
    549        switch (tx_class) {
    550        case TX_CLASS_2D: {
    551            const unsigned nonsquare_tx = tx >= RTX_4X8;
    552            const uint8_t (*const lo_ctx_offsets)[5] =
    553                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
    554            scan = dav1d_scans[tx];
    555            const ptrdiff_t stride = 4 << slh;
    556            const unsigned shift = slh + 2, shift2 = 0;
    557            const unsigned mask = (4 << slh) - 1;
    558            memset(levels, 0, stride * ((4 << slw) + 2));
    559            DECODE_COEFS_CLASS(TX_CLASS_2D);
    560        }
    561        case TX_CLASS_H: {
    562            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
    563            const ptrdiff_t stride = 16;
    564            const unsigned shift = slh + 2, shift2 = 0;
    565            const unsigned mask = (4 << slh) - 1;
    566            memset(levels, 0, stride * ((4 << slh) + 2));
    567            DECODE_COEFS_CLASS(TX_CLASS_H);
    568        }
    569        case TX_CLASS_V: {
    570            const uint8_t (*const lo_ctx_offsets)[5] = NULL;
    571            const ptrdiff_t stride = 16;
    572            const unsigned shift = slw + 2, shift2 = slh + 2;
    573            const unsigned mask = (4 << slw) - 1;
    574            memset(levels, 0, stride * ((4 << slw) + 2));
    575            DECODE_COEFS_CLASS(TX_CLASS_V);
    576        }
    577 #undef DECODE_COEFS_CLASS
    578        default: assert(0);
    579        }
    580    } else { // dc-only
    581        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
    582        dc_tok = 1 + tok_br;
    583        if (dbg)
    584            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
    585                   t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
    586        if (tok_br == 2) {
    587            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
    588            if (dbg)
    589                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
    590                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
    591        }
    592        rc = 0;
    593    }
    594 
    595    // residual and sign
    596    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
    597    const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
    598    const int dq_shift = imax(0, t_dim->ctx - 2);
    599    const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
    600    unsigned cul_level, dc_sign_level;
    601 
    602    if (!dc_tok) {
    603        cul_level = 0;
    604        dc_sign_level = 1 << 6;
    605        if (qm_tbl) goto ac_qm;
    606        goto ac_noqm;
    607    }
    608 
    609    const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
    610    uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
    611    const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
    612    if (dbg)
    613        printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
    614               chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
    615 
    616    int dc_dq = dq_tbl[0];
    617    dc_sign_level = (dc_sign - 1) & (2 << 6);
    618 
    619    if (qm_tbl) {
    620        dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
    621 
    622        if (dc_tok == 15) {
    623            dc_tok = read_golomb(&ts->msac) + 15;
    624            if (dbg)
    625                printf("Post-dc_residual[%d->%d]: r=%d\n",
    626                       dc_tok - 15, dc_tok, ts->msac.rng);
    627 
    628            dc_tok &= 0xfffff;
    629            dc_dq = (dc_dq * dc_tok) & 0xffffff;
    630        } else {
    631            dc_dq *= dc_tok;
    632            assert(dc_dq <= 0xffffff);
    633        }
    634        cul_level = dc_tok;
    635        dc_dq >>= dq_shift;
    636        dc_dq = umin(dc_dq, cf_max + dc_sign);
    637        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
    638 
    639        if (rc) ac_qm: {
    640            const unsigned ac_dq = dq_tbl[1];
    641            do {
    642                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
    643                if (dbg)
    644                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
    645                const unsigned rc_tok = cf[rc];
    646                unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
    647                int dq_sat;
    648 
    649                if (rc_tok >= (15 << 11)) {
    650                    tok = read_golomb(&ts->msac) + 15;
    651                    if (dbg)
    652                        printf("Post-residual[%d=%d->%d]: r=%d\n",
    653                               rc, tok - 15, tok, ts->msac.rng);
    654 
    655                    tok &= 0xfffff;
    656                    dq = (dq * tok) & 0xffffff;
    657                } else {
    658                    tok = rc_tok >> 11;
    659                    dq *= tok;
    660                    assert(dq <= 0xffffff);
    661                }
    662                cul_level += tok;
    663                dq >>= dq_shift;
    664                dq_sat = umin(dq, cf_max + sign);
    665                cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
    666 
    667                rc = rc_tok & 0x3ff;
    668            } while (rc);
    669        }
    670    } else {
    671        // non-qmatrix is the common case and allows for additional optimizations
    672        if (dc_tok == 15) {
    673            dc_tok = read_golomb(&ts->msac) + 15;
    674            if (dbg)
    675                printf("Post-dc_residual[%d->%d]: r=%d\n",
    676                       dc_tok - 15, dc_tok, ts->msac.rng);
    677 
    678            dc_tok &= 0xfffff;
    679            dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
    680            dc_dq = umin(dc_dq, cf_max + dc_sign);
    681        } else {
    682            dc_dq = ((dc_dq * dc_tok) >> dq_shift);
    683            assert(dc_dq <= cf_max);
    684        }
    685        cul_level = dc_tok;
    686        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
    687 
    688        if (rc) ac_noqm: {
    689            const unsigned ac_dq = dq_tbl[1];
    690            do {
    691                const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
    692                if (dbg)
    693                    printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
    694                const unsigned rc_tok = cf[rc];
    695                unsigned tok;
    696                int dq;
    697 
    698                // residual
    699                if (rc_tok >= (15 << 11)) {
    700                    tok = read_golomb(&ts->msac) + 15;
    701                    if (dbg)
    702                        printf("Post-residual[%d=%d->%d]: r=%d\n",
    703                               rc, tok - 15, tok, ts->msac.rng);
    704 
    705                    // coefficient parsing, see 5.11.39
    706                    tok &= 0xfffff;
    707 
    708                    // dequant, see 7.12.3
    709                    dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
    710                    dq = umin(dq, cf_max + sign);
    711                } else {
    712                    // cannot exceed cf_max, so we can avoid the clipping
    713                    tok = rc_tok >> 11;
    714                    dq = ((ac_dq * tok) >> dq_shift);
    715                    assert(dq <= cf_max);
    716                }
    717                cul_level += tok;
    718                cf[rc] = (coef) (sign ? -dq : dq);
    719 
    720                rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
    721            } while (rc);
    722        }
    723    }
    724 
    725    // context
    726    *res_ctx = umin(cul_level, 63) | dc_sign_level;
    727 
    728    return eob;
    729 }
    730 
    731 static void read_coef_tree(Dav1dTaskContext *const t,
    732                           const enum BlockSize bs, const Av1Block *const b,
    733                           const enum RectTxfmSize ytx, const int depth,
    734                           const uint16_t *const tx_split,
    735                           const int x_off, const int y_off, pixel *dst)
    736 {
    737    const Dav1dFrameContext *const f = t->f;
    738    Dav1dTileState *const ts = t->ts;
    739    const Dav1dDSPContext *const dsp = f->dsp;
    740    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
    741    const int txw = t_dim->w, txh = t_dim->h;
    742 
    743    /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
    744     * be splitted. Aviods an undefined left shift. */
    745    if (depth < 2 && tx_split[depth] &&
    746        tx_split[depth] & (1 << (y_off * 4 + x_off)))
    747    {
    748        const enum RectTxfmSize sub = t_dim->sub;
    749        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
    750        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
    751 
    752        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
    753                       x_off * 2 + 0, y_off * 2 + 0, dst);
    754        t->bx += txsw;
    755        if (txw >= txh && t->bx < f->bw)
    756            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
    757                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
    758        t->bx -= txsw;
    759        t->by += txsh;
    760        if (txh >= txw && t->by < f->bh) {
    761            if (dst)
    762                dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
    763            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
    764                           x_off * 2 + 0, y_off * 2 + 1, dst);
    765            t->bx += txsw;
    766            if (txw >= txh && t->bx < f->bw)
    767                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
    768                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
    769            t->bx -= txsw;
    770        }
    771        t->by -= txsh;
    772    } else {
    773        const int bx4 = t->bx & 31, by4 = t->by & 31;
    774        enum TxfmType txtp;
    775        uint8_t cf_ctx;
    776        int eob;
    777        coef *cf;
    778 
    779        if (t->frame_thread.pass) {
    780            const int p = t->frame_thread.pass & 1;
    781            assert(ts->frame_thread[p].cf);
    782            cf = ts->frame_thread[p].cf;
    783            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
    784        } else {
    785            cf = bitfn(t->cf);
    786        }
    787        if (t->frame_thread.pass != 2) {
    788            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
    789                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
    790            if (DEBUG_BLOCK_INFO)
    791                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
    792                       ytx, txtp, eob, ts->msac.rng);
    793            dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
    794            dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
    795 #define set_ctx(rep_macro) \
    796            for (int y = 0; y < txh; y++) { \
    797                rep_macro(txtp_map, 0, txtp); \
    798                txtp_map += 32; \
    799            }
    800            uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
    801            case_set_upto16(t_dim->lw);
    802 #undef set_ctx
    803            if (t->frame_thread.pass == 1)
    804                *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
    805        } else {
    806            const int cbi = *ts->frame_thread[0].cbi++;
    807            eob  = cbi >> 5;
    808            txtp = cbi & 0x1f;
    809        }
    810        if (!(t->frame_thread.pass & 1)) {
    811            assert(dst);
    812            if (eob >= 0) {
    813                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
    814                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
    815                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
    816                                              HIGHBD_CALL_SUFFIX);
    817                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
    818                    hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
    819            }
    820        }
    821    }
    822 }
    823 
    824 void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
    825                                    const enum BlockSize bs, const Av1Block *const b)
    826 {
    827    const Dav1dFrameContext *const f = t->f;
    828    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    829    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    830    const int bx4 = t->bx & 31, by4 = t->by & 31;
    831    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
    832    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
    833    const int bw4 = b_dim[0], bh4 = b_dim[1];
    834    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
    835    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
    836                           (bw4 > ss_hor || t->bx & 1) &&
    837                           (bh4 > ss_ver || t->by & 1);
    838 
    839    if (b->skip) {
    840        BlockContext *const a = t->a;
    841        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
    842        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
    843        if (has_chroma) {
    844            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
    845            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
    846            memset_cw(&a->ccoef[0][cbx4], 0x40);
    847            memset_cw(&a->ccoef[1][cbx4], 0x40);
    848            memset_ch(&t->l.ccoef[0][cby4], 0x40);
    849            memset_ch(&t->l.ccoef[1][cby4], 0x40);
    850        }
    851        return;
    852    }
    853 
    854    Dav1dTileState *const ts = t->ts;
    855    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
    856    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
    857    assert(t->frame_thread.pass == 1);
    858    assert(!b->skip);
    859    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
    860    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
    861    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
    862 
    863    for (int init_y = 0; init_y < h4; init_y += 16) {
    864        const int sub_h4 = imin(h4, 16 + init_y);
    865        for (int init_x = 0; init_x < w4; init_x += 16) {
    866            const int sub_w4 = imin(w4, init_x + 16);
    867            int y_off = !!init_y, y, x;
    868            for (y = init_y, t->by += init_y; y < sub_h4;
    869                 y += t_dim->h, t->by += t_dim->h, y_off++)
    870            {
    871                int x_off = !!init_x;
    872                for (x = init_x, t->bx += init_x; x < sub_w4;
    873                     x += t_dim->w, t->bx += t_dim->w, x_off++)
    874                {
    875                    if (!b->intra) {
    876                        read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
    877                                       x_off, y_off, NULL);
    878                    } else {
    879                        uint8_t cf_ctx = 0x40;
    880                        enum TxfmType txtp;
    881                        const int eob =
    882                            decode_coefs(t, &t->a->lcoef[bx4 + x],
    883                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
    884                                         0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
    885                        if (DEBUG_BLOCK_INFO)
    886                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
    887                                   b->tx, txtp, eob, ts->msac.rng);
    888                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
    889                        ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
    890                        dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
    891                        dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
    892                    }
    893                }
    894                t->bx -= x;
    895            }
    896            t->by -= y;
    897 
    898            if (!has_chroma) continue;
    899 
    900            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
    901            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
    902            for (int pl = 0; pl < 2; pl++) {
    903                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
    904                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
    905                {
    906                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
    907                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
    908                    {
    909                        uint8_t cf_ctx = 0x40;
    910                        enum TxfmType txtp;
    911                        if (!b->intra)
    912                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
    913                                                        bx4 + (x << ss_hor)];
    914                        const int eob =
    915                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
    916                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
    917                                         b, b->intra, 1 + pl, ts->frame_thread[1].cf,
    918                                         &txtp, &cf_ctx);
    919                        if (DEBUG_BLOCK_INFO)
    920                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
    921                                   "txtp=%d,eob=%d]: r=%d\n",
    922                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
    923                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
    924                        ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
    925                        int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
    926                        int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
    927                        dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
    928                        dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
    929                    }
    930                    t->bx -= x << ss_hor;
    931                }
    932                t->by -= y << ss_ver;
    933            }
    934        }
    935    }
    936 }
    937 
    938 static int mc(Dav1dTaskContext *const t,
    939              pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
    940              const int bw4, const int bh4,
    941              const int bx, const int by, const int pl,
    942              const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
    943              const enum Filter2d filter_2d)
    944 {
    945    assert((dst8 != NULL) ^ (dst16 != NULL));
    946    const Dav1dFrameContext *const f = t->f;
    947    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
    948    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
    949    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
    950    const int mvx = mv.x, mvy = mv.y;
    951    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
    952    ptrdiff_t ref_stride = refp->p.stride[!!pl];
    953    const pixel *ref;
    954 
    955    if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
    956        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
    957        const int dy = by * v_mul + (mvy >> (3 + ss_ver));
    958        int w, h;
    959 
    960        if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
    961            w = (f->cur.p.w + ss_hor) >> ss_hor;
    962            h = (f->cur.p.h + ss_ver) >> ss_ver;
    963        } else {
    964            w = f->bw * 4 >> ss_hor;
    965            h = f->bh * 4 >> ss_ver;
    966        }
    967        if (dx < !!mx * 3 || dy < !!my * 3 ||
    968            dx + bw4 * h_mul + !!mx * 4 > w ||
    969            dy + bh4 * v_mul + !!my * 4 > h)
    970        {
    971            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
    972            f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
    973                                w, h, dx - !!mx * 3, dy - !!my * 3,
    974                                emu_edge_buf, 192 * sizeof(pixel),
    975                                refp->p.data[pl], ref_stride);
    976            ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
    977            ref_stride = 192 * sizeof(pixel);
    978        } else {
    979            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
    980        }
    981 
    982        if (dst8 != NULL) {
    983            f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
    984                                     bh4 * v_mul, mx << !ss_hor, my << !ss_ver
    985                                     HIGHBD_CALL_SUFFIX);
    986        } else {
    987            f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
    988                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver
    989                                      HIGHBD_CALL_SUFFIX);
    990        }
    991    } else {
    992        assert(refp != &f->sr_cur);
    993 
    994        const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
    995        const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
    996 #define scale_mv(res, val, scale) do { \
    997            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
    998            res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
    999        } while (0)
   1000        int pos_y, pos_x;
   1001        scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
   1002        scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
   1003 #undef scale_mv
   1004        const int left = pos_x >> 10;
   1005        const int top = pos_y >> 10;
   1006        const int right =
   1007            ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
   1008        const int bottom =
   1009            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
   1010 
   1011        if (DEBUG_BLOCK_INFO)
   1012            printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
   1013                   left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
   1014                   right-left, bottom-top,
   1015                   f->svc[refidx][0].step, f->svc[refidx][1].step);
   1016 
   1017        const int w = (refp->p.p.w + ss_hor) >> ss_hor;
   1018        const int h = (refp->p.p.h + ss_ver) >> ss_ver;
   1019        if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
   1020            pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
   1021            f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
   1022                                w, h, left - 3, top - 3,
   1023                                emu_edge_buf, 320 * sizeof(pixel),
   1024                                refp->p.data[pl], ref_stride);
   1025            ref = &emu_edge_buf[320 * 3 + 3];
   1026            ref_stride = 320 * sizeof(pixel);
   1027            if (DEBUG_BLOCK_INFO) printf("Emu\n");
   1028        } else {
   1029            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
   1030        }
   1031 
   1032        if (dst8 != NULL) {
   1033            f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
   1034                                            bw4 * h_mul, bh4 * v_mul,
   1035                                            pos_x & 0x3ff, pos_y & 0x3ff,
   1036                                            f->svc[refidx][0].step,
   1037                                            f->svc[refidx][1].step
   1038                                            HIGHBD_CALL_SUFFIX);
   1039        } else {
   1040            f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
   1041                                             bw4 * h_mul, bh4 * v_mul,
   1042                                             pos_x & 0x3ff, pos_y & 0x3ff,
   1043                                             f->svc[refidx][0].step,
   1044                                             f->svc[refidx][1].step
   1045                                             HIGHBD_CALL_SUFFIX);
   1046        }
   1047    }
   1048 
   1049    return 0;
   1050 }
   1051 
   1052 static int obmc(Dav1dTaskContext *const t,
   1053                pixel *const dst, const ptrdiff_t dst_stride,
   1054                const uint8_t *const b_dim, const int pl,
   1055                const int bx4, const int by4, const int w4, const int h4)
   1056 {
   1057    assert(!(t->bx & 1) && !(t->by & 1));
   1058    const Dav1dFrameContext *const f = t->f;
   1059    /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
   1060    pixel *const lap = bitfn(t->scratch.lap);
   1061    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   1062    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   1063    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
   1064    int res;
   1065 
   1066    if (t->by > t->ts->tiling.row_start &&
   1067        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
   1068    {
   1069        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
   1070            // only odd blocks are considered for overlap handling, hence +1
   1071            const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
   1072            const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
   1073            const int step4 = iclip(a_b_dim[0], 2, 16);
   1074 
   1075            if (a_r->ref.ref[0] > 0) {
   1076                const int ow4 = imin(step4, b_dim[0]);
   1077                const int oh4 = imin(b_dim[1], 16) >> 1;
   1078                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
   1079                         t->bx + x, t->by, pl, a_r->mv.mv[0],
   1080                         &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
   1081                         dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
   1082                if (res) return res;
   1083                f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
   1084                                   h_mul * ow4, v_mul * oh4);
   1085                i++;
   1086            }
   1087            x += step4;
   1088        }
   1089    }
   1090 
   1091    if (t->bx > t->ts->tiling.col_start)
   1092        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
   1093            // only odd blocks are considered for overlap handling, hence +1
   1094            const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
   1095            const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
   1096            const int step4 = iclip(l_b_dim[1], 2, 16);
   1097 
   1098            if (l_r->ref.ref[0] > 0) {
   1099                const int ow4 = imin(b_dim[0], 16) >> 1;
   1100                const int oh4 = imin(step4, b_dim[1]);
   1101                res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
   1102                         t->bx, t->by + y, pl, l_r->mv.mv[0],
   1103                         &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
   1104                         dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
   1105                if (res) return res;
   1106                f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
   1107                                   dst_stride, lap, h_mul * ow4, v_mul * oh4);
   1108                i++;
   1109            }
   1110            y += step4;
   1111        }
   1112    return 0;
   1113 }
   1114 
   1115 static int warp_affine(Dav1dTaskContext *const t,
   1116                       pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
   1117                       const uint8_t *const b_dim, const int pl,
   1118                       const Dav1dThreadPicture *const refp,
   1119                       const Dav1dWarpedMotionParams *const wmp)
   1120 {
   1121    assert((dst8 != NULL) ^ (dst16 != NULL));
   1122    const Dav1dFrameContext *const f = t->f;
   1123    const Dav1dDSPContext *const dsp = f->dsp;
   1124    const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   1125    const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   1126    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
   1127    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
   1128    const int32_t *const mat = wmp->matrix;
   1129    const int width = (refp->p.p.w + ss_hor) >> ss_hor;
   1130    const int height = (refp->p.p.h + ss_ver) >> ss_ver;
   1131 
   1132    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
   1133        const int src_y = t->by * 4 + ((y + 4) << ss_ver);
   1134        const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
   1135        const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
   1136        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
   1137            // calculate transformation relative to center of 8x8 block in
   1138            // luma pixel units
   1139            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
   1140            const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
   1141            const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
   1142 
   1143            const int dx = (int) (mvx >> 16) - 4;
   1144            const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
   1145                                                   wmp->u.p.beta  * 7) & ~0x3f;
   1146            const int dy = (int) (mvy >> 16) - 4;
   1147            const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
   1148                                                   wmp->u.p.delta * 4) & ~0x3f;
   1149 
   1150            const pixel *ref_ptr;
   1151            ptrdiff_t ref_stride = refp->p.stride[!!pl];
   1152 
   1153            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
   1154                pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
   1155                f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
   1156                                    emu_edge_buf, 32 * sizeof(pixel),
   1157                                    refp->p.data[pl], ref_stride);
   1158                ref_ptr = &emu_edge_buf[32 * 3 + 3];
   1159                ref_stride = 32 * sizeof(pixel);
   1160            } else {
   1161                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
   1162            }
   1163            if (dst16 != NULL)
   1164                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
   1165                                 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
   1166            else
   1167                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
   1168                                wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
   1169        }
   1170        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
   1171        else      dst16 += 8 * dstride;
   1172    }
   1173    return 0;
   1174 }
   1175 
   1176 void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
   1177                                 const enum EdgeFlags intra_edge_flags,
   1178                                 const Av1Block *const b)
   1179 {
   1180    Dav1dTileState *const ts = t->ts;
   1181    const Dav1dFrameContext *const f = t->f;
   1182    const Dav1dDSPContext *const dsp = f->dsp;
   1183    const int bx4 = t->bx & 31, by4 = t->by & 31;
   1184    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   1185    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   1186    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
   1187    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
   1188    const int bw4 = b_dim[0], bh4 = b_dim[1];
   1189    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
   1190    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
   1191    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
   1192                           (bw4 > ss_hor || t->bx & 1) &&
   1193                           (bh4 > ss_ver || t->by & 1);
   1194    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
   1195    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
   1196 
   1197    // coefficient coding
   1198    pixel *const edge = bitfn(t->scratch.edge) + 128;
   1199    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
   1200 
   1201    const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
   1202 
   1203    for (int init_y = 0; init_y < h4; init_y += 16) {
   1204        const int sub_h4 = imin(h4, 16 + init_y);
   1205        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
   1206        for (int init_x = 0; init_x < w4; init_x += 16) {
   1207            if (b->pal_sz[0]) {
   1208                pixel *dst = ((pixel *) f->cur.data[0]) +
   1209                             4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
   1210                const uint8_t *pal_idx;
   1211                if (t->frame_thread.pass) {
   1212                    const int p = t->frame_thread.pass & 1;
   1213                    assert(ts->frame_thread[p].pal_idx);
   1214                    pal_idx = ts->frame_thread[p].pal_idx;
   1215                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
   1216                } else {
   1217                    pal_idx = t->scratch.pal_idx_y;
   1218                }
   1219                const pixel *const pal = t->frame_thread.pass ?
   1220                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
   1221                                        ((t->bx >> 1) + (t->by & 1))][0] :
   1222                    bytefn(t->scratch.pal)[0];
   1223                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
   1224                                       pal_idx, bw4 * 4, bh4 * 4);
   1225                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
   1226                    hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
   1227                             bw4 * 4, bh4 * 4, "y-pal-pred");
   1228            }
   1229 
   1230            const int intra_flags = (sm_flag(t->a, bx4) |
   1231                                     sm_flag(&t->l, by4) |
   1232                                     intra_edge_filter_flag);
   1233            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
   1234                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
   1235            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
   1236                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
   1237            int y, x;
   1238            const int sub_w4 = imin(w4, init_x + 16);
   1239            for (y = init_y, t->by += init_y; y < sub_h4;
   1240                 y += t_dim->h, t->by += t_dim->h)
   1241            {
   1242                pixel *dst = ((pixel *) f->cur.data[0]) +
   1243                               4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
   1244                                    t->bx + init_x);
   1245                for (x = init_x, t->bx += init_x; x < sub_w4;
   1246                     x += t_dim->w, t->bx += t_dim->w)
   1247                {
   1248                    if (b->pal_sz[0]) goto skip_y_pred;
   1249 
   1250                    int angle = b->y_angle;
   1251                    const enum EdgeFlags edge_flags =
   1252                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
   1253                             0 : EDGE_I444_TOP_HAS_RIGHT) |
   1254                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
   1255                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
   1256                    const pixel *top_sb_edge = NULL;
   1257                    if (!(t->by & (f->sb_step - 1))) {
   1258                        top_sb_edge = f->ipred_edge[0];
   1259                        const int sby = t->by >> f->sb_shift;
   1260                        top_sb_edge += f->sb128w * 128 * (sby - 1);
   1261                    }
   1262                    const enum IntraPredMode m =
   1263                        bytefn(dav1d_prepare_intra_edges)(t->bx,
   1264                                                          t->bx > ts->tiling.col_start,
   1265                                                          t->by,
   1266                                                          t->by > ts->tiling.row_start,
   1267                                                          ts->tiling.col_end,
   1268                                                          ts->tiling.row_end,
   1269                                                          edge_flags, dst,
   1270                                                          f->cur.stride[0], top_sb_edge,
   1271                                                          b->y_mode, &angle,
   1272                                                          t_dim->w, t_dim->h,
   1273                                                          f->seq_hdr->intra_edge_filter,
   1274                                                          edge HIGHBD_CALL_SUFFIX);
   1275                    dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
   1276                                             t_dim->w * 4, t_dim->h * 4,
   1277                                             angle | intra_flags,
   1278                                             4 * f->bw - 4 * t->bx,
   1279                                             4 * f->bh - 4 * t->by
   1280                                             HIGHBD_CALL_SUFFIX);
   1281 
   1282                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
   1283                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
   1284                                 t_dim->h * 4, 2, "l");
   1285                        hex_dump(edge, 0, 1, 1, "tl");
   1286                        hex_dump(edge + 1, t_dim->w * 4,
   1287                                 t_dim->w * 4, 2, "t");
   1288                        hex_dump(dst, f->cur.stride[0],
   1289                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
   1290                    }
   1291 
   1292                skip_y_pred: {}
   1293                    if (!b->skip) {
   1294                        coef *cf;
   1295                        int eob;
   1296                        enum TxfmType txtp;
   1297                        if (t->frame_thread.pass) {
   1298                            const int p = t->frame_thread.pass & 1;
   1299                            const int cbi = *ts->frame_thread[p].cbi++;
   1300                            cf = ts->frame_thread[p].cf;
   1301                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
   1302                            eob  = cbi >> 5;
   1303                            txtp = cbi & 0x1f;
   1304                        } else {
   1305                            uint8_t cf_ctx;
   1306                            cf = bitfn(t->cf);
   1307                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
   1308                                               &t->l.lcoef[by4 + y], b->tx, bs,
   1309                                               b, 1, 0, cf, &txtp, &cf_ctx);
   1310                            if (DEBUG_BLOCK_INFO)
   1311                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
   1312                                       b->tx, txtp, eob, ts->msac.rng);
   1313                            dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx));
   1314                            dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by));
   1315                        }
   1316                        if (eob >= 0) {
   1317                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
   1318                                coef_dump(cf, imin(t_dim->h, 8) * 4,
   1319                                          imin(t_dim->w, 8) * 4, 3, "dq");
   1320                            dsp->itx.itxfm_add[b->tx]
   1321                                              [txtp](dst,
   1322                                                     f->cur.stride[0],
   1323                                                     cf, eob HIGHBD_CALL_SUFFIX);
   1324                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
   1325                                hex_dump(dst, f->cur.stride[0],
   1326                                         t_dim->w * 4, t_dim->h * 4, "recon");
   1327                        }
   1328                    } else if (!t->frame_thread.pass) {
   1329                        dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40);
   1330                        dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40);
   1331                    }
   1332                    dst += 4 * t_dim->w;
   1333                }
   1334                t->bx -= x;
   1335            }
   1336            t->by -= y;
   1337 
   1338            if (!has_chroma) continue;
   1339 
   1340            const ptrdiff_t stride = f->cur.stride[1];
   1341 
   1342            if (b->uv_mode == CFL_PRED) {
   1343                assert(!init_x && !init_y);
   1344 
   1345                int16_t *const ac = t->scratch.ac;
   1346                pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
   1347                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
   1348                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
   1349                                              (t->by >> ss_ver) * PXSTRIDE(stride));
   1350                pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
   1351                                           ((pixel *) f->cur.data[2]) + uv_off };
   1352 
   1353                const int furthest_r =
   1354                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
   1355                const int furthest_b =
   1356                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
   1357                dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
   1358                                                         cbw4 - (furthest_r >> ss_hor),
   1359                                                         cbh4 - (furthest_b >> ss_ver),
   1360                                                         cbw4 * 4, cbh4 * 4);
   1361                for (int pl = 0; pl < 2; pl++) {
   1362                    if (!b->cfl_alpha[pl]) continue;
   1363                    int angle = 0;
   1364                    const pixel *top_sb_edge = NULL;
   1365                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
   1366                        top_sb_edge = f->ipred_edge[pl + 1];
   1367                        const int sby = t->by >> f->sb_shift;
   1368                        top_sb_edge += f->sb128w * 128 * (sby - 1);
   1369                    }
   1370                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
   1371                    const int xstart = ts->tiling.col_start >> ss_hor;
   1372                    const int ystart = ts->tiling.row_start >> ss_ver;
   1373                    const enum IntraPredMode m =
   1374                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
   1375                                                          ypos, ypos > ystart,
   1376                                                          ts->tiling.col_end >> ss_hor,
   1377                                                          ts->tiling.row_end >> ss_ver,
   1378                                                          0, uv_dst[pl], stride,
   1379                                                          top_sb_edge, DC_PRED, &angle,
   1380                                                          uv_t_dim->w, uv_t_dim->h, 0,
   1381                                                          edge HIGHBD_CALL_SUFFIX);
   1382                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
   1383                                           uv_t_dim->w * 4,
   1384                                           uv_t_dim->h * 4,
   1385                                           ac, b->cfl_alpha[pl]
   1386                                           HIGHBD_CALL_SUFFIX);
   1387                }
   1388                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
   1389                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
   1390                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
   1391                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
   1392                }
   1393            } else if (b->pal_sz[1]) {
   1394                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
   1395                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
   1396                const pixel (*pal)[8];
   1397                const uint8_t *pal_idx;
   1398                if (t->frame_thread.pass) {
   1399                    const int p = t->frame_thread.pass & 1;
   1400                    assert(ts->frame_thread[p].pal_idx);
   1401                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
   1402                                              ((t->bx >> 1) + (t->by & 1))];
   1403                    pal_idx = ts->frame_thread[p].pal_idx;
   1404                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
   1405                } else {
   1406                    pal = bytefn(t->scratch.pal);
   1407                    pal_idx = t->scratch.pal_idx_uv;
   1408                }
   1409 
   1410                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
   1411                                       f->cur.stride[1], pal[1],
   1412                                       pal_idx, cbw4 * 4, cbh4 * 4);
   1413                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
   1414                                       f->cur.stride[1], pal[2],
   1415                                       pal_idx, cbw4 * 4, cbh4 * 4);
   1416                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
   1417                    hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
   1418                             PXSTRIDE(f->cur.stride[1]),
   1419                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
   1420                    hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
   1421                             PXSTRIDE(f->cur.stride[1]),
   1422                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
   1423                }
   1424            }
   1425 
   1426            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
   1427                                 sm_uv_flag(&t->l, cby4);
   1428            const int uv_sb_has_tr =
   1429                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
   1430                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
   1431            const int uv_sb_has_bl =
   1432                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
   1433                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
   1434            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
   1435            for (int pl = 0; pl < 2; pl++) {
   1436                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
   1437                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
   1438                {
   1439                    pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
   1440                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
   1441                                        ((t->bx + init_x) >> ss_hor));
   1442                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
   1443                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
   1444                    {
   1445                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
   1446                            b->pal_sz[1])
   1447                        {
   1448                            goto skip_uv_pred;
   1449                        }
   1450 
   1451                        int angle = b->uv_angle;
   1452                        // this probably looks weird because we're using
   1453                        // luma flags in a chroma loop, but that's because
   1454                        // prepare_intra_edges() expects luma flags as input
   1455                        const enum EdgeFlags edge_flags =
   1456                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
   1457                              (x + uv_t_dim->w >= sub_cw4)) ?
   1458                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
   1459                            ((x > (init_x >> ss_hor) ||
   1460                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
   1461                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
   1462                        const pixel *top_sb_edge = NULL;
   1463                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
   1464                            top_sb_edge = f->ipred_edge[1 + pl];
   1465                            const int sby = t->by >> f->sb_shift;
   1466                            top_sb_edge += f->sb128w * 128 * (sby - 1);
   1467                        }
   1468                        const enum IntraPredMode uv_mode =
   1469                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
   1470                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
   1471                        const int xstart = ts->tiling.col_start >> ss_hor;
   1472                        const int ystart = ts->tiling.row_start >> ss_ver;
   1473                        const enum IntraPredMode m =
   1474                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
   1475                                                              ypos, ypos > ystart,
   1476                                                              ts->tiling.col_end >> ss_hor,
   1477                                                              ts->tiling.row_end >> ss_ver,
   1478                                                              edge_flags, dst, stride,
   1479                                                              top_sb_edge, uv_mode,
   1480                                                              &angle, uv_t_dim->w,
   1481                                                              uv_t_dim->h,
   1482                                                              f->seq_hdr->intra_edge_filter,
   1483                                                              edge HIGHBD_CALL_SUFFIX);
   1484                        angle |= intra_edge_filter_flag;
   1485                        dsp->ipred.intra_pred[m](dst, stride, edge,
   1486                                                 uv_t_dim->w * 4,
   1487                                                 uv_t_dim->h * 4,
   1488                                                 angle | sm_uv_fl,
   1489                                                 (4 * f->bw + ss_hor -
   1490                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
   1491                                                 (4 * f->bh + ss_ver -
   1492                                                  4 * (t->by & ~ss_ver)) >> ss_ver
   1493                                                 HIGHBD_CALL_SUFFIX);
   1494                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
   1495                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
   1496                                     uv_t_dim->h * 4, 2, "l");
   1497                            hex_dump(edge, 0, 1, 1, "tl");
   1498                            hex_dump(edge + 1, uv_t_dim->w * 4,
   1499                                     uv_t_dim->w * 4, 2, "t");
   1500                            hex_dump(dst, stride, uv_t_dim->w * 4,
   1501                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
   1502                        }
   1503 
   1504                    skip_uv_pred: {}
   1505                        if (!b->skip) {
   1506                            enum TxfmType txtp;
   1507                            int eob;
   1508                            coef *cf;
   1509                            if (t->frame_thread.pass) {
   1510                                const int p = t->frame_thread.pass & 1;
   1511                                const int cbi = *ts->frame_thread[p].cbi++;
   1512                                cf = ts->frame_thread[p].cf;
   1513                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
   1514                                eob  = cbi >> 5;
   1515                                txtp = cbi & 0x1f;
   1516                            } else {
   1517                                uint8_t cf_ctx;
   1518                                cf = bitfn(t->cf);
   1519                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
   1520                                                   &t->l.ccoef[pl][cby4 + y],
   1521                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
   1522                                                   &txtp, &cf_ctx);
   1523                                if (DEBUG_BLOCK_INFO)
   1524                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
   1525                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
   1526                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
   1527                                int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor);
   1528                                int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver);
   1529                                dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
   1530                                dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
   1531                            }
   1532                            if (eob >= 0) {
   1533                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
   1534                                    coef_dump(cf, uv_t_dim->h * 4,
   1535                                              uv_t_dim->w * 4, 3, "dq");
   1536                                dsp->itx.itxfm_add[b->uvtx]
   1537                                                  [txtp](dst, stride,
   1538                                                         cf, eob HIGHBD_CALL_SUFFIX);
   1539                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
   1540                                    hex_dump(dst, stride, uv_t_dim->w * 4,
   1541                                             uv_t_dim->h * 4, "recon");
   1542                            }
   1543                        } else if (!t->frame_thread.pass) {
   1544                            dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40);
   1545                            dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40);
   1546                        }
   1547                        dst += uv_t_dim->w * 4;
   1548                    }
   1549                    t->bx -= x << ss_hor;
   1550                }
   1551                t->by -= y << ss_ver;
   1552            }
   1553        }
   1554    }
   1555 }
   1556 
   1557 int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
   1558                                const Av1Block *const b)
   1559 {
   1560    Dav1dTileState *const ts = t->ts;
   1561    const Dav1dFrameContext *const f = t->f;
   1562    const Dav1dDSPContext *const dsp = f->dsp;
   1563    const int bx4 = t->bx & 31, by4 = t->by & 31;
   1564    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   1565    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   1566    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
   1567    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
   1568    const int bw4 = b_dim[0], bh4 = b_dim[1];
   1569    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
   1570    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
   1571                           (bw4 > ss_hor || t->bx & 1) &&
   1572                           (bh4 > ss_ver || t->by & 1);
   1573    const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
   1574                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
   1575    int res;
   1576 
   1577    // prediction
   1578    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
   1579    pixel *dst = ((pixel *) f->cur.data[0]) +
   1580        4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
   1581    const ptrdiff_t uvdstoff =
   1582        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
   1583    if (IS_KEY_OR_INTRA(f->frame_hdr)) {
   1584        // intrabc
   1585        assert(!f->frame_hdr->super_res.enabled);
   1586        res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
   1587                 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
   1588        if (res) return res;
   1589        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
   1590            res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
   1591                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
   1592                     t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
   1593                     &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
   1594            if (res) return res;
   1595        }
   1596    } else if (b->comp_type == COMP_INTER_NONE) {
   1597        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
   1598        const enum Filter2d filter_2d = b->filter2d;
   1599 
   1600        if (imin(bw4, bh4) > 1 &&
   1601            ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
   1602             (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
   1603        {
   1604            res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
   1605                              b->motion_mode == MM_WARP ? &t->warpmv :
   1606                                  &f->frame_hdr->gmv[b->ref[0]]);
   1607            if (res) return res;
   1608        } else {
   1609            res = mc(t, dst, NULL, f->cur.stride[0],
   1610                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
   1611            if (res) return res;
   1612            if (b->motion_mode == MM_OBMC) {
   1613                res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
   1614                if (res) return res;
   1615            }
   1616        }
   1617        if (b->interintra_type) {
   1618            pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
   1619            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
   1620                                   SMOOTH_PRED : b->interintra_mode;
   1621            pixel *const tmp = bitfn(t->scratch.interintra);
   1622            int angle = 0;
   1623            const pixel *top_sb_edge = NULL;
   1624            if (!(t->by & (f->sb_step - 1))) {
   1625                top_sb_edge = f->ipred_edge[0];
   1626                const int sby = t->by >> f->sb_shift;
   1627                top_sb_edge += f->sb128w * 128 * (sby - 1);
   1628            }
   1629            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
   1630                                                  t->by, t->by > ts->tiling.row_start,
   1631                                                  ts->tiling.col_end, ts->tiling.row_end,
   1632                                                  0, dst, f->cur.stride[0], top_sb_edge,
   1633                                                  m, &angle, bw4, bh4, 0, tl_edge
   1634                                                  HIGHBD_CALL_SUFFIX);
   1635            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
   1636                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
   1637                                     HIGHBD_CALL_SUFFIX);
   1638            dsp->mc.blend(dst, f->cur.stride[0], tmp,
   1639                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
   1640        }
   1641 
   1642        if (!has_chroma) goto skip_inter_chroma_pred;
   1643 
   1644        // sub8x8 derivation
   1645        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
   1646        refmvs_block *const *r;
   1647        if (is_sub8x8) {
   1648            assert(ss_hor == 1);
   1649            r = &t->rt.r[(t->by & 31) + 5];
   1650            if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
   1651            if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
   1652            if (bw4 == 1 && bh4 == ss_ver)
   1653                is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
   1654        }
   1655 
   1656        // chroma prediction
   1657        if (is_sub8x8) {
   1658            assert(ss_hor == 1);
   1659            ptrdiff_t h_off = 0, v_off = 0;
   1660            if (bw4 == 1 && bh4 == ss_ver) {
   1661                for (int pl = 0; pl < 2; pl++) {
   1662                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
   1663                             NULL, f->cur.stride[1],
   1664                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
   1665                             r[-1][t->bx - 1].mv.mv[0],
   1666                             &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
   1667                             r[-1][t->bx - 1].ref.ref[0] - 1,
   1668                             t->frame_thread.pass != 2 ? t->tl_4x4_filter :
   1669                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
   1670                    if (res) return res;
   1671                }
   1672                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
   1673                h_off = 2;
   1674            }
   1675            if (bw4 == 1) {
   1676                const enum Filter2d left_filter_2d =
   1677                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
   1678                for (int pl = 0; pl < 2; pl++) {
   1679                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
   1680                             f->cur.stride[1], bw4, bh4, t->bx - 1,
   1681                             t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
   1682                             &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
   1683                             r[0][t->bx - 1].ref.ref[0] - 1,
   1684                             t->frame_thread.pass != 2 ? left_filter_2d :
   1685                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
   1686                    if (res) return res;
   1687                }
   1688                h_off = 2;
   1689            }
   1690            if (bh4 == ss_ver) {
   1691                const enum Filter2d top_filter_2d =
   1692                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
   1693                for (int pl = 0; pl < 2; pl++) {
   1694                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
   1695                             f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
   1696                             1 + pl, r[-1][t->bx].mv.mv[0],
   1697                             &f->refp[r[-1][t->bx].ref.ref[0] - 1],
   1698                             r[-1][t->bx].ref.ref[0] - 1,
   1699                             t->frame_thread.pass != 2 ? top_filter_2d :
   1700                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
   1701                    if (res) return res;
   1702                }
   1703                v_off = 2 * PXSTRIDE(f->cur.stride[1]);
   1704            }
   1705            for (int pl = 0; pl < 2; pl++) {
   1706                res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
   1707                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
   1708                         refp, b->ref[0], filter_2d);
   1709                if (res) return res;
   1710            }
   1711        } else {
   1712            if (imin(cbw4, cbh4) > 1 &&
   1713                ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
   1714                 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
   1715            {
   1716                for (int pl = 0; pl < 2; pl++) {
   1717                    res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
   1718                                      f->cur.stride[1], b_dim, 1 + pl, refp,
   1719                                      b->motion_mode == MM_WARP ? &t->warpmv :
   1720                                          &f->frame_hdr->gmv[b->ref[0]]);
   1721                    if (res) return res;
   1722                }
   1723            } else {
   1724                for (int pl = 0; pl < 2; pl++) {
   1725                    res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
   1726                             NULL, f->cur.stride[1],
   1727                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
   1728                             t->bx & ~ss_hor, t->by & ~ss_ver,
   1729                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
   1730                    if (res) return res;
   1731                    if (b->motion_mode == MM_OBMC) {
   1732                        res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
   1733                                   f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
   1734                        if (res) return res;
   1735                    }
   1736                }
   1737            }
   1738            if (b->interintra_type) {
   1739                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
   1740                // the wrong thing since it will select 4x16, not 4x32, as a
   1741                // transform size...
   1742                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
   1743 
   1744                for (int pl = 0; pl < 2; pl++) {
   1745                    pixel *const tmp = bitfn(t->scratch.interintra);
   1746                    pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
   1747                    enum IntraPredMode m =
   1748                        b->interintra_mode == II_SMOOTH_PRED ?
   1749                        SMOOTH_PRED : b->interintra_mode;
   1750                    int angle = 0;
   1751                    pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
   1752                    const pixel *top_sb_edge = NULL;
   1753                    if (!(t->by & (f->sb_step - 1))) {
   1754                        top_sb_edge = f->ipred_edge[pl + 1];
   1755                        const int sby = t->by >> f->sb_shift;
   1756                        top_sb_edge += f->sb128w * 128 * (sby - 1);
   1757                    }
   1758                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
   1759                                                          (t->bx >> ss_hor) >
   1760                                                              (ts->tiling.col_start >> ss_hor),
   1761                                                          t->by >> ss_ver,
   1762                                                          (t->by >> ss_ver) >
   1763                                                              (ts->tiling.row_start >> ss_ver),
   1764                                                          ts->tiling.col_end >> ss_hor,
   1765                                                          ts->tiling.row_end >> ss_ver,
   1766                                                          0, uvdst, f->cur.stride[1],
   1767                                                          top_sb_edge, m,
   1768                                                          &angle, cbw4, cbh4, 0, tl_edge
   1769                                                          HIGHBD_CALL_SUFFIX);
   1770                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
   1771                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
   1772                                             HIGHBD_CALL_SUFFIX);
   1773                    dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
   1774                                  cbw4 * 4, cbh4 * 4, ii_mask);
   1775                }
   1776            }
   1777        }
   1778 
   1779    skip_inter_chroma_pred: {}
   1780        t->tl_4x4_filter = filter_2d;
   1781    } else {
   1782        const enum Filter2d filter_2d = b->filter2d;
   1783        // Maximum super block size is 128x128
   1784        int16_t (*tmp)[128 * 128] = t->scratch.compinter;
   1785        int jnt_weight;
   1786        uint8_t *const seg_mask = t->scratch.seg_mask;
   1787        const uint8_t *mask;
   1788 
   1789        for (int i = 0; i < 2; i++) {
   1790            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
   1791 
   1792            if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
   1793                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
   1794                                  &f->frame_hdr->gmv[b->ref[i]]);
   1795                if (res) return res;
   1796            } else {
   1797                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
   1798                         b->mv[i], refp, b->ref[i], filter_2d);
   1799                if (res) return res;
   1800            }
   1801        }
   1802        switch (b->comp_type) {
   1803        case COMP_INTER_AVG:
   1804            dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
   1805                        bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
   1806            break;
   1807        case COMP_INTER_WEIGHTED_AVG:
   1808            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
   1809            dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
   1810                          bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
   1811            break;
   1812        case COMP_INTER_SEG:
   1813            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
   1814                                           tmp[b->mask_sign], tmp[!b->mask_sign],
   1815                                           bw4 * 4, bh4 * 4, seg_mask,
   1816                                           b->mask_sign HIGHBD_CALL_SUFFIX);
   1817            mask = seg_mask;
   1818            break;
   1819        case COMP_INTER_WEDGE:
   1820            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
   1821            dsp->mc.mask(dst, f->cur.stride[0],
   1822                         tmp[b->mask_sign], tmp[!b->mask_sign],
   1823                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
   1824            if (has_chroma)
   1825                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
   1826            break;
   1827        }
   1828 
   1829        // chroma
   1830        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
   1831            for (int i = 0; i < 2; i++) {
   1832                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
   1833                if (b->inter_mode == GLOBALMV_GLOBALMV &&
   1834                    imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
   1835                {
   1836                    res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
   1837                                      b_dim, 1 + pl,
   1838                                      refp, &f->frame_hdr->gmv[b->ref[i]]);
   1839                    if (res) return res;
   1840                } else {
   1841                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
   1842                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
   1843                    if (res) return res;
   1844                }
   1845            }
   1846            pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
   1847            switch (b->comp_type) {
   1848            case COMP_INTER_AVG:
   1849                dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
   1850                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
   1851                            HIGHBD_CALL_SUFFIX);
   1852                break;
   1853            case COMP_INTER_WEIGHTED_AVG:
   1854                dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
   1855                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
   1856                              HIGHBD_CALL_SUFFIX);
   1857                break;
   1858            case COMP_INTER_WEDGE:
   1859            case COMP_INTER_SEG:
   1860                dsp->mc.mask(uvdst, f->cur.stride[1],
   1861                             tmp[b->mask_sign], tmp[!b->mask_sign],
   1862                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
   1863                             HIGHBD_CALL_SUFFIX);
   1864                break;
   1865            }
   1866        }
   1867    }
   1868 
   1869    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
   1870        hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
   1871        if (has_chroma) {
   1872            hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
   1873                     cbw4 * 4, cbh4 * 4, "u-pred");
   1874            hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
   1875                     cbw4 * 4, cbh4 * 4, "v-pred");
   1876        }
   1877    }
   1878 
   1879    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
   1880 
   1881    if (b->skip) {
   1882        // reset coef contexts
   1883        BlockContext *const a = t->a;
   1884        dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40);
   1885        dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40);
   1886        if (has_chroma) {
   1887            dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)];
   1888            dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)];
   1889            memset_cw(&a->ccoef[0][cbx4], 0x40);
   1890            memset_cw(&a->ccoef[1][cbx4], 0x40);
   1891            memset_ch(&t->l.ccoef[0][cby4], 0x40);
   1892            memset_ch(&t->l.ccoef[1][cby4], 0x40);
   1893        }
   1894        return 0;
   1895    }
   1896 
   1897    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
   1898    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
   1899    const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
   1900 
   1901    for (int init_y = 0; init_y < bh4; init_y += 16) {
   1902        for (int init_x = 0; init_x < bw4; init_x += 16) {
   1903            // coefficient coding & inverse transforms
   1904            int y_off = !!init_y, y;
   1905            dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
   1906            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
   1907                 y += ytx->h, y_off++)
   1908            {
   1909                int x, x_off = !!init_x;
   1910                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
   1911                     x += ytx->w, x_off++)
   1912                {
   1913                    read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
   1914                                   x_off, y_off, &dst[x * 4]);
   1915                    t->bx += ytx->w;
   1916                }
   1917                dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
   1918                t->bx -= x;
   1919                t->by += ytx->h;
   1920            }
   1921            dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
   1922            t->by -= y;
   1923 
   1924            // chroma coefs and inverse transform
   1925            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
   1926                pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
   1927                    (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
   1928                for (y = init_y >> ss_ver, t->by += init_y;
   1929                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
   1930                {
   1931                    int x;
   1932                    for (x = init_x >> ss_hor, t->bx += init_x;
   1933                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
   1934                    {
   1935                        coef *cf;
   1936                        int eob;
   1937                        enum TxfmType txtp;
   1938                        if (t->frame_thread.pass) {
   1939                            const int p = t->frame_thread.pass & 1;
   1940                            const int cbi = *ts->frame_thread[p].cbi++;
   1941                            cf = ts->frame_thread[p].cf;
   1942                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
   1943                            eob  = cbi >> 5;
   1944                            txtp = cbi & 0x1f;
   1945                        } else {
   1946                            uint8_t cf_ctx;
   1947                            cf = bitfn(t->cf);
   1948                            txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
   1949                                                        bx4 + (x << ss_hor)];
   1950                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
   1951                                               &t->l.ccoef[pl][cby4 + y],
   1952                                               b->uvtx, bs, b, 0, 1 + pl,
   1953                                               cf, &txtp, &cf_ctx);
   1954                            if (DEBUG_BLOCK_INFO)
   1955                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
   1956                                       "txtp=%d,eob=%d]: r=%d\n",
   1957                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
   1958                            int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor);
   1959                            int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver);
   1960                            dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw);
   1961                            dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth);
   1962                        }
   1963                        if (eob >= 0) {
   1964                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
   1965                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
   1966                            dsp->itx.itxfm_add[b->uvtx]
   1967                                              [txtp](&uvdst[4 * x],
   1968                                                     f->cur.stride[1],
   1969                                                     cf, eob HIGHBD_CALL_SUFFIX);
   1970                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
   1971                                hex_dump(&uvdst[4 * x], f->cur.stride[1],
   1972                                         uvtx->w * 4, uvtx->h * 4, "recon");
   1973                        }
   1974                        t->bx += uvtx->w << ss_hor;
   1975                    }
   1976                    uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
   1977                    t->bx -= x << ss_hor;
   1978                    t->by += uvtx->h << ss_ver;
   1979                }
   1980                t->by -= y << ss_ver;
   1981            }
   1982        }
   1983    }
   1984    return 0;
   1985 }
   1986 
   1987 void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
   1988    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
   1989        (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
   1990    {
   1991        return;
   1992    }
   1993    const int y = sby * f->sb_step * 4;
   1994    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   1995    pixel *const p[3] = {
   1996        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
   1997        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
   1998        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
   1999    };
   2000    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
   2001    bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
   2002                                        f->lf.start_of_tile_row[sby]);
   2003 }
   2004 
   2005 void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
   2006    const int y = sby * f->sb_step * 4;
   2007    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2008    pixel *const p[3] = {
   2009        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
   2010        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
   2011        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
   2012    };
   2013    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
   2014    if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
   2015        (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
   2016    {
   2017        bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
   2018    }
   2019    if (f->seq_hdr->cdef || f->lf.restore_planes) {
   2020        // Store loop filtered pixels required by CDEF / LR
   2021        bytefn(dav1d_copy_lpf)(f, p, sby);
   2022    }
   2023 }
   2024 
   2025 void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
   2026    const Dav1dFrameContext *const f = tc->f;
   2027    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
   2028    const int sbsz = f->sb_step;
   2029    const int y = sby * sbsz * 4;
   2030    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2031    pixel *const p[3] = {
   2032        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
   2033        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
   2034        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
   2035    };
   2036    Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
   2037    Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
   2038    const int start = sby * sbsz;
   2039    if (sby) {
   2040        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2041        pixel *p_up[3] = {
   2042            p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
   2043            p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
   2044            p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
   2045        };
   2046        bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
   2047    }
   2048    const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
   2049    const int end = imin(start + n_blks, f->bh);
   2050    bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
   2051 }
   2052 
   2053 void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
   2054    const int sbsz = f->sb_step;
   2055    const int y = sby * sbsz * 4;
   2056    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2057    const pixel *const p[3] = {
   2058        f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
   2059        f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
   2060        f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
   2061    };
   2062    pixel *const sr_p[3] = {
   2063        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
   2064        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
   2065        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
   2066    };
   2067    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
   2068    for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
   2069        const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2070        const int h_start = 8 * !!sby >> ss_ver;
   2071        const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
   2072        pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
   2073        const ptrdiff_t src_stride = f->cur.stride[!!pl];
   2074        const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
   2075        const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
   2076        const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   2077        const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
   2078        const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
   2079        const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
   2080 
   2081        f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
   2082                          imin(img_h, h_end) + h_start, src_w,
   2083                          f->resize_step[!!pl], f->resize_start[!!pl]
   2084                          HIGHBD_CALL_SUFFIX);
   2085    }
   2086 }
   2087 
   2088 void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
   2089    if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
   2090    const int y = sby * f->sb_step * 4;
   2091    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2092    pixel *const sr_p[3] = {
   2093        f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
   2094        f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
   2095        f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
   2096    };
   2097    bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
   2098 }
   2099 
   2100 void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
   2101    bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
   2102    bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
   2103    if (f->seq_hdr->cdef)
   2104        bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
   2105    if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
   2106        bytefn(dav1d_filter_sbrow_resize)(f, sby);
   2107    if (f->lf.restore_planes)
   2108        bytefn(dav1d_filter_sbrow_lr)(f, sby);
   2109 }
   2110 
   2111 void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
   2112    const Dav1dFrameContext *const f = t->f;
   2113    Dav1dTileState *const ts = t->ts;
   2114    const int sby = t->by >> f->sb_shift;
   2115    const int sby_off = f->sb128w * 128 * sby;
   2116    const int x_off = ts->tiling.col_start;
   2117 
   2118    const pixel *const y =
   2119        ((const pixel *) f->cur.data[0]) + x_off * 4 +
   2120                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
   2121    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
   2122               4 * (ts->tiling.col_end - x_off));
   2123 
   2124    if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
   2125        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
   2126        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
   2127 
   2128        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
   2129            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
   2130        for (int pl = 1; pl <= 2; pl++)
   2131            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
   2132                       &((const pixel *) f->cur.data[pl])[uv_off],
   2133                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
   2134    }
   2135 }
   2136 
   2137 void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
   2138                                    const int bx4, const int by4,
   2139                                    const int bw4, const int bh4)
   2140 
   2141 {
   2142    const Dav1dFrameContext *const f = t->f;
   2143    pixel *const pal = t->frame_thread.pass ?
   2144        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
   2145                            ((t->bx >> 1) + (t->by & 1))][0] :
   2146        bytefn(t->scratch.pal)[0];
   2147    for (int x = 0; x < bw4; x++)
   2148        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
   2149    for (int y = 0; y < bh4; y++)
   2150        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
   2151 }
   2152 
   2153 void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
   2154                                     const int bx4, const int by4,
   2155                                     const int bw4, const int bh4)
   2156 
   2157 {
   2158    const Dav1dFrameContext *const f = t->f;
   2159    const pixel (*const pal)[8] = t->frame_thread.pass ?
   2160        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
   2161                            ((t->bx >> 1) + (t->by & 1))] :
   2162        bytefn(t->scratch.pal);
   2163    // see aomedia bug 2183 for why we use luma coordinates here
   2164    for (int pl = 1; pl <= 2; pl++) {
   2165        for (int x = 0; x < bw4; x++)
   2166            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
   2167        for (int y = 0; y < bh4; y++)
   2168            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
   2169    }
   2170 }
   2171 
   2172 void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
   2173                                  const int pl, const int sz_ctx,
   2174                                  const int bx4, const int by4)
   2175 {
   2176    Dav1dTileState *const ts = t->ts;
   2177    const Dav1dFrameContext *const f = t->f;
   2178    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
   2179                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
   2180    pixel cache[16], used_cache[8];
   2181    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
   2182    int n_cache = 0;
   2183    // don't reuse above palette outside SB64 boundaries
   2184    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
   2185    const pixel *l = bytefn(t->al_pal)[1][by4][pl];
   2186    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
   2187 
   2188    // fill/sort cache
   2189    while (l_cache && a_cache) {
   2190        if (*l < *a) {
   2191            if (!n_cache || cache[n_cache - 1] != *l)
   2192                cache[n_cache++] = *l;
   2193            l++;
   2194            l_cache--;
   2195        } else {
   2196            if (*a == *l) {
   2197                l++;
   2198                l_cache--;
   2199            }
   2200            if (!n_cache || cache[n_cache - 1] != *a)
   2201                cache[n_cache++] = *a;
   2202            a++;
   2203            a_cache--;
   2204        }
   2205    }
   2206    if (l_cache) {
   2207        do {
   2208            if (!n_cache || cache[n_cache - 1] != *l)
   2209                cache[n_cache++] = *l;
   2210            l++;
   2211        } while (--l_cache > 0);
   2212    } else if (a_cache) {
   2213        do {
   2214            if (!n_cache || cache[n_cache - 1] != *a)
   2215                cache[n_cache++] = *a;
   2216            a++;
   2217        } while (--a_cache > 0);
   2218    }
   2219 
   2220    // find reused cache entries
   2221    int i = 0;
   2222    for (int n = 0; n < n_cache && i < pal_sz; n++)
   2223        if (dav1d_msac_decode_bool_equi(&ts->msac))
   2224            used_cache[i++] = cache[n];
   2225    const int n_used_cache = i;
   2226 
   2227    // parse new entries
   2228    pixel *const pal = t->frame_thread.pass ?
   2229        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
   2230                            ((t->bx >> 1) + (t->by & 1))][pl] :
   2231        bytefn(t->scratch.pal)[pl];
   2232    if (i < pal_sz) {
   2233        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
   2234        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
   2235 
   2236        if (i < pal_sz) {
   2237            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
   2238            const int max = (1 << bpc) - 1;
   2239 
   2240            do {
   2241                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
   2242                prev = pal[i++] = imin(prev + delta + !pl, max);
   2243                if (prev + !pl >= max) {
   2244                    for (; i < pal_sz; i++)
   2245                        pal[i] = max;
   2246                    break;
   2247                }
   2248                bits = imin(bits, 1 + ulog2(max - prev - !pl));
   2249            } while (i < pal_sz);
   2250        }
   2251 
   2252        // merge cache+new entries
   2253        int n = 0, m = n_used_cache;
   2254        for (i = 0; i < pal_sz; i++) {
   2255            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
   2256                pal[i] = used_cache[n++];
   2257            } else {
   2258                assert(m < pal_sz);
   2259                pal[i] = pal[m++];
   2260            }
   2261        }
   2262    } else {
   2263        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
   2264    }
   2265 
   2266    if (DEBUG_BLOCK_INFO) {
   2267        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
   2268               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
   2269        for (int n = 0; n < n_cache; n++)
   2270            printf("%c%02x", n ? ' ' : '[', cache[n]);
   2271        printf("%s, pal=", n_cache ? "]" : "[]");
   2272        for (int n = 0; n < pal_sz; n++)
   2273            printf("%c%02x", n ? ' ' : '[', pal[n]);
   2274        printf("]\n");
   2275    }
   2276 }
   2277 
   2278 void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
   2279                               const int sz_ctx, const int bx4, const int by4)
   2280 {
   2281    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
   2282 
   2283    // V pal coding
   2284    Dav1dTileState *const ts = t->ts;
   2285    const Dav1dFrameContext *const f = t->f;
   2286    pixel *const pal = t->frame_thread.pass ?
   2287        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
   2288                            ((t->bx >> 1) + (t->by & 1))][2] :
   2289        bytefn(t->scratch.pal)[2];
   2290    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
   2291    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
   2292        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
   2293        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
   2294        const int max = (1 << bpc) - 1;
   2295        for (int i = 1; i < b->pal_sz[1]; i++) {
   2296            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
   2297            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
   2298            prev = pal[i] = (prev + delta) & max;
   2299        }
   2300    } else {
   2301        for (int i = 0; i < b->pal_sz[1]; i++)
   2302            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
   2303    }
   2304    if (DEBUG_BLOCK_INFO) {
   2305        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
   2306        for (int n = 0; n < b->pal_sz[1]; n++)
   2307            printf("%c%02x", n ? ' ' : '[', pal[n]);
   2308        printf("]\n");
   2309    }
   2310 }