recon_tmpl.c (107321B)
1 /* 2 * Copyright © 2018-2021, VideoLAN and dav1d authors 3 * Copyright © 2018, Two Orioles, LLC 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 30 #include <string.h> 31 #include <stdio.h> 32 33 #include "common/attributes.h" 34 #include "common/bitdepth.h" 35 #include "common/dump.h" 36 #include "common/frame.h" 37 #include "common/intops.h" 38 39 #include "src/cdef_apply.h" 40 #include "src/ctx.h" 41 #include "src/ipred_prepare.h" 42 #include "src/lf_apply.h" 43 #include "src/lr_apply.h" 44 #include "src/recon.h" 45 #include "src/scan.h" 46 #include "src/tables.h" 47 #include "src/wedge.h" 48 49 static inline unsigned read_golomb(MsacContext *const msac) { 50 int len = 0; 51 unsigned val = 1; 52 53 while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++; 54 while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac); 55 56 return val - 1; 57 } 58 59 static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim, 60 const enum BlockSize bs, 61 const uint8_t *const a, 62 const uint8_t *const l, 63 const int chroma, 64 const enum Dav1dPixelLayout layout) 65 { 66 const uint8_t *const b_dim = dav1d_block_dimensions[bs]; 67 68 if (chroma) { 69 const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; 70 const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; 71 const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw || 72 b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh; 73 unsigned ca, cl; 74 75 #define MERGE_CTX(dir, type, no_val) \ 76 c##dir = *(const type *) dir != no_val; \ 77 break 78 79 switch (t_dim->lw) { 80 /* For some reason the MSVC CRT _wassert() function is not flagged as 81 * __declspec(noreturn), so when using those headers the compiler will 82 * expect execution to continue after an assertion has been triggered 83 * and will therefore complain about the use of uninitialized variables 84 * when compiled in debug mode if we put the default case at the end. */ 85 default: assert(0); /* fall-through */ 86 case TX_4X4: MERGE_CTX(a, uint8_t, 0x40); 87 case TX_8X8: MERGE_CTX(a, uint16_t, 0x4040); 88 case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U); 89 case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL); 90 } 91 switch (t_dim->lh) { 92 default: assert(0); /* fall-through */ 93 case TX_4X4: MERGE_CTX(l, uint8_t, 0x40); 94 case TX_8X8: MERGE_CTX(l, uint16_t, 0x4040); 95 case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U); 96 case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL); 97 } 98 #undef MERGE_CTX 99 100 return 7 + not_one_blk * 3 + ca + cl; 101 } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) { 102 return 0; 103 } else { 104 unsigned la, ll; 105 106 #define MERGE_CTX(dir, type, tx) \ 107 if (tx == TX_64X64) { \ 108 uint64_t tmp = *(const uint64_t *) dir; \ 109 tmp |= *(const uint64_t *) &dir[8]; \ 110 l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \ 111 } else \ 112 l##dir = *(const type *) dir; \ 113 if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \ 114 if (tx >= TX_16X16) l##dir |= l##dir >> 16; \ 115 if (tx >= TX_8X8) l##dir |= l##dir >> 8; \ 116 break 117 118 switch (t_dim->lw) { 119 default: assert(0); /* fall-through */ 120 case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4); 121 case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8); 122 case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16); 123 case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32); 124 case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64); 125 } 126 switch (t_dim->lh) { 127 default: assert(0); /* fall-through */ 128 case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4); 129 case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8); 130 case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16); 131 case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32); 132 case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64); 133 } 134 #undef MERGE_CTX 135 136 return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)]; 137 } 138 } 139 140 static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx, 141 const uint8_t *const a, 142 const uint8_t *const l) 143 { 144 uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL; 145 int s; 146 147 #if ARCH_X86_64 && defined(__GNUC__) 148 /* Coerce compilers into producing better code. For some reason 149 * every x86-64 compiler is awful at handling 64-bit constants. */ 150 __asm__("" : "+r"(mask), "+r"(mul)); 151 #endif 152 153 switch(tx) { 154 default: assert(0); /* fall-through */ 155 case TX_4X4: { 156 int t = *(const uint8_t *) a >> 6; 157 t += *(const uint8_t *) l >> 6; 158 s = t - 1 - 1; 159 break; 160 } 161 case TX_8X8: { 162 uint32_t t = *(const uint16_t *) a & (uint32_t) mask; 163 t += *(const uint16_t *) l & (uint32_t) mask; 164 t *= 0x04040404U; 165 s = (int) (t >> 24) - 2 - 2; 166 break; 167 } 168 case TX_16X16: { 169 uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6; 170 t += (*(const uint32_t *) l & (uint32_t) mask) >> 6; 171 t *= (uint32_t) mul; 172 s = (int) (t >> 24) - 4 - 4; 173 break; 174 } 175 case TX_32X32: { 176 uint64_t t = (*(const uint64_t *) a & mask) >> 6; 177 t += (*(const uint64_t *) l & mask) >> 6; 178 t *= mul; 179 s = (int) (t >> 56) - 8 - 8; 180 break; 181 } 182 case TX_64X64: { 183 uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; 184 t += (*(const uint64_t *) &a[8] & mask) >> 6; 185 t += (*(const uint64_t *) &l[0] & mask) >> 6; 186 t += (*(const uint64_t *) &l[8] & mask) >> 6; 187 t *= mul; 188 s = (int) (t >> 56) - 16 - 16; 189 break; 190 } 191 case RTX_4X8: { 192 uint32_t t = *(const uint8_t *) a & (uint32_t) mask; 193 t += *(const uint16_t *) l & (uint32_t) mask; 194 t *= 0x04040404U; 195 s = (int) (t >> 24) - 1 - 2; 196 break; 197 } 198 case RTX_8X4: { 199 uint32_t t = *(const uint16_t *) a & (uint32_t) mask; 200 t += *(const uint8_t *) l & (uint32_t) mask; 201 t *= 0x04040404U; 202 s = (int) (t >> 24) - 2 - 1; 203 break; 204 } 205 case RTX_8X16: { 206 uint32_t t = *(const uint16_t *) a & (uint32_t) mask; 207 t += *(const uint32_t *) l & (uint32_t) mask; 208 t = (t >> 6) * (uint32_t) mul; 209 s = (int) (t >> 24) - 2 - 4; 210 break; 211 } 212 case RTX_16X8: { 213 uint32_t t = *(const uint32_t *) a & (uint32_t) mask; 214 t += *(const uint16_t *) l & (uint32_t) mask; 215 t = (t >> 6) * (uint32_t) mul; 216 s = (int) (t >> 24) - 4 - 2; 217 break; 218 } 219 case RTX_16X32: { 220 uint64_t t = *(const uint32_t *) a & (uint32_t) mask; 221 t += *(const uint64_t *) l & mask; 222 t = (t >> 6) * mul; 223 s = (int) (t >> 56) - 4 - 8; 224 break; 225 } 226 case RTX_32X16: { 227 uint64_t t = *(const uint64_t *) a & mask; 228 t += *(const uint32_t *) l & (uint32_t) mask; 229 t = (t >> 6) * mul; 230 s = (int) (t >> 56) - 8 - 4; 231 break; 232 } 233 case RTX_32X64: { 234 uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; 235 t += (*(const uint64_t *) &l[0] & mask) >> 6; 236 t += (*(const uint64_t *) &l[8] & mask) >> 6; 237 t *= mul; 238 s = (int) (t >> 56) - 8 - 16; 239 break; 240 } 241 case RTX_64X32: { 242 uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6; 243 t += (*(const uint64_t *) &a[8] & mask) >> 6; 244 t += (*(const uint64_t *) &l[0] & mask) >> 6; 245 t *= mul; 246 s = (int) (t >> 56) - 16 - 8; 247 break; 248 } 249 case RTX_4X16: { 250 uint32_t t = *(const uint8_t *) a & (uint32_t) mask; 251 t += *(const uint32_t *) l & (uint32_t) mask; 252 t = (t >> 6) * (uint32_t) mul; 253 s = (int) (t >> 24) - 1 - 4; 254 break; 255 } 256 case RTX_16X4: { 257 uint32_t t = *(const uint32_t *) a & (uint32_t) mask; 258 t += *(const uint8_t *) l & (uint32_t) mask; 259 t = (t >> 6) * (uint32_t) mul; 260 s = (int) (t >> 24) - 4 - 1; 261 break; 262 } 263 case RTX_8X32: { 264 uint64_t t = *(const uint16_t *) a & (uint32_t) mask; 265 t += *(const uint64_t *) l & mask; 266 t = (t >> 6) * mul; 267 s = (int) (t >> 56) - 2 - 8; 268 break; 269 } 270 case RTX_32X8: { 271 uint64_t t = *(const uint64_t *) a & mask; 272 t += *(const uint16_t *) l & (uint32_t) mask; 273 t = (t >> 6) * mul; 274 s = (int) (t >> 56) - 8 - 2; 275 break; 276 } 277 case RTX_16X64: { 278 uint64_t t = *(const uint32_t *) a & (uint32_t) mask; 279 t += *(const uint64_t *) &l[0] & mask; 280 t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6); 281 t *= mul; 282 s = (int) (t >> 56) - 4 - 16; 283 break; 284 } 285 case RTX_64X16: { 286 uint64_t t = *(const uint64_t *) &a[0] & mask; 287 t += *(const uint32_t *) l & (uint32_t) mask; 288 t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6); 289 t *= mul; 290 s = (int) (t >> 56) - 16 - 4; 291 break; 292 } 293 } 294 295 return (s != 0) + (s > 0); 296 } 297 298 static inline unsigned get_lo_ctx(const uint8_t *const levels, 299 const enum TxClass tx_class, 300 unsigned *const hi_mag, 301 const uint8_t (*const ctx_offsets)[5], 302 const unsigned x, const unsigned y, 303 const ptrdiff_t stride) 304 { 305 unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0]; 306 unsigned offset; 307 if (tx_class == TX_CLASS_2D) { 308 mag += levels[1 * stride + 1]; 309 *hi_mag = mag; 310 mag += levels[0 * stride + 2] + levels[2 * stride + 0]; 311 offset = ctx_offsets[umin(y, 4)][umin(x, 4)]; 312 } else { 313 mag += levels[0 * stride + 2]; 314 *hi_mag = mag; 315 mag += levels[0 * stride + 3] + levels[0 * stride + 4]; 316 offset = 26 + (y > 1 ? 10 : y * 5); 317 } 318 return offset + (mag > 512 ? 4 : (mag + 64) >> 7); 319 } 320 321 static int decode_coefs(Dav1dTaskContext *const t, 322 uint8_t *const a, uint8_t *const l, 323 const enum RectTxfmSize tx, const enum BlockSize bs, 324 const Av1Block *const b, const int intra, 325 const int plane, coef *cf, 326 enum TxfmType *const txtp, uint8_t *res_ctx) 327 { 328 Dav1dTileState *const ts = t->ts; 329 const int chroma = !!plane; 330 const Dav1dFrameContext *const f = t->f; 331 const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id]; 332 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx]; 333 const int dbg = DEBUG_BLOCK_INFO && plane && 0; 334 335 if (dbg) 336 printf("Start: r=%d\n", ts->msac.rng); 337 338 // does this block have any non-zero coefficients 339 const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout); 340 const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac, 341 ts->cdf.coef.skip[t_dim->ctx][sctx]); 342 if (dbg) 343 printf("Post-non-zero[%d][%d][%d]: r=%d\n", 344 t_dim->ctx, sctx, all_skip, ts->msac.rng); 345 if (all_skip) { 346 *res_ctx = 0x40; 347 *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */ 348 return -1; 349 } 350 351 // transform type (chroma: derived, luma: explicitly coded) 352 if (lossless) { 353 assert(t_dim->max == TX_4X4); 354 *txtp = WHT_WHT; 355 } else if (t_dim->max + intra >= TX_64X64) { 356 *txtp = DCT_DCT; 357 } else if (chroma) { 358 // inferred from either the luma txtp (inter) or a LUT (intra) 359 *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] : 360 get_uv_inter_txtp(t_dim, *txtp); 361 } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) { 362 // In libaom, lossless is checked by a literal qidx == 0, but not all 363 // such blocks are actually lossless. The remainder gets an implicit 364 // transform type (for luma) 365 *txtp = DCT_DCT; 366 } else { 367 unsigned idx; 368 if (intra) { 369 const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ? 370 dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode; 371 if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) { 372 idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, 373 ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4); 374 *txtp = dav1d_tx_types_per_set[idx + 0]; 375 } else { 376 idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, 377 ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6); 378 *txtp = dav1d_tx_types_per_set[idx + 5]; 379 } 380 if (dbg) 381 printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n", 382 tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng); 383 } else { 384 if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) { 385 idx = dav1d_msac_decode_bool_adapt(&ts->msac, 386 ts->cdf.m.txtp_inter3[t_dim->min]); 387 *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */ 388 } else if (t_dim->min == TX_16X16) { 389 idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, 390 ts->cdf.m.txtp_inter2, 11); 391 *txtp = dav1d_tx_types_per_set[idx + 12]; 392 } else { 393 idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, 394 ts->cdf.m.txtp_inter1[t_dim->min], 15); 395 *txtp = dav1d_tx_types_per_set[idx + 24]; 396 } 397 if (dbg) 398 printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n", 399 tx, t_dim->min, idx, *txtp, ts->msac.rng); 400 } 401 } 402 403 // find end-of-block (eob) 404 int eob; 405 const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32); 406 const int tx2dszctx = slw + slh; 407 const enum TxClass tx_class = dav1d_tx_type_class[*txtp]; 408 const int is_1d = tx_class != TX_CLASS_2D; 409 switch (tx2dszctx) { 410 #define case_sz(sz, bin, ns, is_1d) \ 411 case sz: { \ 412 uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \ 413 eob = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \ 414 break; \ 415 } 416 case_sz(0, 16, 8, [is_1d]); 417 case_sz(1, 32, 8, [is_1d]); 418 case_sz(2, 64, 8, [is_1d]); 419 case_sz(3, 128, 8, [is_1d]); 420 case_sz(4, 256, 16, [is_1d]); 421 case_sz(5, 512, 16, ); 422 case_sz(6, 1024, 16, ); 423 #undef case_sz 424 } 425 if (dbg) 426 printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n", 427 16 << tx2dszctx, chroma, is_1d, eob, ts->msac.rng); 428 if (eob > 1) { 429 const int eob_bin = eob - 2; 430 uint16_t *const eob_hi_bit_cdf = 431 ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin]; 432 const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf); 433 if (dbg) 434 printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n", 435 t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng); 436 eob = ((eob_hi_bit | 2) << eob_bin) | dav1d_msac_decode_bools(&ts->msac, eob_bin); 437 if (dbg) 438 printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng); 439 } 440 assert(eob >= 0); 441 442 // base tokens 443 uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma]; 444 uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma]; 445 unsigned rc, dc_tok; 446 447 if (eob) { 448 uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma]; 449 uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok 450 451 /* eob */ 452 unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx); 453 int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2); 454 int tok = eob_tok + 1; 455 int level_tok = tok * 0x41; 456 unsigned mag; 457 458 #define DECODE_COEFS_CLASS(tx_class) \ 459 unsigned x, y; \ 460 uint8_t *level; \ 461 if (tx_class == TX_CLASS_2D) \ 462 rc = scan[eob], x = rc >> shift, y = rc & mask; \ 463 else if (tx_class == TX_CLASS_H) \ 464 /* Transposing reduces the stride and padding requirements */ \ 465 x = eob & mask, y = eob >> shift, rc = eob; \ 466 else /* tx_class == TX_CLASS_V */ \ 467 x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \ 468 if (dbg) \ 469 printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ 470 t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \ 471 if (eob_tok == 2) { \ 472 ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \ 473 tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ 474 level_tok = tok + (3 << 6); \ 475 if (dbg) \ 476 printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ 477 imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \ 478 ts->msac.rng); \ 479 } \ 480 cf[rc] = tok << 11; \ 481 if (tx_class == TX_CLASS_2D) \ 482 level = levels + rc; \ 483 else \ 484 level = levels + x * stride + y; \ 485 *level = (uint8_t) level_tok; \ 486 for (int i = eob - 1; i > 0; i--) { /* ac */ \ 487 unsigned rc_i; \ 488 if (tx_class == TX_CLASS_2D) \ 489 rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \ 490 else if (tx_class == TX_CLASS_H) \ 491 x = i & mask, y = i >> shift, rc_i = i; \ 492 else /* tx_class == TX_CLASS_V */ \ 493 x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \ 494 assert(x < 32 && y < 32); \ 495 if (tx_class == TX_CLASS_2D) \ 496 level = levels + rc_i; \ 497 else \ 498 level = levels + x * stride + y; \ 499 ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \ 500 if (tx_class == TX_CLASS_2D) \ 501 y |= x; \ 502 tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \ 503 if (dbg) \ 504 printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ 505 t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \ 506 if (tok == 3) { \ 507 mag &= 63; \ 508 ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \ 509 (mag > 12 ? 6 : (mag + 1) >> 1); \ 510 tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ 511 if (dbg) \ 512 printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ 513 imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \ 514 ts->msac.rng); \ 515 *level = (uint8_t) (tok + (3 << 6)); \ 516 cf[rc_i] = (tok << 11) | rc; \ 517 rc = rc_i; \ 518 } else { \ 519 /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \ 520 tok *= 0x17ff41; \ 521 *level = (uint8_t) tok; \ 522 /* tok ? (tok << 11) | rc : 0 */ \ 523 tok = (tok >> 9) & (rc + ~0x7ffu); \ 524 if (tok) rc = rc_i; \ 525 cf[rc_i] = tok; \ 526 } \ 527 } \ 528 /* dc */ \ 529 ctx = (tx_class == TX_CLASS_2D) ? 0 : \ 530 get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \ 531 dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \ 532 if (dbg) \ 533 printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \ 534 t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \ 535 if (dc_tok == 3) { \ 536 if (tx_class == TX_CLASS_2D) \ 537 mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \ 538 levels[1 * stride + 1]; \ 539 mag &= 63; \ 540 ctx = mag > 12 ? 6 : (mag + 1) >> 1; \ 541 dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ 542 if (dbg) \ 543 printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \ 544 imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \ 545 } \ 546 break 547 548 const uint16_t *scan; 549 switch (tx_class) { 550 case TX_CLASS_2D: { 551 const unsigned nonsquare_tx = tx >= RTX_4X8; 552 const uint8_t (*const lo_ctx_offsets)[5] = 553 dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)]; 554 scan = dav1d_scans[tx]; 555 const ptrdiff_t stride = 4 << slh; 556 const unsigned shift = slh + 2, shift2 = 0; 557 const unsigned mask = (4 << slh) - 1; 558 memset(levels, 0, stride * ((4 << slw) + 2)); 559 DECODE_COEFS_CLASS(TX_CLASS_2D); 560 } 561 case TX_CLASS_H: { 562 const uint8_t (*const lo_ctx_offsets)[5] = NULL; 563 const ptrdiff_t stride = 16; 564 const unsigned shift = slh + 2, shift2 = 0; 565 const unsigned mask = (4 << slh) - 1; 566 memset(levels, 0, stride * ((4 << slh) + 2)); 567 DECODE_COEFS_CLASS(TX_CLASS_H); 568 } 569 case TX_CLASS_V: { 570 const uint8_t (*const lo_ctx_offsets)[5] = NULL; 571 const ptrdiff_t stride = 16; 572 const unsigned shift = slw + 2, shift2 = slh + 2; 573 const unsigned mask = (4 << slw) - 1; 574 memset(levels, 0, stride * ((4 << slw) + 2)); 575 DECODE_COEFS_CLASS(TX_CLASS_V); 576 } 577 #undef DECODE_COEFS_CLASS 578 default: assert(0); 579 } 580 } else { // dc-only 581 int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2); 582 dc_tok = 1 + tok_br; 583 if (dbg) 584 printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", 585 t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng); 586 if (tok_br == 2) { 587 dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]); 588 if (dbg) 589 printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", 590 imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); 591 } 592 rc = 0; 593 } 594 595 // residual and sign 596 const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane]; 597 const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL; 598 const int dq_shift = imax(0, t_dim->ctx - 2); 599 const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc)); 600 unsigned cul_level, dc_sign_level; 601 602 if (!dc_tok) { 603 cul_level = 0; 604 dc_sign_level = 1 << 6; 605 if (qm_tbl) goto ac_qm; 606 goto ac_noqm; 607 } 608 609 const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l); 610 uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; 611 const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); 612 if (dbg) 613 printf("Post-dc_sign[%d][%d][%d]: r=%d\n", 614 chroma, dc_sign_ctx, dc_sign, ts->msac.rng); 615 616 int dc_dq = dq_tbl[0]; 617 dc_sign_level = (dc_sign - 1) & (2 << 6); 618 619 if (qm_tbl) { 620 dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5; 621 622 if (dc_tok == 15) { 623 dc_tok = read_golomb(&ts->msac) + 15; 624 if (dbg) 625 printf("Post-dc_residual[%d->%d]: r=%d\n", 626 dc_tok - 15, dc_tok, ts->msac.rng); 627 628 dc_tok &= 0xfffff; 629 dc_dq = (dc_dq * dc_tok) & 0xffffff; 630 } else { 631 dc_dq *= dc_tok; 632 assert(dc_dq <= 0xffffff); 633 } 634 cul_level = dc_tok; 635 dc_dq >>= dq_shift; 636 dc_dq = umin(dc_dq, cf_max + dc_sign); 637 cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq); 638 639 if (rc) ac_qm: { 640 const unsigned ac_dq = dq_tbl[1]; 641 do { 642 const int sign = dav1d_msac_decode_bool_equi(&ts->msac); 643 if (dbg) 644 printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); 645 const unsigned rc_tok = cf[rc]; 646 unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5; 647 int dq_sat; 648 649 if (rc_tok >= (15 << 11)) { 650 tok = read_golomb(&ts->msac) + 15; 651 if (dbg) 652 printf("Post-residual[%d=%d->%d]: r=%d\n", 653 rc, tok - 15, tok, ts->msac.rng); 654 655 tok &= 0xfffff; 656 dq = (dq * tok) & 0xffffff; 657 } else { 658 tok = rc_tok >> 11; 659 dq *= tok; 660 assert(dq <= 0xffffff); 661 } 662 cul_level += tok; 663 dq >>= dq_shift; 664 dq_sat = umin(dq, cf_max + sign); 665 cf[rc] = (coef) (sign ? -dq_sat : dq_sat); 666 667 rc = rc_tok & 0x3ff; 668 } while (rc); 669 } 670 } else { 671 // non-qmatrix is the common case and allows for additional optimizations 672 if (dc_tok == 15) { 673 dc_tok = read_golomb(&ts->msac) + 15; 674 if (dbg) 675 printf("Post-dc_residual[%d->%d]: r=%d\n", 676 dc_tok - 15, dc_tok, ts->msac.rng); 677 678 dc_tok &= 0xfffff; 679 dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift; 680 dc_dq = umin(dc_dq, cf_max + dc_sign); 681 } else { 682 dc_dq = ((dc_dq * dc_tok) >> dq_shift); 683 assert(dc_dq <= cf_max); 684 } 685 cul_level = dc_tok; 686 cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq); 687 688 if (rc) ac_noqm: { 689 const unsigned ac_dq = dq_tbl[1]; 690 do { 691 const int sign = dav1d_msac_decode_bool_equi(&ts->msac); 692 if (dbg) 693 printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); 694 const unsigned rc_tok = cf[rc]; 695 unsigned tok; 696 int dq; 697 698 // residual 699 if (rc_tok >= (15 << 11)) { 700 tok = read_golomb(&ts->msac) + 15; 701 if (dbg) 702 printf("Post-residual[%d=%d->%d]: r=%d\n", 703 rc, tok - 15, tok, ts->msac.rng); 704 705 // coefficient parsing, see 5.11.39 706 tok &= 0xfffff; 707 708 // dequant, see 7.12.3 709 dq = ((ac_dq * tok) & 0xffffff) >> dq_shift; 710 dq = umin(dq, cf_max + sign); 711 } else { 712 // cannot exceed cf_max, so we can avoid the clipping 713 tok = rc_tok >> 11; 714 dq = ((ac_dq * tok) >> dq_shift); 715 assert(dq <= cf_max); 716 } 717 cul_level += tok; 718 cf[rc] = (coef) (sign ? -dq : dq); 719 720 rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob 721 } while (rc); 722 } 723 } 724 725 // context 726 *res_ctx = umin(cul_level, 63) | dc_sign_level; 727 728 return eob; 729 } 730 731 static void read_coef_tree(Dav1dTaskContext *const t, 732 const enum BlockSize bs, const Av1Block *const b, 733 const enum RectTxfmSize ytx, const int depth, 734 const uint16_t *const tx_split, 735 const int x_off, const int y_off, pixel *dst) 736 { 737 const Dav1dFrameContext *const f = t->f; 738 Dav1dTileState *const ts = t->ts; 739 const Dav1dDSPContext *const dsp = f->dsp; 740 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx]; 741 const int txw = t_dim->w, txh = t_dim->h; 742 743 /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't 744 * be splitted. Aviods an undefined left shift. */ 745 if (depth < 2 && tx_split[depth] && 746 tx_split[depth] & (1 << (y_off * 4 + x_off))) 747 { 748 const enum RectTxfmSize sub = t_dim->sub; 749 const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub]; 750 const int txsw = sub_t_dim->w, txsh = sub_t_dim->h; 751 752 read_coef_tree(t, bs, b, sub, depth + 1, tx_split, 753 x_off * 2 + 0, y_off * 2 + 0, dst); 754 t->bx += txsw; 755 if (txw >= txh && t->bx < f->bw) 756 read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1, 757 y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL); 758 t->bx -= txsw; 759 t->by += txsh; 760 if (txh >= txw && t->by < f->bh) { 761 if (dst) 762 dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]); 763 read_coef_tree(t, bs, b, sub, depth + 1, tx_split, 764 x_off * 2 + 0, y_off * 2 + 1, dst); 765 t->bx += txsw; 766 if (txw >= txh && t->bx < f->bw) 767 read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1, 768 y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL); 769 t->bx -= txsw; 770 } 771 t->by -= txsh; 772 } else { 773 const int bx4 = t->bx & 31, by4 = t->by & 31; 774 enum TxfmType txtp; 775 uint8_t cf_ctx; 776 int eob; 777 coef *cf; 778 779 if (t->frame_thread.pass) { 780 const int p = t->frame_thread.pass & 1; 781 assert(ts->frame_thread[p].cf); 782 cf = ts->frame_thread[p].cf; 783 ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; 784 } else { 785 cf = bitfn(t->cf); 786 } 787 if (t->frame_thread.pass != 2) { 788 eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4], 789 ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx); 790 if (DEBUG_BLOCK_INFO) 791 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", 792 ytx, txtp, eob, ts->msac.rng); 793 dav1d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx)); 794 dav1d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by)); 795 #define set_ctx(rep_macro) \ 796 for (int y = 0; y < txh; y++) { \ 797 rep_macro(txtp_map, 0, txtp); \ 798 txtp_map += 32; \ 799 } 800 uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4]; 801 case_set_upto16(t_dim->lw); 802 #undef set_ctx 803 if (t->frame_thread.pass == 1) 804 *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; 805 } else { 806 const int cbi = *ts->frame_thread[0].cbi++; 807 eob = cbi >> 5; 808 txtp = cbi & 0x1f; 809 } 810 if (!(t->frame_thread.pass & 1)) { 811 assert(dst); 812 if (eob >= 0) { 813 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 814 coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq"); 815 dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob 816 HIGHBD_CALL_SUFFIX); 817 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 818 hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon"); 819 } 820 } 821 } 822 } 823 824 void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, 825 const enum BlockSize bs, const Av1Block *const b) 826 { 827 const Dav1dFrameContext *const f = t->f; 828 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 829 const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; 830 const int bx4 = t->bx & 31, by4 = t->by & 31; 831 const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; 832 const uint8_t *const b_dim = dav1d_block_dimensions[bs]; 833 const int bw4 = b_dim[0], bh4 = b_dim[1]; 834 const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; 835 const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && 836 (bw4 > ss_hor || t->bx & 1) && 837 (bh4 > ss_ver || t->by & 1); 838 839 if (b->skip) { 840 BlockContext *const a = t->a; 841 dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); 842 dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); 843 if (has_chroma) { 844 dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; 845 dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; 846 memset_cw(&a->ccoef[0][cbx4], 0x40); 847 memset_cw(&a->ccoef[1][cbx4], 0x40); 848 memset_ch(&t->l.ccoef[0][cby4], 0x40); 849 memset_ch(&t->l.ccoef[1][cby4], 0x40); 850 } 851 return; 852 } 853 854 Dav1dTileState *const ts = t->ts; 855 const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); 856 const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; 857 assert(t->frame_thread.pass == 1); 858 assert(!b->skip); 859 const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; 860 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx]; 861 const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; 862 863 for (int init_y = 0; init_y < h4; init_y += 16) { 864 const int sub_h4 = imin(h4, 16 + init_y); 865 for (int init_x = 0; init_x < w4; init_x += 16) { 866 const int sub_w4 = imin(w4, init_x + 16); 867 int y_off = !!init_y, y, x; 868 for (y = init_y, t->by += init_y; y < sub_h4; 869 y += t_dim->h, t->by += t_dim->h, y_off++) 870 { 871 int x_off = !!init_x; 872 for (x = init_x, t->bx += init_x; x < sub_w4; 873 x += t_dim->w, t->bx += t_dim->w, x_off++) 874 { 875 if (!b->intra) { 876 read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, 877 x_off, y_off, NULL); 878 } else { 879 uint8_t cf_ctx = 0x40; 880 enum TxfmType txtp; 881 const int eob = 882 decode_coefs(t, &t->a->lcoef[bx4 + x], 883 &t->l.lcoef[by4 + y], b->tx, bs, b, 1, 884 0, ts->frame_thread[1].cf, &txtp, &cf_ctx); 885 if (DEBUG_BLOCK_INFO) 886 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", 887 b->tx, txtp, eob, ts->msac.rng); 888 *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; 889 ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; 890 dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); 891 dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); 892 } 893 } 894 t->bx -= x; 895 } 896 t->by -= y; 897 898 if (!has_chroma) continue; 899 900 const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); 901 const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); 902 for (int pl = 0; pl < 2; pl++) { 903 for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; 904 y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) 905 { 906 for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; 907 x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) 908 { 909 uint8_t cf_ctx = 0x40; 910 enum TxfmType txtp; 911 if (!b->intra) 912 txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + 913 bx4 + (x << ss_hor)]; 914 const int eob = 915 decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], 916 &t->l.ccoef[pl][cby4 + y], b->uvtx, bs, 917 b, b->intra, 1 + pl, ts->frame_thread[1].cf, 918 &txtp, &cf_ctx); 919 if (DEBUG_BLOCK_INFO) 920 printf("Post-uv-cf-blk[pl=%d,tx=%d," 921 "txtp=%d,eob=%d]: r=%d\n", 922 pl, b->uvtx, txtp, eob, ts->msac.rng); 923 *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; 924 ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16; 925 int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); 926 int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); 927 dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); 928 dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); 929 } 930 t->bx -= x << ss_hor; 931 } 932 t->by -= y << ss_ver; 933 } 934 } 935 } 936 } 937 938 static int mc(Dav1dTaskContext *const t, 939 pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride, 940 const int bw4, const int bh4, 941 const int bx, const int by, const int pl, 942 const mv mv, const Dav1dThreadPicture *const refp, const int refidx, 943 const enum Filter2d filter_2d) 944 { 945 assert((dst8 != NULL) ^ (dst16 != NULL)); 946 const Dav1dFrameContext *const f = t->f; 947 const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 948 const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; 949 const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; 950 const int mvx = mv.x, mvy = mv.y; 951 const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver); 952 ptrdiff_t ref_stride = refp->p.stride[!!pl]; 953 const pixel *ref; 954 955 if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) { 956 const int dx = bx * h_mul + (mvx >> (3 + ss_hor)); 957 const int dy = by * v_mul + (mvy >> (3 + ss_ver)); 958 int w, h; 959 960 if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc 961 w = (f->cur.p.w + ss_hor) >> ss_hor; 962 h = (f->cur.p.h + ss_ver) >> ss_ver; 963 } else { 964 w = f->bw * 4 >> ss_hor; 965 h = f->bh * 4 >> ss_ver; 966 } 967 if (dx < !!mx * 3 || dy < !!my * 3 || 968 dx + bw4 * h_mul + !!mx * 4 > w || 969 dy + bh4 * v_mul + !!my * 4 > h) 970 { 971 pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); 972 f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, 973 w, h, dx - !!mx * 3, dy - !!my * 3, 974 emu_edge_buf, 192 * sizeof(pixel), 975 refp->p.data[pl], ref_stride); 976 ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3]; 977 ref_stride = 192 * sizeof(pixel); 978 } else { 979 ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; 980 } 981 982 if (dst8 != NULL) { 983 f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul, 984 bh4 * v_mul, mx << !ss_hor, my << !ss_ver 985 HIGHBD_CALL_SUFFIX); 986 } else { 987 f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul, 988 bh4 * v_mul, mx << !ss_hor, my << !ss_ver 989 HIGHBD_CALL_SUFFIX); 990 } 991 } else { 992 assert(refp != &f->sr_cur); 993 994 const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver); 995 const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor); 996 #define scale_mv(res, val, scale) do { \ 997 const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \ 998 res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32; \ 999 } while (0) 1000 int pos_y, pos_x; 1001 scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale); 1002 scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale); 1003 #undef scale_mv 1004 const int left = pos_x >> 10; 1005 const int top = pos_y >> 10; 1006 const int right = 1007 ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1; 1008 const int bottom = 1009 ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1; 1010 1011 if (DEBUG_BLOCK_INFO) 1012 printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n", 1013 left, top, orig_pos_x, f->svc[refidx][0].scale, refidx, 1014 right-left, bottom-top, 1015 f->svc[refidx][0].step, f->svc[refidx][1].step); 1016 1017 const int w = (refp->p.p.w + ss_hor) >> ss_hor; 1018 const int h = (refp->p.p.h + ss_ver) >> ss_ver; 1019 if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) { 1020 pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); 1021 f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7, 1022 w, h, left - 3, top - 3, 1023 emu_edge_buf, 320 * sizeof(pixel), 1024 refp->p.data[pl], ref_stride); 1025 ref = &emu_edge_buf[320 * 3 + 3]; 1026 ref_stride = 320 * sizeof(pixel); 1027 if (DEBUG_BLOCK_INFO) printf("Emu\n"); 1028 } else { 1029 ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left; 1030 } 1031 1032 if (dst8 != NULL) { 1033 f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride, 1034 bw4 * h_mul, bh4 * v_mul, 1035 pos_x & 0x3ff, pos_y & 0x3ff, 1036 f->svc[refidx][0].step, 1037 f->svc[refidx][1].step 1038 HIGHBD_CALL_SUFFIX); 1039 } else { 1040 f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride, 1041 bw4 * h_mul, bh4 * v_mul, 1042 pos_x & 0x3ff, pos_y & 0x3ff, 1043 f->svc[refidx][0].step, 1044 f->svc[refidx][1].step 1045 HIGHBD_CALL_SUFFIX); 1046 } 1047 } 1048 1049 return 0; 1050 } 1051 1052 static int obmc(Dav1dTaskContext *const t, 1053 pixel *const dst, const ptrdiff_t dst_stride, 1054 const uint8_t *const b_dim, const int pl, 1055 const int bx4, const int by4, const int w4, const int h4) 1056 { 1057 assert(!(t->bx & 1) && !(t->by & 1)); 1058 const Dav1dFrameContext *const f = t->f; 1059 /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5]; 1060 pixel *const lap = bitfn(t->scratch.lap); 1061 const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 1062 const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; 1063 const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; 1064 int res; 1065 1066 if (t->by > t->ts->tiling.row_start && 1067 (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16)) 1068 { 1069 for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) { 1070 // only odd blocks are considered for overlap handling, hence +1 1071 const refmvs_block *const a_r = &r[-1][t->bx + x + 1]; 1072 const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs]; 1073 const int step4 = iclip(a_b_dim[0], 2, 16); 1074 1075 if (a_r->ref.ref[0] > 0) { 1076 const int ow4 = imin(step4, b_dim[0]); 1077 const int oh4 = imin(b_dim[1], 16) >> 1; 1078 res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2, 1079 t->bx + x, t->by, pl, a_r->mv.mv[0], 1080 &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1, 1081 dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]); 1082 if (res) return res; 1083 f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap, 1084 h_mul * ow4, v_mul * oh4); 1085 i++; 1086 } 1087 x += step4; 1088 } 1089 } 1090 1091 if (t->bx > t->ts->tiling.col_start) 1092 for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) { 1093 // only odd blocks are considered for overlap handling, hence +1 1094 const refmvs_block *const l_r = &r[y + 1][t->bx - 1]; 1095 const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs]; 1096 const int step4 = iclip(l_b_dim[1], 2, 16); 1097 1098 if (l_r->ref.ref[0] > 0) { 1099 const int ow4 = imin(b_dim[0], 16) >> 1; 1100 const int oh4 = imin(step4, b_dim[1]); 1101 res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4, 1102 t->bx, t->by + y, pl, l_r->mv.mv[0], 1103 &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1, 1104 dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]); 1105 if (res) return res; 1106 f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)], 1107 dst_stride, lap, h_mul * ow4, v_mul * oh4); 1108 i++; 1109 } 1110 y += step4; 1111 } 1112 return 0; 1113 } 1114 1115 static int warp_affine(Dav1dTaskContext *const t, 1116 pixel *dst8, int16_t *dst16, const ptrdiff_t dstride, 1117 const uint8_t *const b_dim, const int pl, 1118 const Dav1dThreadPicture *const refp, 1119 const Dav1dWarpedMotionParams *const wmp) 1120 { 1121 assert((dst8 != NULL) ^ (dst16 != NULL)); 1122 const Dav1dFrameContext *const f = t->f; 1123 const Dav1dDSPContext *const dsp = f->dsp; 1124 const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 1125 const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; 1126 const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; 1127 assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7)); 1128 const int32_t *const mat = wmp->matrix; 1129 const int width = (refp->p.p.w + ss_hor) >> ss_hor; 1130 const int height = (refp->p.p.h + ss_ver) >> ss_ver; 1131 1132 for (int y = 0; y < b_dim[1] * v_mul; y += 8) { 1133 const int src_y = t->by * 4 + ((y + 4) << ss_ver); 1134 const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0]; 1135 const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; 1136 for (int x = 0; x < b_dim[0] * h_mul; x += 8) { 1137 // calculate transformation relative to center of 8x8 block in 1138 // luma pixel units 1139 const int src_x = t->bx * 4 + ((x + 4) << ss_hor); 1140 const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor; 1141 const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; 1142 1143 const int dx = (int) (mvx >> 16) - 4; 1144 const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 - 1145 wmp->u.p.beta * 7) & ~0x3f; 1146 const int dy = (int) (mvy >> 16) - 4; 1147 const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 - 1148 wmp->u.p.delta * 4) & ~0x3f; 1149 1150 const pixel *ref_ptr; 1151 ptrdiff_t ref_stride = refp->p.stride[!!pl]; 1152 1153 if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) { 1154 pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); 1155 f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3, 1156 emu_edge_buf, 32 * sizeof(pixel), 1157 refp->p.data[pl], ref_stride); 1158 ref_ptr = &emu_edge_buf[32 * 3 + 3]; 1159 ref_stride = 32 * sizeof(pixel); 1160 } else { 1161 ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; 1162 } 1163 if (dst16 != NULL) 1164 dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride, 1165 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); 1166 else 1167 dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride, 1168 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); 1169 } 1170 if (dst8) dst8 += 8 * PXSTRIDE(dstride); 1171 else dst16 += 8 * dstride; 1172 } 1173 return 0; 1174 } 1175 1176 void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs, 1177 const enum EdgeFlags intra_edge_flags, 1178 const Av1Block *const b) 1179 { 1180 Dav1dTileState *const ts = t->ts; 1181 const Dav1dFrameContext *const f = t->f; 1182 const Dav1dDSPContext *const dsp = f->dsp; 1183 const int bx4 = t->bx & 31, by4 = t->by & 31; 1184 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 1185 const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; 1186 const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; 1187 const uint8_t *const b_dim = dav1d_block_dimensions[bs]; 1188 const int bw4 = b_dim[0], bh4 = b_dim[1]; 1189 const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); 1190 const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; 1191 const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && 1192 (bw4 > ss_hor || t->bx & 1) && 1193 (bh4 > ss_ver || t->by & 1); 1194 const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx]; 1195 const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; 1196 1197 // coefficient coding 1198 pixel *const edge = bitfn(t->scratch.edge) + 128; 1199 const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver; 1200 1201 const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10; 1202 1203 for (int init_y = 0; init_y < h4; init_y += 16) { 1204 const int sub_h4 = imin(h4, 16 + init_y); 1205 const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); 1206 for (int init_x = 0; init_x < w4; init_x += 16) { 1207 if (b->pal_sz[0]) { 1208 pixel *dst = ((pixel *) f->cur.data[0]) + 1209 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); 1210 const uint8_t *pal_idx; 1211 if (t->frame_thread.pass) { 1212 const int p = t->frame_thread.pass & 1; 1213 assert(ts->frame_thread[p].pal_idx); 1214 pal_idx = ts->frame_thread[p].pal_idx; 1215 ts->frame_thread[p].pal_idx += bw4 * bh4 * 8; 1216 } else { 1217 pal_idx = t->scratch.pal_idx_y; 1218 } 1219 const pixel *const pal = t->frame_thread.pass ? 1220 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + 1221 ((t->bx >> 1) + (t->by & 1))][0] : 1222 bytefn(t->scratch.pal)[0]; 1223 f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal, 1224 pal_idx, bw4 * 4, bh4 * 4); 1225 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 1226 hex_dump(dst, PXSTRIDE(f->cur.stride[0]), 1227 bw4 * 4, bh4 * 4, "y-pal-pred"); 1228 } 1229 1230 const int intra_flags = (sm_flag(t->a, bx4) | 1231 sm_flag(&t->l, by4) | 1232 intra_edge_filter_flag); 1233 const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 : 1234 intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT; 1235 const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 : 1236 intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM; 1237 int y, x; 1238 const int sub_w4 = imin(w4, init_x + 16); 1239 for (y = init_y, t->by += init_y; y < sub_h4; 1240 y += t_dim->h, t->by += t_dim->h) 1241 { 1242 pixel *dst = ((pixel *) f->cur.data[0]) + 1243 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + 1244 t->bx + init_x); 1245 for (x = init_x, t->bx += init_x; x < sub_w4; 1246 x += t_dim->w, t->bx += t_dim->w) 1247 { 1248 if (b->pal_sz[0]) goto skip_y_pred; 1249 1250 int angle = b->y_angle; 1251 const enum EdgeFlags edge_flags = 1252 (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ? 1253 0 : EDGE_I444_TOP_HAS_RIGHT) | 1254 ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ? 1255 0 : EDGE_I444_LEFT_HAS_BOTTOM); 1256 const pixel *top_sb_edge = NULL; 1257 if (!(t->by & (f->sb_step - 1))) { 1258 top_sb_edge = f->ipred_edge[0]; 1259 const int sby = t->by >> f->sb_shift; 1260 top_sb_edge += f->sb128w * 128 * (sby - 1); 1261 } 1262 const enum IntraPredMode m = 1263 bytefn(dav1d_prepare_intra_edges)(t->bx, 1264 t->bx > ts->tiling.col_start, 1265 t->by, 1266 t->by > ts->tiling.row_start, 1267 ts->tiling.col_end, 1268 ts->tiling.row_end, 1269 edge_flags, dst, 1270 f->cur.stride[0], top_sb_edge, 1271 b->y_mode, &angle, 1272 t_dim->w, t_dim->h, 1273 f->seq_hdr->intra_edge_filter, 1274 edge HIGHBD_CALL_SUFFIX); 1275 dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge, 1276 t_dim->w * 4, t_dim->h * 4, 1277 angle | intra_flags, 1278 4 * f->bw - 4 * t->bx, 1279 4 * f->bh - 4 * t->by 1280 HIGHBD_CALL_SUFFIX); 1281 1282 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { 1283 hex_dump(edge - t_dim->h * 4, t_dim->h * 4, 1284 t_dim->h * 4, 2, "l"); 1285 hex_dump(edge, 0, 1, 1, "tl"); 1286 hex_dump(edge + 1, t_dim->w * 4, 1287 t_dim->w * 4, 2, "t"); 1288 hex_dump(dst, f->cur.stride[0], 1289 t_dim->w * 4, t_dim->h * 4, "y-intra-pred"); 1290 } 1291 1292 skip_y_pred: {} 1293 if (!b->skip) { 1294 coef *cf; 1295 int eob; 1296 enum TxfmType txtp; 1297 if (t->frame_thread.pass) { 1298 const int p = t->frame_thread.pass & 1; 1299 const int cbi = *ts->frame_thread[p].cbi++; 1300 cf = ts->frame_thread[p].cf; 1301 ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; 1302 eob = cbi >> 5; 1303 txtp = cbi & 0x1f; 1304 } else { 1305 uint8_t cf_ctx; 1306 cf = bitfn(t->cf); 1307 eob = decode_coefs(t, &t->a->lcoef[bx4 + x], 1308 &t->l.lcoef[by4 + y], b->tx, bs, 1309 b, 1, 0, cf, &txtp, &cf_ctx); 1310 if (DEBUG_BLOCK_INFO) 1311 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", 1312 b->tx, txtp, eob, ts->msac.rng); 1313 dav1d_memset_likely_pow2(&t->a->lcoef[bx4 + x], cf_ctx, imin(t_dim->w, f->bw - t->bx)); 1314 dav1d_memset_likely_pow2(&t->l.lcoef[by4 + y], cf_ctx, imin(t_dim->h, f->bh - t->by)); 1315 } 1316 if (eob >= 0) { 1317 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 1318 coef_dump(cf, imin(t_dim->h, 8) * 4, 1319 imin(t_dim->w, 8) * 4, 3, "dq"); 1320 dsp->itx.itxfm_add[b->tx] 1321 [txtp](dst, 1322 f->cur.stride[0], 1323 cf, eob HIGHBD_CALL_SUFFIX); 1324 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 1325 hex_dump(dst, f->cur.stride[0], 1326 t_dim->w * 4, t_dim->h * 4, "recon"); 1327 } 1328 } else if (!t->frame_thread.pass) { 1329 dav1d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4 + x], 0x40); 1330 dav1d_memset_pow2[t_dim->lh](&t->l.lcoef[by4 + y], 0x40); 1331 } 1332 dst += 4 * t_dim->w; 1333 } 1334 t->bx -= x; 1335 } 1336 t->by -= y; 1337 1338 if (!has_chroma) continue; 1339 1340 const ptrdiff_t stride = f->cur.stride[1]; 1341 1342 if (b->uv_mode == CFL_PRED) { 1343 assert(!init_x && !init_y); 1344 1345 int16_t *const ac = t->scratch.ac; 1346 pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) + 1347 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]); 1348 const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) + 1349 (t->by >> ss_ver) * PXSTRIDE(stride)); 1350 pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off, 1351 ((pixel *) f->cur.data[2]) + uv_off }; 1352 1353 const int furthest_r = 1354 ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1); 1355 const int furthest_b = 1356 ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1); 1357 dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0], 1358 cbw4 - (furthest_r >> ss_hor), 1359 cbh4 - (furthest_b >> ss_ver), 1360 cbw4 * 4, cbh4 * 4); 1361 for (int pl = 0; pl < 2; pl++) { 1362 if (!b->cfl_alpha[pl]) continue; 1363 int angle = 0; 1364 const pixel *top_sb_edge = NULL; 1365 if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { 1366 top_sb_edge = f->ipred_edge[pl + 1]; 1367 const int sby = t->by >> f->sb_shift; 1368 top_sb_edge += f->sb128w * 128 * (sby - 1); 1369 } 1370 const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; 1371 const int xstart = ts->tiling.col_start >> ss_hor; 1372 const int ystart = ts->tiling.row_start >> ss_ver; 1373 const enum IntraPredMode m = 1374 bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, 1375 ypos, ypos > ystart, 1376 ts->tiling.col_end >> ss_hor, 1377 ts->tiling.row_end >> ss_ver, 1378 0, uv_dst[pl], stride, 1379 top_sb_edge, DC_PRED, &angle, 1380 uv_t_dim->w, uv_t_dim->h, 0, 1381 edge HIGHBD_CALL_SUFFIX); 1382 dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge, 1383 uv_t_dim->w * 4, 1384 uv_t_dim->h * 4, 1385 ac, b->cfl_alpha[pl] 1386 HIGHBD_CALL_SUFFIX); 1387 } 1388 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { 1389 ac_dump(ac, 4*cbw4, 4*cbh4, "ac"); 1390 hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred"); 1391 hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred"); 1392 } 1393 } else if (b->pal_sz[1]) { 1394 const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) + 1395 (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); 1396 const pixel (*pal)[8]; 1397 const uint8_t *pal_idx; 1398 if (t->frame_thread.pass) { 1399 const int p = t->frame_thread.pass & 1; 1400 assert(ts->frame_thread[p].pal_idx); 1401 pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + 1402 ((t->bx >> 1) + (t->by & 1))]; 1403 pal_idx = ts->frame_thread[p].pal_idx; 1404 ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8; 1405 } else { 1406 pal = bytefn(t->scratch.pal); 1407 pal_idx = t->scratch.pal_idx_uv; 1408 } 1409 1410 f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff, 1411 f->cur.stride[1], pal[1], 1412 pal_idx, cbw4 * 4, cbh4 * 4); 1413 f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff, 1414 f->cur.stride[1], pal[2], 1415 pal_idx, cbw4 * 4, cbh4 * 4); 1416 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { 1417 hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff, 1418 PXSTRIDE(f->cur.stride[1]), 1419 cbw4 * 4, cbh4 * 4, "u-pal-pred"); 1420 hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff, 1421 PXSTRIDE(f->cur.stride[1]), 1422 cbw4 * 4, cbh4 * 4, "v-pal-pred"); 1423 } 1424 } 1425 1426 const int sm_uv_fl = sm_uv_flag(t->a, cbx4) | 1427 sm_uv_flag(&t->l, cby4); 1428 const int uv_sb_has_tr = 1429 ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 : 1430 intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1)); 1431 const int uv_sb_has_bl = 1432 init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 : 1433 intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1)); 1434 const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); 1435 for (int pl = 0; pl < 2; pl++) { 1436 for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; 1437 y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) 1438 { 1439 pixel *dst = ((pixel *) f->cur.data[1 + pl]) + 1440 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) + 1441 ((t->bx + init_x) >> ss_hor)); 1442 for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; 1443 x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) 1444 { 1445 if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) || 1446 b->pal_sz[1]) 1447 { 1448 goto skip_uv_pred; 1449 } 1450 1451 int angle = b->uv_angle; 1452 // this probably looks weird because we're using 1453 // luma flags in a chroma loop, but that's because 1454 // prepare_intra_edges() expects luma flags as input 1455 const enum EdgeFlags edge_flags = 1456 (((y > (init_y >> ss_ver) || !uv_sb_has_tr) && 1457 (x + uv_t_dim->w >= sub_cw4)) ? 1458 0 : EDGE_I444_TOP_HAS_RIGHT) | 1459 ((x > (init_x >> ss_hor) || 1460 (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ? 1461 0 : EDGE_I444_LEFT_HAS_BOTTOM); 1462 const pixel *top_sb_edge = NULL; 1463 if (!((t->by & ~ss_ver) & (f->sb_step - 1))) { 1464 top_sb_edge = f->ipred_edge[1 + pl]; 1465 const int sby = t->by >> f->sb_shift; 1466 top_sb_edge += f->sb128w * 128 * (sby - 1); 1467 } 1468 const enum IntraPredMode uv_mode = 1469 b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode; 1470 const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver; 1471 const int xstart = ts->tiling.col_start >> ss_hor; 1472 const int ystart = ts->tiling.row_start >> ss_ver; 1473 const enum IntraPredMode m = 1474 bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart, 1475 ypos, ypos > ystart, 1476 ts->tiling.col_end >> ss_hor, 1477 ts->tiling.row_end >> ss_ver, 1478 edge_flags, dst, stride, 1479 top_sb_edge, uv_mode, 1480 &angle, uv_t_dim->w, 1481 uv_t_dim->h, 1482 f->seq_hdr->intra_edge_filter, 1483 edge HIGHBD_CALL_SUFFIX); 1484 angle |= intra_edge_filter_flag; 1485 dsp->ipred.intra_pred[m](dst, stride, edge, 1486 uv_t_dim->w * 4, 1487 uv_t_dim->h * 4, 1488 angle | sm_uv_fl, 1489 (4 * f->bw + ss_hor - 1490 4 * (t->bx & ~ss_hor)) >> ss_hor, 1491 (4 * f->bh + ss_ver - 1492 4 * (t->by & ~ss_ver)) >> ss_ver 1493 HIGHBD_CALL_SUFFIX); 1494 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { 1495 hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4, 1496 uv_t_dim->h * 4, 2, "l"); 1497 hex_dump(edge, 0, 1, 1, "tl"); 1498 hex_dump(edge + 1, uv_t_dim->w * 4, 1499 uv_t_dim->w * 4, 2, "t"); 1500 hex_dump(dst, stride, uv_t_dim->w * 4, 1501 uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred"); 1502 } 1503 1504 skip_uv_pred: {} 1505 if (!b->skip) { 1506 enum TxfmType txtp; 1507 int eob; 1508 coef *cf; 1509 if (t->frame_thread.pass) { 1510 const int p = t->frame_thread.pass & 1; 1511 const int cbi = *ts->frame_thread[p].cbi++; 1512 cf = ts->frame_thread[p].cf; 1513 ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16; 1514 eob = cbi >> 5; 1515 txtp = cbi & 0x1f; 1516 } else { 1517 uint8_t cf_ctx; 1518 cf = bitfn(t->cf); 1519 eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], 1520 &t->l.ccoef[pl][cby4 + y], 1521 b->uvtx, bs, b, 1, 1 + pl, cf, 1522 &txtp, &cf_ctx); 1523 if (DEBUG_BLOCK_INFO) 1524 printf("Post-uv-cf-blk[pl=%d,tx=%d," 1525 "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n", 1526 pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4); 1527 int ctw = imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor); 1528 int cth = imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver); 1529 dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); 1530 dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); 1531 } 1532 if (eob >= 0) { 1533 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 1534 coef_dump(cf, uv_t_dim->h * 4, 1535 uv_t_dim->w * 4, 3, "dq"); 1536 dsp->itx.itxfm_add[b->uvtx] 1537 [txtp](dst, stride, 1538 cf, eob HIGHBD_CALL_SUFFIX); 1539 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 1540 hex_dump(dst, stride, uv_t_dim->w * 4, 1541 uv_t_dim->h * 4, "recon"); 1542 } 1543 } else if (!t->frame_thread.pass) { 1544 dav1d_memset_pow2[uv_t_dim->lw](&t->a->ccoef[pl][cbx4 + x], 0x40); 1545 dav1d_memset_pow2[uv_t_dim->lh](&t->l.ccoef[pl][cby4 + y], 0x40); 1546 } 1547 dst += uv_t_dim->w * 4; 1548 } 1549 t->bx -= x << ss_hor; 1550 } 1551 t->by -= y << ss_ver; 1552 } 1553 } 1554 } 1555 } 1556 1557 int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs, 1558 const Av1Block *const b) 1559 { 1560 Dav1dTileState *const ts = t->ts; 1561 const Dav1dFrameContext *const f = t->f; 1562 const Dav1dDSPContext *const dsp = f->dsp; 1563 const int bx4 = t->bx & 31, by4 = t->by & 31; 1564 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 1565 const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; 1566 const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; 1567 const uint8_t *const b_dim = dav1d_block_dimensions[bs]; 1568 const int bw4 = b_dim[0], bh4 = b_dim[1]; 1569 const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); 1570 const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 && 1571 (bw4 > ss_hor || t->bx & 1) && 1572 (bh4 > ss_ver || t->by & 1); 1573 const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 : 1574 DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout; 1575 int res; 1576 1577 // prediction 1578 const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor; 1579 pixel *dst = ((pixel *) f->cur.data[0]) + 1580 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); 1581 const ptrdiff_t uvdstoff = 1582 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); 1583 if (IS_KEY_OR_INTRA(f->frame_hdr)) { 1584 // intrabc 1585 assert(!f->frame_hdr->super_res.enabled); 1586 res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0, 1587 b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); 1588 if (res) return res; 1589 if (has_chroma) for (int pl = 1; pl < 3; pl++) { 1590 res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1], 1591 bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), 1592 t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0], 1593 &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR); 1594 if (res) return res; 1595 } 1596 } else if (b->comp_type == COMP_INTER_NONE) { 1597 const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]]; 1598 const enum Filter2d filter_2d = b->filter2d; 1599 1600 if (imin(bw4, bh4) > 1 && 1601 ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || 1602 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) 1603 { 1604 res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp, 1605 b->motion_mode == MM_WARP ? &t->warpmv : 1606 &f->frame_hdr->gmv[b->ref[0]]); 1607 if (res) return res; 1608 } else { 1609 res = mc(t, dst, NULL, f->cur.stride[0], 1610 bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d); 1611 if (res) return res; 1612 if (b->motion_mode == MM_OBMC) { 1613 res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4); 1614 if (res) return res; 1615 } 1616 } 1617 if (b->interintra_type) { 1618 pixel *const tl_edge = bitfn(t->scratch.edge) + 32; 1619 enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ? 1620 SMOOTH_PRED : b->interintra_mode; 1621 pixel *const tmp = bitfn(t->scratch.interintra); 1622 int angle = 0; 1623 const pixel *top_sb_edge = NULL; 1624 if (!(t->by & (f->sb_step - 1))) { 1625 top_sb_edge = f->ipred_edge[0]; 1626 const int sby = t->by >> f->sb_shift; 1627 top_sb_edge += f->sb128w * 128 * (sby - 1); 1628 } 1629 m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start, 1630 t->by, t->by > ts->tiling.row_start, 1631 ts->tiling.col_end, ts->tiling.row_end, 1632 0, dst, f->cur.stride[0], top_sb_edge, 1633 m, &angle, bw4, bh4, 0, tl_edge 1634 HIGHBD_CALL_SUFFIX); 1635 dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel), 1636 tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0 1637 HIGHBD_CALL_SUFFIX); 1638 dsp->mc.blend(dst, f->cur.stride[0], tmp, 1639 bw4 * 4, bh4 * 4, II_MASK(0, bs, b)); 1640 } 1641 1642 if (!has_chroma) goto skip_inter_chroma_pred; 1643 1644 // sub8x8 derivation 1645 int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver; 1646 refmvs_block *const *r; 1647 if (is_sub8x8) { 1648 assert(ss_hor == 1); 1649 r = &t->rt.r[(t->by & 31) + 5]; 1650 if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0; 1651 if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0; 1652 if (bw4 == 1 && bh4 == ss_ver) 1653 is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0; 1654 } 1655 1656 // chroma prediction 1657 if (is_sub8x8) { 1658 assert(ss_hor == 1); 1659 ptrdiff_t h_off = 0, v_off = 0; 1660 if (bw4 == 1 && bh4 == ss_ver) { 1661 for (int pl = 0; pl < 2; pl++) { 1662 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, 1663 NULL, f->cur.stride[1], 1664 bw4, bh4, t->bx - 1, t->by - 1, 1 + pl, 1665 r[-1][t->bx - 1].mv.mv[0], 1666 &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1], 1667 r[-1][t->bx - 1].ref.ref[0] - 1, 1668 t->frame_thread.pass != 2 ? t->tl_4x4_filter : 1669 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d); 1670 if (res) return res; 1671 } 1672 v_off = 2 * PXSTRIDE(f->cur.stride[1]); 1673 h_off = 2; 1674 } 1675 if (bw4 == 1) { 1676 const enum Filter2d left_filter_2d = 1677 dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]]; 1678 for (int pl = 0; pl < 2; pl++) { 1679 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL, 1680 f->cur.stride[1], bw4, bh4, t->bx - 1, 1681 t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0], 1682 &f->refp[r[0][t->bx - 1].ref.ref[0] - 1], 1683 r[0][t->bx - 1].ref.ref[0] - 1, 1684 t->frame_thread.pass != 2 ? left_filter_2d : 1685 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d); 1686 if (res) return res; 1687 } 1688 h_off = 2; 1689 } 1690 if (bh4 == ss_ver) { 1691 const enum Filter2d top_filter_2d = 1692 dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]]; 1693 for (int pl = 0; pl < 2; pl++) { 1694 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL, 1695 f->cur.stride[1], bw4, bh4, t->bx, t->by - 1, 1696 1 + pl, r[-1][t->bx].mv.mv[0], 1697 &f->refp[r[-1][t->bx].ref.ref[0] - 1], 1698 r[-1][t->bx].ref.ref[0] - 1, 1699 t->frame_thread.pass != 2 ? top_filter_2d : 1700 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d); 1701 if (res) return res; 1702 } 1703 v_off = 2 * PXSTRIDE(f->cur.stride[1]); 1704 } 1705 for (int pl = 0; pl < 2; pl++) { 1706 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1], 1707 bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], 1708 refp, b->ref[0], filter_2d); 1709 if (res) return res; 1710 } 1711 } else { 1712 if (imin(cbw4, cbh4) > 1 && 1713 ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || 1714 (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) 1715 { 1716 for (int pl = 0; pl < 2; pl++) { 1717 res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL, 1718 f->cur.stride[1], b_dim, 1 + pl, refp, 1719 b->motion_mode == MM_WARP ? &t->warpmv : 1720 &f->frame_hdr->gmv[b->ref[0]]); 1721 if (res) return res; 1722 } 1723 } else { 1724 for (int pl = 0; pl < 2; pl++) { 1725 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, 1726 NULL, f->cur.stride[1], 1727 bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver), 1728 t->bx & ~ss_hor, t->by & ~ss_ver, 1729 1 + pl, b->mv[0], refp, b->ref[0], filter_2d); 1730 if (res) return res; 1731 if (b->motion_mode == MM_OBMC) { 1732 res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, 1733 f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4); 1734 if (res) return res; 1735 } 1736 } 1737 } 1738 if (b->interintra_type) { 1739 // FIXME for 8x32 with 4:2:2 subsampling, this probably does 1740 // the wrong thing since it will select 4x16, not 4x32, as a 1741 // transform size... 1742 const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b); 1743 1744 for (int pl = 0; pl < 2; pl++) { 1745 pixel *const tmp = bitfn(t->scratch.interintra); 1746 pixel *const tl_edge = bitfn(t->scratch.edge) + 32; 1747 enum IntraPredMode m = 1748 b->interintra_mode == II_SMOOTH_PRED ? 1749 SMOOTH_PRED : b->interintra_mode; 1750 int angle = 0; 1751 pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; 1752 const pixel *top_sb_edge = NULL; 1753 if (!(t->by & (f->sb_step - 1))) { 1754 top_sb_edge = f->ipred_edge[pl + 1]; 1755 const int sby = t->by >> f->sb_shift; 1756 top_sb_edge += f->sb128w * 128 * (sby - 1); 1757 } 1758 m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor, 1759 (t->bx >> ss_hor) > 1760 (ts->tiling.col_start >> ss_hor), 1761 t->by >> ss_ver, 1762 (t->by >> ss_ver) > 1763 (ts->tiling.row_start >> ss_ver), 1764 ts->tiling.col_end >> ss_hor, 1765 ts->tiling.row_end >> ss_ver, 1766 0, uvdst, f->cur.stride[1], 1767 top_sb_edge, m, 1768 &angle, cbw4, cbh4, 0, tl_edge 1769 HIGHBD_CALL_SUFFIX); 1770 dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel), 1771 tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0 1772 HIGHBD_CALL_SUFFIX); 1773 dsp->mc.blend(uvdst, f->cur.stride[1], tmp, 1774 cbw4 * 4, cbh4 * 4, ii_mask); 1775 } 1776 } 1777 } 1778 1779 skip_inter_chroma_pred: {} 1780 t->tl_4x4_filter = filter_2d; 1781 } else { 1782 const enum Filter2d filter_2d = b->filter2d; 1783 // Maximum super block size is 128x128 1784 int16_t (*tmp)[128 * 128] = t->scratch.compinter; 1785 int jnt_weight; 1786 uint8_t *const seg_mask = t->scratch.seg_mask; 1787 const uint8_t *mask; 1788 1789 for (int i = 0; i < 2; i++) { 1790 const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; 1791 1792 if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) { 1793 res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp, 1794 &f->frame_hdr->gmv[b->ref[i]]); 1795 if (res) return res; 1796 } else { 1797 res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0, 1798 b->mv[i], refp, b->ref[i], filter_2d); 1799 if (res) return res; 1800 } 1801 } 1802 switch (b->comp_type) { 1803 case COMP_INTER_AVG: 1804 dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1], 1805 bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX); 1806 break; 1807 case COMP_INTER_WEIGHTED_AVG: 1808 jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]]; 1809 dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1], 1810 bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX); 1811 break; 1812 case COMP_INTER_SEG: 1813 dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0], 1814 tmp[b->mask_sign], tmp[!b->mask_sign], 1815 bw4 * 4, bh4 * 4, seg_mask, 1816 b->mask_sign HIGHBD_CALL_SUFFIX); 1817 mask = seg_mask; 1818 break; 1819 case COMP_INTER_WEDGE: 1820 mask = WEDGE_MASK(0, bs, 0, b->wedge_idx); 1821 dsp->mc.mask(dst, f->cur.stride[0], 1822 tmp[b->mask_sign], tmp[!b->mask_sign], 1823 bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX); 1824 if (has_chroma) 1825 mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx); 1826 break; 1827 } 1828 1829 // chroma 1830 if (has_chroma) for (int pl = 0; pl < 2; pl++) { 1831 for (int i = 0; i < 2; i++) { 1832 const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]]; 1833 if (b->inter_mode == GLOBALMV_GLOBALMV && 1834 imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) 1835 { 1836 res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor, 1837 b_dim, 1 + pl, 1838 refp, &f->frame_hdr->gmv[b->ref[i]]); 1839 if (res) return res; 1840 } else { 1841 res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 1842 1 + pl, b->mv[i], refp, b->ref[i], filter_2d); 1843 if (res) return res; 1844 } 1845 } 1846 pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff; 1847 switch (b->comp_type) { 1848 case COMP_INTER_AVG: 1849 dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], 1850 bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver 1851 HIGHBD_CALL_SUFFIX); 1852 break; 1853 case COMP_INTER_WEIGHTED_AVG: 1854 dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1], 1855 bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight 1856 HIGHBD_CALL_SUFFIX); 1857 break; 1858 case COMP_INTER_WEDGE: 1859 case COMP_INTER_SEG: 1860 dsp->mc.mask(uvdst, f->cur.stride[1], 1861 tmp[b->mask_sign], tmp[!b->mask_sign], 1862 bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask 1863 HIGHBD_CALL_SUFFIX); 1864 break; 1865 } 1866 } 1867 } 1868 1869 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) { 1870 hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred"); 1871 if (has_chroma) { 1872 hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1], 1873 cbw4 * 4, cbh4 * 4, "u-pred"); 1874 hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1], 1875 cbw4 * 4, cbh4 * 4, "v-pred"); 1876 } 1877 } 1878 1879 const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; 1880 1881 if (b->skip) { 1882 // reset coef contexts 1883 BlockContext *const a = t->a; 1884 dav1d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); 1885 dav1d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); 1886 if (has_chroma) { 1887 dav1d_memset_pow2_fn memset_cw = dav1d_memset_pow2[ulog2(cbw4)]; 1888 dav1d_memset_pow2_fn memset_ch = dav1d_memset_pow2[ulog2(cbh4)]; 1889 memset_cw(&a->ccoef[0][cbx4], 0x40); 1890 memset_cw(&a->ccoef[1][cbx4], 0x40); 1891 memset_ch(&t->l.ccoef[0][cby4], 0x40); 1892 memset_ch(&t->l.ccoef[1][cby4], 0x40); 1893 } 1894 return 0; 1895 } 1896 1897 const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx]; 1898 const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx]; 1899 const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; 1900 1901 for (int init_y = 0; init_y < bh4; init_y += 16) { 1902 for (int init_x = 0; init_x < bw4; init_x += 16) { 1903 // coefficient coding & inverse transforms 1904 int y_off = !!init_y, y; 1905 dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y; 1906 for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16); 1907 y += ytx->h, y_off++) 1908 { 1909 int x, x_off = !!init_x; 1910 for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16); 1911 x += ytx->w, x_off++) 1912 { 1913 read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split, 1914 x_off, y_off, &dst[x * 4]); 1915 t->bx += ytx->w; 1916 } 1917 dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h; 1918 t->bx -= x; 1919 t->by += ytx->h; 1920 } 1921 dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y; 1922 t->by -= y; 1923 1924 // chroma coefs and inverse transform 1925 if (has_chroma) for (int pl = 0; pl < 2; pl++) { 1926 pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff + 1927 (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver); 1928 for (y = init_y >> ss_ver, t->by += init_y; 1929 y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h) 1930 { 1931 int x; 1932 for (x = init_x >> ss_hor, t->bx += init_x; 1933 x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w) 1934 { 1935 coef *cf; 1936 int eob; 1937 enum TxfmType txtp; 1938 if (t->frame_thread.pass) { 1939 const int p = t->frame_thread.pass & 1; 1940 const int cbi = *ts->frame_thread[p].cbi++; 1941 cf = ts->frame_thread[p].cf; 1942 ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16; 1943 eob = cbi >> 5; 1944 txtp = cbi & 0x1f; 1945 } else { 1946 uint8_t cf_ctx; 1947 cf = bitfn(t->cf); 1948 txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + 1949 bx4 + (x << ss_hor)]; 1950 eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], 1951 &t->l.ccoef[pl][cby4 + y], 1952 b->uvtx, bs, b, 0, 1 + pl, 1953 cf, &txtp, &cf_ctx); 1954 if (DEBUG_BLOCK_INFO) 1955 printf("Post-uv-cf-blk[pl=%d,tx=%d," 1956 "txtp=%d,eob=%d]: r=%d\n", 1957 pl, b->uvtx, txtp, eob, ts->msac.rng); 1958 int ctw = imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor); 1959 int cth = imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver); 1960 dav1d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx, ctw); 1961 dav1d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx, cth); 1962 } 1963 if (eob >= 0) { 1964 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 1965 coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq"); 1966 dsp->itx.itxfm_add[b->uvtx] 1967 [txtp](&uvdst[4 * x], 1968 f->cur.stride[1], 1969 cf, eob HIGHBD_CALL_SUFFIX); 1970 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) 1971 hex_dump(&uvdst[4 * x], f->cur.stride[1], 1972 uvtx->w * 4, uvtx->h * 4, "recon"); 1973 } 1974 t->bx += uvtx->w << ss_hor; 1975 } 1976 uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h; 1977 t->bx -= x << ss_hor; 1978 t->by += uvtx->h << ss_ver; 1979 } 1980 t->by -= y << ss_ver; 1981 } 1982 } 1983 } 1984 return 0; 1985 } 1986 1987 void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) { 1988 if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) || 1989 (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1])) 1990 { 1991 return; 1992 } 1993 const int y = sby * f->sb_step * 4; 1994 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 1995 pixel *const p[3] = { 1996 f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), 1997 f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), 1998 f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) 1999 }; 2000 Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; 2001 bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby, 2002 f->lf.start_of_tile_row[sby]); 2003 } 2004 2005 void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) { 2006 const int y = sby * f->sb_step * 4; 2007 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 2008 pixel *const p[3] = { 2009 f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), 2010 f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), 2011 f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) 2012 }; 2013 Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; 2014 if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK && 2015 (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])) 2016 { 2017 bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby); 2018 } 2019 if (f->seq_hdr->cdef || f->lf.restore_planes) { 2020 // Store loop filtered pixels required by CDEF / LR 2021 bytefn(dav1d_copy_lpf)(f, p, sby); 2022 } 2023 } 2024 2025 void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) { 2026 const Dav1dFrameContext *const f = tc->f; 2027 if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return; 2028 const int sbsz = f->sb_step; 2029 const int y = sby * sbsz * 4; 2030 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 2031 pixel *const p[3] = { 2032 f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), 2033 f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), 2034 f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) 2035 }; 2036 Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w; 2037 Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; 2038 const int start = sby * sbsz; 2039 if (sby) { 2040 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 2041 pixel *p_up[3] = { 2042 p[0] - 8 * PXSTRIDE(f->cur.stride[0]), 2043 p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), 2044 p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), 2045 }; 2046 bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby); 2047 } 2048 const int n_blks = sbsz - 2 * (sby + 1 < f->sbh); 2049 const int end = imin(start + n_blks, f->bh); 2050 bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby); 2051 } 2052 2053 void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) { 2054 const int sbsz = f->sb_step; 2055 const int y = sby * sbsz * 4; 2056 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 2057 const pixel *const p[3] = { 2058 f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), 2059 f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), 2060 f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) 2061 }; 2062 pixel *const sr_p[3] = { 2063 f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), 2064 f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), 2065 f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) 2066 }; 2067 const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; 2068 for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { 2069 const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 2070 const int h_start = 8 * !!sby >> ss_ver; 2071 const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; 2072 pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride); 2073 const ptrdiff_t src_stride = f->cur.stride[!!pl]; 2074 const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride); 2075 const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver; 2076 const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; 2077 const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; 2078 const int src_w = (4 * f->bw + ss_hor) >> ss_hor; 2079 const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; 2080 2081 f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, 2082 imin(img_h, h_end) + h_start, src_w, 2083 f->resize_step[!!pl], f->resize_start[!!pl] 2084 HIGHBD_CALL_SUFFIX); 2085 } 2086 } 2087 2088 void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) { 2089 if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return; 2090 const int y = sby * f->sb_step * 4; 2091 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 2092 pixel *const sr_p[3] = { 2093 f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), 2094 f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), 2095 f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) 2096 }; 2097 bytefn(dav1d_lr_sbrow)(f, sr_p, sby); 2098 } 2099 2100 void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { 2101 bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby); 2102 bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby); 2103 if (f->seq_hdr->cdef) 2104 bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby); 2105 if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) 2106 bytefn(dav1d_filter_sbrow_resize)(f, sby); 2107 if (f->lf.restore_planes) 2108 bytefn(dav1d_filter_sbrow_lr)(f, sby); 2109 } 2110 2111 void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) { 2112 const Dav1dFrameContext *const f = t->f; 2113 Dav1dTileState *const ts = t->ts; 2114 const int sby = t->by >> f->sb_shift; 2115 const int sby_off = f->sb128w * 128 * sby; 2116 const int x_off = ts->tiling.col_start; 2117 2118 const pixel *const y = 2119 ((const pixel *) f->cur.data[0]) + x_off * 4 + 2120 ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]); 2121 pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y, 2122 4 * (ts->tiling.col_end - x_off)); 2123 2124 if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) { 2125 const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; 2126 const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; 2127 2128 const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) + 2129 (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]); 2130 for (int pl = 1; pl <= 2; pl++) 2131 pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)], 2132 &((const pixel *) f->cur.data[pl])[uv_off], 2133 4 * (ts->tiling.col_end - x_off) >> ss_hor); 2134 } 2135 } 2136 2137 void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t, 2138 const int bx4, const int by4, 2139 const int bw4, const int bh4) 2140 2141 { 2142 const Dav1dFrameContext *const f = t->f; 2143 pixel *const pal = t->frame_thread.pass ? 2144 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + 2145 ((t->bx >> 1) + (t->by & 1))][0] : 2146 bytefn(t->scratch.pal)[0]; 2147 for (int x = 0; x < bw4; x++) 2148 memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel)); 2149 for (int y = 0; y < bh4; y++) 2150 memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel)); 2151 } 2152 2153 void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t, 2154 const int bx4, const int by4, 2155 const int bw4, const int bh4) 2156 2157 { 2158 const Dav1dFrameContext *const f = t->f; 2159 const pixel (*const pal)[8] = t->frame_thread.pass ? 2160 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + 2161 ((t->bx >> 1) + (t->by & 1))] : 2162 bytefn(t->scratch.pal); 2163 // see aomedia bug 2183 for why we use luma coordinates here 2164 for (int pl = 1; pl <= 2; pl++) { 2165 for (int x = 0; x < bw4; x++) 2166 memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel)); 2167 for (int y = 0; y < bh4; y++) 2168 memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel)); 2169 } 2170 } 2171 2172 void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b, 2173 const int pl, const int sz_ctx, 2174 const int bx4, const int by4) 2175 { 2176 Dav1dTileState *const ts = t->ts; 2177 const Dav1dFrameContext *const f = t->f; 2178 const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac, 2179 ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2; 2180 pixel cache[16], used_cache[8]; 2181 int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; 2182 int n_cache = 0; 2183 // don't reuse above palette outside SB64 boundaries 2184 int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0; 2185 const pixel *l = bytefn(t->al_pal)[1][by4][pl]; 2186 const pixel *a = bytefn(t->al_pal)[0][bx4][pl]; 2187 2188 // fill/sort cache 2189 while (l_cache && a_cache) { 2190 if (*l < *a) { 2191 if (!n_cache || cache[n_cache - 1] != *l) 2192 cache[n_cache++] = *l; 2193 l++; 2194 l_cache--; 2195 } else { 2196 if (*a == *l) { 2197 l++; 2198 l_cache--; 2199 } 2200 if (!n_cache || cache[n_cache - 1] != *a) 2201 cache[n_cache++] = *a; 2202 a++; 2203 a_cache--; 2204 } 2205 } 2206 if (l_cache) { 2207 do { 2208 if (!n_cache || cache[n_cache - 1] != *l) 2209 cache[n_cache++] = *l; 2210 l++; 2211 } while (--l_cache > 0); 2212 } else if (a_cache) { 2213 do { 2214 if (!n_cache || cache[n_cache - 1] != *a) 2215 cache[n_cache++] = *a; 2216 a++; 2217 } while (--a_cache > 0); 2218 } 2219 2220 // find reused cache entries 2221 int i = 0; 2222 for (int n = 0; n < n_cache && i < pal_sz; n++) 2223 if (dav1d_msac_decode_bool_equi(&ts->msac)) 2224 used_cache[i++] = cache[n]; 2225 const int n_used_cache = i; 2226 2227 // parse new entries 2228 pixel *const pal = t->frame_thread.pass ? 2229 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + 2230 ((t->bx >> 1) + (t->by & 1))][pl] : 2231 bytefn(t->scratch.pal)[pl]; 2232 if (i < pal_sz) { 2233 const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; 2234 int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc); 2235 2236 if (i < pal_sz) { 2237 int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2); 2238 const int max = (1 << bpc) - 1; 2239 2240 do { 2241 const int delta = dav1d_msac_decode_bools(&ts->msac, bits); 2242 prev = pal[i++] = imin(prev + delta + !pl, max); 2243 if (prev + !pl >= max) { 2244 for (; i < pal_sz; i++) 2245 pal[i] = max; 2246 break; 2247 } 2248 bits = imin(bits, 1 + ulog2(max - prev - !pl)); 2249 } while (i < pal_sz); 2250 } 2251 2252 // merge cache+new entries 2253 int n = 0, m = n_used_cache; 2254 for (i = 0; i < pal_sz; i++) { 2255 if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) { 2256 pal[i] = used_cache[n++]; 2257 } else { 2258 assert(m < pal_sz); 2259 pal[i] = pal[m++]; 2260 } 2261 } 2262 } else { 2263 memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache)); 2264 } 2265 2266 if (DEBUG_BLOCK_INFO) { 2267 printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=", 2268 pl, pal_sz, n_cache, n_used_cache, ts->msac.rng); 2269 for (int n = 0; n < n_cache; n++) 2270 printf("%c%02x", n ? ' ' : '[', cache[n]); 2271 printf("%s, pal=", n_cache ? "]" : "[]"); 2272 for (int n = 0; n < pal_sz; n++) 2273 printf("%c%02x", n ? ' ' : '[', pal[n]); 2274 printf("]\n"); 2275 } 2276 } 2277 2278 void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b, 2279 const int sz_ctx, const int bx4, const int by4) 2280 { 2281 bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4); 2282 2283 // V pal coding 2284 Dav1dTileState *const ts = t->ts; 2285 const Dav1dFrameContext *const f = t->f; 2286 pixel *const pal = t->frame_thread.pass ? 2287 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + 2288 ((t->bx >> 1) + (t->by & 1))][2] : 2289 bytefn(t->scratch.pal)[2]; 2290 const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; 2291 if (dav1d_msac_decode_bool_equi(&ts->msac)) { 2292 const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2); 2293 int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc); 2294 const int max = (1 << bpc) - 1; 2295 for (int i = 1; i < b->pal_sz[1]; i++) { 2296 int delta = dav1d_msac_decode_bools(&ts->msac, bits); 2297 if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta; 2298 prev = pal[i] = (prev + delta) & max; 2299 } 2300 } else { 2301 for (int i = 0; i < b->pal_sz[1]; i++) 2302 pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc); 2303 } 2304 if (DEBUG_BLOCK_INFO) { 2305 printf("Post-pal[pl=2]: r=%d ", ts->msac.rng); 2306 for (int n = 0; n < b->pal_sz[1]; n++) 2307 printf("%c%02x", n ? ' ' : '[', pal[n]); 2308 printf("]\n"); 2309 } 2310 }