ipred_tmpl.c (27732B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Two Orioles, LLC 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 30 #include <stdlib.h> 31 #include <string.h> 32 33 #include "common/attributes.h" 34 #include "common/intops.h" 35 36 #include "src/ipred.h" 37 #include "src/tables.h" 38 39 static NOINLINE void 40 splat_dc(pixel *dst, const ptrdiff_t stride, 41 const int width, const int height, const int dc HIGHBD_DECL_SUFFIX) 42 { 43 #if BITDEPTH == 8 44 assert(dc <= 0xff); 45 if (width > 4) { 46 const uint64_t dcN = dc * 0x0101010101010101ULL; 47 for (int y = 0; y < height; y++) { 48 for (int x = 0; x < width; x += sizeof(dcN)) 49 *((uint64_t *) &dst[x]) = dcN; 50 dst += PXSTRIDE(stride); 51 } 52 } else { 53 const unsigned dcN = dc * 0x01010101U; 54 for (int y = 0; y < height; y++) { 55 for (int x = 0; x < width; x += sizeof(dcN)) 56 *((unsigned *) &dst[x]) = dcN; 57 dst += PXSTRIDE(stride); 58 } 59 } 60 #else 61 assert(dc <= bitdepth_max); 62 const uint64_t dcN = dc * 0x0001000100010001ULL; 63 for (int y = 0; y < height; y++) { 64 for (int x = 0; x < width; x += sizeof(dcN) >> 1) 65 *((uint64_t *) &dst[x]) = dcN; 66 dst += PXSTRIDE(stride); 67 } 68 #endif 69 } 70 71 static NOINLINE void 72 cfl_pred(pixel *dst, const ptrdiff_t stride, 73 const int width, const int height, const int dc, 74 const int16_t *ac, const int alpha HIGHBD_DECL_SUFFIX) 75 { 76 for (int y = 0; y < height; y++) { 77 for (int x = 0; x < width; x++) { 78 const int diff = alpha * ac[x]; 79 dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff)); 80 } 81 ac += width; 82 dst += PXSTRIDE(stride); 83 } 84 } 85 86 static unsigned dc_gen_top(const pixel *const topleft, const int width) { 87 unsigned dc = width >> 1; 88 for (int i = 0; i < width; i++) 89 dc += topleft[1 + i]; 90 return dc >> ctz(width); 91 } 92 93 static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride, 94 const pixel *const topleft, 95 const int width, const int height, const int a, 96 const int max_width, const int max_height 97 HIGHBD_DECL_SUFFIX) 98 { 99 splat_dc(dst, stride, width, height, dc_gen_top(topleft, width) 100 HIGHBD_TAIL_SUFFIX); 101 } 102 103 static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride, 104 const pixel *const topleft, 105 const int width, const int height, 106 const int16_t *ac, const int alpha 107 HIGHBD_DECL_SUFFIX) 108 { 109 cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha 110 HIGHBD_TAIL_SUFFIX); 111 } 112 113 static unsigned dc_gen_left(const pixel *const topleft, const int height) { 114 unsigned dc = height >> 1; 115 for (int i = 0; i < height; i++) 116 dc += topleft[-(1 + i)]; 117 return dc >> ctz(height); 118 } 119 120 static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride, 121 const pixel *const topleft, 122 const int width, const int height, const int a, 123 const int max_width, const int max_height 124 HIGHBD_DECL_SUFFIX) 125 { 126 splat_dc(dst, stride, width, height, dc_gen_left(topleft, height) 127 HIGHBD_TAIL_SUFFIX); 128 } 129 130 static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride, 131 const pixel *const topleft, 132 const int width, const int height, 133 const int16_t *ac, const int alpha 134 HIGHBD_DECL_SUFFIX) 135 { 136 const unsigned dc = dc_gen_left(topleft, height); 137 cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX); 138 } 139 140 #if BITDEPTH == 8 141 #define MULTIPLIER_1x2 0x5556 142 #define MULTIPLIER_1x4 0x3334 143 #define BASE_SHIFT 16 144 #else 145 #define MULTIPLIER_1x2 0xAAAB 146 #define MULTIPLIER_1x4 0x6667 147 #define BASE_SHIFT 17 148 #endif 149 150 static unsigned dc_gen(const pixel *const topleft, 151 const int width, const int height) 152 { 153 unsigned dc = (width + height) >> 1; 154 for (int i = 0; i < width; i++) 155 dc += topleft[i + 1]; 156 for (int i = 0; i < height; i++) 157 dc += topleft[-(i + 1)]; 158 dc >>= ctz(width + height); 159 160 if (width != height) { 161 dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 : 162 MULTIPLIER_1x2; 163 dc >>= BASE_SHIFT; 164 } 165 return dc; 166 } 167 168 static void ipred_dc_c(pixel *dst, const ptrdiff_t stride, 169 const pixel *const topleft, 170 const int width, const int height, const int a, 171 const int max_width, const int max_height 172 HIGHBD_DECL_SUFFIX) 173 { 174 splat_dc(dst, stride, width, height, dc_gen(topleft, width, height) 175 HIGHBD_TAIL_SUFFIX); 176 } 177 178 static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride, 179 const pixel *const topleft, 180 const int width, const int height, 181 const int16_t *ac, const int alpha 182 HIGHBD_DECL_SUFFIX) 183 { 184 unsigned dc = dc_gen(topleft, width, height); 185 cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX); 186 } 187 188 #undef MULTIPLIER_1x2 189 #undef MULTIPLIER_1x4 190 #undef BASE_SHIFT 191 192 static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride, 193 const pixel *const topleft, 194 const int width, const int height, const int a, 195 const int max_width, const int max_height 196 HIGHBD_DECL_SUFFIX) 197 { 198 #if BITDEPTH == 16 199 const int dc = (bitdepth_max + 1) >> 1; 200 #else 201 const int dc = 128; 202 #endif 203 splat_dc(dst, stride, width, height, dc HIGHBD_TAIL_SUFFIX); 204 } 205 206 static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride, 207 const pixel *const topleft, 208 const int width, const int height, 209 const int16_t *ac, const int alpha 210 HIGHBD_DECL_SUFFIX) 211 { 212 #if BITDEPTH == 16 213 const int dc = (bitdepth_max + 1) >> 1; 214 #else 215 const int dc = 128; 216 #endif 217 cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX); 218 } 219 220 static void ipred_v_c(pixel *dst, const ptrdiff_t stride, 221 const pixel *const topleft, 222 const int width, const int height, const int a, 223 const int max_width, const int max_height 224 HIGHBD_DECL_SUFFIX) 225 { 226 for (int y = 0; y < height; y++) { 227 pixel_copy(dst, topleft + 1, width); 228 dst += PXSTRIDE(stride); 229 } 230 } 231 232 static void ipred_h_c(pixel *dst, const ptrdiff_t stride, 233 const pixel *const topleft, 234 const int width, const int height, const int a, 235 const int max_width, const int max_height 236 HIGHBD_DECL_SUFFIX) 237 { 238 for (int y = 0; y < height; y++) { 239 pixel_set(dst, topleft[-(1 + y)], width); 240 dst += PXSTRIDE(stride); 241 } 242 } 243 244 static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride, 245 const pixel *const tl_ptr, 246 const int width, const int height, const int a, 247 const int max_width, const int max_height 248 HIGHBD_DECL_SUFFIX) 249 { 250 const int topleft = tl_ptr[0]; 251 for (int y = 0; y < height; y++) { 252 const int left = tl_ptr[-(y + 1)]; 253 for (int x = 0; x < width; x++) { 254 const int top = tl_ptr[1 + x]; 255 const int base = left + top - topleft; 256 const int ldiff = abs(left - base); 257 const int tdiff = abs(top - base); 258 const int tldiff = abs(topleft - base); 259 260 dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left : 261 tdiff <= tldiff ? top : topleft; 262 } 263 dst += PXSTRIDE(stride); 264 } 265 } 266 267 static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride, 268 const pixel *const topleft, 269 const int width, const int height, const int a, 270 const int max_width, const int max_height 271 HIGHBD_DECL_SUFFIX) 272 { 273 const uint8_t *const weights_hor = &dav1d_sm_weights[width]; 274 const uint8_t *const weights_ver = &dav1d_sm_weights[height]; 275 const int right = topleft[width], bottom = topleft[-height]; 276 277 for (int y = 0; y < height; y++) { 278 for (int x = 0; x < width; x++) { 279 const int pred = weights_ver[y] * topleft[1 + x] + 280 (256 - weights_ver[y]) * bottom + 281 weights_hor[x] * topleft[-(1 + y)] + 282 (256 - weights_hor[x]) * right; 283 dst[x] = (pred + 256) >> 9; 284 } 285 dst += PXSTRIDE(stride); 286 } 287 } 288 289 static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride, 290 const pixel *const topleft, 291 const int width, const int height, const int a, 292 const int max_width, const int max_height 293 HIGHBD_DECL_SUFFIX) 294 { 295 const uint8_t *const weights_ver = &dav1d_sm_weights[height]; 296 const int bottom = topleft[-height]; 297 298 for (int y = 0; y < height; y++) { 299 for (int x = 0; x < width; x++) { 300 const int pred = weights_ver[y] * topleft[1 + x] + 301 (256 - weights_ver[y]) * bottom; 302 dst[x] = (pred + 128) >> 8; 303 } 304 dst += PXSTRIDE(stride); 305 } 306 } 307 308 static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride, 309 const pixel *const topleft, 310 const int width, const int height, const int a, 311 const int max_width, const int max_height 312 HIGHBD_DECL_SUFFIX) 313 { 314 const uint8_t *const weights_hor = &dav1d_sm_weights[width]; 315 const int right = topleft[width]; 316 317 for (int y = 0; y < height; y++) { 318 for (int x = 0; x < width; x++) { 319 const int pred = weights_hor[x] * topleft[-(y + 1)] + 320 (256 - weights_hor[x]) * right; 321 dst[x] = (pred + 128) >> 8; 322 } 323 dst += PXSTRIDE(stride); 324 } 325 } 326 327 static NOINLINE int get_filter_strength(const int wh, const int angle, 328 const int is_sm) 329 { 330 if (is_sm) { 331 if (wh <= 8) { 332 if (angle >= 64) return 2; 333 if (angle >= 40) return 1; 334 } else if (wh <= 16) { 335 if (angle >= 48) return 2; 336 if (angle >= 20) return 1; 337 } else if (wh <= 24) { 338 if (angle >= 4) return 3; 339 } else { 340 return 3; 341 } 342 } else { 343 if (wh <= 8) { 344 if (angle >= 56) return 1; 345 } else if (wh <= 16) { 346 if (angle >= 40) return 1; 347 } else if (wh <= 24) { 348 if (angle >= 32) return 3; 349 if (angle >= 16) return 2; 350 if (angle >= 8) return 1; 351 } else if (wh <= 32) { 352 if (angle >= 32) return 3; 353 if (angle >= 4) return 2; 354 return 1; 355 } else { 356 return 3; 357 } 358 } 359 return 0; 360 } 361 362 static NOINLINE void filter_edge(pixel *const out, const int sz, 363 const int lim_from, const int lim_to, 364 const pixel *const in, const int from, 365 const int to, const int strength) 366 { 367 static const uint8_t kernel[3][5] = { 368 { 0, 4, 8, 4, 0 }, 369 { 0, 5, 6, 5, 0 }, 370 { 2, 4, 4, 4, 2 } 371 }; 372 373 assert(strength > 0); 374 int i = 0; 375 for (; i < imin(sz, lim_from); i++) 376 out[i] = in[iclip(i, from, to - 1)]; 377 for (; i < imin(lim_to, sz); i++) { 378 int s = 0; 379 for (int j = 0; j < 5; j++) 380 s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j]; 381 out[i] = (s + 8) >> 4; 382 } 383 for (; i < sz; i++) 384 out[i] = in[iclip(i, from, to - 1)]; 385 } 386 387 static inline int get_upsample(const int wh, const int angle, const int is_sm) { 388 return angle < 40 && wh <= 16 >> is_sm; 389 } 390 391 static NOINLINE void upsample_edge(pixel *const out, const int hsz, 392 const pixel *const in, const int from, 393 const int to HIGHBD_DECL_SUFFIX) 394 { 395 static const int8_t kernel[4] = { -1, 9, 9, -1 }; 396 int i; 397 for (i = 0; i < hsz - 1; i++) { 398 out[i * 2] = in[iclip(i, from, to - 1)]; 399 400 int s = 0; 401 for (int j = 0; j < 4; j++) 402 s += in[iclip(i + j - 1, from, to - 1)] * kernel[j]; 403 out[i * 2 + 1] = iclip_pixel((s + 8) >> 4); 404 } 405 out[i * 2] = in[iclip(i, from, to - 1)]; 406 } 407 408 static void ipred_z1_c(pixel *dst, const ptrdiff_t stride, 409 const pixel *const topleft_in, 410 const int width, const int height, int angle, 411 const int max_width, const int max_height 412 HIGHBD_DECL_SUFFIX) 413 { 414 const int is_sm = (angle >> 9) & 0x1; 415 const int enable_intra_edge_filter = angle >> 10; 416 angle &= 511; 417 assert(angle < 90); 418 int dx = dav1d_dr_intra_derivative[angle >> 1]; 419 pixel top_out[64 + 64]; 420 const pixel *top; 421 int max_base_x; 422 const int upsample_above = enable_intra_edge_filter ? 423 get_upsample(width + height, 90 - angle, is_sm) : 0; 424 if (upsample_above) { 425 upsample_edge(top_out, width + height, &topleft_in[1], -1, 426 width + imin(width, height) HIGHBD_TAIL_SUFFIX); 427 top = top_out; 428 max_base_x = 2 * (width + height) - 2; 429 dx <<= 1; 430 } else { 431 const int filter_strength = enable_intra_edge_filter ? 432 get_filter_strength(width + height, 90 - angle, is_sm) : 0; 433 if (filter_strength) { 434 filter_edge(top_out, width + height, 0, width + height, 435 &topleft_in[1], -1, width + imin(width, height), 436 filter_strength); 437 top = top_out; 438 max_base_x = width + height - 1; 439 } else { 440 top = &topleft_in[1]; 441 max_base_x = width + imin(width, height) - 1; 442 } 443 } 444 const int base_inc = 1 + upsample_above; 445 for (int y = 0, xpos = dx; y < height; 446 y++, dst += PXSTRIDE(stride), xpos += dx) 447 { 448 const int frac = xpos & 0x3E; 449 450 for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) { 451 if (base < max_base_x) { 452 const int v = top[base] * (64 - frac) + top[base + 1] * frac; 453 dst[x] = (v + 32) >> 6; 454 } else { 455 pixel_set(&dst[x], top[max_base_x], width - x); 456 break; 457 } 458 } 459 } 460 } 461 462 static void ipred_z2_c(pixel *dst, const ptrdiff_t stride, 463 const pixel *const topleft_in, 464 const int width, const int height, int angle, 465 const int max_width, const int max_height 466 HIGHBD_DECL_SUFFIX) 467 { 468 const int is_sm = (angle >> 9) & 0x1; 469 const int enable_intra_edge_filter = angle >> 10; 470 angle &= 511; 471 assert(angle > 90 && angle < 180); 472 int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1]; 473 int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1]; 474 const int upsample_left = enable_intra_edge_filter ? 475 get_upsample(width + height, 180 - angle, is_sm) : 0; 476 const int upsample_above = enable_intra_edge_filter ? 477 get_upsample(width + height, angle - 90, is_sm) : 0; 478 pixel edge[64 + 64 + 1]; 479 pixel *const topleft = &edge[64]; 480 481 if (upsample_above) { 482 upsample_edge(topleft, width + 1, topleft_in, 0, width + 1 483 HIGHBD_TAIL_SUFFIX); 484 dx <<= 1; 485 } else { 486 const int filter_strength = enable_intra_edge_filter ? 487 get_filter_strength(width + height, angle - 90, is_sm) : 0; 488 489 if (filter_strength) { 490 filter_edge(&topleft[1], width, 0, max_width, 491 &topleft_in[1], -1, width, 492 filter_strength); 493 } else { 494 pixel_copy(&topleft[1], &topleft_in[1], width); 495 } 496 } 497 if (upsample_left) { 498 upsample_edge(&topleft[-height * 2], height + 1, &topleft_in[-height], 499 0, height + 1 HIGHBD_TAIL_SUFFIX); 500 dy <<= 1; 501 } else { 502 const int filter_strength = enable_intra_edge_filter ? 503 get_filter_strength(width + height, 180 - angle, is_sm) : 0; 504 505 if (filter_strength) { 506 filter_edge(&topleft[-height], height, height - max_height, height, 507 &topleft_in[-height], 508 0, height + 1, filter_strength); 509 } else { 510 pixel_copy(&topleft[-height], &topleft_in[-height], height); 511 } 512 } 513 *topleft = *topleft_in; 514 515 const int base_inc_x = 1 + upsample_above; 516 const pixel *const left = &topleft[-(1 + upsample_left)]; 517 for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height; 518 y++, xpos -= dx, dst += PXSTRIDE(stride)) 519 { 520 int base_x = xpos >> 6; 521 const int frac_x = xpos & 0x3E; 522 523 for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width; 524 x++, base_x += base_inc_x, ypos -= dy) 525 { 526 int v; 527 if (base_x >= 0) { 528 v = topleft[base_x] * (64 - frac_x) + 529 topleft[base_x + 1] * frac_x; 530 } else { 531 const int base_y = ypos >> 6; 532 assert(base_y >= -(1 + upsample_left)); 533 const int frac_y = ypos & 0x3E; 534 v = left[-base_y] * (64 - frac_y) + 535 left[-(base_y + 1)] * frac_y; 536 } 537 dst[x] = (v + 32) >> 6; 538 } 539 } 540 } 541 542 static void ipred_z3_c(pixel *dst, const ptrdiff_t stride, 543 const pixel *const topleft_in, 544 const int width, const int height, int angle, 545 const int max_width, const int max_height 546 HIGHBD_DECL_SUFFIX) 547 { 548 const int is_sm = (angle >> 9) & 0x1; 549 const int enable_intra_edge_filter = angle >> 10; 550 angle &= 511; 551 assert(angle > 180); 552 int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1]; 553 pixel left_out[64 + 64]; 554 const pixel *left; 555 int max_base_y; 556 const int upsample_left = enable_intra_edge_filter ? 557 get_upsample(width + height, angle - 180, is_sm) : 0; 558 if (upsample_left) { 559 upsample_edge(left_out, width + height, 560 &topleft_in[-(width + height)], 561 imax(width - height, 0), width + height + 1 562 HIGHBD_TAIL_SUFFIX); 563 left = &left_out[2 * (width + height) - 2]; 564 max_base_y = 2 * (width + height) - 2; 565 dy <<= 1; 566 } else { 567 const int filter_strength = enable_intra_edge_filter ? 568 get_filter_strength(width + height, angle - 180, is_sm) : 0; 569 570 if (filter_strength) { 571 filter_edge(left_out, width + height, 0, width + height, 572 &topleft_in[-(width + height)], 573 imax(width - height, 0), width + height + 1, 574 filter_strength); 575 left = &left_out[width + height - 1]; 576 max_base_y = width + height - 1; 577 } else { 578 left = &topleft_in[-1]; 579 max_base_y = height + imin(width, height) - 1; 580 } 581 } 582 const int base_inc = 1 + upsample_left; 583 for (int x = 0, ypos = dy; x < width; x++, ypos += dy) { 584 const int frac = ypos & 0x3E; 585 586 for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) { 587 if (base < max_base_y) { 588 const int v = left[-base] * (64 - frac) + 589 left[-(base + 1)] * frac; 590 dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6; 591 } else { 592 do { 593 dst[y * PXSTRIDE(stride) + x] = left[-max_base_y]; 594 } while (++y < height); 595 break; 596 } 597 } 598 } 599 } 600 601 #if ARCH_X86 602 #define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \ 603 flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 + \ 604 flt_ptr[16] * p2 + flt_ptr[17] * p3 + \ 605 flt_ptr[32] * p4 + flt_ptr[33] * p5 + \ 606 flt_ptr[48] * p6 607 #define FLT_INCR 2 608 #else 609 #define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \ 610 flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 + \ 611 flt_ptr[16] * p2 + flt_ptr[24] * p3 + \ 612 flt_ptr[32] * p4 + flt_ptr[40] * p5 + \ 613 flt_ptr[48] * p6 614 #define FLT_INCR 1 615 #endif 616 617 /* Up to 32x32 only */ 618 static void ipred_filter_c(pixel *dst, const ptrdiff_t stride, 619 const pixel *const topleft_in, 620 const int width, const int height, int filt_idx, 621 const int max_width, const int max_height 622 HIGHBD_DECL_SUFFIX) 623 { 624 filt_idx &= 511; 625 assert(filt_idx < 5); 626 627 const int8_t *const filter = dav1d_filter_intra_taps[filt_idx]; 628 const pixel *top = &topleft_in[1]; 629 for (int y = 0; y < height; y += 2) { 630 const pixel *topleft = &topleft_in[-y]; 631 const pixel *left = &topleft[-1]; 632 ptrdiff_t left_stride = -1; 633 for (int x = 0; x < width; x += 4) { 634 const int p0 = *topleft; 635 const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3]; 636 const int p5 = left[0 * left_stride], p6 = left[1 * left_stride]; 637 pixel *ptr = &dst[x]; 638 const int8_t *flt_ptr = filter; 639 640 for (int yy = 0; yy < 2; yy++) { 641 for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) { 642 const int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6); 643 ptr[xx] = iclip_pixel((acc + 8) >> 4); 644 } 645 ptr += PXSTRIDE(stride); 646 } 647 left = &dst[x + 4 - 1]; 648 left_stride = PXSTRIDE(stride); 649 top += 4; 650 topleft = &top[-1]; 651 } 652 top = &dst[PXSTRIDE(stride)]; 653 dst = &dst[PXSTRIDE(stride) * 2]; 654 } 655 } 656 657 static NOINLINE void 658 cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride, 659 const int w_pad, const int h_pad, const int width, const int height, 660 const int ss_hor, const int ss_ver) 661 { 662 int y, x; 663 int16_t *const ac_orig = ac; 664 665 assert(w_pad >= 0 && w_pad * 4 < width); 666 assert(h_pad >= 0 && h_pad * 4 < height); 667 668 for (y = 0; y < height - 4 * h_pad; y++) { 669 for (x = 0; x < width - 4 * w_pad; x++) { 670 int ac_sum = ypx[x << ss_hor]; 671 if (ss_hor) ac_sum += ypx[x * 2 + 1]; 672 if (ss_ver) { 673 ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)]; 674 if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)]; 675 } 676 ac[x] = ac_sum << (1 + !ss_ver + !ss_hor); 677 } 678 for (; x < width; x++) 679 ac[x] = ac[x - 1]; 680 ac += width; 681 ypx += PXSTRIDE(stride) << ss_ver; 682 } 683 for (; y < height; y++) { 684 memcpy(ac, &ac[-width], width * sizeof(*ac)); 685 ac += width; 686 } 687 688 const int log2sz = ctz(width) + ctz(height); 689 int sum = (1 << log2sz) >> 1; 690 for (ac = ac_orig, y = 0; y < height; y++) { 691 for (x = 0; x < width; x++) 692 sum += ac[x]; 693 ac += width; 694 } 695 sum >>= log2sz; 696 697 // subtract DC 698 for (ac = ac_orig, y = 0; y < height; y++) { 699 for (x = 0; x < width; x++) 700 ac[x] -= sum; 701 ac += width; 702 } 703 } 704 705 #define cfl_ac_fn(fmt, ss_hor, ss_ver) \ 706 static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \ 707 const ptrdiff_t stride, const int w_pad, \ 708 const int h_pad, const int cw, const int ch) \ 709 { \ 710 cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \ 711 } 712 713 cfl_ac_fn(420, 1, 1) 714 cfl_ac_fn(422, 1, 0) 715 cfl_ac_fn(444, 0, 0) 716 717 static void pal_pred_c(pixel *dst, const ptrdiff_t stride, 718 const pixel *const pal, const uint8_t *idx, 719 const int w, const int h) 720 { 721 for (int y = 0; y < h; y++) { 722 for (int x = 0; x < w; x += 2) { 723 const int i = *idx++; 724 assert(!(i & 0x88)); 725 dst[x + 0] = pal[i & 7]; 726 dst[x + 1] = pal[i >> 4]; 727 } 728 dst += PXSTRIDE(stride); 729 } 730 } 731 732 #if HAVE_ASM 733 #if ARCH_AARCH64 || ARCH_ARM 734 #include "src/arm/ipred.h" 735 #elif ARCH_RISCV 736 #include "src/riscv/ipred.h" 737 #elif ARCH_X86 738 #include "src/x86/ipred.h" 739 #elif ARCH_LOONGARCH64 740 #include "src/loongarch/ipred.h" 741 #endif 742 #endif 743 744 COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) { 745 c->intra_pred[DC_PRED ] = ipred_dc_c; 746 c->intra_pred[DC_128_PRED ] = ipred_dc_128_c; 747 c->intra_pred[TOP_DC_PRED ] = ipred_dc_top_c; 748 c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c; 749 c->intra_pred[HOR_PRED ] = ipred_h_c; 750 c->intra_pred[VERT_PRED ] = ipred_v_c; 751 c->intra_pred[PAETH_PRED ] = ipred_paeth_c; 752 c->intra_pred[SMOOTH_PRED ] = ipred_smooth_c; 753 c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c; 754 c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c; 755 c->intra_pred[Z1_PRED ] = ipred_z1_c; 756 c->intra_pred[Z2_PRED ] = ipred_z2_c; 757 c->intra_pred[Z3_PRED ] = ipred_z3_c; 758 c->intra_pred[FILTER_PRED ] = ipred_filter_c; 759 760 c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c; 761 c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c; 762 c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c; 763 764 c->cfl_pred[DC_PRED ] = ipred_cfl_c; 765 c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c; 766 c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c; 767 c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c; 768 769 c->pal_pred = pal_pred_c; 770 771 #if HAVE_ASM 772 #if ARCH_AARCH64 || ARCH_ARM 773 intra_pred_dsp_init_arm(c); 774 #elif ARCH_RISCV 775 intra_pred_dsp_init_riscv(c); 776 #elif ARCH_X86 777 intra_pred_dsp_init_x86(c); 778 #elif ARCH_LOONGARCH64 779 intra_pred_dsp_init_loongarch(c); 780 #endif 781 #endif 782 }