mc_tmpl.c (35339B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Two Orioles, LLC 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 30 #include <stdlib.h> 31 #include <string.h> 32 33 #include "common/attributes.h" 34 #include "common/intops.h" 35 36 #include "src/mc.h" 37 #include "src/tables.h" 38 39 #if BITDEPTH == 8 40 #define get_intermediate_bits(bitdepth_max) 4 41 // Output in interval [-5132, 9212], fits in int16_t as is 42 #define PREP_BIAS 0 43 #else 44 // 4 for 10 bits/component, 2 for 12 bits/component 45 #define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max)) 46 // Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit) 47 // Subtract a bias to ensure the output fits in int16_t 48 #define PREP_BIAS 8192 49 #endif 50 51 static NOINLINE void 52 put_c(pixel *dst, const ptrdiff_t dst_stride, 53 const pixel *src, const ptrdiff_t src_stride, const int w, int h) 54 { 55 do { 56 pixel_copy(dst, src, w); 57 58 dst += dst_stride; 59 src += src_stride; 60 } while (--h); 61 } 62 63 static NOINLINE void 64 prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride, 65 const int w, int h HIGHBD_DECL_SUFFIX) 66 { 67 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 68 do { 69 for (int x = 0; x < w; x++) 70 tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS; 71 72 tmp += w; 73 src += src_stride; 74 } while (--h); 75 } 76 77 #define FILTER_8TAP(src, x, F, stride) \ 78 (F[0] * src[x + -3 * stride] + \ 79 F[1] * src[x + -2 * stride] + \ 80 F[2] * src[x + -1 * stride] + \ 81 F[3] * src[x + +0 * stride] + \ 82 F[4] * src[x + +1 * stride] + \ 83 F[5] * src[x + +2 * stride] + \ 84 F[6] * src[x + +3 * stride] + \ 85 F[7] * src[x + +4 * stride]) 86 87 #define FILTER_8TAP2(src, x, F) \ 88 (F[0] * src[0][x] + \ 89 F[1] * src[1][x] + \ 90 F[2] * src[2][x] + \ 91 F[3] * src[3][x] + \ 92 F[4] * src[4][x] + \ 93 F[5] * src[5][x] + \ 94 F[6] * src[6][x] + \ 95 F[7] * src[7][x]) 96 97 #define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \ 98 ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh)) 99 100 #define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \ 101 ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh)) 102 103 #define DAV1D_FILTER_8TAP_RND3(src, x, F, sh) \ 104 ((FILTER_8TAP2(src, x, F) + ((1 << (sh)) >> 1)) >> (sh)) 105 106 #define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \ 107 iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh)) 108 109 #define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \ 110 iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh)) 111 112 #define DAV1D_FILTER_8TAP_CLIP3(src, x, F, sh) \ 113 iclip_pixel(DAV1D_FILTER_8TAP_RND3(src, x, F, sh)) 114 115 #define GET_H_FILTER(mx) \ 116 const int8_t *const fh = !(mx) ? NULL : w > 4 ? \ 117 dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \ 118 dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1] 119 120 #define GET_V_FILTER(my) \ 121 const int8_t *const fv = !(my) ? NULL : h > 4 ? \ 122 dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \ 123 dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1] 124 125 #define GET_FILTERS() \ 126 GET_H_FILTER(mx); \ 127 GET_V_FILTER(my) 128 129 static NOINLINE void 130 put_8tap_c(pixel *dst, ptrdiff_t dst_stride, 131 const pixel *src, ptrdiff_t src_stride, 132 const int w, int h, const int mx, const int my, 133 const int filter_type HIGHBD_DECL_SUFFIX) 134 { 135 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 136 const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1); 137 138 GET_FILTERS(); 139 dst_stride = PXSTRIDE(dst_stride); 140 src_stride = PXSTRIDE(src_stride); 141 142 if (fh) { 143 if (fv) { 144 int tmp_h = h + 7; 145 int16_t mid[128 * 135], *mid_ptr = mid; 146 147 src -= src_stride * 3; 148 do { 149 for (int x = 0; x < w; x++) 150 mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, 151 6 - intermediate_bits); 152 153 mid_ptr += 128; 154 src += src_stride; 155 } while (--tmp_h); 156 157 mid_ptr = mid + 128 * 3; 158 do { 159 for (int x = 0; x < w; x++) 160 dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 161 6 + intermediate_bits); 162 163 mid_ptr += 128; 164 dst += dst_stride; 165 } while (--h); 166 } else { 167 do { 168 for (int x = 0; x < w; x++) { 169 dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1, 170 intermediate_rnd, 6); 171 } 172 173 dst += dst_stride; 174 src += src_stride; 175 } while (--h); 176 } 177 } else if (fv) { 178 do { 179 for (int x = 0; x < w; x++) 180 dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6); 181 182 dst += dst_stride; 183 src += src_stride; 184 } while (--h); 185 } else 186 put_c(dst, dst_stride, src, src_stride, w, h); 187 } 188 189 static NOINLINE void 190 put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride, 191 const pixel *src, ptrdiff_t src_stride, 192 const int w, int h, const int mx, int my, 193 const int dx, const int dy, const int filter_type 194 HIGHBD_DECL_SUFFIX) 195 { 196 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 197 const int intermediate_rnd = (1 << intermediate_bits) >> 1; 198 int16_t mid[128 * 8]; 199 int16_t *mid_ptrs[8]; 200 int in_y = -8; 201 src_stride = PXSTRIDE(src_stride); 202 203 for (int i = 0; i < 8; i++) 204 mid_ptrs[i] = &mid[128 * i]; 205 206 src -= src_stride * 3; 207 208 for (int y = 0; y < h; y++) { 209 int x; 210 int src_y = my >> 10; 211 GET_V_FILTER((my & 0x3ff) >> 6); 212 213 while (in_y < src_y) { 214 int imx = mx, ioff = 0; 215 int16_t *mid_ptr = mid_ptrs[0]; 216 217 for (int i = 0; i < 7; i++) 218 mid_ptrs[i] = mid_ptrs[i + 1]; 219 mid_ptrs[7] = mid_ptr; 220 221 for (x = 0; x < w; x++) { 222 GET_H_FILTER(imx >> 6); 223 mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1, 224 6 - intermediate_bits) : 225 src[ioff] << intermediate_bits; 226 imx += dx; 227 ioff += imx >> 10; 228 imx &= 0x3ff; 229 } 230 231 src += src_stride; 232 in_y++; 233 } 234 235 for (x = 0; x < w; x++) 236 dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP3(mid_ptrs, x, fv, 237 6 + intermediate_bits) : 238 iclip_pixel((mid_ptrs[3][x] + intermediate_rnd) >> 239 intermediate_bits); 240 241 my += dy; 242 dst += PXSTRIDE(dst_stride); 243 } 244 } 245 246 static NOINLINE void 247 prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, 248 const int w, int h, const int mx, const int my, 249 const int filter_type HIGHBD_DECL_SUFFIX) 250 { 251 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 252 GET_FILTERS(); 253 src_stride = PXSTRIDE(src_stride); 254 255 if (fh) { 256 if (fv) { 257 int tmp_h = h + 7; 258 int16_t mid[128 * 135], *mid_ptr = mid; 259 260 src -= src_stride * 3; 261 do { 262 for (int x = 0; x < w; x++) 263 mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, 264 6 - intermediate_bits); 265 266 mid_ptr += 128; 267 src += src_stride; 268 } while (--tmp_h); 269 270 mid_ptr = mid + 128 * 3; 271 do { 272 for (int x = 0; x < w; x++) { 273 int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) - 274 PREP_BIAS; 275 assert(t >= INT16_MIN && t <= INT16_MAX); 276 tmp[x] = t; 277 } 278 279 mid_ptr += 128; 280 tmp += w; 281 } while (--h); 282 } else { 283 do { 284 for (int x = 0; x < w; x++) 285 tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, 286 6 - intermediate_bits) - 287 PREP_BIAS; 288 289 tmp += w; 290 src += src_stride; 291 } while (--h); 292 } 293 } else if (fv) { 294 do { 295 for (int x = 0; x < w; x++) 296 tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride, 297 6 - intermediate_bits) - 298 PREP_BIAS; 299 300 tmp += w; 301 src += src_stride; 302 } while (--h); 303 } else 304 prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX); 305 } 306 307 static NOINLINE void 308 prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, 309 const int w, int h, const int mx, int my, 310 const int dx, const int dy, const int filter_type 311 HIGHBD_DECL_SUFFIX) 312 { 313 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 314 int16_t mid[128 * 8]; 315 int16_t *mid_ptrs[8]; 316 int in_y = -8; 317 src_stride = PXSTRIDE(src_stride); 318 319 for (int i = 0; i < 8; i++) 320 mid_ptrs[i] = &mid[128 * i]; 321 322 src -= src_stride * 3; 323 324 for (int y = 0; y < h; y++) { 325 int x; 326 int src_y = my >> 10; 327 GET_V_FILTER((my & 0x3ff) >> 6); 328 329 while (in_y < src_y) { 330 int imx = mx, ioff = 0; 331 int16_t *mid_ptr = mid_ptrs[0]; 332 333 for (int i = 0; i < 7; i++) 334 mid_ptrs[i] = mid_ptrs[i + 1]; 335 mid_ptrs[7] = mid_ptr; 336 337 for (x = 0; x < w; x++) { 338 GET_H_FILTER(imx >> 6); 339 mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1, 340 6 - intermediate_bits) : 341 src[ioff] << intermediate_bits; 342 imx += dx; 343 ioff += imx >> 10; 344 imx &= 0x3ff; 345 } 346 347 src += src_stride; 348 in_y++; 349 } 350 351 for (x = 0; x < w; x++) 352 tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND3(mid_ptrs, x, fv, 6) 353 : mid_ptrs[3][x]) - PREP_BIAS; 354 355 my += dy; 356 tmp += w; 357 } 358 } 359 360 #define filter_fns(type, type_h, type_v) \ 361 static void put_8tap_##type##_c(pixel *const dst, \ 362 const ptrdiff_t dst_stride, \ 363 const pixel *const src, \ 364 const ptrdiff_t src_stride, \ 365 const int w, const int h, \ 366 const int mx, const int my \ 367 HIGHBD_DECL_SUFFIX) \ 368 { \ 369 put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \ 370 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \ 371 } \ 372 static void put_8tap_##type##_scaled_c(pixel *const dst, \ 373 const ptrdiff_t dst_stride, \ 374 const pixel *const src, \ 375 const ptrdiff_t src_stride, \ 376 const int w, const int h, \ 377 const int mx, const int my, \ 378 const int dx, const int dy \ 379 HIGHBD_DECL_SUFFIX) \ 380 { \ 381 put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \ 382 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \ 383 } \ 384 static void prep_8tap_##type##_c(int16_t *const tmp, \ 385 const pixel *const src, \ 386 const ptrdiff_t src_stride, \ 387 const int w, const int h, \ 388 const int mx, const int my \ 389 HIGHBD_DECL_SUFFIX) \ 390 { \ 391 prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \ 392 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \ 393 } \ 394 static void prep_8tap_##type##_scaled_c(int16_t *const tmp, \ 395 const pixel *const src, \ 396 const ptrdiff_t src_stride, \ 397 const int w, const int h, \ 398 const int mx, const int my, \ 399 const int dx, const int dy \ 400 HIGHBD_DECL_SUFFIX) \ 401 { \ 402 prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \ 403 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \ 404 } 405 406 filter_fns(regular, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR) 407 filter_fns(regular_sharp, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP) 408 filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH) 409 filter_fns(smooth, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SMOOTH) 410 filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_REGULAR) 411 filter_fns(smooth_sharp, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SHARP) 412 filter_fns(sharp, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SHARP) 413 filter_fns(sharp_regular, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_REGULAR) 414 filter_fns(sharp_smooth, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH) 415 416 #define FILTER_BILIN(src, x, mxy, stride) \ 417 (16 * src[x] + ((mxy) * (src[x + stride] - src[x]))) 418 419 #define FILTER_BILIN_RND(src, x, mxy, stride, sh) \ 420 ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh)) 421 422 #define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \ 423 iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh)) 424 425 #define FILTER_BILIN2(src1, src2, x, mxy) \ 426 (16 * src1[x] + ((mxy) * (src2[x] - src1[x]))) 427 428 #define FILTER_BILIN_RND2(src1, src2, x, mxy, sh) \ 429 ((FILTER_BILIN2(src1, src2, x, mxy) + ((1 << (sh)) >> 1)) >> (sh)) 430 431 #define FILTER_BILIN_CLIP2(src1, src2, x, mxy, sh) \ 432 iclip_pixel(FILTER_BILIN_RND2(src1, src2, x, mxy, sh)) 433 434 static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride, 435 const pixel *src, ptrdiff_t src_stride, 436 const int w, int h, const int mx, const int my 437 HIGHBD_DECL_SUFFIX) 438 { 439 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 440 const int intermediate_rnd = (1 << intermediate_bits) >> 1; 441 dst_stride = PXSTRIDE(dst_stride); 442 src_stride = PXSTRIDE(src_stride); 443 444 if (mx) { 445 if (my) { 446 int16_t mid[128 * 129], *mid_ptr = mid; 447 int tmp_h = h + 1; 448 449 do { 450 for (int x = 0; x < w; x++) 451 mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1, 452 4 - intermediate_bits); 453 454 mid_ptr += 128; 455 src += src_stride; 456 } while (--tmp_h); 457 458 mid_ptr = mid; 459 do { 460 for (int x = 0; x < w; x++) 461 dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128, 462 4 + intermediate_bits); 463 464 mid_ptr += 128; 465 dst += dst_stride; 466 } while (--h); 467 } else { 468 do { 469 for (int x = 0; x < w; x++) { 470 const int px = FILTER_BILIN_RND(src, x, mx, 1, 471 4 - intermediate_bits); 472 dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits); 473 } 474 475 dst += dst_stride; 476 src += src_stride; 477 } while (--h); 478 } 479 } else if (my) { 480 do { 481 for (int x = 0; x < w; x++) 482 dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4); 483 484 dst += dst_stride; 485 src += src_stride; 486 } while (--h); 487 } else 488 put_c(dst, dst_stride, src, src_stride, w, h); 489 } 490 491 static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride, 492 const pixel *src, ptrdiff_t src_stride, 493 const int w, int h, const int mx, int my, 494 const int dx, const int dy 495 HIGHBD_DECL_SUFFIX) 496 { 497 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 498 int16_t mid[128 * 2]; 499 int in_y = -2; 500 501 do { 502 int x; 503 int y = my >> 10; 504 int16_t *mid1 = &mid[(y & 1) * 128]; 505 int16_t *mid2 = &mid[((y + 1) & 1) * 128]; 506 int dmy = my & 0x3ff; 507 508 while (in_y < y) { 509 int imx = mx, ioff = 0; 510 int16_t *mid_ptr = &mid[(in_y & 1) * 128]; 511 512 for (x = 0; x < w; x++) { 513 mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1, 514 4 - intermediate_bits); 515 imx += dx; 516 ioff += imx >> 10; 517 imx &= 0x3ff; 518 } 519 520 src += PXSTRIDE(src_stride); 521 in_y++; 522 } 523 524 for (x = 0; x < w; x++) 525 dst[x] = FILTER_BILIN_CLIP2(mid1, mid2, x, dmy >> 6, 526 4 + intermediate_bits); 527 528 my += dy; 529 dst += PXSTRIDE(dst_stride); 530 } while (--h); 531 } 532 533 static void prep_bilin_c(int16_t *tmp, 534 const pixel *src, ptrdiff_t src_stride, 535 const int w, int h, const int mx, const int my 536 HIGHBD_DECL_SUFFIX) 537 { 538 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 539 src_stride = PXSTRIDE(src_stride); 540 541 if (mx) { 542 if (my) { 543 int16_t mid[128 * 129], *mid_ptr = mid; 544 int tmp_h = h + 1; 545 546 do { 547 for (int x = 0; x < w; x++) 548 mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1, 549 4 - intermediate_bits); 550 551 mid_ptr += 128; 552 src += src_stride; 553 } while (--tmp_h); 554 555 mid_ptr = mid; 556 do { 557 for (int x = 0; x < w; x++) 558 tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) - 559 PREP_BIAS; 560 561 mid_ptr += 128; 562 tmp += w; 563 } while (--h); 564 } else { 565 do { 566 for (int x = 0; x < w; x++) 567 tmp[x] = FILTER_BILIN_RND(src, x, mx, 1, 568 4 - intermediate_bits) - 569 PREP_BIAS; 570 571 tmp += w; 572 src += src_stride; 573 } while (--h); 574 } 575 } else if (my) { 576 do { 577 for (int x = 0; x < w; x++) 578 tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride, 579 4 - intermediate_bits) - PREP_BIAS; 580 581 tmp += w; 582 src += src_stride; 583 } while (--h); 584 } else 585 prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX); 586 } 587 588 static void prep_bilin_scaled_c(int16_t *tmp, 589 const pixel *src, ptrdiff_t src_stride, 590 const int w, int h, const int mx, int my, 591 const int dx, const int dy HIGHBD_DECL_SUFFIX) 592 { 593 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 594 int16_t mid[128 * 2]; 595 int in_y = -2; 596 597 do { 598 int x; 599 int y = my >> 10; 600 int16_t *mid1 = &mid[(y & 1) * 128]; 601 int16_t *mid2 = &mid[((y + 1) & 1) * 128]; 602 int dmy = my & 0x3ff; 603 604 while (in_y < y) { 605 int imx = mx, ioff = 0; 606 int16_t *mid_ptr = &mid[(in_y & 1) * 128]; 607 608 for (x = 0; x < w; x++) { 609 mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1, 610 4 - intermediate_bits); 611 imx += dx; 612 ioff += imx >> 10; 613 imx &= 0x3ff; 614 } 615 616 src += PXSTRIDE(src_stride); 617 in_y++; 618 } 619 620 for (x = 0; x < w; x++) 621 tmp[x] = FILTER_BILIN_RND2(mid1, mid2, x, dmy >> 6, 4) - PREP_BIAS; 622 623 my += dy; 624 tmp += w; 625 } while (--h); 626 } 627 628 static void avg_c(pixel *dst, const ptrdiff_t dst_stride, 629 const int16_t *tmp1, const int16_t *tmp2, const int w, int h 630 HIGHBD_DECL_SUFFIX) 631 { 632 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 633 const int sh = intermediate_bits + 1; 634 const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2; 635 do { 636 for (int x = 0; x < w; x++) 637 dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh); 638 639 tmp1 += w; 640 tmp2 += w; 641 dst += PXSTRIDE(dst_stride); 642 } while (--h); 643 } 644 645 static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, 646 const int16_t *tmp1, const int16_t *tmp2, const int w, int h, 647 const int weight HIGHBD_DECL_SUFFIX) 648 { 649 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 650 const int sh = intermediate_bits + 4; 651 const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16; 652 do { 653 for (int x = 0; x < w; x++) 654 dst[x] = iclip_pixel((tmp1[x] * weight + 655 tmp2[x] * (16 - weight) + rnd) >> sh); 656 657 tmp1 += w; 658 tmp2 += w; 659 dst += PXSTRIDE(dst_stride); 660 } while (--h); 661 } 662 663 static void mask_c(pixel *dst, const ptrdiff_t dst_stride, 664 const int16_t *tmp1, const int16_t *tmp2, const int w, int h, 665 const uint8_t *mask HIGHBD_DECL_SUFFIX) 666 { 667 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 668 const int sh = intermediate_bits + 6; 669 const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64; 670 do { 671 for (int x = 0; x < w; x++) 672 dst[x] = iclip_pixel((tmp1[x] * mask[x] + 673 tmp2[x] * (64 - mask[x]) + rnd) >> sh); 674 675 tmp1 += w; 676 tmp2 += w; 677 mask += w; 678 dst += PXSTRIDE(dst_stride); 679 } while (--h); 680 } 681 682 #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6) 683 static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 684 const int w, int h, const uint8_t *mask) 685 { 686 do { 687 for (int x = 0; x < w; x++) { 688 dst[x] = blend_px(dst[x], tmp[x], mask[x]); 689 } 690 dst += PXSTRIDE(dst_stride); 691 tmp += w; 692 mask += w; 693 } while (--h); 694 } 695 696 static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 697 const int w, int h) 698 { 699 const uint8_t *const mask = &dav1d_obmc_masks[w]; 700 do { 701 for (int x = 0; x < (w * 3) >> 2; x++) { 702 dst[x] = blend_px(dst[x], tmp[x], mask[x]); 703 } 704 dst += PXSTRIDE(dst_stride); 705 tmp += w; 706 } while (--h); 707 } 708 709 static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 710 const int w, int h) 711 { 712 const uint8_t *mask = &dav1d_obmc_masks[h]; 713 h = (h * 3) >> 2; 714 do { 715 const int m = *mask++; 716 for (int x = 0; x < w; x++) { 717 dst[x] = blend_px(dst[x], tmp[x], m); 718 } 719 dst += PXSTRIDE(dst_stride); 720 tmp += w; 721 } while (--h); 722 } 723 724 static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, 725 const int16_t *tmp1, const int16_t *tmp2, const int w, int h, 726 uint8_t *mask, const int sign, 727 const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) 728 { 729 // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows, 730 // and then load this intermediate to calculate final value for odd rows 731 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 732 const int bitdepth = bitdepth_from_max(bitdepth_max); 733 const int sh = intermediate_bits + 6; 734 const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64; 735 const int mask_sh = bitdepth + intermediate_bits - 4; 736 const int mask_rnd = 1 << (mask_sh - 5); 737 do { 738 for (int x = 0; x < w; x++) { 739 const int tmpdiff = tmp1[x] - tmp2[x]; 740 const int m = imin(38 + ((abs(tmpdiff) + mask_rnd) >> mask_sh), 64); 741 dst[x] = iclip_pixel((tmpdiff * m + tmp2[x] * 64 + rnd) >> sh); 742 743 if (ss_hor) { 744 x++; 745 746 const int tmpdiff = tmp1[x] - tmp2[x]; 747 const int n = imin(38 + ((abs(tmpdiff) + mask_rnd) >> mask_sh), 64); 748 dst[x] = iclip_pixel((tmpdiff * n + tmp2[x] * 64 + rnd) >> sh); 749 750 if (h & ss_ver) { 751 mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2; 752 } else if (ss_ver) { 753 mask[x >> 1] = m + n; 754 } else { 755 mask[x >> 1] = (m + n + 1 - sign) >> 1; 756 } 757 } else { 758 mask[x] = m; 759 } 760 } 761 762 tmp1 += w; 763 tmp2 += w; 764 dst += PXSTRIDE(dst_stride); 765 if (!ss_ver || (h & 1)) mask += w >> ss_hor; 766 } while (--h); 767 } 768 769 #define w_mask_fns(ssn, ss_hor, ss_ver) \ 770 static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \ 771 const int16_t *const tmp1, const int16_t *const tmp2, \ 772 const int w, const int h, uint8_t *mask, \ 773 const int sign HIGHBD_DECL_SUFFIX) \ 774 { \ 775 w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver \ 776 HIGHBD_TAIL_SUFFIX); \ 777 } 778 779 w_mask_fns(444, 0, 0); 780 w_mask_fns(422, 1, 0); 781 w_mask_fns(420, 1, 1); 782 783 #undef w_mask_fns 784 785 #define FILTER_WARP_RND(src, x, F, stride, sh) \ 786 ((F[0] * src[x - 3 * stride] + \ 787 F[1] * src[x - 2 * stride] + \ 788 F[2] * src[x - 1 * stride] + \ 789 F[3] * src[x + 0 * stride] + \ 790 F[4] * src[x + 1 * stride] + \ 791 F[5] * src[x + 2 * stride] + \ 792 F[6] * src[x + 3 * stride] + \ 793 F[7] * src[x + 4 * stride] + \ 794 ((1 << (sh)) >> 1)) >> (sh)) 795 796 #define FILTER_WARP_CLIP(src, x, F, stride, sh) \ 797 iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh)) 798 799 static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, 800 const pixel *src, const ptrdiff_t src_stride, 801 const int16_t *const abcd, int mx, int my 802 HIGHBD_DECL_SUFFIX) 803 { 804 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 805 int16_t mid[15 * 8], *mid_ptr = mid; 806 807 src -= 3 * PXSTRIDE(src_stride); 808 for (int y = 0; y < 15; y++, mx += abcd[1]) { 809 for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) { 810 const int8_t *const filter = 811 dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)]; 812 813 mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1, 814 7 - intermediate_bits); 815 } 816 src += PXSTRIDE(src_stride); 817 mid_ptr += 8; 818 } 819 820 mid_ptr = &mid[3 * 8]; 821 for (int y = 0; y < 8; y++, my += abcd[3]) { 822 for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) { 823 const int8_t *const filter = 824 dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)]; 825 826 dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8, 827 7 + intermediate_bits); 828 } 829 mid_ptr += 8; 830 dst += PXSTRIDE(dst_stride); 831 } 832 } 833 834 static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride, 835 const pixel *src, const ptrdiff_t src_stride, 836 const int16_t *const abcd, int mx, int my 837 HIGHBD_DECL_SUFFIX) 838 { 839 const int intermediate_bits = get_intermediate_bits(bitdepth_max); 840 int16_t mid[15 * 8], *mid_ptr = mid; 841 842 src -= 3 * PXSTRIDE(src_stride); 843 for (int y = 0; y < 15; y++, mx += abcd[1]) { 844 for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) { 845 const int8_t *const filter = 846 dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)]; 847 848 mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1, 849 7 - intermediate_bits); 850 } 851 src += PXSTRIDE(src_stride); 852 mid_ptr += 8; 853 } 854 855 mid_ptr = &mid[3 * 8]; 856 for (int y = 0; y < 8; y++, my += abcd[3]) { 857 for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) { 858 const int8_t *const filter = 859 dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)]; 860 861 tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS; 862 } 863 mid_ptr += 8; 864 tmp += tmp_stride; 865 } 866 } 867 868 static void emu_edge_c(const intptr_t bw, const intptr_t bh, 869 const intptr_t iw, const intptr_t ih, 870 const intptr_t x, const intptr_t y, 871 pixel *dst, const ptrdiff_t dst_stride, 872 const pixel *ref, const ptrdiff_t ref_stride) 873 { 874 // find offset in reference of visible block to copy 875 ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) + 876 iclip((int) x, 0, (int) iw - 1); 877 878 // number of pixels to extend (left, right, top, bottom) 879 const int left_ext = iclip((int) -x, 0, (int) bw - 1); 880 const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1); 881 assert(left_ext + right_ext < bw); 882 const int top_ext = iclip((int) -y, 0, (int) bh - 1); 883 const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1); 884 assert(top_ext + bottom_ext < bh); 885 886 // copy visible portion first 887 pixel *blk = dst + top_ext * PXSTRIDE(dst_stride); 888 const int center_w = (int) (bw - left_ext - right_ext); 889 const int center_h = (int) (bh - top_ext - bottom_ext); 890 for (int y = 0; y < center_h; y++) { 891 pixel_copy(blk + left_ext, ref, center_w); 892 // extend left edge for this line 893 if (left_ext) 894 pixel_set(blk, blk[left_ext], left_ext); 895 // extend right edge for this line 896 if (right_ext) 897 pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1], 898 right_ext); 899 ref += PXSTRIDE(ref_stride); 900 blk += PXSTRIDE(dst_stride); 901 } 902 903 // copy top 904 blk = dst + top_ext * PXSTRIDE(dst_stride); 905 for (int y = 0; y < top_ext; y++) { 906 pixel_copy(dst, blk, bw); 907 dst += PXSTRIDE(dst_stride); 908 } 909 910 // copy bottom 911 dst += center_h * PXSTRIDE(dst_stride); 912 for (int y = 0; y < bottom_ext; y++) { 913 pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw); 914 dst += PXSTRIDE(dst_stride); 915 } 916 } 917 918 static void resize_c(pixel *dst, const ptrdiff_t dst_stride, 919 const pixel *src, const ptrdiff_t src_stride, 920 const int dst_w, int h, const int src_w, 921 const int dx, const int mx0 HIGHBD_DECL_SUFFIX) 922 { 923 do { 924 int mx = mx0, src_x = -1; 925 for (int x = 0; x < dst_w; x++) { 926 const int8_t *const F = dav1d_resize_filter[mx >> 8]; 927 dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] + 928 F[1] * src[iclip(src_x - 2, 0, src_w - 1)] + 929 F[2] * src[iclip(src_x - 1, 0, src_w - 1)] + 930 F[3] * src[iclip(src_x + 0, 0, src_w - 1)] + 931 F[4] * src[iclip(src_x + 1, 0, src_w - 1)] + 932 F[5] * src[iclip(src_x + 2, 0, src_w - 1)] + 933 F[6] * src[iclip(src_x + 3, 0, src_w - 1)] + 934 F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) + 935 64) >> 7); 936 mx += dx; 937 src_x += mx >> 14; 938 mx &= 0x3fff; 939 } 940 941 dst += PXSTRIDE(dst_stride); 942 src += PXSTRIDE(src_stride); 943 } while (--h); 944 } 945 946 #if HAVE_ASM 947 #if ARCH_AARCH64 || ARCH_ARM 948 #include "src/arm/mc.h" 949 #elif ARCH_LOONGARCH64 950 #include "src/loongarch/mc.h" 951 #elif ARCH_PPC64LE 952 #include "src/ppc/mc.h" 953 #elif ARCH_RISCV 954 #include "src/riscv/mc.h" 955 #elif ARCH_X86 956 #include "src/x86/mc.h" 957 #endif 958 #endif 959 960 COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { 961 #define init_mc_fns(type, name) do { \ 962 c->mc [type] = put_##name##_c; \ 963 c->mc_scaled [type] = put_##name##_scaled_c; \ 964 c->mct [type] = prep_##name##_c; \ 965 c->mct_scaled[type] = prep_##name##_scaled_c; \ 966 } while (0) 967 968 init_mc_fns(FILTER_2D_8TAP_REGULAR, 8tap_regular); 969 init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth); 970 init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp); 971 init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular); 972 init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth); 973 init_mc_fns(FILTER_2D_8TAP_SHARP, 8tap_sharp); 974 init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular); 975 init_mc_fns(FILTER_2D_8TAP_SMOOTH, 8tap_smooth); 976 init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp); 977 init_mc_fns(FILTER_2D_BILINEAR, bilin); 978 979 c->avg = avg_c; 980 c->w_avg = w_avg_c; 981 c->mask = mask_c; 982 c->blend = blend_c; 983 c->blend_v = blend_v_c; 984 c->blend_h = blend_h_c; 985 c->w_mask[0] = w_mask_444_c; 986 c->w_mask[1] = w_mask_422_c; 987 c->w_mask[2] = w_mask_420_c; 988 c->warp8x8 = warp_affine_8x8_c; 989 c->warp8x8t = warp_affine_8x8t_c; 990 c->emu_edge = emu_edge_c; 991 c->resize = resize_c; 992 993 #if HAVE_ASM 994 #if ARCH_AARCH64 || ARCH_ARM 995 mc_dsp_init_arm(c); 996 #elif ARCH_LOONGARCH64 997 mc_dsp_init_loongarch(c); 998 #elif ARCH_PPC64LE 999 mc_dsp_init_ppc(c); 1000 #elif ARCH_RISCV 1001 mc_dsp_init_riscv(c); 1002 #elif ARCH_X86 1003 mc_dsp_init_x86(c); 1004 #endif 1005 #endif 1006 }