cdef_tmpl.c (19600B)
1 /* 2 * Copyright © 2019, Luca Barbato 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "src/ppc/dav1d_types.h" 28 #include "src/ppc/cdef.h" 29 30 #if BITDEPTH == 8 31 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, 32 const uint16_t shift) 33 { 34 const i16x8 zero = vec_splat_s16(0); 35 if (!threshold) return zero; 36 const i16x8 abs_diff = vec_abs(diff); 37 const b16x8 mask = vec_cmplt(diff, zero); 38 const i16x8 thr = vec_splats(threshold); 39 const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift))); 40 const i16x8 max = vec_max(zero, sub); 41 const i16x8 min = vec_min(abs_diff, max); 42 const i16x8 neg = vec_sub(zero, min); 43 return vec_sel(min, neg, mask); 44 } 45 46 static inline void copy4xN(uint16_t *tmp, 47 const uint8_t *src, const ptrdiff_t src_stride, 48 const uint8_t (*left)[2], const uint8_t *const top, 49 const uint8_t *const bottom, const int w, const int h, 50 const enum CdefEdgeFlags edges) 51 { 52 const u16x8 fill = vec_splats((uint16_t)INT16_MAX); 53 54 u16x8 l0; 55 u16x8 l1; 56 57 int y_start = -2, y_end = h + 2; 58 59 // Copy top and bottom first 60 if (!(edges & CDEF_HAVE_TOP)) { 61 l0 = fill; 62 l1 = fill; 63 y_start = 0; 64 } else { 65 l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2)); 66 l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2)); 67 } 68 69 vec_st(l0, 0, tmp - 2 * 8); 70 vec_st(l1, 0, tmp - 1 * 8); 71 72 if (!(edges & CDEF_HAVE_BOTTOM)) { 73 l0 = fill; 74 l1 = fill; 75 y_end -= 2; 76 } else { 77 l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2)); 78 l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2)); 79 } 80 81 vec_st(l0, 0, tmp + (h + 0) * 8); 82 vec_st(l1, 0, tmp + (h + 1) * 8); 83 84 int y_with_left_edge = 0; 85 if (!(edges & CDEF_HAVE_LEFT)) { 86 u16x8 l = u8h_to_u16(vec_vsx_ld(0, src)); 87 vec_vsx_st(l, 0, tmp + 2); 88 89 y_with_left_edge = 1; 90 } 91 92 for (int y = y_with_left_edge; y < h; y++) { 93 u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride)); 94 vec_st(l, 0, tmp + y * 8); 95 } 96 97 if (!(edges & CDEF_HAVE_LEFT)) { 98 for (int y = y_start; y < y_end; y++) { 99 tmp[y * 8] = INT16_MAX; 100 tmp[1 + y * 8] = INT16_MAX; 101 } 102 } else { 103 for (int y = 0; y < h; y++) { 104 tmp[y * 8] = left[y][0]; 105 tmp[1 + y * 8] = left[y][1]; 106 } 107 } 108 if (!(edges & CDEF_HAVE_RIGHT)) { 109 for (int y = y_start; y < y_end; y++) { 110 tmp[- 2 + (y + 1) * 8] = INT16_MAX; 111 tmp[- 1 + (y + 1) * 8] = INT16_MAX; 112 } 113 } 114 } 115 116 static inline void copy8xN(uint16_t *tmp, 117 const uint8_t *src, const ptrdiff_t src_stride, 118 const uint8_t (*left)[2], const uint8_t *const top, 119 const uint8_t *const bottom, const int w, const int h, 120 const enum CdefEdgeFlags edges) 121 { 122 const u16x8 fill = vec_splats((uint16_t)INT16_MAX); 123 124 u16x8 l0h, l0l; 125 u16x8 l1h, l1l; 126 127 int y_start = -2, y_end = h + 2; 128 129 // Copy top and bottom first 130 if (!(edges & CDEF_HAVE_TOP)) { 131 l0h = fill; 132 l0l = fill; 133 l1h = fill; 134 l1l = fill; 135 y_start = 0; 136 } else { 137 u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2); 138 u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2); 139 l0h = u8h_to_u16(l0); 140 l0l = u8l_to_u16(l0); 141 l1h = u8h_to_u16(l1); 142 l1l = u8l_to_u16(l1); 143 } 144 145 vec_st(l0h, 0, tmp - 4 * 8); 146 vec_st(l0l, 0, tmp - 3 * 8); 147 vec_st(l1h, 0, tmp - 2 * 8); 148 vec_st(l1l, 0, tmp - 1 * 8); 149 150 if (!(edges & CDEF_HAVE_BOTTOM)) { 151 l0h = fill; 152 l0l = fill; 153 l1h = fill; 154 l1l = fill; 155 y_end -= 2; 156 } else { 157 u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2); 158 u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2); 159 l0h = u8h_to_u16(l0); 160 l0l = u8l_to_u16(l0); 161 l1h = u8h_to_u16(l1); 162 l1l = u8l_to_u16(l1); 163 } 164 165 vec_st(l0h, 0, tmp + (h + 0) * 16); 166 vec_st(l0l, 0, tmp + (h + 0) * 16 + 8); 167 vec_st(l1h, 0, tmp + (h + 1) * 16); 168 vec_st(l1l, 0, tmp + (h + 1) * 16 + 8); 169 170 int y_with_left_edge = 0; 171 if (!(edges & CDEF_HAVE_LEFT)) { 172 u8x16 l = vec_vsx_ld(0, src); 173 u16x8 lh = u8h_to_u16(l); 174 u16x8 ll = u8l_to_u16(l); 175 vec_vsx_st(lh, 0, tmp + 2); 176 vec_vsx_st(ll, 0, tmp + 8 + 2); 177 178 y_with_left_edge = 1; 179 } 180 181 for (int y = y_with_left_edge; y < h; y++) { 182 u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride); 183 u16x8 lh = u8h_to_u16(l); 184 u16x8 ll = u8l_to_u16(l); 185 vec_st(lh, 0, tmp + y * 16); 186 vec_st(ll, 0, tmp + 8 + y * 16); 187 } 188 189 if (!(edges & CDEF_HAVE_LEFT)) { 190 for (int y = y_start; y < y_end; y++) { 191 tmp[y * 16] = INT16_MAX; 192 tmp[1 + y * 16] = INT16_MAX; 193 } 194 } else { 195 for (int y = 0; y < h; y++) { 196 tmp[y * 16] = left[y][0]; 197 tmp[1 + y * 16] = left[y][1]; 198 } 199 } 200 if (!(edges & CDEF_HAVE_RIGHT)) { 201 for (int y = y_start; y < y_end; y++) { 202 tmp[- 6 + (y + 1) * 16] = INT16_MAX; 203 tmp[- 5 + (y + 1) * 16] = INT16_MAX; 204 } 205 } 206 } 207 208 static inline i16x8 max_mask(i16x8 a, i16x8 b) { 209 const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX); 210 211 const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX); 212 213 const i16x8 val = vec_sel(a, b, mask); 214 215 return vec_max(val, b); 216 } 217 218 #define LOAD_PIX(addr) \ 219 const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \ 220 i16x8 sum = vec_splat_s16(0); 221 222 #define LOAD_PIX4(addr) \ 223 const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \ 224 const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \ 225 const i16x8 px = vec_xxpermdi(a, b, 0); \ 226 i16x8 sum = vec_splat_s16(0); 227 228 #define LOAD_DIR(p, addr, o0, o1) \ 229 const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \ 230 const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \ 231 const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \ 232 const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1); 233 234 #define LOAD_DIR4(p, addr, o0, o1) \ 235 LOAD_DIR(p ## a, addr, o0, o1) \ 236 LOAD_DIR(p ## b, addr + 8, o0, o1) \ 237 const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \ 238 const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \ 239 const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \ 240 const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0); 241 242 #define CONSTRAIN(p, strength, shift) \ 243 const i16x8 p ## _d0 = vec_sub(p ## 0, px); \ 244 const i16x8 p ## _d1 = vec_sub(p ## 1, px); \ 245 const i16x8 p ## _d2 = vec_sub(p ## 2, px); \ 246 const i16x8 p ## _d3 = vec_sub(p ## 3, px); \ 247 \ 248 i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \ 249 i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \ 250 i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \ 251 i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift); 252 253 #define SETUP_MINMAX \ 254 i16x8 max = px; \ 255 i16x8 min = px; \ 256 257 #define MIN_MAX(p) \ 258 max = max_mask(p ## 0, max); \ 259 min = vec_min(p ## 0, min); \ 260 max = max_mask(p ## 1, max); \ 261 min = vec_min(p ## 1, min); \ 262 max = max_mask(p ## 2, max); \ 263 min = vec_min(p ## 2, min); \ 264 max = max_mask(p ## 3, max); \ 265 min = vec_min(p ## 3, min); 266 267 #define MAKE_TAPS \ 268 const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \ 269 const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \ 270 const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd)); 271 272 #define PRI_0_UPDATE_SUM(p) \ 273 sum = vec_madd(tap0, p ## _c0, sum); \ 274 sum = vec_madd(tap0, p ## _c1, sum); \ 275 sum = vec_madd(tap1, p ## _c2, sum); \ 276 sum = vec_madd(tap1, p ## _c3, sum); 277 278 #define UPDATE_SUM(p) \ 279 const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \ 280 const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \ 281 sum = vec_add(sum, p ## sum0); \ 282 sum = vec_add(sum, p ## sum1); 283 284 #define SEC_0_UPDATE_SUM(p) \ 285 sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \ 286 sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \ 287 sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \ 288 sum = vec_madd(vec_splat_s16(2), p ## _c3, sum); 289 290 #define BIAS \ 291 i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \ 292 bias = vec_sub(vec_splat_s16(8), bias); \ 293 294 #define STORE4 \ 295 dst[0] = vdst[0]; \ 296 dst[1] = vdst[1]; \ 297 dst[2] = vdst[2]; \ 298 dst[3] = vdst[3]; \ 299 \ 300 tmp += 8; \ 301 dst += PXSTRIDE(dst_stride); \ 302 dst[0] = vdst[4]; \ 303 dst[1] = vdst[5]; \ 304 dst[2] = vdst[6]; \ 305 dst[3] = vdst[7]; \ 306 \ 307 tmp += 8; \ 308 dst += PXSTRIDE(dst_stride); 309 310 #define STORE4_CLAMPED \ 311 BIAS \ 312 i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ 313 i16x8 vdst = vec_max(vec_min(unclamped, max), min); \ 314 STORE4 315 316 #define STORE4_UNCLAMPED \ 317 BIAS \ 318 i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ 319 STORE4 320 321 #define STORE8 \ 322 dst[0] = vdst[0]; \ 323 dst[1] = vdst[1]; \ 324 dst[2] = vdst[2]; \ 325 dst[3] = vdst[3]; \ 326 dst[4] = vdst[4]; \ 327 dst[5] = vdst[5]; \ 328 dst[6] = vdst[6]; \ 329 dst[7] = vdst[7]; \ 330 \ 331 tmp += 16; \ 332 dst += PXSTRIDE(dst_stride); 333 334 #define STORE8_CLAMPED \ 335 BIAS \ 336 i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ 337 i16x8 vdst = vec_max(vec_min(unclamped, max), min); \ 338 STORE8 339 340 #define STORE8_UNCLAMPED \ 341 BIAS \ 342 i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ 343 STORE8 344 345 #define DIRECTIONS(w, tmp_stride) \ 346 static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \ 347 { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \ 348 { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \ 349 { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, \ 350 { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, \ 351 { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, \ 352 { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, \ 353 { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, \ 354 { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } \ 355 }; 356 357 DIRECTIONS(4, 8) 358 DIRECTIONS(8, 16) 359 360 static inline void 361 filter_4xN(pixel *dst, const ptrdiff_t dst_stride, 362 const pixel (*left)[2], const pixel *const top, 363 const pixel *const bottom, const int w, const int h, 364 const int pri_strength, const int sec_strength, const int dir, 365 const int pri_shift, const int sec_shift, 366 const enum CdefEdgeFlags edges, uint16_t *tmp) 367 { 368 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 369 const int off1 = cdef_directions4[dir][0]; 370 const int off1_1 = cdef_directions4[dir][1]; 371 372 const int off2 = cdef_directions4[(dir + 2) & 7][0]; 373 const int off3 = cdef_directions4[(dir + 6) & 7][0]; 374 375 const int off2_1 = cdef_directions4[(dir + 2) & 7][1]; 376 const int off3_1 = cdef_directions4[(dir + 6) & 7][1]; 377 378 MAKE_TAPS 379 380 for (int y = 0; y < h / 2; y++) { 381 LOAD_PIX4(tmp) 382 383 SETUP_MINMAX 384 385 // Primary pass 386 LOAD_DIR4(p, tmp, off1, off1_1) 387 388 CONSTRAIN(p, pri_strength, pri_shift) 389 390 MIN_MAX(p) 391 392 PRI_0_UPDATE_SUM(p) 393 394 // Secondary pass 1 395 LOAD_DIR4(s, tmp, off2, off3) 396 397 CONSTRAIN(s, sec_strength, sec_shift) 398 399 MIN_MAX(s) 400 401 SEC_0_UPDATE_SUM(s) 402 403 // Secondary pass 2 404 LOAD_DIR4(s2, tmp, off2_1, off3_1) 405 406 CONSTRAIN(s2, sec_strength, sec_shift) 407 408 MIN_MAX(s2) 409 410 UPDATE_SUM(s2) 411 412 // Store 413 STORE4_CLAMPED 414 } 415 } 416 417 static inline void 418 filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride, 419 const pixel (*left)[2], const pixel *const top, 420 const pixel *const bottom, const int w, const int h, 421 const int pri_strength, const int dir, 422 const int pri_shift, const enum CdefEdgeFlags edges, 423 uint16_t *tmp) 424 { 425 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 426 const int off1 = cdef_directions4[dir][0]; 427 const int off1_1 = cdef_directions4[dir][1]; 428 429 MAKE_TAPS 430 431 for (int y = 0; y < h / 2; y++) { 432 LOAD_PIX4(tmp) 433 434 // Primary pass 435 LOAD_DIR4(p, tmp, off1, off1_1) 436 437 CONSTRAIN(p, pri_strength, pri_shift) 438 439 PRI_0_UPDATE_SUM(p) 440 441 STORE4_UNCLAMPED 442 } 443 } 444 445 static inline void 446 filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride, 447 const pixel (*left)[2], const pixel *const top, 448 const pixel *const bottom, const int w, const int h, 449 const int sec_strength, const int dir, 450 const int sec_shift, const enum CdefEdgeFlags edges, 451 uint16_t *tmp) 452 { 453 const int off2 = cdef_directions4[(dir + 2) & 7][0]; 454 const int off3 = cdef_directions4[(dir + 6) & 7][0]; 455 456 const int off2_1 = cdef_directions4[(dir + 2) & 7][1]; 457 const int off3_1 = cdef_directions4[(dir + 6) & 7][1]; 458 459 for (int y = 0; y < h / 2; y++) { 460 LOAD_PIX4(tmp) 461 // Secondary pass 1 462 LOAD_DIR4(s, tmp, off2, off3) 463 464 CONSTRAIN(s, sec_strength, sec_shift) 465 466 SEC_0_UPDATE_SUM(s) 467 468 // Secondary pass 2 469 LOAD_DIR4(s2, tmp, off2_1, off3_1) 470 471 CONSTRAIN(s2, sec_strength, sec_shift) 472 473 UPDATE_SUM(s2) 474 475 STORE4_UNCLAMPED 476 } 477 } 478 479 static inline void 480 filter_8xN(pixel *dst, const ptrdiff_t dst_stride, 481 const pixel (*left)[2], const pixel *const top, 482 const pixel *const bottom, const int w, const int h, 483 const int pri_strength, const int sec_strength, const int dir, 484 const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges, 485 uint16_t *tmp) 486 { 487 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 488 489 const int off1 = cdef_directions8[dir][0]; 490 const int off1_1 = cdef_directions8[dir][1]; 491 492 const int off2 = cdef_directions8[(dir + 2) & 7][0]; 493 const int off3 = cdef_directions8[(dir + 6) & 7][0]; 494 495 const int off2_1 = cdef_directions8[(dir + 2) & 7][1]; 496 const int off3_1 = cdef_directions8[(dir + 6) & 7][1]; 497 498 MAKE_TAPS 499 500 for (int y = 0; y < h; y++) { 501 LOAD_PIX(tmp) 502 503 SETUP_MINMAX 504 505 // Primary pass 506 LOAD_DIR(p, tmp, off1, off1_1) 507 508 CONSTRAIN(p, pri_strength, pri_shift) 509 510 MIN_MAX(p) 511 512 PRI_0_UPDATE_SUM(p) 513 514 // Secondary pass 1 515 LOAD_DIR(s, tmp, off2, off3) 516 517 CONSTRAIN(s, sec_strength, sec_shift) 518 519 MIN_MAX(s) 520 521 SEC_0_UPDATE_SUM(s) 522 523 // Secondary pass 2 524 LOAD_DIR(s2, tmp, off2_1, off3_1) 525 526 CONSTRAIN(s2, sec_strength, sec_shift) 527 528 MIN_MAX(s2) 529 530 UPDATE_SUM(s2) 531 532 // Store 533 STORE8_CLAMPED 534 } 535 536 } 537 538 static inline void 539 filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride, 540 const pixel (*left)[2], const pixel *const top, 541 const pixel *const bottom, const int w, const int h, 542 const int pri_strength, const int dir, 543 const int pri_shift, const enum CdefEdgeFlags edges, 544 uint16_t *tmp) 545 { 546 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 547 const int off1 = cdef_directions8[dir][0]; 548 const int off1_1 = cdef_directions8[dir][1]; 549 550 MAKE_TAPS 551 552 for (int y = 0; y < h; y++) { 553 LOAD_PIX(tmp) 554 555 // Primary pass 556 LOAD_DIR(p, tmp, off1, off1_1) 557 558 CONSTRAIN(p, pri_strength, pri_shift) 559 560 PRI_0_UPDATE_SUM(p) 561 562 STORE8_UNCLAMPED 563 } 564 } 565 566 static inline void 567 filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride, 568 const pixel (*left)[2], const pixel *const top, 569 const pixel *const bottom, const int w, const int h, 570 const int sec_strength, const int dir, 571 const int sec_shift, const enum CdefEdgeFlags edges, 572 uint16_t *tmp) 573 { 574 const int off2 = cdef_directions8[(dir + 2) & 7][0]; 575 const int off3 = cdef_directions8[(dir + 6) & 7][0]; 576 577 const int off2_1 = cdef_directions8[(dir + 2) & 7][1]; 578 const int off3_1 = cdef_directions8[(dir + 6) & 7][1]; 579 580 for (int y = 0; y < h; y++) { 581 LOAD_PIX(tmp) 582 583 // Secondary pass 1 584 LOAD_DIR(s, tmp, off2, off3) 585 586 CONSTRAIN(s, sec_strength, sec_shift) 587 588 SEC_0_UPDATE_SUM(s) 589 590 // Secondary pass 2 591 LOAD_DIR(s2, tmp, off2_1, off3_1) 592 593 CONSTRAIN(s2, sec_strength, sec_shift) 594 595 UPDATE_SUM(s2) 596 597 STORE8_UNCLAMPED 598 } 599 } 600 601 #define cdef_fn(w, h, tmp_stride) \ 602 void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ 603 const ptrdiff_t dst_stride, \ 604 const pixel (*left)[2], \ 605 const pixel *const top, \ 606 const pixel *const bottom, \ 607 const int pri_strength, \ 608 const int sec_strength, \ 609 const int dir, \ 610 const int damping, \ 611 const enum CdefEdgeFlags edges) \ 612 { \ 613 ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ 614 uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ 615 copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \ 616 if (pri_strength) { \ 617 const int pri_shift = imax(0, damping - ulog2(pri_strength)); \ 618 if (sec_strength) { \ 619 const int sec_shift = damping - ulog2(sec_strength); \ 620 filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ 621 sec_strength, dir, pri_shift, sec_shift, edges, tmp); \ 622 } else { \ 623 filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ 624 dir, pri_shift, edges, tmp); \ 625 } \ 626 } else { \ 627 const int sec_shift = damping - ulog2(sec_strength); \ 628 filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \ 629 dir, sec_shift, edges, tmp); \ 630 } \ 631 } 632 633 cdef_fn(4, 4, 8); 634 cdef_fn(4, 8, 8); 635 cdef_fn(8, 8, 16); 636 #endif