cdef.S (19140B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 #include "cdef_tmpl.S" 31 32 // n1 = s0/d0 33 // w1 = d0/q0 34 // n2 = s4/d2 35 // w2 = d2/q1 36 .macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret 37 tst r7, #1 // CDEF_HAVE_LEFT 38 beq 2f 39 // CDEF_HAVE_LEFT 40 tst r7, #2 // CDEF_HAVE_RIGHT 41 beq 1f 42 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 43 ldrh r12, [\s1, #-2] 44 vldr \n1, [\s1] 45 vdup.16 d4, r12 46 ldrh r12, [\s1, #\w] 47 vmov.16 d4[1], r12 48 ldrh r12, [\s2, #-2] 49 vldr \n2, [\s2] 50 vmov.16 d4[2], r12 51 ldrh r12, [\s2, #\w] 52 vmovl.u8 q0, d0 53 vmov.16 d4[3], r12 54 vmovl.u8 q1, d2 55 vmovl.u8 q2, d4 56 vstr s8, [r0, #-4] 57 vst1.16 {\w1}, [r0, :\align] 58 vstr s9, [r0, #2*\w] 59 add r0, r0, #2*\stride 60 vstr s10, [r0, #-4] 61 vst1.16 {\w2}, [r0, :\align] 62 vstr s11, [r0, #2*\w] 63 .if \ret 64 pop {r4-r8,pc} 65 .else 66 add r0, r0, #2*\stride 67 b 3f 68 .endif 69 70 1: 71 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 72 ldrh r12, [\s1, #-2] 73 vldr \n1, [\s1] 74 vdup.16 d4, r12 75 ldrh r12, [\s2, #-2] 76 vldr \n2, [\s2] 77 vmovl.u8 q0, d0 78 vmov.16 d4[1], r12 79 vmovl.u8 q1, d2 80 vmovl.u8 q2, d4 81 vstr s8, [r0, #-4] 82 vst1.16 {\w1}, [r0, :\align] 83 vstr s12, [r0, #2*\w] 84 add r0, r0, #2*\stride 85 vstr s9, [r0, #-4] 86 vst1.16 {\w2}, [r0, :\align] 87 vstr s12, [r0, #2*\w] 88 .if \ret 89 pop {r4-r8,pc} 90 .else 91 add r0, r0, #2*\stride 92 b 3f 93 .endif 94 95 2: 96 // !CDEF_HAVE_LEFT 97 tst r7, #2 // CDEF_HAVE_RIGHT 98 beq 1f 99 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 100 vldr \n1, [\s1] 101 ldrh r12, [\s1, #\w] 102 vldr \n2, [\s2] 103 vdup.16 d4, r12 104 ldrh r12, [\s2, #\w] 105 vmovl.u8 q0, d0 106 vmov.16 d4[1], r12 107 vmovl.u8 q1, d2 108 vmovl.u8 q2, d4 109 vstr s12, [r0, #-4] 110 vst1.16 {\w1}, [r0, :\align] 111 vstr s8, [r0, #2*\w] 112 add r0, r0, #2*\stride 113 vstr s12, [r0, #-4] 114 vst1.16 {\w2}, [r0, :\align] 115 vstr s9, [r0, #2*\w] 116 .if \ret 117 pop {r4-r8,pc} 118 .else 119 add r0, r0, #2*\stride 120 b 3f 121 .endif 122 123 1: 124 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 125 vldr \n1, [\s1] 126 vldr \n2, [\s2] 127 vmovl.u8 q0, d0 128 vmovl.u8 q1, d2 129 vstr s12, [r0, #-4] 130 vst1.16 {\w1}, [r0, :\align] 131 vstr s12, [r0, #2*\w] 132 add r0, r0, #2*\stride 133 vstr s12, [r0, #-4] 134 vst1.16 {\w2}, [r0, :\align] 135 vstr s12, [r0, #2*\w] 136 .if \ret 137 pop {r4-r8,pc} 138 .else 139 add r0, r0, #2*\stride 140 .endif 141 3: 142 .endm 143 144 .macro load_n_incr dst, src, incr, w 145 .if \w == 4 146 vld1.32 {\dst\()[0]}, [\src, :32], \incr 147 .else 148 vld1.8 {\dst\()}, [\src, :64], \incr 149 .endif 150 .endm 151 152 // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, 153 // ptrdiff_t src_stride, const pixel (*left)[2], 154 // const pixel *const top, 155 // const pixel *const bottom, int h, 156 // enum CdefEdgeFlags edges); 157 158 // n1 = s0/d0 159 // w1 = d0/q0 160 // n2 = s4/d2 161 // w2 = d2/q1 162 .macro padding_func w, stride, n1, w1, n2, w2, align 163 function cdef_padding\w\()_8bpc_neon, export=1 164 push {r4-r8,lr} 165 ldrd r4, r5, [sp, #24] 166 ldrd r6, r7, [sp, #32] 167 cmp r7, #0xf // fully edged 168 beq cdef_padding\w\()_edged_8bpc_neon 169 vmov.i16 q3, #0x8000 170 tst r7, #4 // CDEF_HAVE_TOP 171 bne 1f 172 // !CDEF_HAVE_TOP 173 sub r12, r0, #2*(2*\stride+2) 174 vmov.i16 q2, #0x8000 175 vst1.16 {q2,q3}, [r12]! 176 .if \w == 8 177 vst1.16 {q2,q3}, [r12]! 178 .endif 179 b 3f 180 1: 181 // CDEF_HAVE_TOP 182 add r8, r4, r2 183 sub r0, r0, #2*(2*\stride) 184 pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 185 186 // Middle section 187 3: 188 tst r7, #1 // CDEF_HAVE_LEFT 189 beq 2f 190 // CDEF_HAVE_LEFT 191 tst r7, #2 // CDEF_HAVE_RIGHT 192 beq 1f 193 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 194 0: 195 vld1.16 {d2[]}, [r3, :16]! 196 ldrh r12, [r1, #\w] 197 load_n_incr d0, r1, r2, \w 198 subs r6, r6, #1 199 vmov.16 d2[1], r12 200 vmovl.u8 q0, d0 201 vmovl.u8 q1, d2 202 vstr s4, [r0, #-4] 203 vst1.16 {\w1}, [r0, :\align] 204 vstr s5, [r0, #2*\w] 205 add r0, r0, #2*\stride 206 bgt 0b 207 b 3f 208 1: 209 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 210 vld1.16 {d2[]}, [r3, :16]! 211 load_n_incr d0, r1, r2, \w 212 subs r6, r6, #1 213 vmovl.u8 q0, d0 214 vmovl.u8 q1, d2 215 vstr s4, [r0, #-4] 216 vst1.16 {\w1}, [r0, :\align] 217 vstr s12, [r0, #2*\w] 218 add r0, r0, #2*\stride 219 bgt 1b 220 b 3f 221 2: 222 tst r7, #2 // CDEF_HAVE_RIGHT 223 beq 1f 224 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 225 0: 226 ldrh r12, [r1, #\w] 227 load_n_incr d0, r1, r2, \w 228 vdup.16 d2, r12 229 subs r6, r6, #1 230 vmovl.u8 q0, d0 231 vmovl.u8 q1, d2 232 vstr s12, [r0, #-4] 233 vst1.16 {\w1}, [r0, :\align] 234 vstr s4, [r0, #2*\w] 235 add r0, r0, #2*\stride 236 bgt 0b 237 b 3f 238 1: 239 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 240 load_n_incr d0, r1, r2, \w 241 subs r6, r6, #1 242 vmovl.u8 q0, d0 243 vstr s12, [r0, #-4] 244 vst1.16 {\w1}, [r0, :\align] 245 vstr s12, [r0, #2*\w] 246 add r0, r0, #2*\stride 247 bgt 1b 248 249 3: 250 tst r7, #8 // CDEF_HAVE_BOTTOM 251 bne 1f 252 // !CDEF_HAVE_BOTTOM 253 sub r12, r0, #4 254 vmov.i16 q2, #0x8000 255 vst1.16 {q2,q3}, [r12]! 256 .if \w == 8 257 vst1.16 {q2,q3}, [r12]! 258 .endif 259 pop {r4-r8,pc} 260 1: 261 // CDEF_HAVE_BOTTOM 262 add r8, r5, r2 263 pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 264 endfunc 265 .endm 266 267 padding_func 8, 16, d0, q0, d2, q1, 128 268 padding_func 4, 8, s0, d0, s4, d2, 64 269 270 // void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src, 271 // ptrdiff_t src_stride, const pixel (*left)[2], 272 // const pixel *const top, 273 // const pixel *const bottom, int h, 274 // enum CdefEdgeFlags edges); 275 276 .macro padding_func_edged w, stride, reg, align 277 function cdef_padding\w\()_edged_8bpc_neon 278 sub r0, r0, #(2*\stride) 279 280 ldrh r12, [r4, #-2] 281 vldr \reg, [r4] 282 add r8, r4, r2 283 strh r12, [r0, #-2] 284 ldrh r12, [r4, #\w] 285 vstr \reg, [r0] 286 strh r12, [r0, #\w] 287 288 ldrh r12, [r8, #-2] 289 vldr \reg, [r8] 290 strh r12, [r0, #\stride-2] 291 ldrh r12, [r8, #\w] 292 vstr \reg, [r0, #\stride] 293 strh r12, [r0, #\stride+\w] 294 add r0, r0, #2*\stride 295 296 0: 297 ldrh r12, [r3], #2 298 vldr \reg, [r1] 299 str r12, [r0, #-2] 300 ldrh r12, [r1, #\w] 301 add r1, r1, r2 302 subs r6, r6, #1 303 vstr \reg, [r0] 304 str r12, [r0, #\w] 305 add r0, r0, #\stride 306 bgt 0b 307 308 ldrh r12, [r5, #-2] 309 vldr \reg, [r5] 310 add r8, r5, r2 311 strh r12, [r0, #-2] 312 ldrh r12, [r5, #\w] 313 vstr \reg, [r0] 314 strh r12, [r0, #\w] 315 316 ldrh r12, [r8, #-2] 317 vldr \reg, [r8] 318 strh r12, [r0, #\stride-2] 319 ldrh r12, [r8, #\w] 320 vstr \reg, [r0, #\stride] 321 strh r12, [r0, #\stride+\w] 322 323 pop {r4-r8,pc} 324 endfunc 325 .endm 326 327 padding_func_edged 8, 16, d0, 64 328 padding_func_edged 4, 8, s0, 32 329 330 tables 331 332 filter 8, 8 333 filter 4, 8 334 335 find_dir 8 336 337 .macro load_px_8 d11, d12, d21, d22, w 338 .if \w == 8 339 add r6, r2, r9 // x + off 340 sub r9, r2, r9 // x - off 341 vld1.8 {\d11}, [r6] // p0 342 add r6, r6, #16 // += stride 343 vld1.8 {\d21}, [r9] // p1 344 add r9, r9, #16 // += stride 345 vld1.8 {\d12}, [r6] // p0 346 vld1.8 {\d22}, [r9] // p1 347 .else 348 add r6, r2, r9 // x + off 349 sub r9, r2, r9 // x - off 350 vld1.32 {\d11[0]}, [r6] // p0 351 add r6, r6, #8 // += stride 352 vld1.32 {\d21[0]}, [r9] // p1 353 add r9, r9, #8 // += stride 354 vld1.32 {\d11[1]}, [r6] // p0 355 add r6, r6, #8 // += stride 356 vld1.32 {\d21[1]}, [r9] // p1 357 add r9, r9, #8 // += stride 358 vld1.32 {\d12[0]}, [r6] // p0 359 add r6, r6, #8 // += stride 360 vld1.32 {\d22[0]}, [r9] // p1 361 add r9, r9, #8 // += stride 362 vld1.32 {\d12[1]}, [r6] // p0 363 vld1.32 {\d22[1]}, [r9] // p1 364 .endif 365 .endm 366 .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min 367 .if \min 368 vmin.u8 q3, q3, \s1 369 vmax.u8 q4, q4, \s1 370 vmin.u8 q3, q3, \s2 371 vmax.u8 q4, q4, \s2 372 .endif 373 vabd.u8 q8, q0, \s1 // abs(diff) 374 vabd.u8 q11, q0, \s2 // abs(diff) 375 vshl.u8 q9, q8, \shift // abs(diff) >> shift 376 vshl.u8 q12, q11, \shift // abs(diff) >> shift 377 vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) 378 vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) 379 vcgt.u8 q10, q0, \s1 // px > p0 380 vcgt.u8 q13, q0, \s2 // px > p1 381 vmin.u8 q9, q9, q8 // imin(abs(diff), clip) 382 vmin.u8 q12, q12, q11 // imin(abs(diff), clip) 383 vneg.s8 q8, q9 // -imin() 384 vneg.s8 q11, q12 // -imin() 385 vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip) 386 vdup.8 d18, \tap // taps[k] 387 vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip) 388 vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain() 389 vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain() 390 vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain() 391 vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain() 392 .endm 393 394 // void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride, 395 // const uint16_t *tmp, int pri_strength, 396 // int sec_strength, int dir, int damping, 397 // int h, size_t edges); 398 .macro filter_func_8 w, pri, sec, min, suffix 399 function cdef_filter\w\suffix\()_edged_neon 400 .if \pri 401 movrel_local r8, pri_taps 402 and r9, r3, #1 403 add r8, r8, r9, lsl #1 404 .endif 405 movrel_local r9, directions\w 406 add r5, r9, r5, lsl #1 407 vmov.u8 d17, #7 408 vdup.8 d16, r6 // damping 409 410 vmov.8 d8[0], r3 411 vmov.8 d8[1], r4 412 vclz.i8 d8, d8 // clz(threshold) 413 vsub.i8 d8, d17, d8 // ulog2(threshold) 414 vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) 415 vneg.s8 d8, d8 // -shift 416 .if \sec 417 vdup.8 q6, d8[1] 418 .endif 419 .if \pri 420 vdup.8 q5, d8[0] 421 .endif 422 423 1: 424 .if \w == 8 425 add r12, r2, #16 426 vld1.8 {d0}, [r2, :64] // px 427 vld1.8 {d1}, [r12, :64] // px 428 .else 429 add r12, r2, #8 430 vld1.32 {d0[0]}, [r2, :32] // px 431 add r9, r2, #2*8 432 vld1.32 {d0[1]}, [r12, :32] // px 433 add r12, r12, #2*8 434 vld1.32 {d1[0]}, [r9, :32] // px 435 vld1.32 {d1[1]}, [r12, :32] // px 436 .endif 437 438 vmov.u8 q1, #0 // sum 439 vmov.u8 q2, #0 // sum 440 .if \min 441 vmov.u16 q3, q0 // min 442 vmov.u16 q4, q0 // max 443 .endif 444 445 // Instead of loading sec_taps 2, 1 from memory, just set it 446 // to 2 initially and decrease for the second round. 447 // This is also used as loop counter. 448 mov lr, #2 // sec_taps[0] 449 450 2: 451 .if \pri 452 ldrsb r9, [r5] // off1 453 454 load_px_8 d28, d29, d30, d31, \w 455 .endif 456 457 .if \sec 458 add r5, r5, #4 // +2*2 459 ldrsb r9, [r5] // off2 460 .endif 461 462 .if \pri 463 ldrb r12, [r8] // *pri_taps 464 vdup.8 q7, r3 // threshold 465 466 handle_pixel_8 q14, q15, q7, q5, r12, \min 467 .endif 468 469 .if \sec 470 load_px_8 d28, d29, d30, d31, \w 471 472 add r5, r5, #8 // +2*4 473 ldrsb r9, [r5] // off3 474 475 vdup.8 q7, r4 // threshold 476 477 handle_pixel_8 q14, q15, q7, q6, lr, \min 478 479 load_px_8 d28, d29, d30, d31, \w 480 481 handle_pixel_8 q14, q15, q7, q6, lr, \min 482 483 sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; 484 .else 485 add r5, r5, #1 // r5 += 1 486 .endif 487 subs lr, lr, #1 // sec_tap-- (value) 488 .if \pri 489 add r8, r8, #1 // pri_taps++ (pointer) 490 .endif 491 bne 2b 492 493 vshr.s16 q14, q1, #15 // -(sum < 0) 494 vshr.s16 q15, q2, #15 // -(sum < 0) 495 vadd.i16 q1, q1, q14 // sum - (sum < 0) 496 vadd.i16 q2, q2, q15 // sum - (sum < 0) 497 vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 498 vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4 499 vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4 500 vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4 501 vqmovun.s16 d0, q1 502 vqmovun.s16 d1, q2 503 .if \min 504 vmin.u8 q0, q0, q4 505 vmax.u8 q0, q0, q3 // iclip(px + .., min, max) 506 .endif 507 .if \w == 8 508 vst1.8 {d0}, [r0, :64], r1 509 add r2, r2, #2*16 // tmp += 2*tmp_stride 510 subs r7, r7, #2 // h -= 2 511 vst1.8 {d1}, [r0, :64], r1 512 .else 513 vst1.32 {d0[0]}, [r0, :32], r1 514 add r2, r2, #4*8 // tmp += 4*tmp_stride 515 vst1.32 {d0[1]}, [r0, :32], r1 516 subs r7, r7, #4 // h -= 4 517 vst1.32 {d1[0]}, [r0, :32], r1 518 vst1.32 {d1[1]}, [r0, :32], r1 519 .endif 520 521 // Reset pri_taps and directions back to the original point 522 sub r5, r5, #2 523 .if \pri 524 sub r8, r8, #2 525 .endif 526 527 bgt 1b 528 vpop {q4-q7} 529 pop {r4-r9,pc} 530 endfunc 531 .endm 532 533 .macro filter_8 w 534 filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri 535 filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec 536 filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec 537 .endm 538 539 filter_8 8 540 filter_8 4