cdef.S (19101B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 #include "cdef_tmpl.S" 31 32 .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret 33 tst w7, #1 // CDEF_HAVE_LEFT 34 b.eq 2f 35 // CDEF_HAVE_LEFT 36 sub \s1, \s1, #2 37 sub \s2, \s2, #2 38 tst w7, #2 // CDEF_HAVE_RIGHT 39 b.eq 1f 40 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 41 ldr \rn\()0, [\s1] 42 ldr s1, [\s1, #\w] 43 ldr \rn\()2, [\s2] 44 ldr s3, [\s2, #\w] 45 uxtl v0.8h, v0.8b 46 uxtl v1.8h, v1.8b 47 uxtl v2.8h, v2.8b 48 uxtl v3.8h, v3.8b 49 str \rw\()0, [x0] 50 str d1, [x0, #2*\w] 51 add x0, x0, #2*\stride 52 str \rw\()2, [x0] 53 str d3, [x0, #2*\w] 54 .if \ret 55 ret 56 .else 57 add x0, x0, #2*\stride 58 b 3f 59 .endif 60 61 1: 62 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 63 ldr \rn\()0, [\s1] 64 ldr h1, [\s1, #\w] 65 ldr \rn\()2, [\s2] 66 ldr h3, [\s2, #\w] 67 uxtl v0.8h, v0.8b 68 uxtl v1.8h, v1.8b 69 uxtl v2.8h, v2.8b 70 uxtl v3.8h, v3.8b 71 str \rw\()0, [x0] 72 str s1, [x0, #2*\w] 73 str s31, [x0, #2*\w+4] 74 add x0, x0, #2*\stride 75 str \rw\()2, [x0] 76 str s3, [x0, #2*\w] 77 str s31, [x0, #2*\w+4] 78 .if \ret 79 ret 80 .else 81 add x0, x0, #2*\stride 82 b 3f 83 .endif 84 85 2: 86 // !CDEF_HAVE_LEFT 87 tst w7, #2 // CDEF_HAVE_RIGHT 88 b.eq 1f 89 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 90 ldr \rn\()0, [\s1] 91 ldr h1, [\s1, #\w] 92 ldr \rn\()2, [\s2] 93 ldr h3, [\s2, #\w] 94 uxtl v0.8h, v0.8b 95 uxtl v1.8h, v1.8b 96 uxtl v2.8h, v2.8b 97 uxtl v3.8h, v3.8b 98 str s31, [x0] 99 stur \rw\()0, [x0, #4] 100 str s1, [x0, #4+2*\w] 101 add x0, x0, #2*\stride 102 str s31, [x0] 103 stur \rw\()2, [x0, #4] 104 str s3, [x0, #4+2*\w] 105 .if \ret 106 ret 107 .else 108 add x0, x0, #2*\stride 109 b 3f 110 .endif 111 112 1: 113 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 114 ldr \rn\()0, [\s1] 115 ldr \rn\()1, [\s2] 116 uxtl v0.8h, v0.8b 117 uxtl v1.8h, v1.8b 118 str s31, [x0] 119 stur \rw\()0, [x0, #4] 120 str s31, [x0, #4+2*\w] 121 add x0, x0, #2*\stride 122 str s31, [x0] 123 stur \rw\()1, [x0, #4] 124 str s31, [x0, #4+2*\w] 125 .if \ret 126 ret 127 .else 128 add x0, x0, #2*\stride 129 .endif 130 3: 131 .endm 132 133 .macro load_n_incr dst, src, incr, w 134 .if \w == 4 135 ld1 {\dst\().s}[0], [\src], \incr 136 .else 137 ld1 {\dst\().8b}, [\src], \incr 138 .endif 139 .endm 140 141 // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, 142 // ptrdiff_t src_stride, const pixel (*left)[2], 143 // const pixel *const top, 144 // const pixel *const bottom, int h, 145 // enum CdefEdgeFlags edges); 146 147 .macro padding_func w, stride, rn, rw 148 function cdef_padding\w\()_8bpc_neon, export=1 149 cmp w7, #0xf // fully edged 150 b.eq cdef_padding\w\()_edged_8bpc_neon 151 movi v30.8h, #0x80, lsl #8 152 mov v31.16b, v30.16b 153 sub x0, x0, #2*(2*\stride+2) 154 tst w7, #4 // CDEF_HAVE_TOP 155 b.ne 1f 156 // !CDEF_HAVE_TOP 157 st1 {v30.8h, v31.8h}, [x0], #32 158 .if \w == 8 159 st1 {v30.8h, v31.8h}, [x0], #32 160 .endif 161 b 3f 162 1: 163 // CDEF_HAVE_TOP 164 add x9, x4, x2 165 pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0 166 167 // Middle section 168 3: 169 tst w7, #1 // CDEF_HAVE_LEFT 170 b.eq 2f 171 // CDEF_HAVE_LEFT 172 tst w7, #2 // CDEF_HAVE_RIGHT 173 b.eq 1f 174 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 175 0: 176 ld1 {v0.h}[0], [x3], #2 177 ldr h2, [x1, #\w] 178 load_n_incr v1, x1, x2, \w 179 subs w6, w6, #1 180 uxtl v0.8h, v0.8b 181 uxtl v1.8h, v1.8b 182 uxtl v2.8h, v2.8b 183 str s0, [x0] 184 stur \rw\()1, [x0, #4] 185 str s2, [x0, #4+2*\w] 186 add x0, x0, #2*\stride 187 b.gt 0b 188 b 3f 189 1: 190 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 191 ld1 {v0.h}[0], [x3], #2 192 load_n_incr v1, x1, x2, \w 193 subs w6, w6, #1 194 uxtl v0.8h, v0.8b 195 uxtl v1.8h, v1.8b 196 str s0, [x0] 197 stur \rw\()1, [x0, #4] 198 str s31, [x0, #4+2*\w] 199 add x0, x0, #2*\stride 200 b.gt 1b 201 b 3f 202 2: 203 tst w7, #2 // CDEF_HAVE_RIGHT 204 b.eq 1f 205 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 206 0: 207 ldr h1, [x1, #\w] 208 load_n_incr v0, x1, x2, \w 209 subs w6, w6, #1 210 uxtl v0.8h, v0.8b 211 uxtl v1.8h, v1.8b 212 str s31, [x0] 213 stur \rw\()0, [x0, #4] 214 str s1, [x0, #4+2*\w] 215 add x0, x0, #2*\stride 216 b.gt 0b 217 b 3f 218 1: 219 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 220 load_n_incr v0, x1, x2, \w 221 subs w6, w6, #1 222 uxtl v0.8h, v0.8b 223 str s31, [x0] 224 stur \rw\()0, [x0, #4] 225 str s31, [x0, #4+2*\w] 226 add x0, x0, #2*\stride 227 b.gt 1b 228 229 3: 230 tst w7, #8 // CDEF_HAVE_BOTTOM 231 b.ne 1f 232 // !CDEF_HAVE_BOTTOM 233 st1 {v30.8h, v31.8h}, [x0], #32 234 .if \w == 8 235 st1 {v30.8h, v31.8h}, [x0], #32 236 .endif 237 ret 238 1: 239 // CDEF_HAVE_BOTTOM 240 add x9, x5, x2 241 pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1 242 endfunc 243 .endm 244 245 padding_func 8, 16, d, q 246 padding_func 4, 8, s, d 247 248 // void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, 249 // ptrdiff_t src_stride, const pixel (*left)[2], 250 // const pixel *const top, 251 // const pixel *const bottom, int h, 252 // enum CdefEdgeFlags edges); 253 254 .macro padding_func_edged w, stride, reg 255 function cdef_padding\w\()_edged_8bpc_neon, export=1 256 sub x4, x4, #2 257 sub x5, x5, #2 258 sub x0, x0, #(2*\stride+2) 259 260 .if \w == 4 261 ldr d0, [x4] 262 ldr d1, [x4, x2] 263 st1 {v0.8b, v1.8b}, [x0], #16 264 .else 265 add x9, x4, x2 266 ldr d0, [x4] 267 ldr s1, [x4, #8] 268 ldr d2, [x9] 269 ldr s3, [x9, #8] 270 str d0, [x0] 271 str s1, [x0, #8] 272 str d2, [x0, #\stride] 273 str s3, [x0, #\stride+8] 274 add x0, x0, #2*\stride 275 .endif 276 277 0: 278 ld1 {v0.h}[0], [x3], #2 279 ldr h2, [x1, #\w] 280 load_n_incr v1, x1, x2, \w 281 subs w6, w6, #1 282 str h0, [x0] 283 stur \reg\()1, [x0, #2] 284 str h2, [x0, #2+\w] 285 add x0, x0, #\stride 286 b.gt 0b 287 288 .if \w == 4 289 ldr d0, [x5] 290 ldr d1, [x5, x2] 291 st1 {v0.8b, v1.8b}, [x0], #16 292 .else 293 add x9, x5, x2 294 ldr d0, [x5] 295 ldr s1, [x5, #8] 296 ldr d2, [x9] 297 ldr s3, [x9, #8] 298 str d0, [x0] 299 str s1, [x0, #8] 300 str d2, [x0, #\stride] 301 str s3, [x0, #\stride+8] 302 .endif 303 ret 304 endfunc 305 .endm 306 307 padding_func_edged 8, 16, d 308 padding_func_edged 4, 8, s 309 310 tables 311 312 filter 8, 8 313 filter 4, 8 314 315 find_dir 8 316 317 .macro load_px_8 d1, d2, w 318 .if \w == 8 319 add x6, x2, w9, sxtb // x + off 320 sub x9, x2, w9, sxtb // x - off 321 ld1 {\d1\().d}[0], [x6] // p0 322 add x6, x6, #16 // += stride 323 ld1 {\d2\().d}[0], [x9] // p1 324 add x9, x9, #16 // += stride 325 ld1 {\d1\().d}[1], [x6] // p0 326 ld1 {\d2\().d}[1], [x9] // p0 327 .else 328 add x6, x2, w9, sxtb // x + off 329 sub x9, x2, w9, sxtb // x - off 330 ld1 {\d1\().s}[0], [x6] // p0 331 add x6, x6, #8 // += stride 332 ld1 {\d2\().s}[0], [x9] // p1 333 add x9, x9, #8 // += stride 334 ld1 {\d1\().s}[1], [x6] // p0 335 add x6, x6, #8 // += stride 336 ld1 {\d2\().s}[1], [x9] // p1 337 add x9, x9, #8 // += stride 338 ld1 {\d1\().s}[2], [x6] // p0 339 add x6, x6, #8 // += stride 340 ld1 {\d2\().s}[2], [x9] // p1 341 add x9, x9, #8 // += stride 342 ld1 {\d1\().s}[3], [x6] // p0 343 ld1 {\d2\().s}[3], [x9] // p1 344 .endif 345 .endm 346 .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min 347 .if \min 348 umin v3.16b, v3.16b, \s1\().16b 349 umax v4.16b, v4.16b, \s1\().16b 350 umin v3.16b, v3.16b, \s2\().16b 351 umax v4.16b, v4.16b, \s2\().16b 352 .endif 353 uabd v16.16b, v0.16b, \s1\().16b // abs(diff) 354 uabd v20.16b, v0.16b, \s2\().16b // abs(diff) 355 ushl v17.16b, v16.16b, \shift // abs(diff) >> shift 356 ushl v21.16b, v20.16b, \shift // abs(diff) >> shift 357 uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift)) 358 uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift)) 359 cmhi v18.16b, v0.16b, \s1\().16b // px > p0 360 cmhi v22.16b, v0.16b, \s2\().16b // px > p1 361 umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip) 362 umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip) 363 dup v19.16b, \tap // taps[k] 364 neg v16.16b, v17.16b // -imin() 365 neg v20.16b, v21.16b // -imin() 366 bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() 367 bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() 368 mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() 369 mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() 370 .endm 371 372 // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, 373 // const uint8_t *tmp, int pri_strength, 374 // int sec_strength, int dir, int damping, 375 // int h); 376 .macro filter_func_8 w, pri, sec, min, suffix 377 function cdef_filter\w\suffix\()_edged_8bpc_neon 378 .if \pri 379 movrel x8, pri_taps 380 and w9, w3, #1 381 add x8, x8, w9, uxtw #1 382 .endif 383 movrel x9, directions\w 384 add x5, x9, w5, uxtw #1 385 movi v30.8b, #7 386 dup v28.8b, w6 // damping 387 388 .if \pri 389 dup v25.16b, w3 // threshold 390 .endif 391 .if \sec 392 dup v27.16b, w4 // threshold 393 .endif 394 trn1 v24.8b, v25.8b, v27.8b 395 clz v24.8b, v24.8b // clz(threshold) 396 sub v24.8b, v30.8b, v24.8b // ulog2(threshold) 397 uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold)) 398 neg v24.8b, v24.8b // -shift 399 .if \sec 400 dup v26.16b, v24.b[1] 401 .endif 402 .if \pri 403 dup v24.16b, v24.b[0] 404 .endif 405 406 1: 407 .if \w == 8 408 add x12, x2, #16 409 ld1 {v0.d}[0], [x2] // px 410 ld1 {v0.d}[1], [x12] // px 411 .else 412 add x12, x2, #1*8 413 add x13, x2, #2*8 414 add x14, x2, #3*8 415 ld1 {v0.s}[0], [x2] // px 416 ld1 {v0.s}[1], [x12] // px 417 ld1 {v0.s}[2], [x13] // px 418 ld1 {v0.s}[3], [x14] // px 419 .endif 420 421 // We need 9-bits or two 8-bit accululators to fit the sum. 422 // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. 423 // Start sum at -1 instead of 0 to help handle rounding later. 424 movi v1.16b, #255 // sum 425 movi v2.16b, #0 // sum 426 .if \min 427 mov v3.16b, v0.16b // min 428 mov v4.16b, v0.16b // max 429 .endif 430 431 // Instead of loading sec_taps 2, 1 from memory, just set it 432 // to 2 initially and decrease for the second round. 433 // This is also used as loop counter. 434 mov w11, #2 // sec_taps[0] 435 436 2: 437 .if \pri 438 ldrb w9, [x5] // off1 439 440 load_px_8 v5, v6, \w 441 .endif 442 443 .if \sec 444 add x5, x5, #4 // +2*2 445 ldrb w9, [x5] // off2 446 load_px_8 v28, v29, \w 447 .endif 448 449 .if \pri 450 ldrb w10, [x8] // *pri_taps 451 452 handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min 453 .endif 454 455 .if \sec 456 add x5, x5, #8 // +2*4 457 ldrb w9, [x5] // off3 458 load_px_8 v5, v6, \w 459 460 handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min 461 462 handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min 463 464 sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; 465 .else 466 add x5, x5, #1 // x5 += 1 467 .endif 468 subs w11, w11, #1 // sec_tap-- (value) 469 .if \pri 470 add x8, x8, #1 // pri_taps++ (pointer) 471 .endif 472 b.ne 2b 473 474 // Perform halving adds since the value won't fit otherwise. 475 // To handle the offset for negative values, use both halving w/ and w/o rounding. 476 srhadd v5.16b, v1.16b, v2.16b // sum >> 1 477 shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 478 cmlt v1.16b, v5.16b, #0 // sum < 0 479 bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 480 481 srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 482 483 usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 484 .if \min 485 umin v0.16b, v0.16b, v4.16b 486 umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) 487 .endif 488 .if \w == 8 489 st1 {v0.d}[0], [x0], x1 490 add x2, x2, #2*16 // tmp += 2*tmp_stride 491 subs w7, w7, #2 // h -= 2 492 st1 {v0.d}[1], [x0], x1 493 .else 494 st1 {v0.s}[0], [x0], x1 495 add x2, x2, #4*8 // tmp += 4*tmp_stride 496 st1 {v0.s}[1], [x0], x1 497 subs w7, w7, #4 // h -= 4 498 st1 {v0.s}[2], [x0], x1 499 st1 {v0.s}[3], [x0], x1 500 .endif 501 502 // Reset pri_taps and directions back to the original point 503 sub x5, x5, #2 504 .if \pri 505 sub x8, x8, #2 506 .endif 507 508 b.gt 1b 509 ret 510 endfunc 511 .endm 512 513 .macro filter_8 w 514 filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri 515 filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec 516 filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec 517 .endm 518 519 filter_8 8 520 filter_8 4