cdef_tmpl.S (20656B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2020, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 31 .macro dir_table w, stride 32 const directions\w 33 .byte -1 * \stride + 1, -2 * \stride + 2 34 .byte 0 * \stride + 1, -1 * \stride + 2 35 .byte 0 * \stride + 1, 0 * \stride + 2 36 .byte 0 * \stride + 1, 1 * \stride + 2 37 .byte 1 * \stride + 1, 2 * \stride + 2 38 .byte 1 * \stride + 0, 2 * \stride + 1 39 .byte 1 * \stride + 0, 2 * \stride + 0 40 .byte 1 * \stride + 0, 2 * \stride - 1 41 // Repeated, to avoid & 7 42 .byte -1 * \stride + 1, -2 * \stride + 2 43 .byte 0 * \stride + 1, -1 * \stride + 2 44 .byte 0 * \stride + 1, 0 * \stride + 2 45 .byte 0 * \stride + 1, 1 * \stride + 2 46 .byte 1 * \stride + 1, 2 * \stride + 2 47 .byte 1 * \stride + 0, 2 * \stride + 1 48 endconst 49 .endm 50 51 .macro tables 52 dir_table 8, 16 53 dir_table 4, 8 54 55 const pri_taps 56 .byte 4, 2, 3, 3 57 endconst 58 .endm 59 60 .macro load_px d1, d2, w 61 .if \w == 8 62 add x6, x2, w9, sxtb #1 // x + off 63 sub x9, x2, w9, sxtb #1 // x - off 64 ld1 {\d1\().8h}, [x6] // p0 65 ld1 {\d2\().8h}, [x9] // p1 66 .else 67 add x6, x2, w9, sxtb #1 // x + off 68 sub x9, x2, w9, sxtb #1 // x - off 69 ld1 {\d1\().4h}, [x6] // p0 70 add x6, x6, #2*8 // += stride 71 ld1 {\d2\().4h}, [x9] // p1 72 add x9, x9, #2*8 // += stride 73 ld1 {\d1\().d}[1], [x6] // p0 74 ld1 {\d2\().d}[1], [x9] // p1 75 .endif 76 .endm 77 .macro handle_pixel s1, s2, thresh_vec, shift, tap, min 78 .if \min 79 umin v2.8h, v2.8h, \s1\().8h 80 smax v3.8h, v3.8h, \s1\().8h 81 umin v2.8h, v2.8h, \s2\().8h 82 smax v3.8h, v3.8h, \s2\().8h 83 .endif 84 uabd v16.8h, v0.8h, \s1\().8h // abs(diff) 85 uabd v20.8h, v0.8h, \s2\().8h // abs(diff) 86 ushl v17.8h, v16.8h, \shift // abs(diff) >> shift 87 ushl v21.8h, v20.8h, \shift // abs(diff) >> shift 88 uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift)) 89 uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift)) 90 sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px 91 sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px 92 neg v16.8h, v17.8h // -clip 93 neg v20.8h, v21.8h // -clip 94 smin v18.8h, v18.8h, v17.8h // imin(diff, clip) 95 smin v22.8h, v22.8h, v21.8h // imin(diff, clip) 96 dup v19.8h, \tap // taps[k] 97 smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip) 98 smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip) 99 mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain() 100 mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain() 101 .endm 102 103 // void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, 104 // const uint16_t *tmp, int pri_strength, 105 // int sec_strength, int dir, int damping, 106 // int h, size_t edges); 107 .macro filter_func w, bpc, pri, sec, min, suffix 108 function cdef_filter\w\suffix\()_\bpc\()bpc_neon 109 .if \bpc == 8 110 ldr w8, [sp] // edges 111 cmp w8, #0xf 112 b.eq cdef_filter\w\suffix\()_edged_8bpc_neon 113 .endif 114 .if \pri 115 .if \bpc == 16 116 ldr w9, [sp, #8] // bitdepth_max 117 clz w9, w9 118 sub w9, w9, #24 // -bitdepth_min_8 119 neg w9, w9 // bitdepth_min_8 120 .endif 121 movrel x8, pri_taps 122 .if \bpc == 16 123 lsr w9, w3, w9 // pri_strength >> bitdepth_min_8 124 and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1 125 .else 126 and w9, w3, #1 127 .endif 128 add x8, x8, w9, uxtw #1 129 .endif 130 movrel x9, directions\w 131 add x5, x9, w5, uxtw #1 132 movi v30.4h, #15 133 dup v28.4h, w6 // damping 134 135 .if \pri 136 dup v25.8h, w3 // threshold 137 .endif 138 .if \sec 139 dup v27.8h, w4 // threshold 140 .endif 141 trn1 v24.4h, v25.4h, v27.4h 142 clz v24.4h, v24.4h // clz(threshold) 143 sub v24.4h, v30.4h, v24.4h // ulog2(threshold) 144 uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold)) 145 neg v24.4h, v24.4h // -shift 146 .if \sec 147 dup v26.8h, v24.h[1] 148 .endif 149 .if \pri 150 dup v24.8h, v24.h[0] 151 .endif 152 153 1: 154 .if \w == 8 155 ld1 {v0.8h}, [x2] // px 156 .else 157 add x12, x2, #2*8 158 ld1 {v0.4h}, [x2] // px 159 ld1 {v0.d}[1], [x12] // px 160 .endif 161 162 movi v1.8h, #0 // sum 163 .if \min 164 mov v2.16b, v0.16b // min 165 mov v3.16b, v0.16b // max 166 .endif 167 168 // Instead of loading sec_taps 2, 1 from memory, just set it 169 // to 2 initially and decrease for the second round. 170 // This is also used as loop counter. 171 mov w11, #2 // sec_taps[0] 172 173 2: 174 .if \pri 175 ldrb w9, [x5] // off1 176 177 load_px v4, v5, \w 178 .endif 179 180 .if \sec 181 add x5, x5, #4 // +2*2 182 ldrb w9, [x5] // off2 183 load_px v6, v7, \w 184 .endif 185 186 .if \pri 187 ldrb w10, [x8] // *pri_taps 188 189 handle_pixel v4, v5, v25.8h, v24.8h, w10, \min 190 .endif 191 192 .if \sec 193 add x5, x5, #8 // +2*4 194 ldrb w9, [x5] // off3 195 load_px v4, v5, \w 196 197 handle_pixel v6, v7, v27.8h, v26.8h, w11, \min 198 199 handle_pixel v4, v5, v27.8h, v26.8h, w11, \min 200 201 sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; 202 .else 203 add x5, x5, #1 // x5 += 1 204 .endif 205 subs w11, w11, #1 // sec_tap-- (value) 206 .if \pri 207 add x8, x8, #1 // pri_taps++ (pointer) 208 .endif 209 b.ne 2b 210 211 cmlt v4.8h, v1.8h, #0 // -(sum < 0) 212 add v1.8h, v1.8h, v4.8h // sum - (sum < 0) 213 srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 214 add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4 215 .if \min 216 smin v0.8h, v0.8h, v3.8h 217 smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max) 218 .endif 219 .if \bpc == 8 220 xtn v0.8b, v0.8h 221 .endif 222 .if \w == 8 223 add x2, x2, #2*16 // tmp += tmp_stride 224 subs w7, w7, #1 // h-- 225 .if \bpc == 8 226 st1 {v0.8b}, [x0], x1 227 .else 228 st1 {v0.8h}, [x0], x1 229 .endif 230 .else 231 .if \bpc == 8 232 st1 {v0.s}[0], [x0], x1 233 .else 234 st1 {v0.d}[0], [x0], x1 235 .endif 236 add x2, x2, #2*16 // tmp += 2*tmp_stride 237 subs w7, w7, #2 // h -= 2 238 .if \bpc == 8 239 st1 {v0.s}[1], [x0], x1 240 .else 241 st1 {v0.d}[1], [x0], x1 242 .endif 243 .endif 244 245 // Reset pri_taps and directions back to the original point 246 sub x5, x5, #2 247 .if \pri 248 sub x8, x8, #2 249 .endif 250 251 b.gt 1b 252 ret 253 endfunc 254 .endm 255 256 .macro filter w, bpc 257 filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri 258 filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec 259 filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec 260 261 function cdef_filter\w\()_\bpc\()bpc_neon, export=1 262 cbnz w3, 1f // pri_strength 263 b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec 264 1: 265 cbnz w4, 1f // sec_strength 266 b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri 267 1: 268 b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec 269 endfunc 270 .endm 271 272 const div_table 273 .short 840, 420, 280, 210, 168, 140, 120, 105 274 endconst 275 276 const alt_fact 277 .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 278 endconst 279 280 .macro cost_alt d1, d2, s1, s2, s3, s4 281 smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n] 282 smull2 v23.4s, \s1\().8h, \s1\().8h 283 smull v24.4s, \s2\().4h, \s2\().4h 284 smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n] 285 smull2 v26.4s, \s3\().8h, \s3\().8h 286 smull v27.4s, \s4\().4h, \s4\().4h 287 mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact 288 mla v22.4s, v23.4s, v30.4s 289 mla v22.4s, v24.4s, v31.4s 290 mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact 291 mla v25.4s, v26.4s, v30.4s 292 mla v25.4s, v27.4s, v31.4s 293 addv \d1, v22.4s // *cost_ptr 294 addv \d2, v25.4s // *cost_ptr 295 .endm 296 297 .macro find_best s1, s2, s3 298 .ifnb \s2 299 mov w5, \s2\().s[0] 300 .endif 301 cmp w4, w1 // cost[n] > best_cost 302 csel w0, w3, w0, gt // best_dir = n 303 csel w1, w4, w1, gt // best_cost = cost[n] 304 .ifnb \s2 305 add w3, w3, #1 // n++ 306 cmp w5, w1 // cost[n] > best_cost 307 mov w4, \s3\().s[0] 308 csel w0, w3, w0, gt // best_dir = n 309 csel w1, w5, w1, gt // best_cost = cost[n] 310 add w3, w3, #1 // n++ 311 .endif 312 .endm 313 314 // Steps for loading and preparing each row 315 .macro dir_load_step1 s1, bpc 316 .if \bpc == 8 317 ld1 {\s1\().8b}, [x0], x1 318 .else 319 ld1 {\s1\().8h}, [x0], x1 320 .endif 321 .endm 322 323 .macro dir_load_step2 s1, bpc 324 .if \bpc == 8 325 usubl \s1\().8h, \s1\().8b, v31.8b 326 .else 327 ushl \s1\().8h, \s1\().8h, v8.8h 328 .endif 329 .endm 330 331 .macro dir_load_step3 s1, bpc 332 // Nothing for \bpc == 8 333 .if \bpc != 8 334 sub \s1\().8h, \s1\().8h, v31.8h 335 .endif 336 .endm 337 338 // int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, 339 // unsigned *const var) 340 .macro find_dir bpc 341 function cdef_find_dir_\bpc\()bpc_neon, export=1 342 .if \bpc == 16 343 str d8, [sp, #-0x10]! 344 clz w3, w3 // clz(bitdepth_max) 345 sub w3, w3, #24 // -bitdepth_min_8 346 dup v8.8h, w3 347 .endif 348 sub sp, sp, #32 // cost 349 mov w3, #8 350 .if \bpc == 8 351 movi v31.16b, #128 352 .else 353 movi v31.8h, #128 354 .endif 355 movi v30.16b, #0 356 movi v1.8h, #0 // v0-v1 sum_diag[0] 357 movi v3.8h, #0 // v2-v3 sum_diag[1] 358 movi v5.8h, #0 // v4-v5 sum_hv[0-1] 359 movi v7.8h, #0 // v6-v7 sum_alt[0] 360 dir_load_step1 v26, \bpc // Setup first row early 361 movi v17.8h, #0 // v16-v17 sum_alt[1] 362 movi v18.8h, #0 // v18-v19 sum_alt[2] 363 dir_load_step2 v26, \bpc 364 movi v19.8h, #0 365 dir_load_step3 v26, \bpc 366 movi v21.8h, #0 // v20-v21 sum_alt[3] 367 368 .irpc i, 01234567 369 addv h25, v26.8h // [y] 370 rev64 v27.8h, v26.8h 371 addp v28.8h, v26.8h, v30.8h // [(x >> 1)] 372 add v5.8h, v5.8h, v26.8h // sum_hv[1] 373 ext v27.16b, v27.16b, v27.16b, #8 // [-x] 374 rev64 v29.4h, v28.4h // [-(x >> 1)] 375 ins v4.h[\i], v25.h[0] // sum_hv[0] 376 .if \i < 6 377 ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) 378 ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) 379 add v18.8h, v18.8h, v22.8h // sum_alt[2] 380 add v19.4h, v19.4h, v23.4h // sum_alt[2] 381 .else 382 add v18.8h, v18.8h, v26.8h // sum_alt[2] 383 .endif 384 .if \i == 0 385 mov v20.16b, v26.16b // sum_alt[3] 386 .elseif \i == 1 387 add v20.8h, v20.8h, v26.8h // sum_alt[3] 388 .else 389 ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) 390 ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) 391 add v20.8h, v20.8h, v24.8h // sum_alt[3] 392 add v21.4h, v21.4h, v25.4h // sum_alt[3] 393 .endif 394 .if \i == 0 395 mov v0.16b, v26.16b // sum_diag[0] 396 dir_load_step1 v26, \bpc 397 mov v2.16b, v27.16b // sum_diag[1] 398 dir_load_step2 v26, \bpc 399 mov v6.16b, v28.16b // sum_alt[0] 400 dir_load_step3 v26, \bpc 401 mov v16.16b, v29.16b // sum_alt[1] 402 .else 403 ext v22.16b, v30.16b, v26.16b, #(16-2*\i) 404 ext v23.16b, v26.16b, v30.16b, #(16-2*\i) 405 ext v24.16b, v30.16b, v27.16b, #(16-2*\i) 406 ext v25.16b, v27.16b, v30.16b, #(16-2*\i) 407 .if \i != 7 // Nothing to load for the final row 408 dir_load_step1 v26, \bpc // Start setting up the next row early. 409 .endif 410 add v0.8h, v0.8h, v22.8h // sum_diag[0] 411 add v1.8h, v1.8h, v23.8h // sum_diag[0] 412 add v2.8h, v2.8h, v24.8h // sum_diag[1] 413 add v3.8h, v3.8h, v25.8h // sum_diag[1] 414 .if \i != 7 415 dir_load_step2 v26, \bpc 416 .endif 417 ext v22.16b, v30.16b, v28.16b, #(16-2*\i) 418 ext v23.16b, v28.16b, v30.16b, #(16-2*\i) 419 ext v24.16b, v30.16b, v29.16b, #(16-2*\i) 420 ext v25.16b, v29.16b, v30.16b, #(16-2*\i) 421 .if \i != 7 422 dir_load_step3 v26, \bpc 423 .endif 424 add v6.8h, v6.8h, v22.8h // sum_alt[0] 425 add v7.4h, v7.4h, v23.4h // sum_alt[0] 426 add v16.8h, v16.8h, v24.8h // sum_alt[1] 427 add v17.4h, v17.4h, v25.4h // sum_alt[1] 428 .endif 429 .endr 430 431 movi v31.4s, #105 432 433 smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0] 434 smlal2 v26.4s, v4.8h, v4.8h 435 smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1] 436 smlal2 v27.4s, v5.8h, v5.8h 437 mul v26.4s, v26.4s, v31.4s // cost[2] *= 105 438 mul v27.4s, v27.4s, v31.4s // cost[6] *= 105 439 addv s4, v26.4s // cost[2] 440 addv s5, v27.4s // cost[6] 441 442 rev64 v1.8h, v1.8h 443 rev64 v3.8h, v3.8h 444 ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n] 445 ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n] 446 447 str s4, [sp, #2*4] // cost[2] 448 str s5, [sp, #6*4] // cost[6] 449 450 movrel x4, div_table 451 ld1 {v31.8h}, [x4] 452 453 smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0] 454 smull2 v23.4s, v0.8h, v0.8h 455 smlal v22.4s, v1.4h, v1.4h 456 smlal2 v23.4s, v1.8h, v1.8h 457 smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1] 458 smull2 v25.4s, v2.8h, v2.8h 459 smlal v24.4s, v3.4h, v3.4h 460 smlal2 v25.4s, v3.8h, v3.8h 461 uxtl v30.4s, v31.4h // div_table 462 uxtl2 v31.4s, v31.8h 463 mul v22.4s, v22.4s, v30.4s // cost[0] 464 mla v22.4s, v23.4s, v31.4s // cost[0] 465 mul v24.4s, v24.4s, v30.4s // cost[4] 466 mla v24.4s, v25.4s, v31.4s // cost[4] 467 addv s0, v22.4s // cost[0] 468 addv s2, v24.4s // cost[4] 469 470 movrel x5, alt_fact 471 ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105 472 473 str s0, [sp, #0*4] // cost[0] 474 str s2, [sp, #4*4] // cost[4] 475 476 uxtl v29.4s, v29.4h // div_table[2*m+1] + 105 477 uxtl v30.4s, v30.4h 478 uxtl v31.4s, v31.4h 479 480 cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3] 481 cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7] 482 str s6, [sp, #1*4] // cost[1] 483 str s16, [sp, #3*4] // cost[3] 484 485 mov w0, #0 // best_dir 486 mov w1, v0.s[0] // best_cost 487 mov w3, #1 // n 488 489 str s18, [sp, #5*4] // cost[5] 490 str s20, [sp, #7*4] // cost[7] 491 492 mov w4, v6.s[0] 493 494 find_best v6, v4, v16 495 find_best v16, v2, v18 496 find_best v18, v5, v20 497 find_best v20 498 499 eor w3, w0, #4 // best_dir ^4 500 ldr w4, [sp, w3, uxtw #2] 501 sub w1, w1, w4 // best_cost - cost[best_dir ^ 4] 502 lsr w1, w1, #10 503 str w1, [x2] // *var 504 505 add sp, sp, #32 506 .if \bpc == 16 507 ldr d8, [sp], 0x10 508 .endif 509 ret 510 endfunc 511 .endm