filmgrain16.S (74703B)
1 /* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 #include "src/arm/asm-offsets.h" 31 32 #define GRAIN_WIDTH 82 33 #define GRAIN_HEIGHT 73 34 35 #define SUB_GRAIN_WIDTH 44 36 #define SUB_GRAIN_HEIGHT 38 37 38 .macro increment_seed steps, shift=1 39 lsr w11, w2, #3 40 lsr w12, w2, #12 41 lsr w13, w2, #1 42 eor w11, w2, w11 // (r >> 0) ^ (r >> 3) 43 eor w12, w12, w13 // (r >> 12) ^ (r >> 1) 44 eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) 45 .if \shift 46 lsr w2, w2, #\steps 47 .endif 48 and w11, w11, #((1 << \steps) - 1) // bit 49 .if \shift 50 orr w2, w2, w11, lsl #(16 - \steps) // *state 51 .else 52 orr w2, w2, w11, lsl #16 // *state 53 .endif 54 .endm 55 56 .macro read_rand dest, bits, age 57 ubfx \dest, x2, #16 - \bits - \age, #\bits 58 .endm 59 60 .macro read_shift_rand dest, bits 61 ubfx \dest, x2, #17 - \bits, #\bits 62 lsr w2, w2, #1 63 .endm 64 65 // special calling convention: 66 // w2 holds seed 67 // x3 holds dav1d_gaussian_sequence 68 // clobbers x11-x15 69 // returns in v0.8h 70 function get_gaussian_neon 71 increment_seed 4 72 read_rand x14, 11, 3 73 read_rand x15, 11, 2 74 add x14, x3, x14, lsl #1 75 add x15, x3, x15, lsl #1 76 ld1 {v0.h}[0], [x14] 77 read_rand x14, 11, 1 78 ld1 {v0.h}[1], [x15] 79 add x14, x3, x14, lsl #1 80 read_rand x15, 11, 0 81 increment_seed 4 82 add x15, x3, x15, lsl #1 83 ld1 {v0.h}[2], [x14] 84 read_rand x14, 11, 3 85 ld1 {v0.h}[3], [x15] 86 add x14, x3, x14, lsl #1 87 read_rand x15, 11, 2 88 ld1 {v0.h}[4], [x14] 89 add x15, x3, x15, lsl #1 90 read_rand x14, 11, 1 91 ld1 {v0.h}[5], [x15] 92 read_rand x15, 11, 0 93 add x14, x3, x14, lsl #1 94 add x15, x3, x15, lsl #1 95 ld1 {v0.h}[6], [x14] 96 ld1 {v0.h}[7], [x15] 97 ret 98 endfunc 99 100 .macro store_grain_row r0, r1, r2, r3, r4, r5 101 st1 {\r0\().16b,\r1\().16b}, [x0], #32 102 st1 {\r2\().16b,\r3\().16b}, [x0], #32 103 st1 {\r4\().16b}, [x0], #16 104 st1 {\r5\().h}[0], [x0], #2 105 .endm 106 107 function get_grain_2_neon 108 increment_seed 2 109 read_rand x14, 11, 1 110 read_rand x15, 11, 0 111 add x14, x3, x14, lsl #1 112 add x15, x3, x15, lsl #1 113 ld1 {v0.h}[0], [x14] 114 ld1 {v0.h}[1], [x15] 115 srshl v0.4h, v0.4h, v31.4h 116 ret 117 endfunc 118 119 .macro get_grain_2 dst 120 bl get_grain_2_neon 121 .ifnc \dst, v0 122 mov \dst\().8b, v0.8b 123 .endif 124 .endm 125 126 function get_grain_4_neon 127 increment_seed 4 128 read_rand x14, 11, 3 129 read_rand x15, 11, 2 130 add x14, x3, x14, lsl #1 131 add x15, x3, x15, lsl #1 132 ld1 {v0.h}[0], [x14] 133 read_rand x14, 11, 1 134 ld1 {v0.h}[1], [x15] 135 add x14, x3, x14, lsl #1 136 read_rand x15, 11, 0 137 add x15, x3, x15, lsl #1 138 ld1 {v0.h}[2], [x14] 139 ld1 {v0.h}[3], [x15] 140 srshl v0.4h, v0.4h, v31.4h 141 ret 142 endfunc 143 144 .macro get_grain_4 dst 145 bl get_grain_4_neon 146 .ifnc \dst, v0 147 mov \dst\().8b, v0.8b 148 .endif 149 .endm 150 151 // w15 holds the number of entries to produce 152 // w14, w16 and w17 hold the previous output entries 153 // v0 holds the vector of produced entries 154 // v1 holds the input vector of sums from above 155 .macro output_lag n 156 function output_lag\n\()_neon 157 1: 158 read_shift_rand x13, 11 159 mov w11, v1.s[0] 160 ldrsh w12, [x3, x13, lsl #1] 161 ext v0.16b, v0.16b, v0.16b, #2 162 .if \n == 1 163 madd w11, w14, w4, w11 // sum (above) + *coeff * prev output 164 .elseif \n == 2 165 madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 166 madd w11, w14, w17, w11 // += *coeff * prev output 2 167 mov w16, w14 168 .else 169 madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 170 madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 171 madd w11, w14, w21, w11 // += *coeff * prev output 3 172 mov w17, w16 173 mov w16, w14 174 .endif 175 add w14, w11, w8 // 1 << (ar_coeff_shift - 1) 176 add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) 177 asr w14, w14, w7 // >> ar_coeff_shift 178 asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) 179 add w14, w14, w12 180 cmp w14, w5 181 csel w14, w14, w5, le 182 cmp w14, w6 183 csel w14, w14, w6, ge 184 subs w15, w15, #1 185 ext v1.16b, v1.16b, v1.16b, #4 186 ins v0.h[7], w14 187 b.gt 1b 188 ret 189 endfunc 190 .endm 191 192 output_lag 1 193 output_lag 2 194 output_lag 3 195 196 197 function sum_lag1_above_neon 198 sub x12, x0, #1*GRAIN_WIDTH*2 - 16 199 ld1 {v18.8h}, [x12] // load top right 200 201 ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid 202 ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right 203 204 smull v4.4s, v17.4h, v28.4h 205 smlal v4.4s, v0.4h, v27.4h 206 smlal v4.4s, v1.4h, v29.4h 207 smull2 v5.4s, v17.8h, v28.8h 208 smlal2 v5.4s, v0.8h, v27.8h 209 smlal2 v5.4s, v1.8h, v29.8h 210 211 mov v16.16b, v17.16b 212 mov v17.16b, v18.16b 213 214 ret 215 endfunc 216 217 .macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff 218 bl sum_\lag\()_above_neon 219 .ifc \type, uv_420 220 add x12, x19, #GRAIN_WIDTH*2 221 ld1 {v22.8h, v23.8h}, [x19], #32 222 ld1 {v24.8h, v25.8h}, [x12] 223 addp v22.8h, v22.8h, v23.8h 224 addp v23.8h, v24.8h, v25.8h 225 add v22.8h, v22.8h, v23.8h 226 srshr v0.8h, v22.8h, #2 227 .endif 228 .ifc \type, uv_422 229 ld1 {v22.8h, v23.8h}, [x19], #32 230 addp v22.8h, v22.8h, v23.8h 231 srshr v0.8h, v22.8h, #1 232 .endif 233 .ifc \type, uv_444 234 ld1 {v0.8h}, [x19], #16 235 .endif 236 .if \uv_layout 237 .ifnb \uv_coeff 238 dup v1.8b, \uv_coeff 239 sxtl v1.8h, v1.8b 240 smlal v4.4s, v0.4h, v1.4h 241 smlal2 v5.4s, v0.8h, v1.8h 242 .else 243 smlal v4.4s, v0.4h, v30.4h 244 smlal2 v5.4s, v0.8h, v30.8h 245 .endif 246 .endif 247 .if \uv_layout && \elems == 8 248 b sum_\lag\()_y_\edge\()_start 249 .elseif \uv_layout == 444 && \elems == 7 250 b sum_\lag\()_y_\edge\()_start 251 .elseif \uv_layout == 422 && \elems == 1 252 b sum_\lag\()_uv_420_\edge\()_start 253 .else 254 sum_\lag\()_\type\()_\edge\()_start: 255 .if \elems > 4 256 .ifc \edge, left 257 increment_seed 4 258 read_rand x12, 11, 3 259 read_rand x13, 11, 2 260 read_rand x14, 11, 1 261 add x12, x3, x12, lsl #1 262 add x13, x3, x13, lsl #1 263 add x14, x3, x14, lsl #1 264 ld1 {v0.h}[5], [x12] 265 ld1 {v0.h}[6], [x13] 266 ld1 {v0.h}[7], [x14] 267 lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 268 srshl v0.8h, v0.8h, v31.8h 269 ext v4.16b, v4.16b, v4.16b, #12 270 .ifc \lag, lag3 271 smov w17, v0.h[5] 272 .endif 273 .ifnc \lag, lag1 274 smov w16, v0.h[6] 275 .endif 276 smov w14, v0.h[7] 277 278 mov v1.16b, v4.16b 279 mov w15, #1 280 bl output_\lag\()_neon 281 .else 282 increment_seed 4, shift=0 283 mov v1.16b, v4.16b 284 mov w15, #4 285 bl output_\lag\()_neon 286 .endif 287 288 increment_seed 4, shift=0 289 mov v1.16b, v5.16b 290 .ifc \edge, right 291 mov w15, #3 292 bl output_\lag\()_neon 293 read_shift_rand x15, 11 294 add x15, x3, x15, lsl #1 295 ld1 {v1.h}[0], [x15] 296 srshl v1.4h, v1.4h, v31.4h 297 ext v0.16b, v0.16b, v1.16b, #2 298 .else 299 mov w15, #4 300 bl output_\lag\()_neon 301 .endif 302 .else 303 // elems == 1 304 increment_seed 4, shift=0 305 mov v1.16b, v4.16b 306 mov w15, #1 307 bl output_\lag\()_neon 308 lsr w2, w2, #3 309 310 read_rand x12, 11, 2 311 read_rand x13, 11, 1 312 read_rand x14, 11, 0 313 add x12, x3, x12, lsl #1 314 add x13, x3, x13, lsl #1 315 add x14, x3, x14, lsl #1 316 ld1 {v1.h}[0], [x12] 317 ld1 {v1.h}[1], [x13] 318 ld1 {v1.h}[2], [x14] 319 srshl v1.4h, v1.4h, v31.4h 320 ext v0.16b, v0.16b, v1.16b, #14 321 .endif 322 st1 {v0.8h}, [x0], #16 323 ldr x30, [sp], #16 324 AARCH64_VALIDATE_LINK_REGISTER 325 ret 326 .endif 327 .endm 328 329 .macro sum_lag1_func type, uv_layout, edge, elems=8 330 function sum_\type\()_lag1_\edge\()_neon 331 AARCH64_SIGN_LINK_REGISTER 332 str x30, [sp, #-16]! 333 .ifc \edge, left 334 sub x12, x0, #1*GRAIN_WIDTH*2 335 ld1 {v17.8h}, [x12] // load the previous block right above 336 .endif 337 sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems 338 endfunc 339 .endm 340 341 sum_lag1_func y, 0, left 342 sum_lag1_func y, 0, mid 343 sum_lag1_func y, 0, right, 7 344 sum_lag1_func uv_444, 444, left 345 sum_lag1_func uv_444, 444, mid 346 sum_lag1_func uv_444, 444, right, 7 347 sum_lag1_func uv_422, 422, left 348 sum_lag1_func uv_422, 422, mid 349 sum_lag1_func uv_422, 422, right, 1 350 sum_lag1_func uv_420, 420, left 351 sum_lag1_func uv_420, 420, mid 352 sum_lag1_func uv_420, 420, right, 1 353 354 355 function sum_lag2_above_neon 356 sub x12, x0, #2*GRAIN_WIDTH*2 - 16 357 sub x13, x0, #1*GRAIN_WIDTH*2 - 16 358 ld1 {v18.8h}, [x12] // load top right 359 ld1 {v21.8h}, [x13] 360 361 dup v26.8b, v30.b[0] 362 ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid 363 dup v27.8b, v30.b[1] 364 ext v23.16b, v16.16b, v17.16b, #14 365 sxtl v26.8h, v26.8b 366 dup v28.8b, v30.b[3] 367 ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right 368 sxtl v27.8h, v27.8b 369 dup v29.8b, v30.b[4] 370 ext v1.16b, v17.16b, v18.16b, #4 371 sxtl v28.8h, v28.8b 372 sxtl v29.8h, v29.8b 373 374 smull v4.4s, v22.4h, v26.4h 375 smlal v4.4s, v23.4h, v27.4h 376 smlal v4.4s, v0.4h, v28.4h 377 smlal v4.4s, v1.4h, v29.4h 378 smull2 v5.4s, v22.8h, v26.8h 379 smlal2 v5.4s, v23.8h, v27.8h 380 smlal2 v5.4s, v0.8h, v28.8h 381 smlal2 v5.4s, v1.8h, v29.8h 382 383 dup v26.16b, v30.b[5] 384 ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid 385 dup v27.16b, v30.b[6] 386 ext v23.16b, v19.16b, v20.16b, #14 387 sxtl v26.8h, v26.8b 388 dup v28.16b, v30.b[8] 389 ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right 390 sxtl v27.8h, v27.8b 391 dup v29.16b, v30.b[9] 392 ext v1.16b, v20.16b, v21.16b, #4 393 sxtl v28.8h, v28.8b 394 sxtl v29.8h, v29.8b 395 396 smlal v4.4s, v22.4h, v26.4h 397 smlal v4.4s, v23.4h, v27.4h 398 smlal v4.4s, v0.4h, v28.4h 399 smlal v4.4s, v1.4h, v29.4h 400 smlal2 v5.4s, v22.8h, v26.8h 401 smlal2 v5.4s, v23.8h, v27.8h 402 smlal2 v5.4s, v0.8h, v28.8h 403 smlal2 v5.4s, v1.8h, v29.8h 404 405 dup v26.16b, v30.b[2] 406 dup v27.16b, v30.b[7] 407 sxtl v26.8h, v26.8b 408 sxtl v27.8h, v27.8b 409 410 smlal v4.4s, v17.4h, v26.4h 411 smlal v4.4s, v20.4h, v27.4h 412 smlal2 v5.4s, v17.8h, v26.8h 413 smlal2 v5.4s, v20.8h, v27.8h 414 mov v16.16b, v17.16b 415 mov v17.16b, v18.16b 416 417 mov v19.16b, v20.16b 418 mov v20.16b, v21.16b 419 ret 420 endfunc 421 422 .macro sum_lag2_func type, uv_layout, edge, elems=8 423 function sum_\type\()_lag2_\edge\()_neon 424 AARCH64_SIGN_LINK_REGISTER 425 str x30, [sp, #-16]! 426 .ifc \edge, left 427 sub x12, x0, #2*GRAIN_WIDTH*2 428 sub x13, x0, #1*GRAIN_WIDTH*2 429 ld1 {v17.8h}, [x12] // load the previous block right above 430 ld1 {v20.8h}, [x13] 431 .endif 432 sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] 433 endfunc 434 .endm 435 436 sum_lag2_func y, 0, left 437 sum_lag2_func y, 0, mid 438 sum_lag2_func y, 0, right, 7 439 sum_lag2_func uv_444, 444, left 440 sum_lag2_func uv_444, 444, mid 441 sum_lag2_func uv_444, 444, right, 7 442 sum_lag2_func uv_422, 422, left 443 sum_lag2_func uv_422, 422, mid 444 sum_lag2_func uv_422, 422, right, 1 445 sum_lag2_func uv_420, 420, left 446 sum_lag2_func uv_420, 420, mid 447 sum_lag2_func uv_420, 420, right, 1 448 449 450 function sum_lag3_above_neon 451 sub x11, x0, #3*GRAIN_WIDTH*2 - 16 452 sub x12, x0, #2*GRAIN_WIDTH*2 - 16 453 sub x13, x0, #1*GRAIN_WIDTH*2 - 16 454 ld1 {v15.8h}, [x11] // load top right 455 ld1 {v18.8h}, [x12] 456 ld1 {v21.8h}, [x13] 457 458 dup v22.8b, v29.b[0] 459 ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid 460 dup v23.8b, v29.b[1] 461 ext v9.16b, v13.16b, v14.16b, #12 462 sxtl v22.8h, v22.8b 463 dup v24.8b, v29.b[2] 464 sxtl v23.8h, v23.8b 465 dup v25.8b, v29.b[3] 466 ext v10.16b, v13.16b, v14.16b, #14 467 sxtl v24.8h, v24.8b 468 dup v26.8b, v29.b[4] 469 ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right 470 sxtl v25.8h, v25.8b 471 dup v27.8b, v29.b[5] 472 ext v12.16b, v14.16b, v15.16b, #4 473 sxtl v26.8h, v26.8b 474 dup v28.8b, v29.b[6] 475 ext v13.16b, v14.16b, v15.16b, #6 476 sxtl v27.8h, v27.8b 477 sxtl v28.8h, v28.8b 478 479 smull v4.4s, v8.4h, v22.4h 480 smlal v4.4s, v9.4h, v23.4h 481 smlal v4.4s, v10.4h, v24.4h 482 smlal v4.4s, v11.4h, v26.4h 483 smlal v4.4s, v12.4h, v27.4h 484 smlal v4.4s, v13.4h, v28.4h 485 smlal v4.4s, v14.4h, v25.4h 486 smull2 v5.4s, v8.8h, v22.8h 487 smlal2 v5.4s, v9.8h, v23.8h 488 smlal2 v5.4s, v10.8h, v24.8h 489 smlal2 v5.4s, v11.8h, v26.8h 490 smlal2 v5.4s, v12.8h, v27.8h 491 smlal2 v5.4s, v13.8h, v28.8h 492 smlal2 v5.4s, v14.8h, v25.8h 493 494 dup v22.8b, v29.b[7] 495 ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid 496 dup v23.8b, v29.b[8] 497 ext v9.16b, v16.16b, v17.16b, #12 498 sxtl v22.8h, v22.8b 499 dup v24.8b, v29.b[9] 500 sxtl v23.8h, v23.8b 501 dup v25.8b, v29.b[10] 502 ext v10.16b, v16.16b, v17.16b, #14 503 sxtl v24.8h, v24.8b 504 dup v26.8b, v29.b[11] 505 ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right 506 sxtl v25.8h, v25.8b 507 dup v27.8b, v29.b[12] 508 ext v12.16b, v17.16b, v18.16b, #4 509 sxtl v26.8h, v26.8b 510 dup v28.8b, v29.b[13] 511 ext v13.16b, v17.16b, v18.16b, #6 512 sxtl v27.8h, v27.8b 513 sxtl v28.8h, v28.8b 514 515 smlal v4.4s, v8.4h, v22.4h 516 smlal v4.4s, v9.4h, v23.4h 517 smlal v4.4s, v10.4h, v24.4h 518 smlal v4.4s, v11.4h, v26.4h 519 smlal v4.4s, v12.4h, v27.4h 520 smlal v4.4s, v13.4h, v28.4h 521 smlal v4.4s, v17.4h, v25.4h 522 smlal2 v5.4s, v8.8h, v22.8h 523 smlal2 v5.4s, v9.8h, v23.8h 524 smlal2 v5.4s, v10.8h, v24.8h 525 smlal2 v5.4s, v11.8h, v26.8h 526 smlal2 v5.4s, v12.8h, v27.8h 527 smlal2 v5.4s, v13.8h, v28.8h 528 smlal2 v5.4s, v17.8h, v25.8h 529 530 dup v22.8b, v29.b[14] 531 ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid 532 dup v23.8b, v29.b[15] 533 ext v9.16b, v19.16b, v20.16b, #12 534 sxtl v22.8h, v22.8b 535 dup v24.8b, v30.b[0] 536 sxtl v23.8h, v23.8b 537 dup v25.8b, v30.b[1] 538 ext v10.16b, v19.16b, v20.16b, #14 539 sxtl v24.8h, v24.8b 540 dup v26.8b, v30.b[2] 541 ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right 542 sxtl v25.8h, v25.8b 543 dup v27.8b, v30.b[3] 544 ext v12.16b, v20.16b, v21.16b, #4 545 sxtl v26.8h, v26.8b 546 dup v28.8b, v30.b[4] 547 ext v13.16b, v20.16b, v21.16b, #6 548 sxtl v27.8h, v27.8b 549 sxtl v28.8h, v28.8b 550 551 smlal v4.4s, v8.4h, v22.4h 552 smlal v4.4s, v9.4h, v23.4h 553 smlal v4.4s, v10.4h, v24.4h 554 smlal v4.4s, v11.4h, v26.4h 555 smlal v4.4s, v12.4h, v27.4h 556 smlal v4.4s, v13.4h, v28.4h 557 smlal v4.4s, v20.4h, v25.4h 558 mov v16.16b, v17.16b 559 mov v17.16b, v18.16b 560 smlal2 v5.4s, v8.8h, v22.8h 561 smlal2 v5.4s, v9.8h, v23.8h 562 smlal2 v5.4s, v10.8h, v24.8h 563 smlal2 v5.4s, v11.8h, v26.8h 564 smlal2 v5.4s, v12.8h, v27.8h 565 smlal2 v5.4s, v13.8h, v28.8h 566 smlal2 v5.4s, v20.8h, v25.8h 567 568 mov v13.16b, v14.16b 569 mov v14.16b, v15.16b 570 571 mov v19.16b, v20.16b 572 mov v20.16b, v21.16b 573 ret 574 endfunc 575 576 .macro sum_lag3_func type, uv_layout, edge, elems=8 577 function sum_\type\()_lag3_\edge\()_neon 578 AARCH64_SIGN_LINK_REGISTER 579 str x30, [sp, #-16]! 580 .ifc \edge, left 581 sub x11, x0, #3*GRAIN_WIDTH*2 582 sub x12, x0, #2*GRAIN_WIDTH*2 583 sub x13, x0, #1*GRAIN_WIDTH*2 584 ld1 {v14.8h}, [x11] // load the previous block right above 585 ld1 {v17.8h}, [x12] 586 ld1 {v20.8h}, [x13] 587 .endif 588 sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] 589 endfunc 590 .endm 591 592 sum_lag3_func y, 0, left 593 sum_lag3_func y, 0, mid 594 sum_lag3_func y, 0, right, 7 595 sum_lag3_func uv_444, 444, left 596 sum_lag3_func uv_444, 444, mid 597 sum_lag3_func uv_444, 444, right, 7 598 sum_lag3_func uv_422, 422, left 599 sum_lag3_func uv_422, 422, mid 600 sum_lag3_func uv_422, 422, right, 1 601 sum_lag3_func uv_420, 420, left 602 sum_lag3_func uv_420, 420, mid 603 sum_lag3_func uv_420, 420, right, 1 604 605 function generate_grain_rows_neon 606 AARCH64_SIGN_LINK_REGISTER 607 str x30, [sp, #-16]! 608 1: 609 mov w16, #80 610 2: 611 bl get_gaussian_neon 612 srshl v0.8h, v0.8h, v31.8h 613 subs w16, w16, #8 614 st1 {v0.8h}, [x0], #16 615 b.gt 2b 616 get_grain_2 v0 617 subs w1, w1, #1 618 st1 {v0.s}[0], [x0], #4 619 b.gt 1b 620 ldr x30, [sp], #16 621 AARCH64_VALIDATE_LINK_REGISTER 622 ret 623 endfunc 624 625 function generate_grain_rows_44_neon 626 AARCH64_SIGN_LINK_REGISTER 627 str x30, [sp, #-16]! 628 1: 629 mov w16, #40 630 2: 631 bl get_gaussian_neon 632 srshl v0.8h, v0.8h, v31.8h 633 subs w16, w16, #8 634 st1 {v0.8h}, [x0], #16 635 b.gt 2b 636 get_grain_4 v0 637 subs w1, w1, #1 638 st1 {v0.4h}, [x0] 639 add x0, x0, #GRAIN_WIDTH*2-80 640 b.gt 1b 641 ldr x30, [sp], #16 642 AARCH64_VALIDATE_LINK_REGISTER 643 ret 644 endfunc 645 646 function gen_grain_uv_444_lag0_neon 647 AARCH64_SIGN_LINK_REGISTER 648 str x30, [sp, #-16]! 649 ld1 {v4.8h}, [x19], #16 650 gen_grain_uv_lag0_8_start: 651 bl get_gaussian_neon 652 srshl v0.8h, v0.8h, v31.8h 653 gen_grain_uv_lag0_8_add: 654 and v4.16b, v4.16b, v1.16b 655 smull v2.4s, v4.4h, v27.4h 656 smull2 v3.4s, v4.8h, v27.8h 657 srshl v2.4s, v2.4s, v28.4s 658 srshl v3.4s, v3.4s, v28.4s 659 sqxtn v2.4h, v2.4s 660 sqxtn2 v2.8h, v3.4s 661 sqadd v2.8h, v2.8h, v0.8h 662 smin v2.8h, v2.8h, v25.8h 663 smax v2.8h, v2.8h, v26.8h 664 st1 {v2.8h}, [x0], #16 665 ldr x30, [sp], #16 666 AARCH64_VALIDATE_LINK_REGISTER 667 ret 668 endfunc 669 670 function gen_grain_uv_420_lag0_8_neon 671 AARCH64_SIGN_LINK_REGISTER 672 add x12, x19, #GRAIN_WIDTH*2 673 str x30, [sp, #-16]! 674 ld1 {v16.8h, v17.8h}, [x19], #32 675 ld1 {v18.8h, v19.8h}, [x12] 676 addp v16.8h, v16.8h, v17.8h 677 addp v17.8h, v18.8h, v19.8h 678 add v16.8h, v16.8h, v17.8h 679 srshr v4.8h, v16.8h, #2 680 b gen_grain_uv_lag0_8_start 681 endfunc 682 683 function gen_grain_uv_422_lag0_8_neon 684 AARCH64_SIGN_LINK_REGISTER 685 str x30, [sp, #-16]! 686 ld1 {v16.8h, v17.8h}, [x19], #32 687 addp v16.8h, v16.8h, v17.8h 688 srshr v4.8h, v16.8h, #1 689 b gen_grain_uv_lag0_8_start 690 endfunc 691 692 function gen_grain_uv_420_lag0_4_neon 693 add x12, x19, #GRAIN_WIDTH*2 694 AARCH64_SIGN_LINK_REGISTER 695 str x30, [sp, #-16]! 696 ld1 {v16.4h, v17.4h}, [x19] 697 ld1 {v18.4h, v19.4h}, [x12] 698 add x19, x19, #32 699 addp v16.4h, v16.4h, v17.4h 700 addp v17.4h, v18.4h, v19.4h 701 add v16.4h, v16.4h, v17.4h 702 srshr v4.4h, v16.4h, #2 703 get_grain_4 v0 704 b gen_grain_uv_lag0_8_add 705 endfunc 706 707 function gen_grain_uv_422_lag0_4_neon 708 AARCH64_SIGN_LINK_REGISTER 709 str x30, [sp, #-16]! 710 ld1 {v16.4h, v17.4h}, [x19] 711 add x19, x19, #32 712 addp v16.4h, v16.4h, v17.4h 713 srshr v4.4h, v16.4h, #1 714 get_grain_4 v0 715 b gen_grain_uv_lag0_8_add 716 endfunc 717 718 .macro gen_grain_82 type 719 function generate_grain_\type\()_16bpc_neon, export=1 720 AARCH64_SIGN_LINK_REGISTER 721 stp x30, x19, [sp, #-96]! 722 723 .ifc \type, uv_444 724 mov w13, w3 725 mov w14, #28 726 add x19, x1, #3*GRAIN_WIDTH*2 727 mov x1, x2 728 mul w13, w13, w14 729 clz w15, w4 730 .else 731 clz w15, w2 732 .endif 733 movrel x3, X(gaussian_sequence) 734 sub w15, w15, #24 // -bitdepth_min_8 735 ldr w2, [x1, #FGD_SEED] 736 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] 737 .ifc \type, y 738 add x4, x1, #FGD_AR_COEFFS_Y 739 .else 740 add x4, x1, #FGD_AR_COEFFS_UV 741 .endif 742 add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 743 movrel x16, gen_grain_\type\()_tbl 744 ldr w17, [x1, #FGD_AR_COEFF_LAG] 745 add w9, w9, #4 746 ldrsw x17, [x16, w17, uxtw #2] 747 dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift 748 add x16, x16, x17 749 neg v31.8h, v31.8h 750 751 .ifc \type, uv_444 752 cmp w13, #0 753 mov w11, #0x49d8 754 mov w14, #0xb524 755 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] 756 csel w11, w11, w14, ne 757 .endif 758 759 ldr w7, [x1, #FGD_AR_COEFF_SHIFT] 760 neg w15, w15 // bitdepth_min_8 761 mov w8, #1 762 mov w10, #1 763 lsl w8, w8, w7 // 1 << ar_coeff_shift 764 lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) 765 lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) 766 lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) 767 mov w5, #128 768 lsl w5, w5, w15 // 128 << bitdepth_min_8 769 neg w6, w5 // -(128 << bitpdeth_min_8) 770 sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 771 772 .ifc \type, uv_444 773 eor w2, w2, w11 774 .endif 775 776 br x16 777 778 L(generate_grain_\type\()_lag0): 779 AARCH64_VALID_JUMP_TARGET 780 .ifc \type, y 781 mov w1, #GRAIN_HEIGHT 782 bl generate_grain_rows_neon 783 .else 784 dup v28.4s, w7 785 ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] 786 movi v0.16b, #0 787 movi v1.16b, #255 788 dup v25.8h, w5 789 dup v26.8h, w6 790 ext v29.16b, v0.16b, v1.16b, #10 791 ext v30.16b, v1.16b, v0.16b, #2 792 neg v28.4s, v28.4s 793 sxtl v27.8h, v27.8b 794 795 mov w1, #3 796 bl generate_grain_rows_neon 797 mov w1, #GRAIN_HEIGHT-3 798 1: 799 mov v1.16b, v29.16b 800 bl gen_grain_uv_444_lag0_neon // 8 801 movi v1.16b, #255 802 bl gen_grain_uv_444_lag0_neon // 16 803 bl gen_grain_uv_444_lag0_neon // 24 804 bl gen_grain_uv_444_lag0_neon // 32 805 bl gen_grain_uv_444_lag0_neon // 40 806 bl gen_grain_uv_444_lag0_neon // 48 807 bl gen_grain_uv_444_lag0_neon // 56 808 bl gen_grain_uv_444_lag0_neon // 64 809 bl gen_grain_uv_444_lag0_neon // 72 810 mov v1.16b, v30.16b 811 bl gen_grain_uv_444_lag0_neon // 80 812 get_grain_2 v16 813 subs w1, w1, #1 814 add x19, x19, #4 815 st1 {v16.s}[0], [x0], #4 816 b.gt 1b 817 .endif 818 ldp x30, x19, [sp], #96 819 AARCH64_VALIDATE_LINK_REGISTER 820 ret 821 822 L(generate_grain_\type\()_lag1): 823 AARCH64_VALID_JUMP_TARGET 824 ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] 825 ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] 826 ld1r {v29.8b}, [x4] // ar_coeffs_y[2] 827 .ifc \type, y 828 ldrsb w4, [x4, #1] // ar_coeffs_y[3] 829 .else 830 add x4, x4, #2 831 .endif 832 833 mov w1, #3 834 .ifc \type, uv_444 835 ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] 836 ldursb w4, [x4, #-1] // ar_coeffs_uv[3] 837 .endif 838 bl generate_grain_rows_neon 839 sxtl v27.8h, v27.8b 840 sxtl v28.8h, v28.8b 841 sxtl v29.8h, v29.8b 842 .ifc \type, uv_444 843 sxtl v30.8h, v30.8b 844 .endif 845 846 mov w1, #GRAIN_HEIGHT - 3 847 1: 848 bl sum_\type\()_lag1_left_neon // 8 849 bl sum_\type\()_lag1_mid_neon // 16 850 bl sum_\type\()_lag1_mid_neon // 24 851 bl sum_\type\()_lag1_mid_neon // 32 852 bl sum_\type\()_lag1_mid_neon // 40 853 bl sum_\type\()_lag1_mid_neon // 48 854 bl sum_\type\()_lag1_mid_neon // 56 855 bl sum_\type\()_lag1_mid_neon // 64 856 bl sum_\type\()_lag1_mid_neon // 72 857 bl sum_\type\()_lag1_right_neon // 80 858 get_grain_2 v16 859 subs w1, w1, #1 860 .ifc \type, uv_444 861 add x19, x19, #4 862 .endif 863 st1 {v16.s}[0], [x0], #4 864 b.gt 1b 865 866 ldp x30, x19, [sp], #96 867 AARCH64_VALIDATE_LINK_REGISTER 868 ret 869 870 L(generate_grain_\type\()_lag2): 871 AARCH64_VALID_JUMP_TARGET 872 ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] 873 874 smov w4, v30.b[10] 875 smov w17, v30.b[11] 876 877 mov w1, #3 878 bl generate_grain_rows_neon 879 880 mov w1, #GRAIN_HEIGHT - 3 881 1: 882 bl sum_\type\()_lag2_left_neon // 8 883 bl sum_\type\()_lag2_mid_neon // 16 884 bl sum_\type\()_lag2_mid_neon // 24 885 bl sum_\type\()_lag2_mid_neon // 32 886 bl sum_\type\()_lag2_mid_neon // 40 887 bl sum_\type\()_lag2_mid_neon // 48 888 bl sum_\type\()_lag2_mid_neon // 56 889 bl sum_\type\()_lag2_mid_neon // 64 890 bl sum_\type\()_lag2_mid_neon // 72 891 bl sum_\type\()_lag2_right_neon // 80 892 get_grain_2 v16 893 subs w1, w1, #1 894 .ifc \type, uv_444 895 add x19, x19, #4 896 .endif 897 st1 {v16.s}[0], [x0], #4 898 b.gt 1b 899 900 ldp x30, x19, [sp], #96 901 AARCH64_VALIDATE_LINK_REGISTER 902 ret 903 904 L(generate_grain_\type\()_lag3): 905 AARCH64_VALID_JUMP_TARGET 906 ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] 907 stp d8, d9, [sp, #16] 908 stp d10, d11, [sp, #32] 909 stp d12, d13, [sp, #48] 910 stp d14, d15, [sp, #64] 911 stp x20, x21, [sp, #80] 912 913 smov w4, v30.b[5] 914 smov w20, v30.b[6] 915 smov w21, v30.b[7] 916 917 mov w1, #3 918 bl generate_grain_rows_neon 919 920 mov w1, #GRAIN_HEIGHT - 3 921 1: 922 bl sum_\type\()_lag3_left_neon // 8 923 bl sum_\type\()_lag3_mid_neon // 16 924 bl sum_\type\()_lag3_mid_neon // 24 925 bl sum_\type\()_lag3_mid_neon // 32 926 bl sum_\type\()_lag3_mid_neon // 40 927 bl sum_\type\()_lag3_mid_neon // 48 928 bl sum_\type\()_lag3_mid_neon // 56 929 bl sum_\type\()_lag3_mid_neon // 64 930 bl sum_\type\()_lag3_mid_neon // 72 931 bl sum_\type\()_lag3_right_neon // 80 932 get_grain_2 v16 933 subs w1, w1, #1 934 .ifc \type, uv_444 935 add x19, x19, #4 936 .endif 937 st1 {v16.s}[0], [x0], #4 938 b.gt 1b 939 940 ldp x20, x21, [sp, #80] 941 ldp d14, d15, [sp, #64] 942 ldp d12, d13, [sp, #48] 943 ldp d10, d11, [sp, #32] 944 ldp d8, d9, [sp, #16] 945 ldp x30, x19, [sp], #96 946 AARCH64_VALIDATE_LINK_REGISTER 947 ret 948 endfunc 949 950 jumptable gen_grain_\type\()_tbl 951 .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl 952 .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl 953 .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl 954 .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl 955 endjumptable 956 .endm 957 958 gen_grain_82 y 959 gen_grain_82 uv_444 960 961 .macro set_height dst, type 962 .ifc \type, uv_420 963 mov \dst, #SUB_GRAIN_HEIGHT-3 964 .else 965 mov \dst, #GRAIN_HEIGHT-3 966 .endif 967 .endm 968 969 .macro increment_y_ptr reg, type 970 .ifc \type, uv_420 971 add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) 972 .else 973 sub \reg, \reg, #6*32-GRAIN_WIDTH*2 974 .endif 975 .endm 976 977 .macro gen_grain_44 type 978 function generate_grain_\type\()_16bpc_neon, export=1 979 AARCH64_SIGN_LINK_REGISTER 980 stp x30, x19, [sp, #-96]! 981 982 mov w13, w3 983 mov w14, #28 984 add x19, x1, #(3*GRAIN_WIDTH-3)*2 985 mov x1, x2 986 mul w13, w13, w14 987 clz w15, w4 988 989 movrel x3, X(gaussian_sequence) 990 sub w15, w15, #24 // -bitdepth_min_8 991 ldr w2, [x1, #FGD_SEED] 992 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] 993 add x4, x1, #FGD_AR_COEFFS_UV 994 add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 995 movrel x16, gen_grain_\type\()_tbl 996 ldr w17, [x1, #FGD_AR_COEFF_LAG] 997 add w9, w9, #4 998 ldrsw x17, [x16, w17, uxtw #2] 999 dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift 1000 add x16, x16, x17 1001 neg v31.8h, v31.8h 1002 1003 cmp w13, #0 1004 mov w11, #0x49d8 1005 mov w14, #0xb524 1006 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] 1007 csel w11, w11, w14, ne 1008 1009 ldr w7, [x1, #FGD_AR_COEFF_SHIFT] 1010 neg w15, w15 // bitdepth_min_8 1011 mov w8, #1 1012 mov w10, #1 1013 lsl w8, w8, w7 // 1 << ar_coeff_shift 1014 lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) 1015 lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) 1016 lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) 1017 mov w5, #128 1018 lsl w5, w5, w15 // 128 << bitdepth_min_8 1019 neg w6, w5 // -(128 << bitpdeth_min_8) 1020 sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 1021 1022 eor w2, w2, w11 1023 1024 br x16 1025 1026 L(generate_grain_\type\()_lag0): 1027 AARCH64_VALID_JUMP_TARGET 1028 dup v28.4s, w7 1029 ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] 1030 movi v0.16b, #0 1031 movi v1.16b, #255 1032 dup v25.8h, w5 1033 dup v26.8h, w6 1034 ext v29.16b, v0.16b, v1.16b, #10 1035 ext v30.16b, v1.16b, v0.16b, #14 1036 neg v28.4s, v28.4s 1037 sxtl v27.8h, v27.8b 1038 1039 mov w1, #3 1040 bl generate_grain_rows_44_neon 1041 set_height w1, \type 1042 1: 1043 mov v1.16b, v29.16b 1044 bl gen_grain_\type\()_lag0_8_neon // 8 1045 movi v1.16b, #255 1046 bl gen_grain_\type\()_lag0_8_neon // 16 1047 bl gen_grain_\type\()_lag0_8_neon // 24 1048 bl gen_grain_\type\()_lag0_8_neon // 32 1049 bl gen_grain_\type\()_lag0_8_neon // 40 1050 mov v1.16b, v30.16b 1051 bl gen_grain_\type\()_lag0_4_neon // 44 1052 subs w1, w1, #1 1053 increment_y_ptr x19, \type 1054 add x0, x0, #GRAIN_WIDTH*2-6*16 1055 b.gt 1b 1056 1057 ldp x30, x19, [sp], #96 1058 AARCH64_VALIDATE_LINK_REGISTER 1059 ret 1060 1061 L(generate_grain_\type\()_lag1): 1062 AARCH64_VALID_JUMP_TARGET 1063 ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] 1064 ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] 1065 ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] 1066 add x4, x4, #2 1067 1068 mov w1, #3 1069 ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] 1070 ldursb w4, [x4, #-1] // ar_coeffs_uv[3] 1071 bl generate_grain_rows_44_neon 1072 1073 sxtl v27.8h, v27.8b 1074 sxtl v28.8h, v28.8b 1075 sxtl v29.8h, v29.8b 1076 sxtl v30.8h, v30.8b 1077 set_height w1, \type 1078 1: 1079 bl sum_\type\()_lag1_left_neon // 8 1080 bl sum_\type\()_lag1_mid_neon // 16 1081 bl sum_\type\()_lag1_mid_neon // 24 1082 bl sum_\type\()_lag1_mid_neon // 32 1083 bl sum_\type\()_lag1_mid_neon // 40 1084 bl sum_\type\()_lag1_right_neon // 44 1085 subs w1, w1, #1 1086 increment_y_ptr x19, \type 1087 add x0, x0, #GRAIN_WIDTH*2-6*16 1088 b.gt 1b 1089 1090 ldp x30, x19, [sp], #96 1091 AARCH64_VALIDATE_LINK_REGISTER 1092 ret 1093 1094 L(generate_grain_\type\()_lag2): 1095 AARCH64_VALID_JUMP_TARGET 1096 ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] 1097 1098 smov w4, v30.b[10] 1099 smov w17, v30.b[11] 1100 1101 mov w1, #3 1102 bl generate_grain_rows_44_neon 1103 1104 set_height w1, \type 1105 1: 1106 bl sum_\type\()_lag2_left_neon // 8 1107 bl sum_\type\()_lag2_mid_neon // 16 1108 bl sum_\type\()_lag2_mid_neon // 24 1109 bl sum_\type\()_lag2_mid_neon // 32 1110 bl sum_\type\()_lag2_mid_neon // 40 1111 bl sum_\type\()_lag2_right_neon // 44 1112 subs w1, w1, #1 1113 increment_y_ptr x19, \type 1114 add x0, x0, #GRAIN_WIDTH*2-6*16 1115 b.gt 1b 1116 1117 ldp x30, x19, [sp], #96 1118 AARCH64_VALIDATE_LINK_REGISTER 1119 ret 1120 1121 L(generate_grain_\type\()_lag3): 1122 AARCH64_VALID_JUMP_TARGET 1123 ldr q29, [x4] // ar_coeffs_uv[0-15] 1124 ldr q30, [x4, #16] // ar_coeffs_uv[16-24] 1125 stp d8, d9, [sp, #16] 1126 stp d10, d11, [sp, #32] 1127 stp d12, d13, [sp, #48] 1128 stp d14, d15, [sp, #64] 1129 stp x20, x21, [sp, #80] 1130 1131 smov w4, v30.b[5] 1132 smov w20, v30.b[6] 1133 smov w21, v30.b[7] 1134 1135 mov w1, #3 1136 bl generate_grain_rows_44_neon 1137 1138 set_height w1, \type 1139 1: 1140 bl sum_\type\()_lag3_left_neon // 8 1141 bl sum_\type\()_lag3_mid_neon // 16 1142 bl sum_\type\()_lag3_mid_neon // 24 1143 bl sum_\type\()_lag3_mid_neon // 32 1144 bl sum_\type\()_lag3_mid_neon // 40 1145 bl sum_\type\()_lag3_right_neon // 44 1146 subs w1, w1, #1 1147 increment_y_ptr x19, \type 1148 add x0, x0, #GRAIN_WIDTH*2-6*16 1149 b.gt 1b 1150 1151 ldp x20, x21, [sp, #80] 1152 ldp d14, d15, [sp, #64] 1153 ldp d12, d13, [sp, #48] 1154 ldp d10, d11, [sp, #32] 1155 ldp d8, d9, [sp, #16] 1156 ldp x30, x19, [sp], #96 1157 AARCH64_VALIDATE_LINK_REGISTER 1158 ret 1159 endfunc 1160 1161 jumptable gen_grain_\type\()_tbl 1162 .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl 1163 .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl 1164 .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl 1165 .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl 1166 endjumptable 1167 .endm 1168 1169 gen_grain_44 uv_420 1170 gen_grain_44 uv_422 1171 1172 .macro gather_interleaved dst1, dst2, src1, src2, off 1173 umov w14, \src1[0] 1174 umov w15, \src2[1] 1175 umov w16, \src1[2] 1176 add x14, x14, x3 1177 umov w17, \src2[3] 1178 add x15, x15, x3 1179 ld1 {\dst1}[0+\off], [x14] 1180 umov w14, \src1[4] 1181 add x16, x16, x3 1182 ld1 {\dst2}[1+\off], [x15] 1183 umov w15, \src2[5] 1184 add x17, x17, x3 1185 ld1 {\dst1}[2+\off], [x16] 1186 umov w16, \src1[6] 1187 add x14, x14, x3 1188 ld1 {\dst2}[3+\off], [x17] 1189 umov w17, \src2[7] 1190 add x15, x15, x3 1191 ld1 {\dst1}[4+\off], [x14] 1192 add x16, x16, x3 1193 ld1 {\dst2}[5+\off], [x15] 1194 add x17, x17, x3 1195 ld1 {\dst1}[6+\off], [x16] 1196 ld1 {\dst2}[7+\off], [x17] 1197 .endm 1198 1199 .macro gather dst1, dst2, src1, src2, src3, src4 1200 gather_interleaved \dst1, \dst2, \src1, \src3, 0 1201 gather_interleaved \dst2, \dst1, \src3, \src1, 0 1202 gather_interleaved \dst1, \dst2, \src2, \src4, 8 1203 gather_interleaved \dst2, \dst1, \src4, \src2, 8 1204 .endm 1205 1206 function gather32_neon 1207 gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h 1208 ret 1209 endfunc 1210 1211 function gather16_neon 1212 gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 1213 gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 1214 ins v6.d[1], v7.d[0] 1215 ret 1216 endfunc 1217 1218 const overlap_coeffs_0, align=4 1219 .short 27, 17, 0, 0 1220 .short 17, 27, 32, 32 1221 endconst 1222 1223 const overlap_coeffs_1, align=4 1224 .short 23, 0, 0, 0 1225 .short 22, 32, 32, 32 1226 endconst 1227 1228 .macro calc_offset offx, offy, src, sx, sy 1229 and \offy, \src, #0xF // randval & 0xF 1230 lsr \offx, \src, #4 // randval >> 4 1231 .if \sy == 0 1232 add \offy, \offy, \offy // 2 * (randval & 0xF) 1233 .endif 1234 .if \sx == 0 1235 add \offx, \offx, \offx // 2 * (randval >> 4) 1236 .endif 1237 .endm 1238 1239 .macro add_offset dst, offx, offy, src, stride 1240 madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy 1241 add \dst, \dst, \offx, uxtw #1 // grain_lut += offx 1242 .endm 1243 1244 // void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, 1245 // const ptrdiff_t stride, 1246 // const uint8_t scaling[SCALING_SIZE], 1247 // const int scaling_shift, 1248 // const entry grain_lut[][GRAIN_WIDTH], 1249 // const int offsets[][2], 1250 // const int h, const ptrdiff_t clip, 1251 // const ptrdiff_t type, 1252 // const int bitdepth_max); 1253 function fgy_32x32_16bpc_neon, export=1 1254 AARCH64_SIGN_LINK_REGISTER 1255 str x30, [sp, #-80]! 1256 stp d8, d9, [sp, #16] 1257 stp d10, d11, [sp, #32] 1258 stp d12, d13, [sp, #48] 1259 str d14, [sp, #64] 1260 eor w4, w4, #15 // 15 - scaling_shift 1261 ldr w11, [x6, #8] // offsets[1][0] 1262 ldr w13, [x6, #4] // offsets[0][1] 1263 ldr w15, [x6, #12] // offsets[1][1] 1264 ldr w10, [sp, #96] // bitdepth_max 1265 ldr w6, [x6] // offsets[0][0] 1266 dup v26.8h, w10 // bitdepth_max 1267 clz w10, w10 1268 ldr w8, [sp, #80] // clip 1269 sub w10, w10, #24 // -bitdepth_min_8 1270 mov x9, #GRAIN_WIDTH*2 // grain_lut stride 1271 neg w10, w10 // bitdepth_min_8 1272 1273 dup v29.8h, w4 // 15 - scaling_shift 1274 dup v27.8h, w10 // bitdepth_min_8 1275 1276 movrel x16, overlap_coeffs_0 1277 1278 cbz w8, 1f 1279 // clip 1280 movi v30.8h, #16 1281 movi v31.8h, #235 1282 sshl v30.8h, v30.8h, v27.8h 1283 sshl v31.8h, v31.8h, v27.8h 1284 b 2f 1285 1: 1286 // no clip 1287 movi v30.8h, #0 1288 mov v31.16b, v26.16b // bitdepth_max 1289 2: 1290 1291 ushr v26.8h, v26.8h, #1 // grain_max 1292 not v25.16b, v26.16b // grain_min 1293 1294 ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs 1295 1296 add x5, x5, #18 // grain_lut += 9 1297 add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride 1298 add x5, x5, x9 // grain_lut += grain_stride 1299 1300 calc_offset w11, w12, w11, 0, 0 1301 calc_offset w13, w14, w13, 0, 0 1302 calc_offset w15, w16, w15, 0, 0 1303 calc_offset w6, w10, w6, 0, 0 1304 1305 add_offset x12, w11, x12, x5, x9 1306 add_offset x14, w13, x14, x5, x9 1307 add_offset x16, w15, x16, x5, x9 1308 add_offset x5, w6, x10, x5, x9 1309 1310 ldr w11, [sp, #88] // type 1311 movrel x13, fgy_loop_tbl 1312 1313 add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx 1314 add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1315 1316 tst w11, #1 1317 ldrsw x11, [x13, w11, uxtw #2] 1318 1319 add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1320 add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx 1321 1322 add x11, x13, x11 1323 1324 b.eq 1f 1325 // y overlap 1326 dup v8.8h, v27.h[0] 1327 dup v9.8h, v27.h[1] 1328 mov w10, w7 // backup actual h 1329 mov w7, #2 1330 1: 1331 br x11 1332 endfunc 1333 1334 function fgy_loop_neon 1335 .macro fgy ox, oy 1336 L(loop_\ox\oy): 1337 AARCH64_VALID_JUMP_TARGET 1338 1: 1339 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src 1340 .if \ox 1341 ld1 {v20.4h}, [x4], x9 // grain_lut old 1342 .endif 1343 .if \oy 1344 ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top 1345 .endif 1346 .if \ox && \oy 1347 ld1 {v14.4h}, [x8], x9 // grain_lut top old 1348 .endif 1349 mvni v4.8h, #0xf0, lsl #8 // 0x0fff 1350 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut 1351 1352 // Make sure that uninitialized pixels out of range past the right 1353 // edge are in range; their actual values shouldn't matter. 1354 and v0.16b, v0.16b, v4.16b 1355 and v1.16b, v1.16b, v4.16b 1356 and v2.16b, v2.16b, v4.16b 1357 and v3.16b, v3.16b, v4.16b 1358 bl gather32_neon 1359 1360 .if \ox 1361 smull v20.4s, v20.4h, v27.4h 1362 smlal v20.4s, v16.4h, v28.4h 1363 .endif 1364 1365 .if \oy 1366 .if \ox 1367 smull v14.4s, v14.4h, v27.4h 1368 smlal v14.4s, v21.4h, v28.4h 1369 sqrshrn v20.4h, v20.4s, #5 1370 sqrshrn v14.4h, v14.4s, #5 1371 smin v20.4h, v20.4h, v26.4h 1372 smin v14.4h, v14.4h, v26.4h 1373 smax v20.4h, v20.4h, v25.4h 1374 smax v14.4h, v14.4h, v25.4h 1375 .endif 1376 1377 .if \ox 1378 smull v10.4s, v20.4h, v9.4h 1379 .else 1380 smull v10.4s, v16.4h, v9.4h 1381 .endif 1382 smull2 v11.4s, v16.8h, v9.8h 1383 smull v12.4s, v17.4h, v9.4h 1384 smull2 v13.4s, v17.8h, v9.8h 1385 smull v16.4s, v18.4h, v9.4h 1386 smull2 v17.4s, v18.8h, v9.8h 1387 smull v18.4s, v19.4h, v9.4h 1388 smull2 v19.4s, v19.8h, v9.8h 1389 .if \ox 1390 smlal v10.4s, v14.4h, v8.4h 1391 .else 1392 smlal v10.4s, v21.4h, v8.4h 1393 .endif 1394 smlal2 v11.4s, v21.8h, v8.8h 1395 smlal v12.4s, v22.4h, v8.4h 1396 smlal2 v13.4s, v22.8h, v8.8h 1397 smlal v16.4s, v23.4h, v8.4h 1398 smlal2 v17.4s, v23.8h, v8.8h 1399 smlal v18.4s, v24.4h, v8.4h 1400 smlal2 v19.4s, v24.8h, v8.8h 1401 sqrshrn v10.4h, v10.4s, #5 1402 sqrshrn2 v10.8h, v11.4s, #5 1403 sqrshrn v11.4h, v12.4s, #5 1404 sqrshrn2 v11.8h, v13.4s, #5 1405 sqrshrn v12.4h, v16.4s, #5 1406 sqrshrn2 v12.8h, v17.4s, #5 1407 sqrshrn v13.4h, v18.4s, #5 1408 sqrshrn2 v13.8h, v19.4s, #5 1409 smin v16.8h, v10.8h, v26.8h 1410 smin v17.8h, v11.8h, v26.8h 1411 smin v18.8h, v12.8h, v26.8h 1412 smin v19.8h, v13.8h, v26.8h 1413 smax v16.8h, v16.8h, v25.8h 1414 smax v17.8h, v17.8h, v25.8h 1415 smax v18.8h, v18.8h, v25.8h 1416 smax v19.8h, v19.8h, v25.8h 1417 .endif 1418 1419 uxtl v4.8h, v6.8b // scaling 1420 .if \ox && !\oy 1421 sqrshrn v20.4h, v20.4s, #5 1422 .endif 1423 uxtl2 v5.8h, v6.16b 1424 .if \ox && !\oy 1425 smin v20.4h, v20.4h, v26.4h 1426 .endif 1427 uxtl v6.8h, v7.8b 1428 .if \ox && !\oy 1429 smax v20.4h, v20.4h, v25.4h 1430 .endif 1431 uxtl2 v7.8h, v7.16b 1432 .if \ox && !\oy 1433 ins v16.d[0], v20.d[0] 1434 .endif 1435 ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) 1436 ushl v5.8h, v5.8h, v29.8h 1437 ushl v6.8h, v6.8h, v29.8h 1438 ushl v7.8h, v7.8h, v29.8h 1439 1440 sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) 1441 sqrdmulh v21.8h, v17.8h, v5.8h 1442 sqrdmulh v22.8h, v18.8h, v6.8h 1443 sqrdmulh v23.8h, v19.8h, v7.8h 1444 1445 usqadd v0.8h, v20.8h // *src + noise 1446 usqadd v1.8h, v21.8h 1447 usqadd v2.8h, v22.8h 1448 usqadd v3.8h, v23.8h 1449 1450 umax v0.8h, v0.8h, v30.8h 1451 umax v1.8h, v1.8h, v30.8h 1452 umax v2.8h, v2.8h, v30.8h 1453 umax v3.8h, v3.8h, v30.8h 1454 umin v0.8h, v0.8h, v31.8h 1455 umin v1.8h, v1.8h, v31.8h 1456 umin v2.8h, v2.8h, v31.8h 1457 umin v3.8h, v3.8h, v31.8h 1458 1459 subs w7, w7, #1 1460 .if \oy 1461 dup v8.8h, v28.h[0] 1462 dup v9.8h, v28.h[1] 1463 .endif 1464 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst 1465 b.gt 1b 1466 1467 .if \oy 1468 cmp w10, #2 1469 sub w7, w10, #2 // restore actual remaining h 1470 b.gt L(loop_\ox\()0) 1471 .endif 1472 ldr d14, [sp, #64] 1473 ldp d12, d13, [sp, #48] 1474 ldp d10, d11, [sp, #32] 1475 ldp d8, d9, [sp, #16] 1476 ldr x30, [sp], #80 1477 AARCH64_VALIDATE_LINK_REGISTER 1478 ret 1479 .endm 1480 1481 fgy 0, 0 1482 fgy 0, 1 1483 fgy 1, 0 1484 fgy 1, 1 1485 endfunc 1486 1487 jumptable fgy_loop_tbl 1488 .word L(loop_00) - fgy_loop_tbl 1489 .word L(loop_01) - fgy_loop_tbl 1490 .word L(loop_10) - fgy_loop_tbl 1491 .word L(loop_11) - fgy_loop_tbl 1492 endjumptable 1493 1494 // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, 1495 // const pixel *const src, 1496 // const ptrdiff_t stride, 1497 // const uint8_t scaling[SCALING_SIZE], 1498 // const Dav1dFilmGrainData *const data, 1499 // const entry grain_lut[][GRAIN_WIDTH], 1500 // const pixel *const luma_row, 1501 // const ptrdiff_t luma_stride, 1502 // const int offsets[][2], 1503 // const ptrdiff_t h, const ptrdiff_t uv, 1504 // const ptrdiff_t is_id, 1505 // const ptrdiff_t type, 1506 // const int bitdepth_max); 1507 .macro fguv layout, sx, sy 1508 function fguv_32x32_\layout\()_16bpc_neon, export=1 1509 AARCH64_SIGN_LINK_REGISTER 1510 str x30, [sp, #-80]! 1511 stp d8, d9, [sp, #16] 1512 stp d10, d11, [sp, #32] 1513 stp d12, d13, [sp, #48] 1514 stp d14, d15, [sp, #64] 1515 1516 ldp x8, x9, [sp, #80] // offsets, h 1517 ldp x10, x11, [sp, #96] // uv, is_id 1518 ldr w16, [sp, #120] // bitdepth_max 1519 1520 ldr w13, [x4, #FGD_SCALING_SHIFT] 1521 ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] 1522 dup v23.8h, w16 // bitdepth_max 1523 clz w16, w16 1524 eor w13, w13, #15 // 15 - scaling_shift 1525 sub w16, w16, #24 // -bitdepth_min_8 1526 1527 // !csfl 1528 add x10, x4, x10, lsl #2 // + 4*uv 1529 add x14, x10, #FGD_UV_LUMA_MULT 1530 add x15, x10, #FGD_UV_MULT 1531 add x10, x10, #FGD_UV_OFFSET 1532 neg w16, w16 // bitdepth_min_8 1533 ld1r {v8.8h}, [x14] // uv_luma_mult 1534 ld1r {v24.8h}, [x10] // uv_offset 1535 ld1r {v9.8h}, [x15] // uv_mult 1536 1537 dup v29.8h, w13 // 15 - scaling_shift 1538 dup v27.8h, w16 // bitdepth_min_8 1539 1540 cbz w12, 1f 1541 // clip 1542 movi v30.8h, #16 1543 movi v31.8h, #240 1544 sshl v30.8h, v30.8h, v27.8h 1545 sshl v31.8h, v31.8h, v27.8h 1546 cbz w11, 2f 1547 // is_id 1548 movi v31.8h, #235 1549 sshl v31.8h, v31.8h, v27.8h 1550 b 2f 1551 1: 1552 // no clip 1553 movi v30.8h, #0 1554 mov v31.16b, v23.16b // bitdepth_max 1555 2: 1556 1557 ushr v15.8h, v23.8h, #1 // grain_max 1558 sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 1559 not v14.16b, v15.16b // grain_min 1560 1561 ldr w12, [x8, #8] // offsets[1][0] 1562 ldr w14, [x8, #4] // offsets[0][1] 1563 ldr w16, [x8, #12] // offsets[1][1] 1564 ldr w8, [x8] // offsets[0][0] 1565 1566 mov x10, #GRAIN_WIDTH*2 // grain_lut stride 1567 1568 add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 1569 .if \sy 1570 add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride 1571 add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride 1572 .else 1573 add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride 1574 add x5, x5, x10 // grain_lut += grain_stride 1575 .endif 1576 1577 calc_offset w12, w13, w12, \sx, \sy 1578 calc_offset w14, w15, w14, \sx, \sy 1579 calc_offset w16, w17, w16, \sx, \sy 1580 calc_offset w8, w11, w8, \sx, \sy 1581 1582 add_offset x13, w12, x13, x5, x10 1583 add_offset x15, w14, x15, x5, x10 1584 add_offset x17, w16, x17, x5, x10 1585 add_offset x5, w8, x11, x5, x10 1586 1587 add x4, x13, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1588 add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1589 add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1590 add x11, x11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1591 1592 ldr w13, [sp, #112] // type 1593 1594 movrel x16, overlap_coeffs_\sx 1595 movrel x14, fguv_loop_sx\sx\()_tbl 1596 1597 ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs 1598 tst w13, #1 1599 ldrsw x13, [x14, w13, uxtw #2] 1600 1601 b.eq 1f 1602 // y overlap 1603 sub w12, w9, #(2 >> \sy) // backup remaining h 1604 mov w9, #(2 >> \sy) 1605 1606 1: 1607 add x13, x14, x13 1608 1609 .if \sy 1610 movi v25.8h, #23 1611 movi v26.8h, #22 1612 .else 1613 movi v25.8h, #27 1614 movi v26.8h, #17 1615 .endif 1616 1617 .if \sy 1618 add x7, x7, x7 // luma_stride *= 2 1619 .endif 1620 1621 br x13 1622 endfunc 1623 .endm 1624 1625 fguv 420, 1, 1 1626 fguv 422, 1, 0 1627 fguv 444, 0, 0 1628 1629 function fguv_loop_sx0_neon 1630 .macro fguv_loop_sx0 csfl, ox, oy 1631 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): 1632 AARCH64_VALID_JUMP_TARGET 1633 1: 1634 .if \ox 1635 ld1 {v4.4h}, [x4], x10 // grain_lut old 1636 .endif 1637 .if \oy 1638 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top 1639 .endif 1640 .if \ox && \oy 1641 ld1 {v5.4h}, [x11], x10 // grain_lut top old 1642 .endif 1643 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut 1644 1645 .if \ox 1646 smull v4.4s, v4.4h, v27.4h 1647 smlal v4.4s, v16.4h, v28.4h 1648 .endif 1649 1650 .if \oy 1651 .if \ox 1652 smull v5.4s, v5.4h, v27.4h 1653 smlal v5.4s, v0.4h, v28.4h 1654 sqrshrn v4.4h, v4.4s, #5 1655 sqrshrn v5.4h, v5.4s, #5 1656 smin v4.4h, v4.4h, v15.4h 1657 smin v5.4h, v5.4h, v15.4h 1658 smax v4.4h, v4.4h, v14.4h 1659 smax v5.4h, v5.4h, v14.4h 1660 ins v16.d[0], v4.d[0] 1661 ins v0.d[0], v5.d[0] 1662 .endif 1663 1664 smull v6.4s, v16.4h, v26.4h 1665 smull2 v7.4s, v16.8h, v26.8h 1666 smull v10.4s, v17.4h, v26.4h 1667 smull2 v11.4s, v17.8h, v26.8h 1668 smull v16.4s, v18.4h, v26.4h 1669 smull2 v17.4s, v18.8h, v26.8h 1670 smull v18.4s, v19.4h, v26.4h 1671 smull2 v19.4s, v19.8h, v26.8h 1672 smlal v6.4s, v0.4h, v25.4h 1673 smlal2 v7.4s, v0.8h, v25.8h 1674 smlal v10.4s, v1.4h, v25.4h 1675 smlal2 v11.4s, v1.8h, v25.8h 1676 smlal v16.4s, v2.4h, v25.4h 1677 smlal2 v17.4s, v2.8h, v25.8h 1678 smlal v18.4s, v3.4h, v25.4h 1679 smlal2 v19.4s, v3.8h, v25.8h 1680 sqrshrn v6.4h, v6.4s, #5 1681 sqrshrn2 v6.8h, v7.4s, #5 1682 sqrshrn v7.4h, v10.4s, #5 1683 sqrshrn2 v7.8h, v11.4s, #5 1684 sqrshrn v10.4h, v16.4s, #5 1685 sqrshrn2 v10.8h, v17.4s, #5 1686 sqrshrn v11.4h, v18.4s, #5 1687 sqrshrn2 v11.8h, v19.4s, #5 1688 .endif 1689 1690 .if \ox && !\oy 1691 sqrshrn v4.4h, v4.4s, #5 1692 smin v4.4h, v4.4h, v15.4h 1693 .endif 1694 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma 1695 .if \oy 1696 smin v16.8h, v6.8h, v15.8h 1697 smin v17.8h, v7.8h, v15.8h 1698 smin v18.8h, v10.8h, v15.8h 1699 smin v19.8h, v11.8h, v15.8h 1700 smax v16.8h, v16.8h, v14.8h 1701 smax v17.8h, v17.8h, v14.8h 1702 smax v18.8h, v18.8h, v14.8h 1703 smax v19.8h, v19.8h, v14.8h 1704 .endif 1705 1706 .if \ox && !\oy 1707 smax v4.4h, v4.4h, v14.4h 1708 .endif 1709 ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src 1710 .if \ox && !\oy 1711 ins v16.d[0], v4.d[0] 1712 .endif 1713 1714 .if !\csfl 1715 smull v4.4s, v0.4h, v8.4h 1716 smull2 v5.4s, v0.8h, v8.8h 1717 smull v6.4s, v1.4h, v8.4h 1718 smull2 v7.4s, v1.8h, v8.8h 1719 smull v0.4s, v2.4h, v8.4h 1720 smull2 v1.4s, v2.8h, v8.8h 1721 smull v2.4s, v3.4h, v8.4h 1722 smull2 v3.4s, v3.8h, v8.8h 1723 smlal v4.4s, v10.4h, v9.4h 1724 smlal2 v5.4s, v10.8h, v9.8h 1725 smlal v6.4s, v11.4h, v9.4h 1726 smlal2 v7.4s, v11.8h, v9.8h 1727 smlal v0.4s, v12.4h, v9.4h 1728 smlal2 v1.4s, v12.8h, v9.8h 1729 smlal v2.4s, v13.4h, v9.4h 1730 smlal2 v3.4s, v13.8h, v9.8h 1731 shrn v4.4h, v4.4s, #6 1732 shrn2 v4.8h, v5.4s, #6 1733 shrn v5.4h, v6.4s, #6 1734 shrn2 v5.8h, v7.4s, #6 1735 shrn v6.4h, v0.4s, #6 1736 shrn2 v6.8h, v1.4s, #6 1737 shrn v7.4h, v2.4s, #6 1738 shrn2 v7.8h, v3.4s, #6 1739 add v0.8h, v4.8h, v24.8h 1740 add v1.8h, v5.8h, v24.8h 1741 add v2.8h, v6.8h, v24.8h 1742 add v3.8h, v7.8h, v24.8h 1743 movi v20.8h, #0 1744 smin v0.8h, v0.8h, v23.8h 1745 smin v1.8h, v1.8h, v23.8h 1746 smin v2.8h, v2.8h, v23.8h 1747 smin v3.8h, v3.8h, v23.8h 1748 smax v0.8h, v0.8h, v20.8h 1749 smax v1.8h, v1.8h, v20.8h 1750 smax v2.8h, v2.8h, v20.8h 1751 smax v3.8h, v3.8h, v20.8h 1752 .else 1753 // Make sure that uninitialized pixels out of range past the right 1754 // edge are in range; their actual values shouldn't matter. 1755 and v0.16b, v0.16b, v23.16b 1756 and v1.16b, v1.16b, v23.16b 1757 and v2.16b, v2.16b, v23.16b 1758 and v3.16b, v3.16b, v23.16b 1759 .endif 1760 1761 bl gather32_neon 1762 1763 uxtl v4.8h, v6.8b // scaling 1764 uxtl2 v5.8h, v6.16b 1765 uxtl v6.8h, v7.8b 1766 uxtl2 v7.8h, v7.16b 1767 1768 ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) 1769 ushl v5.8h, v5.8h, v29.8h 1770 ushl v6.8h, v6.8h, v29.8h 1771 ushl v7.8h, v7.8h, v29.8h 1772 1773 sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) 1774 sqrdmulh v17.8h, v17.8h, v5.8h 1775 sqrdmulh v18.8h, v18.8h, v6.8h 1776 sqrdmulh v19.8h, v19.8h, v7.8h 1777 1778 usqadd v10.8h, v16.8h // *src + noise 1779 usqadd v11.8h, v17.8h 1780 usqadd v12.8h, v18.8h 1781 usqadd v13.8h, v19.8h 1782 1783 umax v0.8h, v10.8h, v30.8h 1784 umax v1.8h, v11.8h, v30.8h 1785 umax v2.8h, v12.8h, v30.8h 1786 umax v3.8h, v13.8h, v30.8h 1787 umin v0.8h, v0.8h, v31.8h 1788 umin v1.8h, v1.8h, v31.8h 1789 umin v2.8h, v2.8h, v31.8h 1790 umin v3.8h, v3.8h, v31.8h 1791 1792 subs w9, w9, #1 1793 .if \oy 1794 dup v25.8h, v28.h[0] 1795 dup v26.8h, v28.h[1] 1796 .endif 1797 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst 1798 b.gt 1b 1799 1800 .if \oy 1801 cmp w12, #0 1802 mov w9, w12 // restore actual remaining h 1803 b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) 1804 .endif 1805 b 9f 1806 .endm 1807 fguv_loop_sx0 0, 0, 0 1808 fguv_loop_sx0 0, 0, 1 1809 fguv_loop_sx0 0, 1, 0 1810 fguv_loop_sx0 0, 1, 1 1811 fguv_loop_sx0 1, 0, 0 1812 fguv_loop_sx0 1, 0, 1 1813 fguv_loop_sx0 1, 1, 0 1814 fguv_loop_sx0 1, 1, 1 1815 1816 9: 1817 ldp d14, d15, [sp, #64] 1818 ldp d12, d13, [sp, #48] 1819 ldp d10, d11, [sp, #32] 1820 ldp d8, d9, [sp, #16] 1821 ldr x30, [sp], #80 1822 AARCH64_VALIDATE_LINK_REGISTER 1823 ret 1824 endfunc 1825 1826 jumptable fguv_loop_sx0_tbl 1827 .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl 1828 .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl 1829 .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl 1830 .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl 1831 .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl 1832 .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl 1833 .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl 1834 .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl 1835 endjumptable 1836 1837 function fguv_loop_sx1_neon 1838 .macro fguv_loop_sx1 csfl, ox, oy 1839 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): 1840 AARCH64_VALID_JUMP_TARGET 1841 1: 1842 .if \ox 1843 ld1 {v18.4h}, [x4], x10 // grain_lut old 1844 .endif 1845 .if \oy 1846 ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top 1847 .endif 1848 .if \ox && \oy 1849 ld1 {v19.4h}, [x11], x10 // grain_lut top old 1850 .endif 1851 ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut 1852 1853 .if \ox 1854 smull v18.4s, v18.4h, v27.4h 1855 smlal v18.4s, v16.4h, v28.4h 1856 .endif 1857 1858 .if \oy 1859 .if \ox 1860 smull v19.4s, v19.4h, v27.4h 1861 smlal v19.4s, v20.4h, v28.4h 1862 sqrshrn v18.4h, v18.4s, #5 1863 sqrshrn v19.4h, v19.4s, #5 1864 smin v18.4h, v18.4h, v15.4h 1865 smin v19.4h, v19.4h, v15.4h 1866 smax v18.4h, v18.4h, v14.4h 1867 smax v19.4h, v19.4h, v14.4h 1868 ins v16.d[0], v18.d[0] 1869 ins v20.d[0], v19.d[0] 1870 .endif 1871 1872 smull v0.4s, v16.4h, v26.4h 1873 smull2 v1.4s, v16.8h, v26.8h 1874 smull v2.4s, v17.4h, v26.4h 1875 smull2 v3.4s, v17.8h, v26.8h 1876 smlal v0.4s, v20.4h, v25.4h 1877 smlal2 v1.4s, v20.8h, v25.8h 1878 smlal v2.4s, v21.4h, v25.4h 1879 smlal2 v3.4s, v21.8h, v25.8h 1880 sqrshrn v16.4h, v0.4s, #5 1881 sqrshrn2 v16.8h, v1.4s, #5 1882 sqrshrn v17.4h, v2.4s, #5 1883 sqrshrn2 v17.8h, v3.4s, #5 1884 .endif 1885 1886 .if \ox && !\oy 1887 sqrshrn v18.4h, v18.4s, #5 1888 smin v18.4h, v18.4h, v15.4h 1889 .endif 1890 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma 1891 .if \oy 1892 smin v16.8h, v16.8h, v15.8h 1893 smin v17.8h, v17.8h, v15.8h 1894 smax v16.8h, v16.8h, v14.8h 1895 smax v17.8h, v17.8h, v14.8h 1896 .endif 1897 1898 .if \ox && !\oy 1899 smax v18.4h, v18.4h, v14.4h 1900 .endif 1901 ld1 {v10.8h, v11.8h}, [x1], x2 // src 1902 .if \ox && !\oy 1903 ins v16.d[0], v18.d[0] 1904 .endif 1905 addp v0.8h, v0.8h, v1.8h 1906 addp v1.8h, v2.8h, v3.8h 1907 urshr v0.8h, v0.8h, #1 1908 urshr v1.8h, v1.8h, #1 1909 .if !\csfl 1910 smull v2.4s, v0.4h, v8.4h 1911 smull2 v3.4s, v0.8h, v8.8h 1912 smull v0.4s, v1.4h, v8.4h 1913 smull2 v1.4s, v1.8h, v8.8h 1914 smlal v2.4s, v10.4h, v9.4h 1915 smlal2 v3.4s, v10.8h, v9.8h 1916 smlal v0.4s, v11.4h, v9.4h 1917 smlal2 v1.4s, v11.8h, v9.8h 1918 shrn v2.4h, v2.4s, #6 1919 shrn2 v2.8h, v3.4s, #6 1920 shrn v3.4h, v0.4s, #6 1921 shrn2 v3.8h, v1.4s, #6 1922 add v0.8h, v2.8h, v24.8h 1923 add v1.8h, v3.8h, v24.8h 1924 movi v2.8h, #0 1925 smin v0.8h, v0.8h, v23.8h 1926 smin v1.8h, v1.8h, v23.8h 1927 smax v0.8h, v0.8h, v2.8h 1928 smax v1.8h, v1.8h, v2.8h 1929 .else 1930 // Make sure that uninitialized pixels out of range past the right 1931 // edge are in range; their actual values shouldn't matter. 1932 and v0.16b, v0.16b, v23.16b 1933 and v1.16b, v1.16b, v23.16b 1934 .endif 1935 1936 bl gather16_neon 1937 1938 uxtl v4.8h, v6.8b // scaling 1939 uxtl2 v5.8h, v6.16b 1940 1941 ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) 1942 ushl v5.8h, v5.8h, v29.8h 1943 1944 sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) 1945 sqrdmulh v17.8h, v17.8h, v5.8h 1946 1947 usqadd v10.8h, v16.8h // *src + noise 1948 usqadd v11.8h, v17.8h 1949 1950 umax v0.8h, v10.8h, v30.8h 1951 umax v1.8h, v11.8h, v30.8h 1952 umin v0.8h, v0.8h, v31.8h 1953 umin v1.8h, v1.8h, v31.8h 1954 1955 .if \oy 1956 mov v16.16b, v25.16b 1957 .endif 1958 subs w9, w9, #1 1959 .if \oy 1960 mov v25.16b, v26.16b 1961 mov v26.16b, v16.16b 1962 .endif 1963 st1 {v0.8h, v1.8h}, [x0], x2 // dst 1964 b.gt 1b 1965 1966 .if \oy 1967 cmp w12, #0 1968 mov w9, w12 // restore actual remaining h 1969 b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) 1970 .endif 1971 1972 b 9f 1973 .endm 1974 fguv_loop_sx1 0, 0, 0 1975 fguv_loop_sx1 0, 0, 1 1976 fguv_loop_sx1 0, 1, 0 1977 fguv_loop_sx1 0, 1, 1 1978 fguv_loop_sx1 1, 0, 0 1979 fguv_loop_sx1 1, 0, 1 1980 fguv_loop_sx1 1, 1, 0 1981 fguv_loop_sx1 1, 1, 1 1982 1983 9: 1984 ldp d14, d15, [sp, #64] 1985 ldp d12, d13, [sp, #48] 1986 ldp d10, d11, [sp, #32] 1987 ldp d8, d9, [sp, #16] 1988 ldr x30, [sp], #80 1989 AARCH64_VALIDATE_LINK_REGISTER 1990 ret 1991 endfunc 1992 1993 jumptable fguv_loop_sx1_tbl 1994 .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl 1995 .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl 1996 .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl 1997 .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl 1998 .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl 1999 .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl 2000 .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl 2001 .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl 2002 endjumptable