filmgrain.S (73466B)
1 /* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 #include "src/arm/asm-offsets.h" 31 32 #define GRAIN_WIDTH 82 33 #define GRAIN_HEIGHT 73 34 35 #define SUB_GRAIN_WIDTH 44 36 #define SUB_GRAIN_HEIGHT 38 37 38 .macro increment_seed steps, shift=1 39 lsr w11, w2, #3 40 lsr w12, w2, #12 41 lsr w13, w2, #1 42 eor w11, w2, w11 // (r >> 0) ^ (r >> 3) 43 eor w12, w12, w13 // (r >> 12) ^ (r >> 1) 44 eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) 45 .if \shift 46 lsr w2, w2, #\steps 47 .endif 48 and w11, w11, #((1 << \steps) - 1) // bit 49 .if \shift 50 orr w2, w2, w11, lsl #(16 - \steps) // *state 51 .else 52 orr w2, w2, w11, lsl #16 // *state 53 .endif 54 .endm 55 56 .macro read_rand dest, bits, age 57 ubfx \dest, x2, #16 - \bits - \age, #\bits 58 .endm 59 60 .macro read_shift_rand dest, bits 61 ubfx \dest, x2, #17 - \bits, #\bits 62 lsr w2, w2, #1 63 .endm 64 65 // special calling convention: 66 // w2 holds seed 67 // x3 holds dav1d_gaussian_sequence 68 // clobbers x11-x15 69 // returns in v0.8h 70 function get_gaussian_neon 71 increment_seed 4 72 read_rand x14, 11, 3 73 read_rand x15, 11, 2 74 add x14, x3, x14, lsl #1 75 add x15, x3, x15, lsl #1 76 ld1 {v0.h}[0], [x14] 77 read_rand x14, 11, 1 78 ld1 {v0.h}[1], [x15] 79 add x14, x3, x14, lsl #1 80 read_rand x15, 11, 0 81 increment_seed 4 82 add x15, x3, x15, lsl #1 83 ld1 {v0.h}[2], [x14] 84 read_rand x14, 11, 3 85 ld1 {v0.h}[3], [x15] 86 add x14, x3, x14, lsl #1 87 read_rand x15, 11, 2 88 ld1 {v0.h}[4], [x14] 89 add x15, x3, x15, lsl #1 90 read_rand x14, 11, 1 91 ld1 {v0.h}[5], [x15] 92 read_rand x15, 11, 0 93 add x14, x3, x14, lsl #1 94 add x15, x3, x15, lsl #1 95 ld1 {v0.h}[6], [x14] 96 ld1 {v0.h}[7], [x15] 97 ret 98 endfunc 99 100 .macro get_grain_row r0, r1, r2, r3, r4, r5 101 bl get_gaussian_neon 102 srshl \r5\().8h, v0.8h, v31.8h 103 xtn \r0\().8b, \r5\().8h 104 bl get_gaussian_neon 105 srshl \r5\().8h, v0.8h, v31.8h 106 xtn2 \r0\().16b, \r5\().8h 107 bl get_gaussian_neon 108 srshl \r5\().8h, v0.8h, v31.8h 109 xtn \r1\().8b, \r5\().8h 110 bl get_gaussian_neon 111 srshl \r5\().8h, v0.8h, v31.8h 112 xtn2 \r1\().16b, \r5\().8h 113 bl get_gaussian_neon 114 srshl \r5\().8h, v0.8h, v31.8h 115 xtn \r2\().8b, \r5\().8h 116 bl get_gaussian_neon 117 srshl \r5\().8h, v0.8h, v31.8h 118 xtn2 \r2\().16b, \r5\().8h 119 bl get_gaussian_neon 120 srshl \r5\().8h, v0.8h, v31.8h 121 xtn \r3\().8b, \r5\().8h 122 bl get_gaussian_neon 123 srshl \r5\().8h, v0.8h, v31.8h 124 xtn2 \r3\().16b, \r5\().8h 125 bl get_gaussian_neon 126 srshl \r5\().8h, v0.8h, v31.8h 127 xtn \r4\().8b, \r5\().8h 128 bl get_gaussian_neon 129 srshl \r5\().8h, v0.8h, v31.8h 130 xtn2 \r4\().16b, \r5\().8h 131 increment_seed 2 132 read_rand x14, 11, 1 133 read_rand x15, 11, 0 134 add x14, x3, x14, lsl #1 135 add x15, x3, x15, lsl #1 136 ld1 {\r5\().h}[0], [x14] 137 ld1 {\r5\().h}[1], [x15] 138 srshl v0.4h, \r5\().4h, v31.4h 139 xtn \r5\().8b, v0.8h 140 .endm 141 142 .macro store_grain_row r0, r1, r2, r3, r4, r5 143 st1 {\r0\().16b,\r1\().16b}, [x0], #32 144 st1 {\r2\().16b,\r3\().16b}, [x0], #32 145 st1 {\r4\().16b}, [x0], #16 146 st1 {\r5\().h}[0], [x0], #2 147 .endm 148 149 .macro get_grain_row_44 r0, r1, r2 150 bl get_gaussian_neon 151 srshl \r2\().8h, v0.8h, v31.8h 152 xtn \r0\().8b, \r2\().8h 153 bl get_gaussian_neon 154 srshl \r2\().8h, v0.8h, v31.8h 155 xtn2 \r0\().16b, \r2\().8h 156 bl get_gaussian_neon 157 srshl \r2\().8h, v0.8h, v31.8h 158 xtn \r1\().8b, \r2\().8h 159 bl get_gaussian_neon 160 srshl \r2\().8h, v0.8h, v31.8h 161 xtn2 \r1\().16b, \r2\().8h 162 bl get_gaussian_neon 163 srshl \r2\().8h, v0.8h, v31.8h 164 xtn \r2\().8b, \r2\().8h 165 166 increment_seed 4 167 read_rand x14, 11, 3 168 read_rand x15, 11, 2 169 add x14, x3, x14, lsl #1 170 add x15, x3, x15, lsl #1 171 ld1 {v0.h}[0], [x14] 172 read_rand x14, 11, 1 173 ld1 {v0.h}[1], [x15] 174 read_rand x15, 11, 0 175 add x14, x3, x14, lsl #1 176 add x15, x3, x15, lsl #1 177 ld1 {v0.h}[2], [x14] 178 ld1 {v0.h}[3], [x15] 179 srshl v0.4h, v0.4h, v31.4h 180 xtn2 \r2\().16b, v0.8h 181 .endm 182 183 .macro store_grain_row_44 r0, r1, r2 184 st1 {\r0\().16b,\r1\().16b}, [x0], #32 185 st1 {\r2\().16b}, [x0] 186 add x0, x0, #GRAIN_WIDTH-32 187 .endm 188 189 function get_grain_2_neon 190 increment_seed 2 191 read_rand x14, 11, 1 192 read_rand x15, 11, 0 193 add x14, x3, x14, lsl #1 194 add x15, x3, x15, lsl #1 195 ld1 {v0.h}[0], [x14] 196 ld1 {v0.h}[1], [x15] 197 srshl v0.4h, v0.4h, v31.4h 198 xtn v0.8b, v0.8h 199 ret 200 endfunc 201 202 .macro get_grain_2 dst 203 bl get_grain_2_neon 204 .ifnc \dst, v0 205 mov \dst\().8b, v0.8b 206 .endif 207 .endm 208 209 // w15 holds the number of entries to produce 210 // w14, w16 and w17 hold the previous output entries 211 // v0 holds the vector of produced entries 212 // v1 holds the input vector of sums from above 213 .macro output_lag n 214 function output_lag\n\()_neon 215 1: 216 read_shift_rand x13, 11 217 mov w11, v1.s[0] 218 ldrsh w12, [x3, x13, lsl #1] 219 ext v0.16b, v0.16b, v0.16b, #1 220 .if \n == 1 221 madd w11, w14, w4, w11 // sum (above) + *coeff * prev output 222 .elseif \n == 2 223 madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 224 madd w11, w14, w17, w11 // += *coeff * prev output 2 225 mov w16, w14 226 .else 227 madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 228 madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 229 madd w11, w14, w21, w11 // += *coeff * prev output 3 230 mov w17, w16 231 mov w16, w14 232 .endif 233 add w14, w11, w8 // 1 << (ar_coeff_shift - 1) 234 add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1) 235 asr w14, w14, w7 // >> ar_coeff_shift 236 asr w12, w12, w9 // >> (4 + grain_scale_shift) 237 add w14, w14, w12 238 cmp w14, w5 239 csel w14, w14, w5, le 240 cmp w14, w6 241 csel w14, w14, w6, ge 242 subs w15, w15, #1 243 ext v1.16b, v1.16b, v1.16b, #4 244 ins v0.b[15], w14 245 b.gt 1b 246 ret 247 endfunc 248 .endm 249 250 output_lag 1 251 output_lag 2 252 output_lag 3 253 254 255 function sum_lag1_above_neon 256 smull v2.8h, v3.8b, v28.8b 257 smull2 v3.8h, v3.16b, v28.16b 258 smull v4.8h, v0.8b, v27.8b 259 smull2 v5.8h, v0.16b, v27.16b 260 smull v6.8h, v1.8b, v29.8b 261 smull2 v7.8h, v1.16b, v29.16b 262 saddl v0.4s, v2.4h, v4.4h 263 saddl2 v1.4s, v2.8h, v4.8h 264 saddl v2.4s, v3.4h, v5.4h 265 saddl2 v3.4s, v3.8h, v5.8h 266 saddw v4.4s, v0.4s, v6.4h 267 saddw2 v5.4s, v1.4s, v6.8h 268 saddw v6.4s, v2.4s, v7.4h 269 saddw2 v7.4s, v3.4s, v7.8h 270 ret 271 endfunc 272 273 .macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff 274 bl sum_\lag\()_above_neon 275 .ifc \type, uv_420 276 add x12, x19, #GRAIN_WIDTH 277 ld1 {v22.16b, v23.16b}, [x19], #32 278 ld1 {v24.16b, v25.16b}, [x12] 279 saddlp v22.8h, v22.16b 280 saddlp v23.8h, v23.16b 281 saddlp v24.8h, v24.16b 282 saddlp v25.8h, v25.16b 283 add v22.8h, v22.8h, v24.8h 284 add v23.8h, v23.8h, v25.8h 285 rshrn v0.8b, v22.8h, #2 286 rshrn2 v0.16b, v23.8h, #2 287 .endif 288 .ifc \type, uv_422 289 ld1 {v22.16b, v23.16b}, [x19], #32 290 saddlp v22.8h, v22.16b 291 saddlp v23.8h, v23.16b 292 rshrn v0.8b, v22.8h, #1 293 rshrn2 v0.16b, v23.8h, #1 294 .endif 295 .ifc \type, uv_444 296 ld1 {v0.16b}, [x19], #16 297 .endif 298 .if \uv_layout 299 .ifnb \uv_coeff 300 dup v1.16b, \uv_coeff 301 smull v2.8h, v0.8b, v1.8b 302 smull2 v3.8h, v0.16b, v1.16b 303 .else 304 smull v2.8h, v0.8b, v30.8b 305 smull2 v3.8h, v0.16b, v30.16b 306 .endif 307 saddw v4.4s, v4.4s, v2.4h 308 saddw2 v5.4s, v5.4s, v2.8h 309 saddw v6.4s, v6.4s, v3.4h 310 saddw2 v7.4s, v7.4s, v3.8h 311 .endif 312 .if \uv_layout && \elems == 16 313 b sum_\lag\()_y_\edge\()_start 314 .elseif \uv_layout == 444 && \elems == 15 315 b sum_\lag\()_y_\edge\()_start 316 .elseif \uv_layout == 422 && \elems == 9 317 b sum_\lag\()_uv_420_\edge\()_start 318 .else 319 sum_\lag\()_\type\()_\edge\()_start: 320 .ifc \edge, left 321 increment_seed 4 322 read_rand x12, 11, 3 323 read_rand x13, 11, 2 324 read_rand x14, 11, 1 325 add x12, x3, x12, lsl #1 326 add x13, x3, x13, lsl #1 327 add x14, x3, x14, lsl #1 328 ld1 {v0.h}[5], [x12] 329 ld1 {v0.h}[6], [x13] 330 ld1 {v0.h}[7], [x14] 331 lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 332 srshl v0.8h, v0.8h, v31.8h 333 xtn2 v0.16b, v0.8h 334 ext v4.16b, v4.16b, v4.16b, #12 335 .ifc \lag, lag3 336 smov w17, v0.b[13] 337 .endif 338 .ifnc \lag, lag1 339 smov w16, v0.b[14] 340 .endif 341 smov w14, v0.b[15] 342 343 mov v1.16b, v4.16b 344 mov w15, #1 345 bl output_\lag\()_neon 346 .else 347 increment_seed 4, shift=0 348 mov v1.16b, v4.16b 349 mov w15, #4 350 bl output_\lag\()_neon 351 .endif 352 353 increment_seed 4, shift=0 354 mov v1.16b, v5.16b 355 mov w15, #4 356 bl output_\lag\()_neon 357 358 increment_seed 4, shift=0 359 mov v1.16b, v6.16b 360 .if \elems == 9 361 mov w15, #1 362 bl output_\lag\()_neon 363 lsr w2, w2, #3 364 365 read_rand x12, 11, 2 366 read_rand x13, 11, 1 367 read_rand x14, 11, 0 368 add x12, x3, x12, lsl #1 369 add x13, x3, x13, lsl #1 370 add x14, x3, x14, lsl #1 371 ld1 {v1.h}[0], [x12] 372 ld1 {v1.h}[1], [x13] 373 ld1 {v1.h}[2], [x14] 374 srshl v1.4h, v1.4h, v31.4h 375 xtn v1.8b, v1.8h 376 ext v0.16b, v0.16b, v1.16b, #7 377 .else 378 mov w15, #4 379 bl output_\lag\()_neon 380 381 increment_seed 4, shift=0 382 mov v1.16b, v7.16b 383 384 .ifc \edge, right 385 mov w15, #3 386 bl output_\lag\()_neon 387 read_shift_rand x15, 11 388 add x15, x3, x15, lsl #1 389 ld1 {v1.h}[0], [x15] 390 srshl v1.4h, v1.4h, v31.4h 391 ext v0.16b, v0.16b, v1.16b, #1 392 .else 393 mov w15, #4 394 bl output_\lag\()_neon 395 .endif 396 .endif 397 .if \store 398 st1 {v0.16b}, [x0], #16 399 .endif 400 ldr x30, [sp], #16 401 AARCH64_VALIDATE_LINK_REGISTER 402 ret 403 .endif 404 .endm 405 406 .macro sum_lag1_func type, uv_layout, edge, elems=16 407 function sum_\type\()_lag1_\edge\()_neon 408 AARCH64_SIGN_LINK_REGISTER 409 str x30, [sp, #-16]! 410 sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 411 endfunc 412 .endm 413 414 sum_lag1_func y, 0, left 415 sum_lag1_func y, 0, mid 416 sum_lag1_func y, 0, right, 15 417 sum_lag1_func uv_444, 444, left 418 sum_lag1_func uv_444, 444, mid 419 sum_lag1_func uv_444, 444, right, 15 420 sum_lag1_func uv_422, 422, left 421 sum_lag1_func uv_422, 422, mid 422 sum_lag1_func uv_422, 422, right, 9 423 sum_lag1_func uv_420, 420, left 424 sum_lag1_func uv_420, 420, mid 425 sum_lag1_func uv_420, 420, right, 9 426 427 .macro sum_lag1 type, dst, left, mid, right, edge=mid 428 mov v3.16b, \mid\().16b 429 ext v0.16b, \left\().16b, \mid\().16b, #15 430 ext v1.16b, \mid\().16b, \right\().16b, #1 431 bl sum_\type\()_lag1_\edge\()_neon 432 mov \dst\().16b, v0.16b 433 .endm 434 435 .macro sum_y_lag1 dst, left, mid, right, edge=mid 436 sum_lag1 y, \dst, \left, \mid, \right, \edge 437 .endm 438 439 .macro sum_uv_444_lag1 dst, left, mid, right, edge=mid 440 sum_lag1 uv_444, \dst, \left, \mid, \right, \edge 441 .endm 442 443 .macro sum_uv_422_lag1 dst, left, mid, right, edge=mid 444 sum_lag1 uv_422, \dst, \left, \mid, \right, \edge 445 .endm 446 447 .macro sum_uv_420_lag1 dst, left, mid, right, edge=mid 448 sum_lag1 uv_420, \dst, \left, \mid, \right, \edge 449 .endm 450 451 452 function sum_lag2_above_neon 453 sub x12, x0, #2*GRAIN_WIDTH - 16 454 sub x13, x0, #1*GRAIN_WIDTH - 16 455 ld1 {v18.16b}, [x12] // load top right 456 ld1 {v21.16b}, [x13] 457 458 ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid 459 dup v26.16b, v30.b[0] 460 ext v23.16b, v16.16b, v17.16b, #15 461 dup v27.16b, v30.b[1] 462 ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right 463 dup v28.16b, v30.b[3] 464 ext v1.16b, v17.16b, v18.16b, #2 465 dup v29.16b, v30.b[4] 466 467 smull v2.8h, v22.8b, v26.8b 468 smull2 v3.8h, v22.16b, v26.16b 469 smull v4.8h, v23.8b, v27.8b 470 smull2 v5.8h, v23.16b, v27.16b 471 smull v6.8h, v0.8b, v28.8b 472 smull2 v7.8h, v0.16b, v28.16b 473 smull v0.8h, v1.8b, v29.8b 474 smull2 v1.8h, v1.16b, v29.16b 475 saddl v22.4s, v2.4h, v4.4h 476 saddl2 v23.4s, v2.8h, v4.8h 477 saddl v26.4s, v3.4h, v5.4h 478 saddl2 v27.4s, v3.8h, v5.8h 479 saddl v2.4s, v0.4h, v6.4h 480 saddl2 v3.4s, v0.8h, v6.8h 481 saddl v6.4s, v1.4h, v7.4h 482 saddl2 v7.4s, v1.8h, v7.8h 483 add v4.4s, v22.4s, v2.4s 484 add v5.4s, v23.4s, v3.4s 485 add v6.4s, v26.4s, v6.4s 486 add v7.4s, v27.4s, v7.4s 487 488 ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid 489 dup v26.16b, v30.b[5] 490 ext v23.16b, v19.16b, v20.16b, #15 491 dup v27.16b, v30.b[6] 492 ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right 493 dup v28.16b, v30.b[8] 494 ext v1.16b, v20.16b, v21.16b, #2 495 dup v29.16b, v30.b[9] 496 497 smull v2.8h, v22.8b, v26.8b 498 smull2 v3.8h, v22.16b, v26.16b 499 smull v22.8h, v23.8b, v27.8b 500 smull2 v23.8h, v23.16b, v27.16b 501 smull v26.8h, v0.8b, v28.8b 502 smull2 v27.8h, v0.16b, v28.16b 503 smull v28.8h, v1.8b, v29.8b 504 smull2 v29.8h, v1.16b, v29.16b 505 saddl v0.4s, v2.4h, v22.4h 506 saddl2 v1.4s, v2.8h, v22.8h 507 saddl v2.4s, v3.4h, v23.4h 508 saddl2 v3.4s, v3.8h, v23.8h 509 saddl v22.4s, v26.4h, v28.4h 510 saddl2 v23.4s, v26.8h, v28.8h 511 saddl v26.4s, v27.4h, v29.4h 512 saddl2 v27.4s, v27.8h, v29.8h 513 add v0.4s, v0.4s, v22.4s 514 add v1.4s, v1.4s, v23.4s 515 add v2.4s, v2.4s, v26.4s 516 add v3.4s, v3.4s, v27.4s 517 dup v26.16b, v30.b[2] 518 dup v27.16b, v30.b[7] 519 smull v22.8h, v17.8b, v26.8b 520 smull2 v23.8h, v17.16b, v26.16b 521 smull v24.8h, v20.8b, v27.8b 522 smull2 v25.8h, v20.16b, v27.16b 523 add v4.4s, v4.4s, v0.4s 524 add v5.4s, v5.4s, v1.4s 525 add v6.4s, v6.4s, v2.4s 526 add v7.4s, v7.4s, v3.4s 527 528 mov v16.16b, v17.16b 529 mov v17.16b, v18.16b 530 531 saddl v0.4s, v22.4h, v24.4h 532 saddl2 v1.4s, v22.8h, v24.8h 533 saddl v2.4s, v23.4h, v25.4h 534 saddl2 v3.4s, v23.8h, v25.8h 535 mov v19.16b, v20.16b 536 mov v20.16b, v21.16b 537 add v4.4s, v4.4s, v0.4s 538 add v5.4s, v5.4s, v1.4s 539 add v6.4s, v6.4s, v2.4s 540 add v7.4s, v7.4s, v3.4s 541 ret 542 endfunc 543 544 .macro sum_lag2_func type, uv_layout, edge, elems=16 545 function sum_\type\()_lag2_\edge\()_neon 546 AARCH64_SIGN_LINK_REGISTER 547 str x30, [sp, #-16]! 548 .ifc \edge, left 549 sub x12, x0, #2*GRAIN_WIDTH 550 sub x13, x0, #1*GRAIN_WIDTH 551 ld1 {v17.16b}, [x12] // load the previous block right above 552 ld1 {v20.16b}, [x13] 553 .endif 554 sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12] 555 endfunc 556 .endm 557 558 sum_lag2_func y, 0, left 559 sum_lag2_func y, 0, mid 560 sum_lag2_func y, 0, right, 15 561 sum_lag2_func uv_444, 444, left 562 sum_lag2_func uv_444, 444, mid 563 sum_lag2_func uv_444, 444, right, 15 564 sum_lag2_func uv_422, 422, left 565 sum_lag2_func uv_422, 422, mid 566 sum_lag2_func uv_422, 422, right, 9 567 sum_lag2_func uv_420, 420, left 568 sum_lag2_func uv_420, 420, mid 569 sum_lag2_func uv_420, 420, right, 9 570 571 572 function sum_lag3_above_neon 573 sub x11, x0, #3*GRAIN_WIDTH - 16 574 sub x12, x0, #2*GRAIN_WIDTH - 16 575 sub x13, x0, #1*GRAIN_WIDTH - 16 576 ld1 {v15.16b}, [x11] // load top right 577 ld1 {v18.16b}, [x12] 578 ld1 {v21.16b}, [x13] 579 580 ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid 581 dup v22.16b, v29.b[0] 582 ext v9.16b, v13.16b, v14.16b, #14 583 dup v23.16b, v29.b[1] 584 ext v10.16b, v13.16b, v14.16b, #15 585 dup v24.16b, v29.b[2] 586 dup v25.16b, v29.b[3] 587 ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right 588 dup v26.16b, v29.b[4] 589 ext v12.16b, v14.16b, v15.16b, #2 590 dup v27.16b, v29.b[5] 591 ext v13.16b, v14.16b, v15.16b, #3 592 dup v28.16b, v29.b[6] 593 594 smull v0.8h, v8.8b, v22.8b 595 smull2 v1.8h, v8.16b, v22.16b 596 smull v2.8h, v9.8b, v23.8b 597 smull2 v3.8h, v9.16b, v23.16b 598 smull v8.8h, v10.8b, v24.8b 599 smull2 v9.8h, v10.16b, v24.16b 600 smull v10.8h, v11.8b, v26.8b 601 smull2 v11.8h, v11.16b, v26.16b 602 saddl v22.4s, v0.4h, v2.4h 603 saddl2 v23.4s, v0.8h, v2.8h 604 saddl v24.4s, v1.4h, v3.4h 605 saddl2 v26.4s, v1.8h, v3.8h 606 saddl v0.4s, v8.4h, v10.4h 607 saddl2 v1.4s, v8.8h, v10.8h 608 saddl v2.4s, v9.4h, v11.4h 609 saddl2 v3.4s, v9.8h, v11.8h 610 smull v8.8h, v12.8b, v27.8b 611 smull2 v9.8h, v12.16b, v27.16b 612 smull v10.8h, v13.8b, v28.8b 613 smull2 v11.8h, v13.16b, v28.16b 614 smull v12.8h, v14.8b, v25.8b 615 smull2 v13.8h, v14.16b, v25.16b 616 add v4.4s, v22.4s, v0.4s 617 add v5.4s, v23.4s, v1.4s 618 add v6.4s, v24.4s, v2.4s 619 add v7.4s, v26.4s, v3.4s 620 saddl v0.4s, v8.4h, v10.4h 621 saddl2 v1.4s, v8.8h, v10.8h 622 saddl v2.4s, v9.4h, v11.4h 623 saddl2 v3.4s, v9.8h, v11.8h 624 add v4.4s, v4.4s, v0.4s 625 add v5.4s, v5.4s, v1.4s 626 add v6.4s, v6.4s, v2.4s 627 add v7.4s, v7.4s, v3.4s 628 saddw v4.4s, v4.4s, v12.4h 629 saddw2 v5.4s, v5.4s, v12.8h 630 saddw v6.4s, v6.4s, v13.4h 631 saddw2 v7.4s, v7.4s, v13.8h 632 633 ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid 634 dup v22.16b, v29.b[7] 635 ext v9.16b, v16.16b, v17.16b, #14 636 dup v23.16b, v29.b[8] 637 ext v10.16b, v16.16b, v17.16b, #15 638 dup v24.16b, v29.b[9] 639 dup v25.16b, v29.b[10] 640 ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right 641 dup v26.16b, v29.b[11] 642 ext v12.16b, v17.16b, v18.16b, #2 643 dup v27.16b, v29.b[12] 644 ext v13.16b, v17.16b, v18.16b, #3 645 dup v28.16b, v29.b[13] 646 647 smull v0.8h, v8.8b, v22.8b 648 smull2 v1.8h, v8.16b, v22.16b 649 smull v2.8h, v9.8b, v23.8b 650 smull2 v3.8h, v9.16b, v23.16b 651 smull v8.8h, v10.8b, v24.8b 652 smull2 v9.8h, v10.16b, v24.16b 653 smull v10.8h, v11.8b, v26.8b 654 smull2 v11.8h, v11.16b, v26.16b 655 saddl v22.4s, v0.4h, v2.4h 656 saddl2 v23.4s, v0.8h, v2.8h 657 saddl v24.4s, v1.4h, v3.4h 658 saddl2 v26.4s, v1.8h, v3.8h 659 saddl v0.4s, v8.4h, v10.4h 660 saddl2 v1.4s, v8.8h, v10.8h 661 saddl v2.4s, v9.4h, v11.4h 662 saddl2 v3.4s, v9.8h, v11.8h 663 smull v8.8h, v12.8b, v27.8b 664 smull2 v9.8h, v12.16b, v27.16b 665 smull v10.8h, v13.8b, v28.8b 666 smull2 v11.8h, v13.16b, v28.16b 667 smull v12.8h, v17.8b, v25.8b 668 smull2 v13.8h, v17.16b, v25.16b 669 add v22.4s, v22.4s, v0.4s 670 add v23.4s, v23.4s, v1.4s 671 add v24.4s, v24.4s, v2.4s 672 add v26.4s, v26.4s, v3.4s 673 saddl v0.4s, v8.4h, v10.4h 674 saddl2 v1.4s, v8.8h, v10.8h 675 saddl v2.4s, v9.4h, v11.4h 676 saddl2 v3.4s, v9.8h, v11.8h 677 add v4.4s, v4.4s, v22.4s 678 add v5.4s, v5.4s, v23.4s 679 add v6.4s, v6.4s, v24.4s 680 add v7.4s, v7.4s, v26.4s 681 add v4.4s, v4.4s, v0.4s 682 add v5.4s, v5.4s, v1.4s 683 add v6.4s, v6.4s, v2.4s 684 add v7.4s, v7.4s, v3.4s 685 saddw v4.4s, v4.4s, v12.4h 686 saddw2 v5.4s, v5.4s, v12.8h 687 saddw v6.4s, v6.4s, v13.4h 688 saddw2 v7.4s, v7.4s, v13.8h 689 690 ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid 691 dup v22.16b, v29.b[14] 692 ext v9.16b, v19.16b, v20.16b, #14 693 dup v23.16b, v29.b[15] 694 ext v10.16b, v19.16b, v20.16b, #15 695 dup v24.16b, v30.b[0] 696 dup v25.16b, v30.b[1] 697 ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right 698 dup v26.16b, v30.b[2] 699 ext v12.16b, v20.16b, v21.16b, #2 700 dup v27.16b, v30.b[3] 701 ext v13.16b, v20.16b, v21.16b, #3 702 dup v28.16b, v30.b[4] 703 704 smull v0.8h, v8.8b, v22.8b 705 smull2 v1.8h, v8.16b, v22.16b 706 smull v2.8h, v9.8b, v23.8b 707 smull2 v3.8h, v9.16b, v23.16b 708 smull v8.8h, v10.8b, v24.8b 709 smull2 v9.8h, v10.16b, v24.16b 710 smull v10.8h, v11.8b, v26.8b 711 smull2 v11.8h, v11.16b, v26.16b 712 saddl v22.4s, v0.4h, v2.4h 713 saddl2 v23.4s, v0.8h, v2.8h 714 saddl v24.4s, v1.4h, v3.4h 715 saddl2 v26.4s, v1.8h, v3.8h 716 saddl v0.4s, v8.4h, v10.4h 717 saddl2 v1.4s, v8.8h, v10.8h 718 saddl v2.4s, v9.4h, v11.4h 719 saddl2 v3.4s, v9.8h, v11.8h 720 smull v8.8h, v12.8b, v27.8b 721 smull2 v9.8h, v12.16b, v27.16b 722 smull v10.8h, v13.8b, v28.8b 723 smull2 v11.8h, v13.16b, v28.16b 724 smull v12.8h, v20.8b, v25.8b 725 smull2 v19.8h, v20.16b, v25.16b 726 add v22.4s, v22.4s, v0.4s 727 add v23.4s, v23.4s, v1.4s 728 add v24.4s, v24.4s, v2.4s 729 add v26.4s, v26.4s, v3.4s 730 saddl v0.4s, v8.4h, v10.4h 731 saddl2 v1.4s, v8.8h, v10.8h 732 saddl v2.4s, v9.4h, v11.4h 733 saddl2 v3.4s, v9.8h, v11.8h 734 add v4.4s, v4.4s, v22.4s 735 add v5.4s, v5.4s, v23.4s 736 add v6.4s, v6.4s, v24.4s 737 add v7.4s, v7.4s, v26.4s 738 mov v13.16b, v14.16b 739 mov v14.16b, v15.16b 740 add v4.4s, v4.4s, v0.4s 741 add v5.4s, v5.4s, v1.4s 742 add v6.4s, v6.4s, v2.4s 743 add v7.4s, v7.4s, v3.4s 744 mov v16.16b, v17.16b 745 mov v17.16b, v18.16b 746 saddw v4.4s, v4.4s, v12.4h 747 saddw2 v5.4s, v5.4s, v12.8h 748 saddw v6.4s, v6.4s, v19.4h 749 saddw2 v7.4s, v7.4s, v19.8h 750 751 mov v19.16b, v20.16b 752 mov v20.16b, v21.16b 753 ret 754 endfunc 755 756 .macro sum_lag3_func type, uv_layout, edge, elems=16 757 function sum_\type\()_lag3_\edge\()_neon 758 AARCH64_SIGN_LINK_REGISTER 759 str x30, [sp, #-16]! 760 .ifc \edge, left 761 sub x11, x0, #3*GRAIN_WIDTH 762 sub x12, x0, #2*GRAIN_WIDTH 763 sub x13, x0, #1*GRAIN_WIDTH 764 ld1 {v14.16b}, [x11] // load the previous block right above 765 ld1 {v17.16b}, [x12] 766 ld1 {v20.16b}, [x13] 767 .endif 768 sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8] 769 endfunc 770 .endm 771 772 sum_lag3_func y, 0, left 773 sum_lag3_func y, 0, mid 774 sum_lag3_func y, 0, right, 15 775 sum_lag3_func uv_444, 444, left 776 sum_lag3_func uv_444, 444, mid 777 sum_lag3_func uv_444, 444, right, 15 778 sum_lag3_func uv_422, 422, left 779 sum_lag3_func uv_422, 422, mid 780 sum_lag3_func uv_422, 422, right, 9 781 sum_lag3_func uv_420, 420, left 782 sum_lag3_func uv_420, 420, mid 783 sum_lag3_func uv_420, 420, right, 9 784 785 function generate_grain_rows_neon 786 AARCH64_SIGN_LINK_REGISTER 787 str x30, [sp, #-16]! 788 1: 789 get_grain_row v16, v17, v18, v19, v20, v21 790 subs w1, w1, #1 791 store_grain_row v16, v17, v18, v19, v20, v21 792 b.gt 1b 793 ldr x30, [sp], #16 794 AARCH64_VALIDATE_LINK_REGISTER 795 ret 796 endfunc 797 798 function generate_grain_rows_44_neon 799 AARCH64_SIGN_LINK_REGISTER 800 str x30, [sp, #-16]! 801 1: 802 get_grain_row_44 v16, v17, v18 803 subs w1, w1, #1 804 store_grain_row_44 v16, v17, v18 805 b.gt 1b 806 ldr x30, [sp], #16 807 AARCH64_VALIDATE_LINK_REGISTER 808 ret 809 endfunc 810 811 function get_grain_row_neon 812 AARCH64_SIGN_LINK_REGISTER 813 str x30, [sp, #-16]! 814 get_grain_row v16, v17, v18, v19, v20, v21 815 ldr x30, [sp], #16 816 AARCH64_VALIDATE_LINK_REGISTER 817 ret 818 endfunc 819 820 function get_grain_row_44_neon 821 AARCH64_SIGN_LINK_REGISTER 822 str x30, [sp, #-16]! 823 get_grain_row_44 v16, v17, v18 824 ldr x30, [sp], #16 825 AARCH64_VALIDATE_LINK_REGISTER 826 ret 827 endfunc 828 829 function add_uv_444_coeff_lag0_neon 830 add_coeff_lag0_start: 831 smull v2.8h, v0.8b, v27.8b 832 smull2 v3.8h, v0.16b, v27.16b 833 srshl v2.8h, v2.8h, v28.8h 834 srshl v3.8h, v3.8h, v28.8h 835 saddw v2.8h, v2.8h, v1.8b 836 saddw2 v3.8h, v3.8h, v1.16b 837 sqxtn v2.8b, v2.8h 838 sqxtn2 v2.16b, v3.8h 839 ret 840 endfunc 841 842 function add_uv_420_coeff_lag0_neon 843 ld1 {v4.16b, v5.16b}, [x19], #32 844 ld1 {v6.16b, v7.16b}, [x12], #32 845 saddlp v4.8h, v4.16b 846 saddlp v5.8h, v5.16b 847 saddlp v6.8h, v6.16b 848 saddlp v7.8h, v7.16b 849 add v4.8h, v4.8h, v6.8h 850 add v5.8h, v5.8h, v7.8h 851 rshrn v4.8b, v4.8h, #2 852 rshrn2 v4.16b, v5.8h, #2 853 and v0.16b, v4.16b, v0.16b 854 b add_coeff_lag0_start 855 endfunc 856 857 function add_uv_422_coeff_lag0_neon 858 ld1 {v4.16b, v5.16b}, [x19], #32 859 saddlp v4.8h, v4.16b 860 saddlp v5.8h, v5.16b 861 rshrn v4.8b, v4.8h, #1 862 rshrn2 v4.16b, v5.8h, #1 863 and v0.16b, v4.16b, v0.16b 864 b add_coeff_lag0_start 865 endfunc 866 867 .macro gen_grain_82 type 868 function generate_grain_\type\()_8bpc_neon, export=1 869 AARCH64_SIGN_LINK_REGISTER 870 stp x30, x19, [sp, #-96]! 871 872 .ifc \type, uv_444 873 mov w13, w3 874 mov w14, #28 875 add x19, x1, #3*GRAIN_WIDTH 876 mov x1, x2 877 mul w13, w13, w14 878 .endif 879 movrel x3, X(gaussian_sequence) 880 ldr w2, [x1, #FGD_SEED] 881 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] 882 .ifc \type, y 883 add x4, x1, #FGD_AR_COEFFS_Y 884 .else 885 add x4, x1, #FGD_AR_COEFFS_UV 886 .endif 887 movrel x16, gen_grain_\type\()_tbl 888 ldr w17, [x1, #FGD_AR_COEFF_LAG] 889 add w9, w9, #4 890 ldrsw x17, [x16, w17, uxtw #2] 891 dup v31.8h, w9 // 4 + data->grain_scale_shift 892 add x16, x16, x17 893 neg v31.8h, v31.8h 894 895 .ifc \type, uv_444 896 cmp w13, #0 897 mov w11, #0x49d8 898 mov w14, #0xb524 899 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] 900 csel w11, w11, w14, ne 901 .endif 902 903 ldr w7, [x1, #FGD_AR_COEFF_SHIFT] 904 mov w8, #1 905 mov w10, #1 906 lsl w8, w8, w7 // 1 << ar_coeff_shift 907 lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) 908 lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) 909 lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) 910 mov w5, #127 911 mov w6, #-128 912 913 .ifc \type, uv_444 914 eor w2, w2, w11 915 .endif 916 917 br x16 918 919 L(generate_grain_\type\()_lag0): 920 AARCH64_VALID_JUMP_TARGET 921 .ifc \type, y 922 mov w1, #GRAIN_HEIGHT 923 bl generate_grain_rows_neon 924 .else 925 dup v28.8h, w7 926 ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] 927 movi v0.16b, #0 928 movi v1.16b, #255 929 ext v29.16b, v0.16b, v1.16b, #13 930 ext v30.16b, v1.16b, v0.16b, #1 931 neg v28.8h, v28.8h 932 933 mov w1, #3 934 bl generate_grain_rows_neon 935 mov w1, #GRAIN_HEIGHT-3 936 1: 937 ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64 938 bl get_grain_row_neon 939 and v0.16b, v22.16b, v29.16b 940 mov v1.16b, v16.16b 941 bl add_uv_444_coeff_lag0_neon 942 mov v0.16b, v23.16b 943 mov v1.16b, v17.16b 944 mov v16.16b, v2.16b 945 bl add_uv_444_coeff_lag0_neon 946 ld1 {v26.16b}, [x19], #16 947 mov v0.16b, v24.16b 948 mov v1.16b, v18.16b 949 mov v17.16b, v2.16b 950 bl add_uv_444_coeff_lag0_neon 951 add x19, x19, #2 952 mov v0.16b, v25.16b 953 mov v1.16b, v19.16b 954 mov v18.16b, v2.16b 955 bl add_uv_444_coeff_lag0_neon 956 and v0.16b, v26.16b, v30.16b 957 mov v1.16b, v20.16b 958 mov v19.16b, v2.16b 959 bl add_uv_444_coeff_lag0_neon 960 mov v20.16b, v2.16b 961 subs w1, w1, #1 962 store_grain_row v16, v17, v18, v19, v20, v21 963 b.gt 1b 964 .endif 965 ldp x30, x19, [sp], #96 966 AARCH64_VALIDATE_LINK_REGISTER 967 ret 968 969 L(generate_grain_\type\()_lag1): 970 AARCH64_VALID_JUMP_TARGET 971 ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0] 972 ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1] 973 ld1r {v29.16b}, [x4] // ar_coeffs_y[2] 974 .ifc \type, y 975 ldrsb w4, [x4, #1] // ar_coeffs_y[3] 976 .else 977 add x4, x4, #2 978 .endif 979 980 mov w1, #3 981 .ifc \type, uv_444 982 ld1r {v30.16b}, [x4] // ar_coeffs_uv[4] 983 ldursb w4, [x4, #-1] // ar_coeffs_uv[3] 984 .endif 985 bl generate_grain_rows_neon 986 987 mov w1, #GRAIN_HEIGHT - 3 988 1: 989 sum_\type\()_lag1 v22, v16, v16, v17, left 990 sum_\type\()_lag1 v23, v16, v17, v18 991 sum_\type\()_lag1 v24, v17, v18, v19 992 sum_\type\()_lag1 v25, v18, v19, v20 993 sum_\type\()_lag1 v20, v19, v20, v21, right 994 get_grain_2 v21 995 subs w1, w1, #1 996 .ifc \type, uv_444 997 add x19, x19, #2 998 .endif 999 store_grain_row v22, v23, v24, v25, v20, v21 1000 mov v16.16b, v22.16b 1001 mov v17.16b, v23.16b 1002 mov v18.16b, v24.16b 1003 mov v19.16b, v25.16b 1004 b.gt 1b 1005 1006 ldp x30, x19, [sp], #96 1007 AARCH64_VALIDATE_LINK_REGISTER 1008 ret 1009 1010 L(generate_grain_\type\()_lag2): 1011 AARCH64_VALID_JUMP_TARGET 1012 ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] 1013 1014 smov w4, v30.b[10] 1015 smov w17, v30.b[11] 1016 1017 mov w1, #3 1018 bl generate_grain_rows_neon 1019 1020 mov w1, #GRAIN_HEIGHT - 3 1021 1: 1022 bl sum_\type\()_lag2_left_neon 1023 bl sum_\type\()_lag2_mid_neon 1024 bl sum_\type\()_lag2_mid_neon 1025 bl sum_\type\()_lag2_mid_neon 1026 bl sum_\type\()_lag2_right_neon 1027 get_grain_2 v16 1028 subs w1, w1, #1 1029 .ifc \type, uv_444 1030 add x19, x19, #2 1031 .endif 1032 st1 {v16.h}[0], [x0], #2 1033 b.gt 1b 1034 1035 ldp x30, x19, [sp], #96 1036 AARCH64_VALIDATE_LINK_REGISTER 1037 ret 1038 1039 L(generate_grain_\type\()_lag3): 1040 AARCH64_VALID_JUMP_TARGET 1041 ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] 1042 stp d8, d9, [sp, #16] 1043 stp d10, d11, [sp, #32] 1044 stp d12, d13, [sp, #48] 1045 stp d14, d15, [sp, #64] 1046 stp x20, x21, [sp, #80] 1047 1048 smov w4, v30.b[5] 1049 smov w20, v30.b[6] 1050 smov w21, v30.b[7] 1051 1052 mov w1, #3 1053 bl generate_grain_rows_neon 1054 1055 mov w1, #GRAIN_HEIGHT - 3 1056 1: 1057 bl sum_\type\()_lag3_left_neon 1058 bl sum_\type\()_lag3_mid_neon 1059 bl sum_\type\()_lag3_mid_neon 1060 bl sum_\type\()_lag3_mid_neon 1061 bl sum_\type\()_lag3_right_neon 1062 get_grain_2 v16 1063 subs w1, w1, #1 1064 .ifc \type, uv_444 1065 add x19, x19, #2 1066 .endif 1067 st1 {v16.h}[0], [x0], #2 1068 b.gt 1b 1069 1070 ldp x20, x21, [sp, #80] 1071 ldp d14, d15, [sp, #64] 1072 ldp d12, d13, [sp, #48] 1073 ldp d10, d11, [sp, #32] 1074 ldp d8, d9, [sp, #16] 1075 ldp x30, x19, [sp], #96 1076 AARCH64_VALIDATE_LINK_REGISTER 1077 ret 1078 endfunc 1079 1080 jumptable gen_grain_\type\()_tbl 1081 .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl 1082 .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl 1083 .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl 1084 .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl 1085 endjumptable 1086 .endm 1087 1088 gen_grain_82 y 1089 gen_grain_82 uv_444 1090 1091 .macro set_height dst, type 1092 .ifc \type, uv_420 1093 mov \dst, #SUB_GRAIN_HEIGHT-3 1094 .else 1095 mov \dst, #GRAIN_HEIGHT-3 1096 .endif 1097 .endm 1098 1099 .macro increment_y_ptr reg, type 1100 .ifc \type, uv_420 1101 add \reg, \reg, #2*GRAIN_WIDTH-(3*32) 1102 .else 1103 sub \reg, \reg, #3*32-GRAIN_WIDTH 1104 .endif 1105 .endm 1106 1107 .macro gen_grain_44 type 1108 function generate_grain_\type\()_8bpc_neon, export=1 1109 AARCH64_SIGN_LINK_REGISTER 1110 stp x30, x19, [sp, #-96]! 1111 1112 mov w13, w3 1113 mov w14, #28 1114 add x19, x1, #3*GRAIN_WIDTH-3 1115 mov x1, x2 1116 mul w13, w13, w14 1117 1118 movrel x3, X(gaussian_sequence) 1119 ldr w2, [x1, #FGD_SEED] 1120 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] 1121 add x4, x1, #FGD_AR_COEFFS_UV 1122 movrel x16, gen_grain_\type\()_tbl 1123 ldr w17, [x1, #FGD_AR_COEFF_LAG] 1124 add w9, w9, #4 1125 ldrsw x17, [x16, w17, uxtw #2] 1126 dup v31.8h, w9 // 4 + data->grain_scale_shift 1127 add x16, x16, x17 1128 neg v31.8h, v31.8h 1129 1130 cmp w13, #0 1131 mov w11, #0x49d8 1132 mov w14, #0xb524 1133 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] 1134 csel w11, w11, w14, ne 1135 1136 ldr w7, [x1, #FGD_AR_COEFF_SHIFT] 1137 mov w8, #1 1138 mov w10, #1 1139 lsl w8, w8, w7 // 1 << ar_coeff_shift 1140 lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) 1141 lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) 1142 lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) 1143 mov w5, #127 1144 mov w6, #-128 1145 1146 eor w2, w2, w11 1147 1148 br x16 1149 1150 L(generate_grain_\type\()_lag0): 1151 AARCH64_VALID_JUMP_TARGET 1152 dup v28.8h, w7 1153 ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] 1154 movi v0.16b, #0 1155 movi v1.16b, #255 1156 ext v29.16b, v0.16b, v1.16b, #13 1157 ext v30.16b, v1.16b, v0.16b, #7 1158 neg v28.8h, v28.8h 1159 1160 mov w1, #3 1161 bl generate_grain_rows_44_neon 1162 set_height w1, \type 1163 1: 1164 bl get_grain_row_44_neon 1165 .ifc \type, uv_420 1166 add x12, x19, #GRAIN_WIDTH 1167 .endif 1168 mov v0.16b, v29.16b 1169 mov v1.16b, v16.16b 1170 bl add_\type\()_coeff_lag0_neon 1171 movi v0.16b, #255 1172 mov v1.16b, v17.16b 1173 mov v16.16b, v2.16b 1174 bl add_\type\()_coeff_lag0_neon 1175 mov v0.16b, v30.16b 1176 mov v1.16b, v18.16b 1177 mov v17.16b, v2.16b 1178 bl add_\type\()_coeff_lag0_neon 1179 mov v18.16b, v2.16b 1180 subs w1, w1, #1 1181 increment_y_ptr x19, \type 1182 store_grain_row_44 v16, v17, v18 1183 b.gt 1b 1184 1185 ldp x30, x19, [sp], #96 1186 AARCH64_VALIDATE_LINK_REGISTER 1187 ret 1188 1189 L(generate_grain_\type\()_lag1): 1190 AARCH64_VALID_JUMP_TARGET 1191 ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0] 1192 ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1] 1193 ld1r {v29.16b}, [x4] // ar_coeffs_uv[2] 1194 add x4, x4, #2 1195 1196 mov w1, #3 1197 ld1r {v30.16b}, [x4] // ar_coeffs_u4[4] 1198 ldursb w4, [x4, #-1] // ar_coeffs_uv[3] 1199 bl generate_grain_rows_44_neon 1200 1201 set_height w1, \type 1202 1: 1203 sum_\type\()_lag1 v20, v16, v16, v17, left 1204 sum_\type\()_lag1 v21, v16, v17, v18 1205 sum_\type\()_lag1 v18, v17, v18, v18, right 1206 subs w1, w1, #1 1207 increment_y_ptr x19, \type 1208 store_grain_row_44 v20, v21, v18 1209 mov v16.16b, v20.16b 1210 mov v17.16b, v21.16b 1211 b.gt 1b 1212 1213 ldp x30, x19, [sp], #96 1214 AARCH64_VALIDATE_LINK_REGISTER 1215 ret 1216 1217 L(generate_grain_\type\()_lag2): 1218 AARCH64_VALID_JUMP_TARGET 1219 ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] 1220 1221 smov w4, v30.b[10] 1222 smov w17, v30.b[11] 1223 1224 mov w1, #3 1225 bl generate_grain_rows_44_neon 1226 1227 set_height w1, \type 1228 1: 1229 bl sum_\type\()_lag2_left_neon 1230 bl sum_\type\()_lag2_mid_neon 1231 bl sum_\type\()_lag2_right_neon 1232 subs w1, w1, #1 1233 increment_y_ptr x19, \type 1234 add x0, x0, #GRAIN_WIDTH-48 1235 b.gt 1b 1236 1237 ldp x30, x19, [sp], #96 1238 AARCH64_VALIDATE_LINK_REGISTER 1239 ret 1240 1241 L(generate_grain_\type\()_lag3): 1242 AARCH64_VALID_JUMP_TARGET 1243 ldr q29, [x4] // ar_coeffs_uv[0-15] 1244 ldr q30, [x4, #16] // ar_coeffs_uv[16-24] 1245 stp d8, d9, [sp, #16] 1246 stp d10, d11, [sp, #32] 1247 stp d12, d13, [sp, #48] 1248 stp d14, d15, [sp, #64] 1249 stp x20, x21, [sp, #80] 1250 1251 smov w4, v30.b[5] 1252 smov w20, v30.b[6] 1253 smov w21, v30.b[7] 1254 1255 mov w1, #3 1256 bl generate_grain_rows_44_neon 1257 1258 set_height w1, \type 1259 1: 1260 bl sum_\type\()_lag3_left_neon 1261 bl sum_\type\()_lag3_mid_neon 1262 bl sum_\type\()_lag3_right_neon 1263 subs w1, w1, #1 1264 increment_y_ptr x19, \type 1265 add x0, x0, #GRAIN_WIDTH-48 1266 b.gt 1b 1267 1268 ldp x20, x21, [sp, #80] 1269 ldp d14, d15, [sp, #64] 1270 ldp d12, d13, [sp, #48] 1271 ldp d10, d11, [sp, #32] 1272 ldp d8, d9, [sp, #16] 1273 ldp x30, x19, [sp], #96 1274 AARCH64_VALIDATE_LINK_REGISTER 1275 ret 1276 endfunc 1277 1278 jumptable gen_grain_\type\()_tbl 1279 .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl 1280 .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl 1281 .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl 1282 .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl 1283 endjumptable 1284 .endm 1285 1286 gen_grain_44 uv_420 1287 gen_grain_44 uv_422 1288 1289 .macro gather_interleaved dst1, dst2, src1, src2, off 1290 umov w14, \src1[0+\off] 1291 umov w15, \src2[8+\off] 1292 umov w16, \src1[2+\off] 1293 add x14, x14, x3 1294 umov w17, \src2[10+\off] 1295 add x15, x15, x3 1296 ld1 {\dst1}[0+\off], [x14] 1297 umov w14, \src1[4+\off] 1298 add x16, x16, x3 1299 ld1 {\dst2}[8+\off], [x15] 1300 umov w15, \src2[12+\off] 1301 add x17, x17, x3 1302 ld1 {\dst1}[2+\off], [x16] 1303 umov w16, \src1[6+\off] 1304 add x14, x14, x3 1305 ld1 {\dst2}[10+\off], [x17] 1306 umov w17, \src2[14+\off] 1307 add x15, x15, x3 1308 ld1 {\dst1}[4+\off], [x14] 1309 add x16, x16, x3 1310 ld1 {\dst2}[12+\off], [x15] 1311 add x17, x17, x3 1312 ld1 {\dst1}[6+\off], [x16] 1313 ld1 {\dst2}[14+\off], [x17] 1314 .endm 1315 1316 .macro gather dst1, dst2, src1, src2 1317 gather_interleaved \dst1, \dst2, \src1, \src2, 0 1318 gather_interleaved \dst2, \dst1, \src2, \src1, 0 1319 gather_interleaved \dst1, \dst2, \src1, \src2, 1 1320 gather_interleaved \dst2, \dst1, \src2, \src1, 1 1321 .endm 1322 1323 function gather32_neon 1324 gather v4.b, v5.b, v0.b, v1.b 1325 ret 1326 endfunc 1327 1328 function gather16_neon 1329 gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 1330 gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 1331 ins v4.d[1], v5.d[1] 1332 ret 1333 endfunc 1334 1335 const overlap_coeffs_0, align=4 1336 .byte 27, 17, 0, 0, 0, 0, 0, 0 1337 .byte 17, 27, 32, 32, 32, 32, 32, 32 1338 endconst 1339 1340 const overlap_coeffs_1, align=4 1341 .byte 23, 0, 0, 0, 0, 0, 0, 0 1342 .byte 22, 32, 32, 32, 32, 32, 32, 32 1343 endconst 1344 1345 .macro calc_offset offx, offy, src, sx, sy 1346 and \offy, \src, #0xF // randval & 0xF 1347 lsr \offx, \src, #4 // randval >> 4 1348 .if \sy == 0 1349 add \offy, \offy, \offy // 2 * (randval & 0xF) 1350 .endif 1351 .if \sx == 0 1352 add \offx, \offx, \offx // 2 * (randval >> 4) 1353 .endif 1354 .endm 1355 1356 .macro add_offset dst, offx, offy, src, stride 1357 madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy 1358 add \dst, \dst, \offx, uxtw // grain_lut += offx 1359 .endm 1360 1361 // void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, 1362 // const ptrdiff_t stride, 1363 // const uint8_t scaling[SCALING_SIZE], 1364 // const int scaling_shift, 1365 // const entry grain_lut[][GRAIN_WIDTH], 1366 // const int offsets[][2], 1367 // const int h, const ptrdiff_t clip, 1368 // const ptrdiff_t type); 1369 function fgy_32x32_8bpc_neon, export=1 1370 AARCH64_SIGN_LINK_REGISTER 1371 str x30, [sp, #-16]! 1372 ldr w11, [x6, #8] // offsets[1][0] 1373 ldr w13, [x6, #4] // offsets[0][1] 1374 ldr w15, [x6, #12] // offsets[1][1] 1375 ldr w6, [x6] // offsets[0][0] 1376 ldr w8, [sp, #16] // clip 1377 mov x9, #GRAIN_WIDTH // grain_lut stride 1378 1379 neg w4, w4 1380 dup v29.8h, w4 // -scaling_shift 1381 1382 movrel x16, overlap_coeffs_0 1383 1384 cbz w8, 1f 1385 // clip 1386 movi v30.16b, #16 1387 movi v31.16b, #235 1388 b 2f 1389 1: 1390 // no clip 1391 movi v30.16b, #0 1392 movi v31.16b, #255 1393 2: 1394 1395 ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs 1396 1397 add x5, x5, #9 // grain_lut += 9 1398 add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride 1399 add x5, x5, x9 // grain_lut += grain_stride 1400 1401 calc_offset w11, w12, w11, 0, 0 1402 calc_offset w13, w14, w13, 0, 0 1403 calc_offset w15, w16, w15, 0, 0 1404 calc_offset w6, w10, w6, 0, 0 1405 1406 add_offset x12, w11, x12, x5, x9 1407 add_offset x14, w13, x14, x5, x9 1408 add_offset x16, w15, x16, x5, x9 1409 add_offset x5, w6, x10, x5, x9 1410 1411 ldr w11, [sp, #24] // type 1412 movrel x13, fgy_loop_tbl 1413 1414 add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx 1415 add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1416 1417 tst w11, #1 1418 ldrsw x11, [x13, w11, uxtw #2] 1419 1420 add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1421 add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx 1422 1423 add x11, x13, x11 1424 1425 b.eq 1f 1426 // y overlap 1427 dup v6.16b, v27.b[0] 1428 dup v7.16b, v27.b[1] 1429 mov w10, w7 // backup actual h 1430 mov w7, #2 1431 1: 1432 br x11 1433 endfunc 1434 1435 function fgy_loop_neon 1436 .macro fgy ox, oy 1437 L(loop_\ox\oy): 1438 AARCH64_VALID_JUMP_TARGET 1439 1: 1440 ld1 {v0.16b, v1.16b}, [x1], x2 // src 1441 .if \ox 1442 ld1 {v20.8b}, [x4], x9 // grain_lut old 1443 .endif 1444 .if \oy 1445 ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top 1446 .endif 1447 .if \ox && \oy 1448 ld1 {v21.8b}, [x8], x9 // grain_lut top old 1449 .endif 1450 ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut 1451 1452 bl gather32_neon 1453 1454 .if \ox 1455 smull v20.8h, v20.8b, v27.8b 1456 smlal v20.8h, v18.8b, v28.8b 1457 .endif 1458 1459 .if \oy 1460 .if \ox 1461 smull v21.8h, v21.8b, v27.8b 1462 smlal v21.8h, v22.8b, v28.8b 1463 sqrshrn v20.8b, v20.8h, #5 1464 sqrshrn v21.8b, v21.8h, #5 1465 .endif 1466 1467 .if \ox 1468 smull v16.8h, v20.8b, v7.8b 1469 .else 1470 smull v16.8h, v18.8b, v7.8b 1471 .endif 1472 smull2 v17.8h, v18.16b, v7.16b 1473 smull v18.8h, v19.8b, v7.8b 1474 smull2 v19.8h, v19.16b, v7.16b 1475 .if \ox 1476 smlal v16.8h, v21.8b, v6.8b 1477 .else 1478 smlal v16.8h, v22.8b, v6.8b 1479 .endif 1480 smlal2 v17.8h, v22.16b, v6.16b 1481 smlal v18.8h, v23.8b, v6.8b 1482 smlal2 v19.8h, v23.16b, v6.16b 1483 sqrshrn v22.8b, v16.8h, #5 1484 sqrshrn2 v22.16b, v17.8h, #5 1485 sqrshrn v23.8b, v18.8h, #5 1486 sqrshrn2 v23.16b, v19.8h, #5 1487 .endif 1488 1489 // sxtl of grain 1490 .if \oy 1491 sxtl v16.8h, v22.8b 1492 sxtl2 v17.8h, v22.16b 1493 sxtl v18.8h, v23.8b 1494 sxtl2 v19.8h, v23.16b 1495 .elseif \ox 1496 sqrshrn v20.8b, v20.8h, #5 1497 sxtl2 v17.8h, v18.16b 1498 sxtl v18.8h, v19.8b 1499 sxtl2 v19.8h, v19.16b 1500 sxtl v16.8h, v20.8b 1501 .else 1502 sxtl v16.8h, v18.8b 1503 sxtl2 v17.8h, v18.16b 1504 sxtl v18.8h, v19.8b 1505 sxtl2 v19.8h, v19.16b 1506 .endif 1507 1508 uxtl v2.8h, v4.8b // scaling 1509 uxtl2 v3.8h, v4.16b 1510 uxtl v4.8h, v5.8b 1511 uxtl2 v5.8h, v5.16b 1512 1513 mul v16.8h, v16.8h, v2.8h // scaling * grain 1514 mul v17.8h, v17.8h, v3.8h 1515 mul v18.8h, v18.8h, v4.8h 1516 mul v19.8h, v19.8h, v5.8h 1517 1518 srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) 1519 srshl v17.8h, v17.8h, v29.8h 1520 srshl v18.8h, v18.8h, v29.8h 1521 srshl v19.8h, v19.8h, v29.8h 1522 1523 uaddw v16.8h, v16.8h, v0.8b // *src + noise 1524 uaddw2 v17.8h, v17.8h, v0.16b 1525 uaddw v18.8h, v18.8h, v1.8b 1526 uaddw2 v19.8h, v19.8h, v1.16b 1527 1528 sqxtun v0.8b, v16.8h 1529 sqxtun2 v0.16b, v17.8h 1530 sqxtun v1.8b, v18.8h 1531 sqxtun2 v1.16b, v19.8h 1532 1533 umax v0.16b, v0.16b, v30.16b 1534 umax v1.16b, v1.16b, v30.16b 1535 umin v0.16b, v0.16b, v31.16b 1536 umin v1.16b, v1.16b, v31.16b 1537 1538 subs w7, w7, #1 1539 .if \oy 1540 dup v6.16b, v28.b[0] 1541 dup v7.16b, v28.b[1] 1542 .endif 1543 st1 {v0.16b, v1.16b}, [x0], x2 // dst 1544 b.gt 1b 1545 1546 .if \oy 1547 cmp w10, #2 1548 sub w7, w10, #2 // restore actual remaining h 1549 b.gt L(loop_\ox\()0) 1550 .endif 1551 ldr x30, [sp], #16 1552 AARCH64_VALIDATE_LINK_REGISTER 1553 ret 1554 .endm 1555 1556 fgy 0, 0 1557 fgy 0, 1 1558 fgy 1, 0 1559 fgy 1, 1 1560 endfunc 1561 1562 jumptable fgy_loop_tbl 1563 .word L(loop_00) - fgy_loop_tbl 1564 .word L(loop_01) - fgy_loop_tbl 1565 .word L(loop_10) - fgy_loop_tbl 1566 .word L(loop_11) - fgy_loop_tbl 1567 endjumptable 1568 1569 // void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, 1570 // const pixel *const src, 1571 // const ptrdiff_t stride, 1572 // const uint8_t scaling[SCALING_SIZE], 1573 // const Dav1dFilmGrainData *const data, 1574 // const entry grain_lut[][GRAIN_WIDTH], 1575 // const pixel *const luma_row, 1576 // const ptrdiff_t luma_stride, 1577 // const int offsets[][2], 1578 // const ptrdiff_t h, const ptrdiff_t uv, 1579 // const ptrdiff_t is_id, 1580 // const ptrdiff_t type); 1581 .macro fguv layout, sx, sy 1582 function fguv_32x32_\layout\()_8bpc_neon, export=1 1583 AARCH64_SIGN_LINK_REGISTER 1584 str x30, [sp, #-32]! 1585 str d8, [sp, #16] 1586 ldp x8, x9, [sp, #32] // offsets, h 1587 ldp x10, x11, [sp, #48] // uv, is_id 1588 1589 ldr w13, [x4, #FGD_SCALING_SHIFT] 1590 ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] 1591 neg w13, w13 // -scaling_shift 1592 1593 // !csfl 1594 add x10, x4, x10, lsl #2 // + 4*uv 1595 add x14, x10, #FGD_UV_LUMA_MULT 1596 add x15, x10, #FGD_UV_MULT 1597 add x10, x10, #FGD_UV_OFFSET 1598 ld1 {v8.h}[0], [x14] // uv_luma_mult 1599 ld1r {v24.8h}, [x10] // uv_offset 1600 ld1 {v8.h}[1], [x15] // uv_mult 1601 1602 dup v29.8h, w13 // -scaling_shift 1603 1604 cbz w12, 1f 1605 // clip 1606 movi v30.16b, #16 1607 movi v31.16b, #240 1608 cbz w11, 2f 1609 // is_id 1610 movi v31.16b, #235 1611 b 2f 1612 1: 1613 // no clip 1614 movi v30.16b, #0 1615 movi v31.16b, #255 1616 2: 1617 1618 ldr w12, [x8, #8] // offsets[1][0] 1619 ldr w14, [x8, #4] // offsets[0][1] 1620 ldr w16, [x8, #12] // offsets[1][1] 1621 ldr w8, [x8] // offsets[0][0] 1622 1623 mov x10, #GRAIN_WIDTH // grain_lut stride 1624 1625 add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 1626 .if \sy 1627 add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride 1628 add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride 1629 .else 1630 add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride 1631 add x5, x5, x10 // grain_lut += grain_stride 1632 .endif 1633 1634 calc_offset w12, w13, w12, \sx, \sy 1635 calc_offset w14, w15, w14, \sx, \sy 1636 calc_offset w16, w17, w16, \sx, \sy 1637 calc_offset w8, w11, w8, \sx, \sy 1638 1639 add_offset x13, w12, x13, x5, x10 1640 add_offset x15, w14, x15, x5, x10 1641 add_offset x17, w16, x17, x5, x10 1642 add_offset x5, w8, x11, x5, x10 1643 1644 add x4, x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1645 add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1646 add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by 1647 add x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx 1648 1649 ldr w13, [sp, #64] // type 1650 1651 movrel x16, overlap_coeffs_\sx 1652 movrel x14, fguv_loop_sx\sx\()_tbl 1653 1654 ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs 1655 tst w13, #1 1656 ldrsw x13, [x14, w13, uxtw #2] 1657 1658 b.eq 1f 1659 // y overlap 1660 sub w12, w9, #(2 >> \sy) // backup remaining h 1661 mov w9, #(2 >> \sy) 1662 1663 1: 1664 add x13, x14, x13 1665 1666 .if \sy 1667 movi v25.16b, #23 1668 movi v26.16b, #22 1669 .else 1670 movi v25.16b, #27 1671 movi v26.16b, #17 1672 .endif 1673 1674 .if \sy 1675 add x7, x7, x7 // luma_stride *= 2 1676 .endif 1677 1678 br x13 1679 endfunc 1680 .endm 1681 1682 fguv 420, 1, 1 1683 fguv 422, 1, 0 1684 fguv 444, 0, 0 1685 1686 function fguv_loop_sx0_neon 1687 .macro fguv_loop_sx0 csfl, ox, oy 1688 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): 1689 AARCH64_VALID_JUMP_TARGET 1690 1: 1691 ld1 {v0.16b, v1.16b}, [x6], x7 // luma 1692 ld1 {v6.16b, v7.16b}, [x1], x2 // src 1693 .if \ox 1694 ld1 {v20.8b}, [x4], x10 // grain_lut old 1695 .endif 1696 .if \oy 1697 ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top 1698 .endif 1699 .if \ox && \oy 1700 ld1 {v21.8b}, [x11], x10 // grain_lut top old 1701 .endif 1702 ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut 1703 1704 .if !\csfl 1705 uxtl v2.8h, v0.8b 1706 uxtl2 v3.8h, v0.16b 1707 uxtl v4.8h, v1.8b 1708 uxtl2 v5.8h, v1.16b 1709 uxtl v0.8h, v6.8b 1710 uxtl2 v1.8h, v6.16b 1711 uxtl v16.8h, v7.8b 1712 uxtl2 v17.8h, v7.16b 1713 mul v2.8h, v2.8h, v8.h[0] 1714 mul v3.8h, v3.8h, v8.h[0] 1715 mul v4.8h, v4.8h, v8.h[0] 1716 mul v5.8h, v5.8h, v8.h[0] 1717 mul v0.8h, v0.8h, v8.h[1] 1718 mul v1.8h, v1.8h, v8.h[1] 1719 mul v16.8h, v16.8h, v8.h[1] 1720 mul v17.8h, v17.8h, v8.h[1] 1721 sqadd v2.8h, v2.8h, v0.8h 1722 sqadd v3.8h, v3.8h, v1.8h 1723 sqadd v4.8h, v4.8h, v16.8h 1724 sqadd v5.8h, v5.8h, v17.8h 1725 sshr v2.8h, v2.8h, #6 1726 sshr v3.8h, v3.8h, #6 1727 sshr v4.8h, v4.8h, #6 1728 sshr v5.8h, v5.8h, #6 1729 add v2.8h, v2.8h, v24.8h 1730 add v3.8h, v3.8h, v24.8h 1731 add v4.8h, v4.8h, v24.8h 1732 add v5.8h, v5.8h, v24.8h 1733 sqxtun v0.8b, v2.8h 1734 sqxtun2 v0.16b, v3.8h 1735 sqxtun v1.8b, v4.8h 1736 sqxtun2 v1.16b, v5.8h 1737 .endif 1738 1739 bl gather32_neon 1740 1741 .if \ox 1742 smull v20.8h, v20.8b, v27.8b 1743 smlal v20.8h, v18.8b, v28.8b 1744 .endif 1745 1746 .if \oy 1747 .if \ox 1748 smull v21.8h, v21.8b, v27.8b 1749 smlal v21.8h, v22.8b, v28.8b 1750 sqrshrn v20.8b, v20.8h, #5 1751 sqrshrn v21.8b, v21.8h, #5 1752 .endif 1753 1754 .if \ox 1755 smull v16.8h, v20.8b, v26.8b 1756 .else 1757 smull v16.8h, v18.8b, v26.8b 1758 .endif 1759 smull2 v17.8h, v18.16b, v26.16b 1760 smull v18.8h, v19.8b, v26.8b 1761 smull2 v19.8h, v19.16b, v26.16b 1762 .if \ox 1763 smlal v16.8h, v21.8b, v25.8b 1764 .else 1765 smlal v16.8h, v22.8b, v25.8b 1766 .endif 1767 smlal2 v17.8h, v22.16b, v25.16b 1768 smlal v18.8h, v23.8b, v25.8b 1769 smlal2 v19.8h, v23.16b, v25.16b 1770 sqrshrn v22.8b, v16.8h, #5 1771 sqrshrn2 v22.16b, v17.8h, #5 1772 sqrshrn v23.8b, v18.8h, #5 1773 sqrshrn2 v23.16b, v19.8h, #5 1774 .endif 1775 1776 // sxtl of grain 1777 .if \oy 1778 sxtl v16.8h, v22.8b 1779 sxtl2 v17.8h, v22.16b 1780 sxtl v18.8h, v23.8b 1781 sxtl2 v19.8h, v23.16b 1782 .elseif \ox 1783 sqrshrn v20.8b, v20.8h, #5 1784 sxtl2 v17.8h, v18.16b 1785 sxtl v18.8h, v19.8b 1786 sxtl2 v19.8h, v19.16b 1787 sxtl v16.8h, v20.8b 1788 .else 1789 sxtl v16.8h, v18.8b 1790 sxtl2 v17.8h, v18.16b 1791 sxtl v18.8h, v19.8b 1792 sxtl2 v19.8h, v19.16b 1793 .endif 1794 1795 uxtl v2.8h, v4.8b // scaling 1796 uxtl2 v3.8h, v4.16b 1797 uxtl v4.8h, v5.8b 1798 uxtl2 v5.8h, v5.16b 1799 1800 mul v16.8h, v16.8h, v2.8h // scaling * grain 1801 mul v17.8h, v17.8h, v3.8h 1802 mul v18.8h, v18.8h, v4.8h 1803 mul v19.8h, v19.8h, v5.8h 1804 1805 srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) 1806 srshl v17.8h, v17.8h, v29.8h 1807 srshl v18.8h, v18.8h, v29.8h 1808 srshl v19.8h, v19.8h, v29.8h 1809 1810 uaddw v16.8h, v16.8h, v6.8b // *src + noise 1811 uaddw2 v17.8h, v17.8h, v6.16b 1812 uaddw v18.8h, v18.8h, v7.8b 1813 uaddw2 v19.8h, v19.8h, v7.16b 1814 1815 sqxtun v0.8b, v16.8h 1816 sqxtun2 v0.16b, v17.8h 1817 sqxtun v1.8b, v18.8h 1818 sqxtun2 v1.16b, v19.8h 1819 1820 umax v0.16b, v0.16b, v30.16b 1821 umax v1.16b, v1.16b, v30.16b 1822 umin v0.16b, v0.16b, v31.16b 1823 umin v1.16b, v1.16b, v31.16b 1824 1825 subs w9, w9, #1 1826 .if \oy 1827 dup v25.16b, v28.b[0] 1828 dup v26.16b, v28.b[1] 1829 .endif 1830 st1 {v0.16b, v1.16b}, [x0], x2 // dst 1831 b.gt 1b 1832 1833 .if \oy 1834 cmp w12, #0 1835 mov w9, w12 // restore actual remaining h 1836 b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) 1837 .endif 1838 b 9f 1839 .endm 1840 fguv_loop_sx0 0, 0, 0 1841 fguv_loop_sx0 0, 0, 1 1842 fguv_loop_sx0 0, 1, 0 1843 fguv_loop_sx0 0, 1, 1 1844 fguv_loop_sx0 1, 0, 0 1845 fguv_loop_sx0 1, 0, 1 1846 fguv_loop_sx0 1, 1, 0 1847 fguv_loop_sx0 1, 1, 1 1848 1849 9: 1850 ldr d8, [sp, #16] 1851 ldr x30, [sp], #32 1852 AARCH64_VALIDATE_LINK_REGISTER 1853 ret 1854 endfunc 1855 1856 jumptable fguv_loop_sx0_tbl 1857 .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl 1858 .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl 1859 .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl 1860 .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl 1861 .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl 1862 .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl 1863 .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl 1864 .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl 1865 endjumptable 1866 1867 function fguv_loop_sx1_neon 1868 .macro fguv_loop_sx1 csfl, ox, oy 1869 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): 1870 AARCH64_VALID_JUMP_TARGET 1871 1: 1872 ld1 {v0.16b, v1.16b}, [x6], x7 // luma 1873 ld1 {v6.16b}, [x1], x2 // src 1874 .if \ox 1875 ld1 {v20.8b}, [x4], x10 // grain_lut old 1876 .endif 1877 .if \oy 1878 ld1 {v22.16b}, [x8], x10 // grain_lut top 1879 .endif 1880 .if \ox && \oy 1881 ld1 {v21.8b}, [x11], x10 // grain_lut top old 1882 .endif 1883 ld1 {v18.16b}, [x5], x10 // grain_lut 1884 1885 uaddlp v2.8h, v0.16b 1886 uaddlp v3.8h, v1.16b 1887 .if \csfl 1888 rshrn v0.8b, v2.8h, #1 1889 rshrn2 v0.16b, v3.8h, #1 1890 .else 1891 urshr v2.8h, v2.8h, #1 1892 urshr v3.8h, v3.8h, #1 1893 uxtl v0.8h, v6.8b 1894 uxtl2 v1.8h, v6.16b 1895 mul v2.8h, v2.8h, v8.h[0] 1896 mul v3.8h, v3.8h, v8.h[0] 1897 mul v0.8h, v0.8h, v8.h[1] 1898 mul v1.8h, v1.8h, v8.h[1] 1899 sqadd v2.8h, v2.8h, v0.8h 1900 sqadd v3.8h, v3.8h, v1.8h 1901 sshr v2.8h, v2.8h, #6 1902 sshr v3.8h, v3.8h, #6 1903 add v2.8h, v2.8h, v24.8h 1904 add v3.8h, v3.8h, v24.8h 1905 sqxtun v0.8b, v2.8h 1906 sqxtun2 v0.16b, v3.8h 1907 .endif 1908 1909 bl gather16_neon 1910 1911 .if \ox 1912 smull v20.8h, v20.8b, v27.8b 1913 smlal v20.8h, v18.8b, v28.8b 1914 .endif 1915 1916 .if \oy 1917 .if \ox 1918 smull v21.8h, v21.8b, v27.8b 1919 smlal v21.8h, v22.8b, v28.8b 1920 sqrshrn v20.8b, v20.8h, #5 1921 sqrshrn v21.8b, v21.8h, #5 1922 .endif 1923 1924 .if \ox 1925 smull v16.8h, v20.8b, v26.8b 1926 .else 1927 smull v16.8h, v18.8b, v26.8b 1928 .endif 1929 smull2 v17.8h, v18.16b, v26.16b 1930 .if \ox 1931 smlal v16.8h, v21.8b, v25.8b 1932 .else 1933 smlal v16.8h, v22.8b, v25.8b 1934 .endif 1935 smlal2 v17.8h, v22.16b, v25.16b 1936 sqrshrn v22.8b, v16.8h, #5 1937 sqrshrn2 v22.16b, v17.8h, #5 1938 .endif 1939 1940 // sxtl of grain 1941 .if \oy 1942 sxtl v16.8h, v22.8b 1943 sxtl2 v17.8h, v22.16b 1944 .elseif \ox 1945 sqrshrn v20.8b, v20.8h, #5 1946 sxtl2 v17.8h, v18.16b 1947 sxtl v16.8h, v20.8b 1948 .else 1949 sxtl v16.8h, v18.8b 1950 sxtl2 v17.8h, v18.16b 1951 .endif 1952 1953 uxtl v2.8h, v4.8b // scaling 1954 uxtl2 v3.8h, v4.16b 1955 1956 mul v16.8h, v16.8h, v2.8h // scaling * grain 1957 mul v17.8h, v17.8h, v3.8h 1958 1959 srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) 1960 srshl v17.8h, v17.8h, v29.8h 1961 1962 uaddw v16.8h, v16.8h, v6.8b // *src + noise 1963 uaddw2 v17.8h, v17.8h, v6.16b 1964 1965 sqxtun v0.8b, v16.8h 1966 sqxtun2 v0.16b, v17.8h 1967 1968 umax v0.16b, v0.16b, v30.16b 1969 umin v0.16b, v0.16b, v31.16b 1970 1971 .if \oy 1972 mov v16.16b, v25.16b 1973 .endif 1974 subs w9, w9, #1 1975 .if \oy 1976 mov v25.16b, v26.16b 1977 mov v26.16b, v16.16b 1978 .endif 1979 st1 {v0.16b}, [x0], x2 // dst 1980 b.gt 1b 1981 1982 .if \oy 1983 cmp w12, #0 1984 mov w9, w12 // restore actual remaining h 1985 b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) 1986 .endif 1987 1988 b 9f 1989 .endm 1990 fguv_loop_sx1 0, 0, 0 1991 fguv_loop_sx1 0, 0, 1 1992 fguv_loop_sx1 0, 1, 0 1993 fguv_loop_sx1 0, 1, 1 1994 fguv_loop_sx1 1, 0, 0 1995 fguv_loop_sx1 1, 0, 1 1996 fguv_loop_sx1 1, 1, 0 1997 fguv_loop_sx1 1, 1, 1 1998 1999 9: 2000 ldr d8, [sp, #16] 2001 ldr x30, [sp], #32 2002 AARCH64_VALIDATE_LINK_REGISTER 2003 ret 2004 endfunc 2005 2006 jumptable fguv_loop_sx1_tbl 2007 .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl 2008 .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl 2009 .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl 2010 .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl 2011 .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl 2012 .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl 2013 .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl 2014 .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl 2015 endjumptable