looprestoration.S (26504B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 31 const right_ext_mask_buf 32 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36 right_ext_mask: 37 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 38 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 39 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41 endconst 42 43 // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], 44 // const pixel *src, const int16_t fh[8], 45 // const int w, 46 // const enum LrEdgeFlags edges); 47 function wiener_filter_h_8bpc_neon, export=1 48 push {r4-r5,lr} 49 ldrd r4, r5, [sp, #12] 50 vld1.16 {q0}, [r3, :128] 51 movw r12, #(1 << 14) - (1 << 2) 52 vdup.16 q14, r12 53 vmov.s16 q15, #2048 54 55 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 56 tst r5, #1 // LR_HAVE_LEFT 57 beq 1f 58 // LR_HAVE_LEFT 59 cmp r1, #0 60 bne 0f 61 // left == NULL 62 sub r2, r2, #3 63 vld1.8 {q2}, [r2]! 64 b 2f 65 66 0: 67 // LR_HAVE_LEFT, left != NULL 68 vld1.8 {q2}, [r2]! 69 vld1.32 {d3[1]}, [r1] 70 // Move r2 back to account for the last 3 bytes we loaded earlier, 71 // which we'll shift out. 72 sub r2, r2, #3 73 vext.8 q2, q1, q2, #13 74 b 2f 75 76 1: 77 vld1.8 {q2}, [r2]! 78 // !LR_HAVE_LEFT, fill q1 with the leftmost byte 79 // and shift q2 to have 3x the first byte at the front. 80 vdup.8 q1, d4[0] 81 // Move r2 back to account for the last 3 bytes we loaded before, 82 // which we shifted out. 83 sub r2, r2, #3 84 vext.8 q2, q1, q2, #13 85 86 2: 87 vmovl.u8 q1, d4 88 vmovl.u8 q2, d5 89 90 tst r5, #2 // LR_HAVE_RIGHT 91 bne 4f 92 93 3: // !LR_HAVE_RIGHT 94 95 // Check whether we need to pad the right edge 96 cmp r4, #11 97 bge 4f // If w >= 11, all used input pixels are valid 98 99 // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10, 100 // this ends up called again; it's not strictly needed in those 101 // cases (we pad enough here), but keeping the code as simple as possible. 102 103 // The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie 104 // q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. 105 sub r12, r4, #14 106 // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the 107 // buffer pointer. 108 movrel_local r3, right_ext_mask, -6 109 ldrb r12, [r2, r12] 110 sub r3, r3, r4, lsl #1 111 vdup.16 q13, r12 112 vld1.8 {q10, q11}, [r3] 113 114 vbit q1, q13, q10 115 vbit q2, q13, q11 116 117 4: // Loop horizontally 118 vext.8 q10, q1, q2, #4 119 vext.8 q11, q1, q2, #8 120 vext.8 q9, q1, q2, #2 121 vext.8 q12, q1, q2, #10 122 vext.8 q13, q1, q2, #12 123 vext.8 q8, q1, q2, #6 124 vadd.i16 q10, q10, q11 125 vadd.i16 q9, q9, q12 126 vadd.i16 q13, q13, q1 127 vshl.s16 q1, q8, #7 128 vmul.s16 q3, q8, d0[3] 129 vmla.s16 q3, q10, d1[0] 130 vmla.s16 q3, q9, d1[1] 131 vmla.s16 q3, q13, d1[2] 132 133 vsub.s16 q1, q1, q14 134 vqadd.s16 q3, q3, q1 135 vshr.s16 q3, q3, #3 136 vadd.s16 q3, q3, q15 137 subs r4, r4, #8 138 vst1.16 {q3}, [r0, :128]! 139 140 ble 9f 141 vmov q1, q2 142 vld1.8 {d4}, [r2]! 143 tst r5, #2 // LR_HAVE_RIGHT 144 vmovl.u8 q2, d4 145 bne 4b // If we don't need to pad, just keep filtering. 146 b 3b // If we need to pad, check how many pixels we have left. 147 148 9: 149 pop {r4-r5,pc} 150 endfunc 151 152 // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, int16_t **ptrs, 153 // const int16_t fv[8], const int w); 154 function wiener_filter_v_8bpc_neon, export=1 155 push {r4-r9,lr} 156 vpush {q4-q6} 157 158 vld1.16 {q0}, [r2, :128] 159 160 ldrd r4, r5, [r1] 161 ldrd r6, r7, [r1, #8] 162 ldrd r8, r9, [r1, #16] 163 164 1: 165 vld1.16 {q1, q2}, [r4, :128]! 166 vld1.16 {q8, q9}, [r9, :128]! 167 168 vld1.16 {q5, q6}, [r5, :128]! 169 170 vld1.16 {q10, q11}, [r6, :128]! 171 vld1.16 {q12, q13}, [r8, :128]! 172 173 vld1.16 {q14, q15}, [r7, :128]! 174 175 subs r3, r3, #16 176 177 vadd.i16 q1, q1, q8 178 vadd.i16 q2, q2, q9 179 180 vadd.i16 q5, q5, q8 181 vadd.i16 q6, q6, q9 182 183 vadd.i16 q10, q10, q12 184 vadd.i16 q11, q11, q13 185 186 vmull.s16 q3, d28, d0[3] 187 vmlal.s16 q3, d2, d0[0] 188 vmlal.s16 q3, d10, d0[1] 189 vmlal.s16 q3, d20, d0[2] 190 191 vmull.s16 q4, d29, d0[3] 192 vmlal.s16 q4, d3, d0[0] 193 vmlal.s16 q4, d11, d0[1] 194 vmlal.s16 q4, d21, d0[2] 195 196 vmull.s16 q8, d30, d0[3] 197 vmlal.s16 q8, d4, d0[0] 198 vmlal.s16 q8, d12, d0[1] 199 vmlal.s16 q8, d22, d0[2] 200 201 vmull.s16 q9, d31, d0[3] 202 vmlal.s16 q9, d5, d0[0] 203 vmlal.s16 q9, d13, d0[1] 204 vmlal.s16 q9, d23, d0[2] 205 206 vqrshrun.s32 d6, q3, #11 207 vqrshrun.s32 d7, q4, #11 208 vqrshrun.s32 d16, q8, #11 209 vqrshrun.s32 d17, q9, #11 210 vqmovun.s16 d6, q3 211 vqmovun.s16 d7, q8 212 vst1.8 {q3}, [r0, :128]! 213 bgt 1b 214 215 // Shift the pointers, but only update the first 5; the 6th pointer is 216 // kept as it was before (and the 7th is implicitly identical to the 217 // 6th). 218 ldrd r4, r5, [r1, #4] 219 ldrd r6, r7, [r1, #12] 220 ldr r8, [r1, #20] 221 strd r4, r5, [r1] 222 strd r6, r7, [r1, #8] 223 str r8, [r1, #16] 224 225 vpop {q4-q6} 226 pop {r4-r9,pc} 227 endfunc 228 229 // void dav1d_wiener_filter_hv_8bpc_neon(pixel *dst, const pixel (*left)[4], 230 // const pixel *src, 231 // const int16_t filter[2][8], 232 // const int w, 233 // const enum LrEdgeFlags edges, 234 // int16_t **ptrs); 235 function wiener_filter_hv_8bpc_neon, export=1 236 push {r4-r11,lr} 237 vpush {q4-q7} 238 ldrd r4, r5, [sp, #100] 239 ldr lr, [sp, #108] 240 vld1.16 {q0, q1}, [r3, :128] 241 movw r12, #(1 << 14) - (1 << 2) 242 vdup.16 q14, r12 243 vmov.s16 q15, #2048 244 245 ldrd r6, r7, [lr] 246 ldrd r8, r9, [lr, #8] 247 ldrd r10, r11, [lr, #16] 248 ldr r12, [lr, #24] 249 250 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 251 tst r5, #1 // LR_HAVE_LEFT 252 beq 1f 253 // LR_HAVE_LEFT 254 cmp r1, #0 255 bne 0f 256 // left == NULL 257 sub r2, r2, #3 258 vld1.8 {q2}, [r2]! 259 b 2f 260 261 0: 262 // LR_HAVE_LEFT, left != NULL 263 vld1.8 {q2}, [r2]! 264 vld1.32 {d3[1]}, [r1] 265 // Move r2 back to account for the last 3 bytes we loaded earlier, 266 // which we'll shift out. 267 sub r2, r2, #3 268 vext.8 q2, q1, q2, #13 269 b 2f 270 271 1: 272 vld1.8 {q2}, [r2]! 273 // !LR_HAVE_LEFT, fill q1 with the leftmost byte 274 // and shift q2 to have 3x the first byte at the front. 275 vdup.8 q3, d4[0] 276 // Move r2 back to account for the last 3 bytes we loaded before, 277 // which we shifted out. 278 sub r2, r2, #3 279 vext.8 q2, q3, q2, #13 280 281 2: 282 vmovl.u8 q3, d5 283 vmovl.u8 q2, d4 284 285 tst r5, #2 // LR_HAVE_RIGHT 286 bne 4f 287 288 3: // !LR_HAVE_RIGHT 289 290 // Check whether we need to pad the right edge 291 cmp r4, #11 292 bge 4f // If w >= 11, all used input pixels are valid 293 294 // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10, 295 // this ends up called again; it's not strictly needed in those 296 // cases (we pad enough here), but keeping the code as simple as possible. 297 298 // The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie 299 // q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. 300 sub lr, r4, #14 301 // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the 302 // buffer pointer. 303 movrel_local r3, right_ext_mask, -6 304 ldrb lr, [r2, lr] 305 sub r3, r3, r4, lsl #1 306 vdup.16 q13, lr 307 vld1.8 {q10, q11}, [r3] 308 309 vbit q2, q13, q10 310 vbit q3, q13, q11 311 312 4: // Loop horizontally 313 vext.8 q10, q2, q3, #4 314 vext.8 q11, q2, q3, #8 315 vext.8 q9, q2, q3, #2 316 vext.8 q12, q2, q3, #10 317 vext.8 q13, q2, q3, #12 318 vext.8 q8, q2, q3, #6 319 vadd.i16 q10, q10, q11 320 vadd.i16 q9, q9, q12 321 vadd.i16 q13, q13, q2 322 vld1.16 {q6}, [r7, :128]! 323 vshl.s16 q2, q8, #7 324 vld1.16 {q11}, [r11, :128]! 325 vsub.s16 q2, q2, q14 326 vld1.16 {q7}, [r8, :128]! 327 vmul.s16 q4, q8, d0[3] 328 vmla.s16 q4, q10, d1[0] 329 vmla.s16 q4, q9, d1[1] 330 vmla.s16 q4, q13, d1[2] 331 332 vld1.16 {q10}, [r10, :128]! 333 vqadd.s16 q4, q4, q2 334 335 vld1.16 {q9}, [r9, :128]! 336 vshr.s16 q4, q4, #3 337 vld1.16 {q5}, [r6, :128]! 338 vadd.s16 q4, q4, q15 339 340 vadd.s16 q6, q6, q11 341 vadd.s16 q7, q7, q10 342 vadd.s16 q5, q5, q4 343 344 vmull.s16 q8, d18, d2[3] 345 vmlal.s16 q8, d12, d2[1] 346 vmlal.s16 q8, d14, d2[2] 347 vmlal.s16 q8, d10, d2[0] 348 349 vmull.s16 q9, d19, d2[3] 350 vmlal.s16 q9, d13, d2[1] 351 vmlal.s16 q9, d15, d2[2] 352 vmlal.s16 q9, d11, d2[0] 353 354 vqrshrun.s32 d16, q8, #11 355 vqrshrun.s32 d17, q9, #11 356 vst1.16 {q4}, [r12, :128]! 357 vqmovun.s16 d16, q8 358 subs r4, r4, #8 359 vst1.8 {d16}, [r0, :64]! 360 361 ble 9f 362 vmov q2, q3 363 vld1.8 {d6}, [r2]! 364 tst r5, #2 // LR_HAVE_RIGHT 365 vmovl.u8 q3, d6 366 bne 4b // If we don't need to pad, just keep filtering. 367 b 3b // If we need to pad, check how many pixels we have left. 368 369 9: 370 // Reload ptrs from arguments on the stack 371 ldr lr, [sp, #108] 372 // Rotate the window of pointers. Shift the 6 pointers downwards one step. 373 ldrd r6, r7, [lr, #4] 374 ldrd r8, r9, [lr, #12] 375 ldrd r10, r11, [lr, #20] 376 377 strd r6, r7, [lr] 378 strd r8, r9, [lr, #8] 379 strd r10, r11, [lr, #16] 380 // The topmost pointer, ptrs[6], which isn't used as input, is set to 381 // ptrs[0], which will be used as output for the next _hv call. 382 // At the start of the filtering, the caller may set ptrs[6] to the 383 // right next buffer to fill in, instead. 384 str r6, [lr, #24] 385 386 vpop {q4-q7} 387 pop {r4-r11,pc} 388 endfunc 389 390 #include "looprestoration_tmpl.S" 391 392 // void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, 393 // const pixel (*left)[4], 394 // const pixel *src, const int w, 395 // const enum LrEdgeFlags edges); 396 function sgr_box3_row_h_8bpc_neon, export=1 397 push {r4-r5,lr} 398 ldrd r4, r5, [sp, #12] 399 add r4, r4, #2 // w += 2 400 401 tst r5, #1 // LR_HAVE_LEFT 402 beq 1f 403 cmp r2, #0 404 bne 0f 405 406 // LR_HAVE_LEFT && left == NULL 407 sub r3, r3, #2 408 vld1.8 {q0}, [r3]! 409 b 2f 410 411 0: 412 // LR_HAVE_LEFT, left != NULL 413 vld1.8 {q0}, [r3]! 414 vld1.32 {d3[]}, [r2] 415 // Move r3 back to account for the last 2 bytes we loaded earlier, 416 // which we'll shift out. 417 sub r3, r3, #2 418 vext.8 q0, q1, q0, #14 419 b 2f 420 421 1: 422 vld1.8 {q0}, [r3]! 423 // !LR_HAVE_LEFT, fill q1 with the leftmost byte 424 // and shift q0 to have 2x the first byte at the front. 425 vdup.8 q1, d0[0] 426 // Move r3 back to account for the last 2 bytes we loaded before, 427 // which we shifted out. 428 sub r3, r3, #2 429 vext.8 q0, q1, q0, #14 430 431 2: 432 vmull.u8 q1, d0, d0 433 vmull.u8 q2, d1, d1 434 435 tst r5, #2 // LR_HAVE_RIGHT 436 bne 4f 437 // If we'll need to pad the right edge, load that byte to pad with 438 // here since we can find it pretty easily from here. 439 sub lr, r4, #(2 + 16 - 2 + 1) 440 ldrb lr, [r3, lr] 441 // Fill q14 with the right padding pixel 442 vdup.8 q14, lr 443 3: // !LR_HAVE_RIGHT 444 445 // Check whether we need to pad the right edge 446 cmp r4, #10 447 bge 4f // If w >= 10, all used input pixels are valid 448 449 // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called 450 // again; it's not strictly needed in those cases (we pad enough here), 451 // but keeping the code as simple as possible. 452 453 // Insert padding in q0.b[w] onwards 454 movrel_local lr, right_ext_mask 455 sub lr, lr, r4 456 vld1.8 {q13}, [lr] 457 458 vbit q0, q14, q13 459 460 // Update the precalculated squares 461 vmull.u8 q1, d0, d0 462 vmull.u8 q2, d1, d1 463 464 4: // Loop horizontally 465 vext.8 d16, d0, d1, #1 466 vext.8 d17, d0, d1, #2 467 vaddl.u8 q3, d0, d16 468 vext.8 q9, q1, q2, #2 469 vaddw.u8 q3, q3, d17 470 471 vext.8 q10, q1, q2, #4 472 473 vaddl.u16 q12, d2, d18 474 vaddl.u16 q13, d3, d19 475 vaddw.u16 q12, q12, d20 476 vaddw.u16 q13, q13, d21 477 478 subs r4, r4, #8 479 vst1.16 {q3}, [r1, :128]! 480 vst1.32 {q12, q13}, [r0, :128]! 481 482 ble 9f 483 tst r5, #2 // LR_HAVE_RIGHT 484 vld1.8 {d6}, [r3]! 485 vmov q1, q2 486 vext.8 q0, q0, q3, #8 487 vmull.u8 q2, d6, d6 488 489 bne 4b // If we don't need to pad, just keep summing. 490 b 3b // If we need to pad, check how many pixels we have left. 491 492 9: 493 pop {r4-r5,pc} 494 endfunc 495 496 // void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, 497 // const pixel (*left)[4], 498 // const pixel *src, const int w, 499 // const enum LrEdgeFlags edges); 500 function sgr_box5_row_h_8bpc_neon, export=1 501 push {r4-r5,lr} 502 ldrd r4, r5, [sp, #12] 503 add r4, r4, #2 // w += 2 504 505 tst r5, #1 // LR_HAVE_LEFT 506 beq 1f 507 cmp r2, #0 508 bne 0f 509 510 // LR_HAVE_LEFT && left == NULL 511 sub r3, r3, #3 512 vld1.8 {q0}, [r3]! 513 b 2f 514 515 0: 516 // LR_HAVE_LEFT, left != NULL 517 vld1.8 {q0}, [r3]! 518 vld1.32 {d3[]}, [r2] 519 // Move r3 back to account for the last 3 bytes we loaded earlier, 520 // which we'll shift out. 521 sub r3, r3, #3 522 vext.8 q0, q1, q0, #13 523 b 2f 524 525 1: 526 vld1.8 {q0}, [r3]! 527 // !LR_HAVE_LEFT, fill q1 with the leftmost byte 528 // and shift q0 to have 3x the first byte at the front. 529 vdup.8 q1, d0[0] 530 // Move r3 back to account for the last 3 bytes we loaded before, 531 // which we shifted out. 532 sub r3, r3, #3 533 vext.8 q0, q1, q0, #13 534 535 2: 536 vmull.u8 q1, d0, d0 537 vmull.u8 q2, d1, d1 538 539 tst r5, #2 // LR_HAVE_RIGHT 540 bne 4f 541 // If we'll need to pad the right edge, load that byte to pad with 542 // here since we can find it pretty easily from here. 543 sub lr, r4, #(2 + 16 - 3 + 1) 544 ldrb lr, [r3, lr] 545 // Fill q14 with the right padding pixel 546 vdup.8 q14, lr 547 3: // !LR_HAVE_RIGHT 548 549 // Check whether we need to pad the right edge 550 cmp r4, #11 551 bge 4f // If w >= 11, all used input pixels are valid 552 553 // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10, 554 // this ends up called again; it's not strictly needed in those 555 // cases (we pad enough here), but keeping the code as simple as possible. 556 557 // Insert padding in q0.b[w+1] onwards; fuse the +1 into the 558 // buffer pointer. 559 movrel_local lr, right_ext_mask, -1 560 sub lr, lr, r4 561 vld1.8 {q13}, [lr] 562 563 vbit q0, q14, q13 564 565 // Update the precalculated squares 566 vmull.u8 q1, d0, d0 567 vmull.u8 q2, d1, d1 568 569 4: // Loop horizontally 570 vext.8 d16, d0, d1, #1 571 vext.8 d17, d0, d1, #2 572 vext.8 d18, d0, d1, #3 573 vext.8 d19, d0, d1, #4 574 vaddl.u8 q3, d0, d16 575 vaddl.u8 q12, d17, d18 576 vaddw.u8 q3, q3, d19 577 vadd.u16 q3, q3, q12 578 579 vext.8 q8, q1, q2, #2 580 vext.8 q9, q1, q2, #4 581 vext.8 q10, q1, q2, #6 582 vext.8 q11, q1, q2, #8 583 vaddl.u16 q12, d2, d16 584 vaddl.u16 q13, d3, d17 585 vaddl.u16 q8, d18, d20 586 vaddl.u16 q9, d19, d21 587 vaddw.u16 q12, q12, d22 588 vaddw.u16 q13, q13, d23 589 vadd.i32 q12, q12, q8 590 vadd.i32 q13, q13, q9 591 592 subs r4, r4, #8 593 vst1.16 {q3}, [r1, :128]! 594 vst1.32 {q12, q13}, [r0, :128]! 595 596 ble 9f 597 tst r5, #2 // LR_HAVE_RIGHT 598 vld1.8 {d6}, [r3]! 599 vmov q1, q2 600 vext.8 q0, q0, q3, #8 601 vmull.u8 q2, d6, d6 602 bne 4b // If we don't need to pad, just keep summing. 603 b 3b // If we need to pad, check how many pixels we have left. 604 605 9: 606 pop {r4-r5,pc} 607 endfunc 608 609 // void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3, 610 // int32_t *sumsq5, int16_t *sum5, 611 // const pixel (*left)[4], 612 // const pixel *src, const int w, 613 // const enum LrEdgeFlags edges); 614 function sgr_box35_row_h_8bpc_neon, export=1 615 push {r4-r7,lr} 616 ldrd r4, r5, [sp, #20] 617 ldrd r6, r7, [sp, #28] 618 add r6, r6, #2 // w += 2 619 620 tst r7, #1 // LR_HAVE_LEFT 621 beq 1f 622 cmp r4, #0 623 bne 0f 624 625 // LR_HAVE_LEFT && left == NULL 626 sub r5, r5, #3 627 vld1.8 {q0}, [r5]! 628 b 2f 629 630 0: 631 // LR_HAVE_LEFT, left != NULL 632 vld1.8 {q0}, [r5]! 633 vld1.32 {d3[]}, [r4] 634 // Move r3 back to account for the last 3 bytes we loaded earlier, 635 // which we'll shift out. 636 sub r5, r5, #3 637 vext.8 q0, q1, q0, #13 638 b 2f 639 640 1: 641 vld1.8 {q0}, [r5]! 642 // !LR_HAVE_LEFT, fill q1 with the leftmost byte 643 // and shift q0 to have 3x the first byte at the front. 644 vdup.8 q1, d0[0] 645 // Move r3 back to account for the last 3 bytes we loaded before, 646 // which we shifted out. 647 sub r5, r5, #3 648 vext.8 q0, q1, q0, #13 649 650 2: 651 vmull.u8 q1, d0, d0 652 vmull.u8 q2, d1, d1 653 654 tst r7, #2 // LR_HAVE_RIGHT 655 bne 4f 656 // If we'll need to pad the right edge, load that byte to pad with 657 // here since we can find it pretty easily from here. 658 sub lr, r6, #(2 + 16 - 3 + 1) 659 ldrb lr, [r5, lr] 660 // Fill q14 with the right padding pixel 661 vdup.8 q14, lr 662 3: // !LR_HAVE_RIGHT 663 664 // Check whether we need to pad the right edge 665 cmp r6, #11 666 bge 4f // If w >= 11, all used input pixels are valid 667 668 // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10, 669 // this ends up called again; it's not strictly needed in those 670 // cases (we pad enough here), but keeping the code as simple as possible. 671 672 // Insert padding in q0.b[w+1] onwards; fuse the +1 into the 673 // buffer pointer. 674 movrel_local lr, right_ext_mask, -1 675 sub lr, lr, r6 676 vld1.8 {q13}, [lr] 677 678 vbit q0, q14, q13 679 680 // Update the precalculated squares 681 vmull.u8 q1, d0, d0 682 vmull.u8 q2, d1, d1 683 684 4: // Loop horizontally 685 vext.8 d16, d0, d1, #1 686 vext.8 d17, d0, d1, #2 687 vext.8 d18, d0, d1, #3 688 vext.8 d19, d0, d1, #4 689 vaddl.u8 q3, d16, d17 690 vaddl.u8 q12, d0, d19 691 vaddw.u8 q3, q3, d18 692 693 vext.8 q8, q1, q2, #2 694 vext.8 q9, q1, q2, #4 695 vext.8 q10, q1, q2, #6 696 vext.8 q11, q1, q2, #8 697 698 vst1.16 {q3}, [r1, :128]! 699 vadd.u16 q3, q3, q12 700 701 vaddl.u16 q12, d16, d18 702 vaddl.u16 q13, d17, d19 703 vaddl.u16 q8, d2, d22 704 vaddl.u16 q9, d3, d23 705 vaddw.u16 q12, q12, d20 706 vaddw.u16 q13, q13, d21 707 708 vst1.32 {q12, q13}, [r0, :128]! 709 vadd.i32 q12, q12, q8 710 vadd.i32 q13, q13, q9 711 712 subs r6, r6, #8 713 vst1.16 {q3}, [r3, :128]! 714 vst1.32 {q12, q13}, [r2, :128]! 715 716 ble 9f 717 tst r7, #2 // LR_HAVE_RIGHT 718 vld1.8 {d6}, [r5]! 719 vmov q1, q2 720 vext.8 q0, q0, q3, #8 721 vmull.u8 q2, d6, d6 722 bne 4b // If we don't need to pad, just keep summing. 723 b 3b // If we need to pad, check how many pixels we have left. 724 725 9: 726 pop {r4-r7,pc} 727 endfunc 728 729 sgr_funcs 8