looprestoration16.S (28845B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2020, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 31 const right_ext_mask_buf 32 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 33 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 34 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 35 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 36 right_ext_mask: 37 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 38 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 39 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 40 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 41 endconst 42 43 // void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], 44 // const pixel *src, const int16_t fh[8], 45 // const int w, 46 // enum LrEdgeFlags edges, 47 // const int bitdepth_max); 48 function wiener_filter_h_16bpc_neon, export=1 49 push {r4-r6,lr} 50 ldrd r4, r5, [sp, #16] 51 ldr r6, [sp, #24] // bitdepth_max 52 vld1.16 {q0}, [r3, :128] 53 clz r6, r6 54 vmov.i32 q14, #1 55 sub r12, r6, #38 // -(bitdepth + 6) 56 sub r6, r6, #25 // -round_bits_h 57 neg r12, r12 // bitdepth + 6 58 vdup.32 q1, r12 59 vdup.32 q13, r6 // -round_bits_h 60 vmov.i16 q15, #8192 61 vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6) 62 vmvn.i16 q12, #0x8000 // 0x7fff = (1 << 15) - 1 63 64 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 65 tst r5, #1 // LR_HAVE_LEFT 66 beq 1f 67 // LR_HAVE_LEFT 68 cmp r1, #0 69 bne 0f 70 // left == NULL 71 sub r2, r2, #6 72 vld1.16 {q2, q3}, [r2]! 73 b 2f 74 75 0: 76 // LR_HAVE_LEFT, left != NULL 77 vld1.16 {q2, q3}, [r2]! 78 vld1.16 {d3}, [r1]! 79 // Move r2 back to account for the last 3 pixels we loaded earlier, 80 // which we'll shift out. 81 sub r2, r2, #6 82 vext.8 q3, q2, q3, #10 83 vext.8 q2, q1, q2, #10 84 b 2f 85 1: 86 vld1.16 {q2, q3}, [r2]! 87 // !LR_HAVE_LEFT, fill q1 with the leftmost pixel 88 // and shift q2/q3 to have 3x the first pixel at the front. 89 vdup.16 q1, d4[0] 90 // Move r2 back to account for the last 3 pixels we loaded before, 91 // which we shifted out. 92 sub r2, r2, #6 93 vext.8 q3, q2, q3, #10 94 vext.8 q2, q1, q2, #10 95 96 2: 97 tst r5, #2 // LR_HAVE_RIGHT 98 bne 4f 99 100 3: // !LR_HAVE_RIGHT 101 102 // Check whether we need to pad the right edge 103 cmp r4, #11 104 bge 4f // If w >= 11, all used input pixels are valid 105 106 // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10, 107 // this ends up called again; it's not strictly needed in those 108 // cases (we pad enough here), but keeping the code as simple as possible. 109 110 // The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie 111 // q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. 112 sub r12, r4, #14 113 lsl r12, r12, #1 114 // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the 115 // buffer pointer. 116 movrel_local r3, right_ext_mask, -6 117 ldrh r12, [r2, r12] 118 sub r3, r3, r4, lsl #1 119 vdup.16 q11, r12 120 vld1.8 {q9, q10}, [r3] 121 122 vbit q2, q11, q9 123 vbit q3, q11, q10 124 125 4: // Loop horizontally 126 vext.8 q9, q2, q3, #4 127 vext.8 q10, q2, q3, #8 128 vext.8 q8, q2, q3, #2 129 vext.8 q11, q2, q3, #10 130 vadd.i16 q10, q10, q9 131 vadd.i16 q11, q11, q8 132 vext.8 q8, q2, q3, #12 133 vext.8 q9, q2, q3, #6 134 vadd.i16 q2, q2, q8 135 vmull.s16 q8, d18, d0[3] 136 vmlal.s16 q8, d20, d1[0] 137 vmlal.s16 q8, d22, d1[1] 138 vmlal.s16 q8, d4, d1[2] 139 vmull.s16 q9, d19, d0[3] 140 vmlal.s16 q9, d21, d1[0] 141 vmlal.s16 q9, d23, d1[1] 142 vmlal.s16 q9, d5, d1[2] 143 144 vadd.i32 q8, q8, q14 145 vadd.i32 q9, q9, q14 146 vrshl.s32 q8, q8, q13 147 vrshl.s32 q9, q9, q13 148 vqmovun.s32 d16, q8 149 vqmovun.s32 d17, q9 150 vmin.u16 q8, q8, q12 151 vsub.i16 q8, q8, q15 152 subs r4, r4, #8 153 vst1.16 {q8}, [r0, :128]! 154 155 ble 9f 156 vmov q2, q3 157 tst r5, #2 // LR_HAVE_RIGHT 158 vld1.16 {q3}, [r2]! 159 bne 4b // If we don't need to pad, just keep filtering. 160 b 3b // If we need to pad, check how many pixels we have left. 161 162 9: 163 pop {r4-r6,pc} 164 endfunc 165 166 // void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, int16_t **ptrs, 167 // const int16_t fv[8], const int w, 168 // const int bitdepth_max); 169 function wiener_filter_v_16bpc_neon, export=1 170 push {r4-r9,lr} 171 vpush {q4-q7} 172 173 ldr lr, [sp, #92] // bitdepth_max 174 vld1.16 {q0}, [r2, :128] 175 vdup.16 q2, lr 176 clz lr, lr 177 sub lr, lr, #11 // round_bits_v 178 179 vdup.32 q1, lr 180 181 ldrd r4, r5, [r1] 182 ldrd r6, r7, [r1, #8] 183 ldrd r8, r9, [r1, #16] 184 185 vneg.s32 q1, q1 // -round_bits_v 186 187 1: 188 vld1.16 {q4, q5}, [r4, :128]! 189 vld1.16 {q6, q7}, [r5, :128]! 190 vld1.16 {q8, q9}, [r6, :128]! 191 vld1.16 {q10, q11}, [r7, :128]! 192 vld1.16 {q12, q13}, [r8, :128]! 193 vld1.16 {q14, q15}, [r9, :128]! 194 195 subs r3, r3, #16 196 197 vmull.s16 q3, d8, d0[0] 198 vmlal.s16 q3, d12, d0[1] 199 vmlal.s16 q3, d16, d0[2] 200 vmlal.s16 q3, d20, d0[3] 201 vmlal.s16 q3, d24, d1[0] 202 vmlal.s16 q3, d28, d1[1] 203 vmlal.s16 q3, d28, d1[2] 204 vmull.s16 q4, d9, d0[0] 205 vmlal.s16 q4, d13, d0[1] 206 vmlal.s16 q4, d17, d0[2] 207 vmlal.s16 q4, d21, d0[3] 208 vmlal.s16 q4, d25, d1[0] 209 vmlal.s16 q4, d29, d1[1] 210 vmlal.s16 q4, d29, d1[2] 211 212 vmull.s16 q6, d10, d0[0] 213 vmlal.s16 q6, d14, d0[1] 214 vmlal.s16 q6, d18, d0[2] 215 vmlal.s16 q6, d22, d0[3] 216 vmlal.s16 q6, d26, d1[0] 217 vmlal.s16 q6, d30, d1[1] 218 vmlal.s16 q6, d30, d1[2] 219 vmull.s16 q5, d11, d0[0] 220 vmlal.s16 q5, d15, d0[1] 221 vmlal.s16 q5, d19, d0[2] 222 vmlal.s16 q5, d23, d0[3] 223 vmlal.s16 q5, d27, d1[0] 224 vmlal.s16 q5, d31, d1[1] 225 vmlal.s16 q5, d31, d1[2] 226 227 vrshl.s32 q3, q3, q1 // round_bits_v 228 vrshl.s32 q4, q4, q1 229 vrshl.s32 q6, q6, q1 230 vrshl.s32 q5, q5, q1 231 vqmovun.s32 d6, q3 232 vqmovun.s32 d7, q4 233 vqmovun.s32 d8, q6 234 vqmovun.s32 d9, q5 235 vmin.u16 q3, q3, q2 // bitdepth_max 236 vmin.u16 q4, q4, q2 237 vst1.16 {q3, q4}, [r0, :128]! 238 bgt 1b 239 240 // Shift the pointers, but only update the first 5; the 6th pointer is 241 // kept as it was before (and the 7th is implicitly identical to the 242 // 6th). 243 ldrd r4, r5, [r1, #4] 244 ldrd r6, r7, [r1, #12] 245 ldr r8, [r1, #20] 246 strd r4, r5, [r1] 247 strd r6, r7, [r1, #8] 248 str r8, [r1, #16] 249 250 vpop {q4-q7} 251 pop {r4-r9,pc} 252 endfunc 253 254 // void dav1d_wiener_filter_hv_16bpc_neon(pixel *dst, const pixel (*left)[4], 255 // const pixel *src, 256 // const int16_t filter[2][8], 257 // const int w, 258 // const enum LrEdgeFlags edges, 259 // int16_t **ptrs, 260 // const int bitdepth_max); 261 function wiener_filter_hv_16bpc_neon, export=1 262 push {r4-r11,lr} 263 vpush {q4-q7} 264 ldrd r4, r5, [sp, #100] 265 ldrd r6, r7, [sp, #108] 266 vld1.16 {q0, q1}, [r3, :128] 267 vdup.16 q11, r7 // bitdepth_max 268 clz r7, r7 269 vmov.i32 q14, #1 270 sub r12, r7, #38 // -(bitdepth + 6) 271 sub lr, r7, #11 // round_bits_v 272 sub r7, r7, #25 // -round_bits_h 273 neg r12, r12 // bitdepth + 6 274 vdup.32 q2, r12 275 vdup.32 q13, r7 // -round_bits_h 276 vdup.32 q10, lr // round_bits_v 277 mov lr, r6 278 vmov.i16 q15, #8192 279 vshl.u32 q14, q14, q2 // 1 << (bitdepth + 6) 280 vneg.s32 q10, q10 // -round_bits_v 281 282 ldrd r6, r7, [lr] 283 ldrd r8, r9, [lr, #8] 284 ldrd r10, r11, [lr, #16] 285 ldr r12, [lr, #24] 286 287 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL 288 tst r5, #1 // LR_HAVE_LEFT 289 beq 1f 290 // LR_HAVE_LEFT 291 cmp r1, #0 292 bne 0f 293 // left == NULL 294 sub r2, r2, #6 295 vld1.16 {q2, q3}, [r2]! 296 b 2f 297 298 0: 299 // LR_HAVE_LEFT, left != NULL 300 vld1.16 {q2, q3}, [r2]! 301 vld1.16 {d9}, [r1]! 302 // Move r2 back to account for the last 3 pixels we loaded earlier, 303 // which we'll shift out. 304 sub r2, r2, #6 305 vext.8 q3, q2, q3, #10 306 vext.8 q2, q4, q2, #10 307 b 2f 308 1: 309 vld1.16 {q2, q3}, [r2]! 310 // !LR_HAVE_LEFT, fill q1 with the leftmost pixel 311 // and shift q2/q3 to have 3x the first pixel at the front. 312 vdup.16 q4, d4[0] 313 // Move r2 back to account for the last 3 pixels we loaded before, 314 // which we shifted out. 315 sub r2, r2, #6 316 vext.8 q3, q2, q3, #10 317 vext.8 q2, q4, q2, #10 318 319 2: 320 tst r5, #2 // LR_HAVE_RIGHT 321 bne 4f 322 323 3: // !LR_HAVE_RIGHT 324 325 // Check whether we need to pad the right edge 326 cmp r4, #11 327 bge 4f // If w >= 11, all used input pixels are valid 328 329 // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10, 330 // this ends up called again; it's not strictly needed in those 331 // cases (we pad enough here), but keeping the code as simple as possible. 332 333 // The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie 334 // q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. 335 sub lr, r4, #14 336 lsl lr, lr, #1 337 // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the 338 // buffer pointer. 339 movrel_local r3, right_ext_mask, -6 340 ldrh lr, [r2, lr] 341 sub r3, r3, r4, lsl #1 342 vdup.16 q4, lr 343 vld1.8 {q8, q9}, [r3] 344 345 vbit q2, q4, q8 346 vbit q3, q4, q9 347 348 4: // Loop horizontally 349 vext.8 q5, q2, q3, #4 350 vext.8 q6, q2, q3, #8 351 vext.8 q4, q2, q3, #2 352 vext.8 q7, q2, q3, #10 353 vadd.i16 q6, q6, q5 354 vadd.i16 q7, q7, q4 355 vext.8 q4, q2, q3, #12 356 vext.8 q5, q2, q3, #6 357 vadd.i16 q2, q2, q4 358 vld1.16 {q4}, [r6, :128]! 359 vmull.s16 q8, d10, d0[3] 360 vmlal.s16 q8, d12, d1[0] 361 vmlal.s16 q8, d14, d1[1] 362 vmlal.s16 q8, d4, d1[2] 363 vmull.s16 q9, d11, d0[3] 364 vmlal.s16 q9, d13, d1[0] 365 vmlal.s16 q9, d15, d1[1] 366 vmlal.s16 q9, d5, d1[2] 367 vld1.16 {q5}, [r7, :128]! 368 369 vmvn.i16 q12, #0x8000 // 0x7fff = (1 << 15) - 1 370 371 vadd.i32 q8, q8, q14 372 vadd.i32 q9, q9, q14 373 vld1.16 {q6}, [r8, :128]! 374 vrshl.s32 q8, q8, q13 375 vrshl.s32 q9, q9, q13 376 vqmovun.s32 d16, q8 377 vqmovun.s32 d17, q9 378 vld1.16 {q7}, [r9, :128]! 379 vmin.u16 q8, q8, q12 380 vld1.16 {q9}, [r10, :128]! 381 vsub.i16 q8, q8, q15 382 383 vld1.16 {q2}, [r11, :128]! 384 385 vmull.s16 q12, d8, d2[0] 386 vmlal.s16 q12, d10, d2[1] 387 vmlal.s16 q12, d12, d2[2] 388 vmlal.s16 q12, d14, d2[3] 389 vmlal.s16 q12, d18, d3[0] 390 vmlal.s16 q12, d4, d3[1] 391 vmlal.s16 q12, d16, d3[2] 392 vmull.s16 q4, d9, d2[0] 393 vmlal.s16 q4, d11, d2[1] 394 vmlal.s16 q4, d13, d2[2] 395 vmlal.s16 q4, d15, d2[3] 396 vmlal.s16 q4, d19, d3[0] 397 vmlal.s16 q4, d5, d3[1] 398 vmlal.s16 q4, d17, d3[2] 399 400 vrshl.s32 q12, q12, q10 // round_bits_v 401 vrshl.s32 q4, q4, q10 402 vqmovun.s32 d24, q12 403 vqmovun.s32 d25, q4 404 vst1.16 {q8}, [r12, :128]! 405 vmin.u16 q12, q12, q11 // bitdepth_max 406 subs r4, r4, #8 407 vst1.16 {q12}, [r0, :128]! 408 409 ble 9f 410 vmov q2, q3 411 tst r5, #2 // LR_HAVE_RIGHT 412 vld1.16 {q3}, [r2]! 413 bne 4b // If we don't need to pad, just keep filtering. 414 b 3b // If we need to pad, check how many pixels we have left. 415 416 9: 417 // Reload ptrs from arguments on the stack 418 ldr lr, [sp, #108] 419 // Rotate the window of pointers. Shift the 6 pointers downwards one step. 420 ldrd r6, r7, [lr, #4] 421 ldrd r8, r9, [lr, #12] 422 ldrd r10, r11, [lr, #20] 423 424 strd r6, r7, [lr] 425 strd r8, r9, [lr, #8] 426 strd r10, r11, [lr, #16] 427 // The topmost pointer, ptrs[6], which isn't used as input, is set to 428 // ptrs[0], which will be used as output for the next _hv call. 429 // At the start of the filtering, the caller may set ptrs[6] to the 430 // right next buffer to fill in, instead. 431 str r6, [lr, #24] 432 433 vpop {q4-q7} 434 pop {r4-r11,pc} 435 endfunc 436 437 #include "looprestoration_tmpl.S" 438 439 // void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 440 // const pixel (*left)[4], 441 // const pixel *src, const int w, 442 // const enum LrEdgeFlags edges); 443 function sgr_box3_row_h_16bpc_neon, export=1 444 push {r4-r5,lr} 445 ldrd r4, r5, [sp, #12] 446 add r4, r4, #2 // w += 2 447 448 tst r5, #1 // LR_HAVE_LEFT 449 beq 1f 450 cmp r2, #0 451 bne 0f 452 453 // LR_HAVE_LEFT && left == NULL 454 sub r3, r3, #4 455 vld1.8 {q0, q1}, [r3]! 456 b 2f 457 458 0: 459 // LR_HAVE_LEFT, left != NULL 460 vld1.8 {q0, q1}, [r3]! 461 vld1.16 {d5}, [r2] 462 // Move r3 back to account for the last 2 pixels we loaded earlier, 463 // which we'll shift out. 464 sub r3, r3, #4 465 vext.8 q1, q0, q1, #12 466 vext.8 q0, q2, q0, #12 467 b 2f 468 469 1: 470 vld1.8 {q0, q1}, [r3]! 471 // !LR_HAVE_LEFT, fill q1 with the leftmost pixel 472 // and shift q0/q1 to have 2x the first pixel at the front. 473 vdup.16 q2, d0[0] 474 // Move r3 back to account for the last 2 pixels we loaded before, 475 // which we shifted out. 476 sub r3, r3, #4 477 vext.8 q1, q0, q1, #12 478 vext.8 q0, q2, q0, #12 479 480 2: 481 tst r5, #2 // LR_HAVE_RIGHT 482 bne 4f 483 // If we'll need to pad the right edge, load that pixel to pad with 484 // here since we can find it pretty easily from here. 485 sub lr, r4, #(2 + 16 - 2 + 1) 486 lsl lr, lr, #1 487 ldrh lr, [r3, lr] 488 // Fill q14 with the right padding pixel 489 vdup.16 q14, lr 490 3: // !LR_HAVE_RIGHT 491 492 // Check whether we need to pad the right edge 493 cmp r4, #10 494 bge 4f // If w >= 10, all used input pixels are valid 495 496 // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called 497 // again; it's not strictly needed in those cases (we pad enough here), 498 // but keeping the code as simple as possible. 499 500 // Insert padding in q0.h[w] onwards 501 movrel_local lr, right_ext_mask 502 sub lr, lr, r4, lsl #1 503 vld1.8 {q12, q13}, [lr] 504 505 vbit q0, q14, q12 506 vbit q1, q14, q13 507 508 4: // Loop horizontally 509 vext.8 q8, q0, q1, #2 510 vext.8 q9, q0, q1, #4 511 512 vadd.i16 q2, q0, q8 513 vmull.u16 q12, d0, d0 514 vmlal.u16 q12, d16, d16 515 vmlal.u16 q12, d18, d18 516 vadd.i16 q2, q2, q9 517 vmull.u16 q13, d1, d1 518 vmlal.u16 q13, d17, d17 519 vmlal.u16 q13, d19, d19 520 subs r4, r4, #8 521 vst1.16 {q2}, [r1, :128]! 522 vst1.32 {q12, q13}, [r0, :128]! 523 524 ble 9f 525 tst r5, #2 // LR_HAVE_RIGHT 526 vmov q0, q1 527 vld1.16 {q1}, [r3]! 528 529 bne 4b // If we don't need to pad, just keep summing. 530 b 3b // If we need to pad, check how many pixels we have left. 531 532 9: 533 pop {r4-r5,pc} 534 endfunc 535 536 // void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, 537 // const pixel (*left)[4], 538 // const pixel *src, const int w, 539 // const enum LrEdgeFlags edges); 540 function sgr_box5_row_h_16bpc_neon, export=1 541 push {r4-r5,lr} 542 ldrd r4, r5, [sp, #12] 543 add r4, r4, #2 // w += 2 544 545 tst r5, #1 // LR_HAVE_LEFT 546 beq 1f 547 cmp r2, #0 548 bne 0f 549 550 // LR_HAVE_LEFT && left == NULL 551 sub r3, r3, #6 552 vld1.8 {q0, q1}, [r3]! 553 b 2f 554 555 0: 556 // LR_HAVE_LEFT, left != NULL 557 vld1.8 {q0, q1}, [r3]! 558 vld1.16 {d5}, [r2] 559 // Move r3 back to account for the last 2 pixels we loaded earlier, 560 // which we'll shift out. 561 sub r3, r3, #6 562 vext.8 q1, q0, q1, #10 563 vext.8 q0, q2, q0, #10 564 b 2f 565 566 1: 567 vld1.8 {q0, q1}, [r3]! 568 // !LR_HAVE_LEFT, fill q1 with the leftmost pixel 569 // and shift q0/q1 to have 3x the first pixel at the front. 570 vdup.16 q2, d0[0] 571 // Move r3 back to account for the last 3 pixels we loaded before, 572 // which we shifted out. 573 sub r3, r3, #6 574 vext.8 q1, q0, q1, #10 575 vext.8 q0, q2, q0, #10 576 577 2: 578 tst r5, #2 // LR_HAVE_RIGHT 579 bne 4f 580 // If we'll need to pad the right edge, load that pixel to pad with 581 // here since we can find it pretty easily from here. 582 sub lr, r4, #(2 + 16 - 3 + 1) 583 lsl lr, lr, #1 584 ldrh lr, [r3, lr] 585 // Fill q14 with the right padding pixel 586 vdup.16 q14, lr 587 3: // !LR_HAVE_RIGHT 588 589 // Check whether we need to pad the right edge 590 cmp r4, #11 591 bge 4f // If w >= 11, all used input pixels are valid 592 593 // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10, 594 // this ends up called again; it's not strictly needed in those 595 // cases (we pad enough here), but keeping the code as simple as possible. 596 597 // Insert padding in q0.h[w+1] onwards; fuse the +1 into the 598 // buffer pointer. 599 movrel_local lr, right_ext_mask, -2 600 sub lr, lr, r4, lsl #1 601 vld1.8 {q12, q13}, [lr] 602 603 vbit q0, q14, q12 604 vbit q1, q14, q13 605 606 4: // Loop horizontally 607 vext.8 q8, q0, q1, #2 608 vext.8 q9, q0, q1, #4 609 610 vadd.i16 q2, q0, q8 611 vmull.u16 q12, d0, d0 612 vmlal.u16 q12, d16, d16 613 vmlal.u16 q12, d18, d18 614 vadd.i16 q2, q2, q9 615 vmull.u16 q13, d1, d1 616 vmlal.u16 q13, d17, d17 617 vmlal.u16 q13, d19, d19 618 619 vext.8 q8, q0, q1, #6 620 vext.8 q9, q0, q1, #8 621 622 vadd.i16 q2, q2, q8 623 vmlal.u16 q12, d16, d16 624 vmlal.u16 q12, d1, d1 625 vadd.i16 q2, q2, q9 626 vmlal.u16 q13, d17, d17 627 vmlal.u16 q13, d19, d19 628 629 subs r4, r4, #8 630 vst1.16 {q2}, [r1, :128]! 631 vst1.32 {q12, q13}, [r0, :128]! 632 633 ble 9f 634 tst r5, #2 // LR_HAVE_RIGHT 635 vmov q0, q1 636 vld1.16 {q1}, [r3]! 637 bne 4b // If we don't need to pad, just keep summing. 638 b 3b // If we need to pad, check how many pixels we have left. 639 640 9: 641 pop {r4-r5,pc} 642 endfunc 643 644 // void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3, 645 // int32_t *sumsq5, int16_t *sum5, 646 // const pixel (*left)[4], 647 // const pixel *src, const int w, 648 // const enum LrEdgeFlags edges); 649 function sgr_box35_row_h_16bpc_neon, export=1 650 push {r4-r7,lr} 651 ldrd r4, r5, [sp, #20] 652 ldrd r6, r7, [sp, #28] 653 add r6, r6, #2 // w += 2 654 655 tst r7, #1 // LR_HAVE_LEFT 656 beq 1f 657 cmp r4, #0 658 bne 0f 659 660 // LR_HAVE_LEFT && left == NULL 661 sub r5, r5, #6 662 vld1.8 {q0, q1}, [r5]! 663 b 2f 664 665 0: 666 // LR_HAVE_LEFT, left != NULL 667 vld1.8 {q0, q1}, [r5]! 668 vld1.16 {d5}, [r4] 669 // Move r3 back to account for the last 2 pixels we loaded earlier, 670 // which we'll shift out. 671 sub r5, r5, #6 672 vext.8 q1, q0, q1, #10 673 vext.8 q0, q2, q0, #10 674 b 2f 675 676 1: 677 vld1.8 {q0, q1}, [r5]! 678 // !LR_HAVE_LEFT, fill q1 with the leftmost pixel 679 // and shift q0/q1 to have 3x the first pixel at the front. 680 vdup.16 q2, d0[0] 681 // Move r3 back to account for the last 3 pixels we loaded before, 682 // which we shifted out. 683 sub r5, r5, #6 684 vext.8 q1, q0, q1, #10 685 vext.8 q0, q2, q0, #10 686 687 2: 688 tst r7, #2 // LR_HAVE_RIGHT 689 bne 4f 690 // If we'll need to pad the right edge, load that pixel to pad with 691 // here since we can find it pretty easily from here. 692 sub lr, r6, #(2 + 16 - 3 + 1) 693 lsl lr, lr, #1 694 ldrh lr, [r5, lr] 695 // Fill q14 with the right padding pixel 696 vdup.16 q14, lr 697 3: // !LR_HAVE_RIGHT 698 699 // Check whether we need to pad the right edge 700 cmp r6, #11 701 bge 4f // If w >= 11, all used input pixels are valid 702 703 // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10, 704 // this ends up called again; it's not strictly needed in those 705 // cases (we pad enough here), but keeping the code as simple as possible. 706 707 // Insert padding in q0.h[w+1] onwards; fuse the +1 into the 708 // buffer pointer. 709 movrel_local lr, right_ext_mask, -2 710 sub lr, lr, r6, lsl #1 711 vld1.8 {q12, q13}, [lr] 712 713 vbit q0, q14, q12 714 vbit q1, q14, q13 715 716 4: // Loop horizontally 717 vext.8 q8, q0, q1, #2 718 vext.8 q9, q0, q1, #4 719 vext.8 q10, q0, q1, #6 720 vext.8 q11, q0, q1, #8 721 722 vadd.i16 q2, q8, q9 723 vadd.i16 q3, q0, q11 724 vadd.i16 q2, q2, q10 725 726 vmull.u16 q12, d16, d16 727 vmlal.u16 q12, d18, d18 728 vmlal.u16 q12, d20, d20 729 vmull.u16 q13, d17, d17 730 vmlal.u16 q13, d19, d19 731 vmlal.u16 q13, d21, d21 732 733 vadd.i16 q3, q3, q2 734 vst1.16 {q2}, [r1, :128]! 735 vst1.32 {q12, q13}, [r0, :128]! 736 737 vmlal.u16 q12, d0, d0 738 vmlal.u16 q12, d22, d22 739 vmlal.u16 q13, d1, d1 740 vmlal.u16 q13, d23, d23 741 742 subs r6, r6, #8 743 vst1.16 {q3}, [r3, :128]! 744 vst1.32 {q12, q13}, [r2, :128]! 745 746 ble 9f 747 tst r7, #2 // LR_HAVE_RIGHT 748 vmov q0, q1 749 vld1.16 {q1}, [r5]! 750 bne 4b // If we don't need to pad, just keep summing. 751 b 3b // If we need to pad, check how many pixels we have left. 752 753 9: 754 pop {r4-r7,pc} 755 endfunc 756 757 sgr_funcs 16