vp9mc_neon.S (25221B)
1 /* 2 * Copyright (c) 2016 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavutil/arm/asm.S" 22 23 @ All public functions in this file have the following signature: 24 @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride, 25 @ const uint8_t *ref, ptrdiff_t ref_stride, 26 @ int h, int mx, int my); 27 28 function ff_vp9_copy64_neon, export=1 29 ldr r12, [sp] 30 sub r1, r1, #32 31 sub r3, r3, #32 32 1: 33 vld1.8 {q0, q1}, [r2]! 34 vst1.8 {q0, q1}, [r0, :128]! 35 vld1.8 {q2, q3}, [r2], r3 36 subs r12, r12, #1 37 vst1.8 {q2, q3}, [r0, :128], r1 38 bne 1b 39 bx lr 40 endfunc 41 42 function ff_vp9_avg64_neon, export=1 43 push {lr} 44 ldr r12, [sp, #4] 45 sub r1, r1, #32 46 sub r3, r3, #32 47 mov lr, r0 48 1: 49 vld1.8 {q8, q9}, [r2]! 50 vld1.8 {q0, q1}, [r0, :128]! 51 vld1.8 {q10, q11}, [r2], r3 52 vrhadd.u8 q0, q0, q8 53 vld1.8 {q2, q3}, [r0, :128], r1 54 vrhadd.u8 q1, q1, q9 55 vrhadd.u8 q2, q2, q10 56 vst1.8 {q0, q1}, [lr, :128]! 57 vrhadd.u8 q3, q3, q11 58 vst1.8 {q2, q3}, [lr, :128], r1 59 subs r12, r12, #1 60 bne 1b 61 pop {pc} 62 endfunc 63 64 function ff_vp9_copy32_neon, export=1 65 ldr r12, [sp] 66 1: 67 vld1.8 {q0, q1}, [r2], r3 68 subs r12, r12, #1 69 vst1.8 {q0, q1}, [r0, :128], r1 70 bne 1b 71 bx lr 72 endfunc 73 74 function ff_vp9_avg32_neon, export=1 75 ldr r12, [sp] 76 1: 77 vld1.8 {q2, q3}, [r2], r3 78 vld1.8 {q0, q1}, [r0, :128] 79 vrhadd.u8 q0, q0, q2 80 vrhadd.u8 q1, q1, q3 81 subs r12, r12, #1 82 vst1.8 {q0, q1}, [r0, :128], r1 83 bne 1b 84 bx lr 85 endfunc 86 87 function ff_vp9_copy16_neon, export=1 88 push {r4,lr} 89 ldr r12, [sp, #8] 90 add r4, r0, r1 91 add lr, r2, r3 92 add r1, r1, r1 93 add r3, r3, r3 94 1: 95 vld1.8 {q0}, [r2], r3 96 vld1.8 {q1}, [lr], r3 97 subs r12, r12, #2 98 vst1.8 {q0}, [r0, :128], r1 99 vst1.8 {q1}, [r4, :128], r1 100 bne 1b 101 pop {r4,pc} 102 endfunc 103 104 function ff_vp9_avg16_neon, export=1 105 push {lr} 106 ldr r12, [sp, #4] 107 mov lr, r0 108 1: 109 vld1.8 {q2}, [r2], r3 110 vld1.8 {q0}, [r0, :128], r1 111 vld1.8 {q3}, [r2], r3 112 vrhadd.u8 q0, q0, q2 113 vld1.8 {q1}, [r0, :128], r1 114 vrhadd.u8 q1, q1, q3 115 subs r12, r12, #2 116 vst1.8 {q0}, [lr, :128], r1 117 vst1.8 {q1}, [lr, :128], r1 118 bne 1b 119 pop {pc} 120 endfunc 121 122 function ff_vp9_copy8_neon, export=1 123 ldr r12, [sp] 124 1: 125 vld1.8 {d0}, [r2], r3 126 vld1.8 {d1}, [r2], r3 127 subs r12, r12, #2 128 vst1.8 {d0}, [r0, :64], r1 129 vst1.8 {d1}, [r0, :64], r1 130 bne 1b 131 bx lr 132 endfunc 133 134 function ff_vp9_avg8_neon, export=1 135 ldr r12, [sp] 136 1: 137 vld1.8 {d2}, [r2], r3 138 vld1.8 {d0}, [r0, :64], r1 139 vld1.8 {d3}, [r2], r3 140 vrhadd.u8 d0, d0, d2 141 vld1.8 {d1}, [r0, :64] 142 sub r0, r0, r1 143 vrhadd.u8 d1, d1, d3 144 subs r12, r12, #2 145 vst1.8 {d0}, [r0, :64], r1 146 vst1.8 {d1}, [r0, :64], r1 147 bne 1b 148 bx lr 149 endfunc 150 151 function ff_vp9_copy4_neon, export=1 152 ldr r12, [sp] 153 1: 154 vld1.32 {d0[]}, [r2], r3 155 vld1.32 {d1[]}, [r2], r3 156 vst1.32 {d0[0]}, [r0, :32], r1 157 vld1.32 {d2[]}, [r2], r3 158 vst1.32 {d1[0]}, [r0, :32], r1 159 vld1.32 {d3[]}, [r2], r3 160 subs r12, r12, #4 161 vst1.32 {d2[0]}, [r0, :32], r1 162 vst1.32 {d3[0]}, [r0, :32], r1 163 bne 1b 164 bx lr 165 endfunc 166 167 function ff_vp9_avg4_neon, export=1 168 push {lr} 169 ldr r12, [sp, #4] 170 mov lr, r0 171 1: 172 vld1.32 {d4[]}, [r2], r3 173 vld1.32 {d0[]}, [r0, :32], r1 174 vld1.32 {d5[]}, [r2], r3 175 vrhadd.u8 d0, d0, d4 176 vld1.32 {d1[]}, [r0, :32], r1 177 vld1.32 {d6[]}, [r2], r3 178 vrhadd.u8 d1, d1, d5 179 vld1.32 {d2[]}, [r0, :32], r1 180 vld1.32 {d7[]}, [r2], r3 181 vrhadd.u8 d2, d2, d6 182 vld1.32 {d3[]}, [r0, :32], r1 183 subs r12, r12, #4 184 vst1.32 {d0[0]}, [lr, :32], r1 185 vrhadd.u8 d3, d3, d7 186 vst1.32 {d1[0]}, [lr, :32], r1 187 vst1.32 {d2[0]}, [lr, :32], r1 188 vst1.32 {d3[0]}, [lr, :32], r1 189 bne 1b 190 pop {pc} 191 endfunc 192 193 @ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index 194 .macro vmul_lane dst, src, idx 195 .if \idx < 4 196 vmul.s16 \dst, \src, d0[\idx] 197 .else 198 vmul.s16 \dst, \src, d1[\idx - 4] 199 .endif 200 .endm 201 .macro vmla_lane dst, src, idx 202 .if \idx < 4 203 vmla.s16 \dst, \src, d0[\idx] 204 .else 205 vmla.s16 \dst, \src, d1[\idx - 4] 206 .endif 207 .endm 208 209 @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 210 @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or 211 @ dst1-dst2 and dst3-dst4 for size >= 16) 212 .macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size 213 vext.8 q14, \src1, \src2, #(2*\offset) 214 vext.8 q15, \src4, \src5, #(2*\offset) 215 .if \size >= 16 216 vmla_lane \dst1, q14, \offset 217 vext.8 q5, \src2, \src3, #(2*\offset) 218 vmla_lane \dst3, q15, \offset 219 vext.8 q6, \src5, \src6, #(2*\offset) 220 vmla_lane \dst2, q5, \offset 221 vmla_lane \dst4, q6, \offset 222 .elseif \size == 8 223 vmla_lane \dst1, q14, \offset 224 vmla_lane \dst3, q15, \offset 225 .else 226 vmla_lane \dst1d, d28, \offset 227 vmla_lane \dst3d, d30, \offset 228 .endif 229 .endm 230 @ The same as above, but don't accumulate straight into the 231 @ destination, but use a temp register and accumulate with saturation. 232 .macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size 233 vext.8 q14, \src1, \src2, #(2*\offset) 234 vext.8 q15, \src4, \src5, #(2*\offset) 235 .if \size >= 16 236 vmul_lane q14, q14, \offset 237 vext.8 q5, \src2, \src3, #(2*\offset) 238 vmul_lane q15, q15, \offset 239 vext.8 q6, \src5, \src6, #(2*\offset) 240 vmul_lane q5, q5, \offset 241 vmul_lane q6, q6, \offset 242 .elseif \size == 8 243 vmul_lane q14, q14, \offset 244 vmul_lane q15, q15, \offset 245 .else 246 vmul_lane d28, d28, \offset 247 vmul_lane d30, d30, \offset 248 .endif 249 .if \size == 4 250 vqadd.s16 \dst1d, \dst1d, d28 251 vqadd.s16 \dst3d, \dst3d, d30 252 .else 253 vqadd.s16 \dst1, \dst1, q14 254 vqadd.s16 \dst3, \dst3, q15 255 .if \size >= 16 256 vqadd.s16 \dst2, \dst2, q5 257 vqadd.s16 \dst4, \dst4, q6 258 .endif 259 .endif 260 .endm 261 262 263 @ Instantiate a horizontal filter function for the given size. 264 @ This can work on 4, 8 or 16 pixels in parallel; for larger 265 @ widths it will do 16 pixels at a time and loop horizontally. 266 @ The actual width is passed in r5, the height in r4 and 267 @ the filter coefficients in r12. idx2 is the index of the largest 268 @ filter coefficient (3 or 4) and idx1 is the other one of them. 269 .macro do_8tap_h type, size, idx1, idx2 270 function \type\()_8tap_\size\()h_\idx1\idx2 271 sub r2, r2, #3 272 add r6, r0, r1 273 add r7, r2, r3 274 add r1, r1, r1 275 add r3, r3, r3 276 @ Only size >= 16 loops horizontally and needs 277 @ reduced dst stride 278 .if \size >= 16 279 sub r1, r1, r5 280 .endif 281 @ size >= 16 loads two qwords and increments r2, 282 @ size 4 loads 1 d word, increments r2 and loads 1 32-bit lane 283 @ for size 8 it's enough with one qword and no postincrement 284 .if \size >= 16 285 sub r3, r3, r5 286 sub r3, r3, #8 287 .elseif \size == 4 288 sub r3, r3, #8 289 .endif 290 @ Load the filter vector 291 vld1.16 {q0}, [r12,:128] 292 1: 293 .if \size >= 16 294 mov r12, r5 295 .endif 296 @ Load src 297 .if \size >= 16 298 vld1.8 {d18, d19, d20}, [r2]! 299 vld1.8 {d24, d25, d26}, [r7]! 300 .elseif \size == 8 301 vld1.8 {q9}, [r2] 302 vld1.8 {q12}, [r7] 303 .else @ size == 4 304 vld1.8 {d18}, [r2]! 305 vld1.8 {d24}, [r7]! 306 vld1.32 {d19[0]}, [r2] 307 vld1.32 {d25[0]}, [r7] 308 .endif 309 vmovl.u8 q8, d18 310 vmovl.u8 q9, d19 311 vmovl.u8 q11, d24 312 vmovl.u8 q12, d25 313 .if \size >= 16 314 vmovl.u8 q10, d20 315 vmovl.u8 q13, d26 316 .endif 317 2: 318 319 @ Accumulate, adding idx2 last with a separate 320 @ saturating add. The positive filter coefficients 321 @ for all indices except idx2 must add up to less 322 @ than 127 for this not to overflow. 323 vmul.s16 q1, q8, d0[0] 324 vmul.s16 q3, q11, d0[0] 325 .if \size >= 16 326 vmul.s16 q2, q9, d0[0] 327 vmul.s16 q4, q12, d0[0] 328 .endif 329 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 1, \size 330 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 2, \size 331 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx1, \size 332 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 5, \size 333 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 6, \size 334 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 7, \size 335 extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx2, \size 336 337 @ Round, shift and saturate 338 vqrshrun.s16 d2, q1, #7 339 vqrshrun.s16 d6, q3, #7 340 .if \size >= 16 341 vqrshrun.s16 d3, q2, #7 342 vqrshrun.s16 d7, q4, #7 343 .endif 344 @ Average 345 .ifc \type,avg 346 .if \size >= 16 347 vld1.8 {q14}, [r0,:128] 348 vld1.8 {q15}, [r6,:128] 349 vrhadd.u8 q1, q1, q14 350 vrhadd.u8 q3, q3, q15 351 .elseif \size == 8 352 vld1.8 {d28}, [r0,:64] 353 vld1.8 {d30}, [r6,:64] 354 vrhadd.u8 d2, d2, d28 355 vrhadd.u8 d6, d6, d30 356 .else 357 @ We only need d28[0], but [] is faster on some cores 358 vld1.32 {d28[]}, [r0,:32] 359 vld1.32 {d30[]}, [r6,:32] 360 vrhadd.u8 d2, d2, d28 361 vrhadd.u8 d6, d6, d30 362 .endif 363 .endif 364 @ Store and loop horizontally (for size >= 16) 365 .if \size >= 16 366 subs r12, r12, #16 367 vst1.8 {q1}, [r0,:128]! 368 vst1.8 {q3}, [r6,:128]! 369 beq 3f 370 vmov q8, q10 371 vmov q11, q13 372 vld1.8 {q10}, [r2]! 373 vld1.8 {q13}, [r7]! 374 vmovl.u8 q9, d20 375 vmovl.u8 q10, d21 376 vmovl.u8 q12, d26 377 vmovl.u8 q13, d27 378 b 2b 379 .elseif \size == 8 380 vst1.8 {d2}, [r0,:64] 381 vst1.8 {d6}, [r6,:64] 382 .else @ \size == 4 383 vst1.32 {d2[0]}, [r0,:32] 384 vst1.32 {d6[0]}, [r6,:32] 385 .endif 386 3: 387 @ Loop vertically 388 add r0, r0, r1 389 add r6, r6, r1 390 add r2, r2, r3 391 add r7, r7, r3 392 subs r4, r4, #2 393 bne 1b 394 .if \size >= 16 395 vpop {q4-q6} 396 .endif 397 pop {r4-r7} 398 bx lr 399 endfunc 400 .endm 401 402 .macro do_8tap_h_size size 403 do_8tap_h put, \size, 3, 4 404 do_8tap_h avg, \size, 3, 4 405 do_8tap_h put, \size, 4, 3 406 do_8tap_h avg, \size, 4, 3 407 .endm 408 409 do_8tap_h_size 4 410 do_8tap_h_size 8 411 do_8tap_h_size 16 412 413 .macro do_8tap_h_func type, filter, offset, size 414 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 415 push {r4-r7} 416 .if \size >= 16 417 vpush {q4-q6} 418 ldr r4, [sp, #64] 419 ldr r5, [sp, #68] 420 .else 421 ldr r4, [sp, #16] 422 ldr r5, [sp, #20] 423 .endif 424 movrelx r12, X(ff_vp9_subpel_filters), r6 425 add r12, r12, 256*\offset 426 cmp r5, #8 427 add r12, r12, r5, lsl #4 428 mov r5, #\size 429 .if \size >= 16 430 bge \type\()_8tap_16h_34 431 b \type\()_8tap_16h_43 432 .else 433 bge \type\()_8tap_\size\()h_34 434 b \type\()_8tap_\size\()h_43 435 .endif 436 endfunc 437 .endm 438 439 .macro do_8tap_h_filters size 440 do_8tap_h_func put, regular, 1, \size 441 do_8tap_h_func avg, regular, 1, \size 442 do_8tap_h_func put, sharp, 2, \size 443 do_8tap_h_func avg, sharp, 2, \size 444 do_8tap_h_func put, smooth, 0, \size 445 do_8tap_h_func avg, smooth, 0, \size 446 .endm 447 448 do_8tap_h_filters 64 449 do_8tap_h_filters 32 450 do_8tap_h_filters 16 451 do_8tap_h_filters 8 452 do_8tap_h_filters 4 453 454 .ltorg 455 456 @ Vertical filters 457 458 @ Round, shift and saturate and store qreg1-2 over 4 lines 459 .macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type 460 vqrshrun.s16 \dreg1, \qreg1, #7 461 vqrshrun.s16 \dreg2, \qreg2, #7 462 .ifc \type,avg 463 vld1.32 {\tmp1[]}, [r0,:32], r1 464 vld1.32 {\tmp2[]}, [r0,:32], r1 465 vld1.32 {\tmp1[1]}, [r0,:32], r1 466 vld1.32 {\tmp2[1]}, [r0,:32], r1 467 vrhadd.u8 \dreg1, \dreg1, \tmp1 468 vrhadd.u8 \dreg2, \dreg2, \tmp2 469 sub r0, r0, r1, lsl #2 470 .endif 471 vst1.32 {\dreg1[0]}, [r0,:32], r1 472 vst1.32 {\dreg2[0]}, [r0,:32], r1 473 vst1.32 {\dreg1[1]}, [r0,:32], r1 474 vst1.32 {\dreg2[1]}, [r0,:32], r1 475 .endm 476 477 @ Round, shift and saturate and store qreg1-4 478 .macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type 479 vqrshrun.s16 \dreg1, \qreg1, #7 480 vqrshrun.s16 \dreg2, \qreg2, #7 481 vqrshrun.s16 \dreg3, \qreg3, #7 482 vqrshrun.s16 \dreg4, \qreg4, #7 483 .ifc \type,avg 484 vld1.8 {\tmp1}, [r0,:64], r1 485 vld1.8 {\tmp2}, [r0,:64], r1 486 vld1.8 {\tmp3}, [r0,:64], r1 487 vld1.8 {\tmp4}, [r0,:64], r1 488 vrhadd.u8 \dreg1, \dreg1, \tmp1 489 vrhadd.u8 \dreg2, \dreg2, \tmp2 490 vrhadd.u8 \dreg3, \dreg3, \tmp3 491 vrhadd.u8 \dreg4, \dreg4, \tmp4 492 sub r0, r0, r1, lsl #2 493 .endif 494 vst1.8 {\dreg1}, [r0,:64], r1 495 vst1.8 {\dreg2}, [r0,:64], r1 496 vst1.8 {\dreg3}, [r0,:64], r1 497 vst1.8 {\dreg4}, [r0,:64], r1 498 .endm 499 500 @ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2 501 @ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately 502 @ at the end with saturation. Indices 0 and 7 always have negative or zero 503 @ coefficients, so they can be accumulated into tmp1-tmp2 together with the 504 @ largest coefficient. 505 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2 506 vmul.s16 \dst1, \src2, d0[1] 507 vmul.s16 \dst2, \src3, d0[1] 508 vmul.s16 \tmp1, \src1, d0[0] 509 vmul.s16 \tmp2, \src2, d0[0] 510 vmla.s16 \dst1, \src3, d0[2] 511 vmla.s16 \dst2, \src4, d0[2] 512 .if \idx1 == 3 513 vmla.s16 \dst1, \src4, d0[3] 514 vmla.s16 \dst2, \src5, d0[3] 515 .else 516 vmla.s16 \dst1, \src5, d1[0] 517 vmla.s16 \dst2, \src6, d1[0] 518 .endif 519 vmla.s16 \dst1, \src6, d1[1] 520 vmla.s16 \dst2, \src7, d1[1] 521 vmla.s16 \tmp1, \src8, d1[3] 522 vmla.s16 \tmp2, \src9, d1[3] 523 vmla.s16 \dst1, \src7, d1[2] 524 vmla.s16 \dst2, \src8, d1[2] 525 .if \idx2 == 3 526 vmla.s16 \tmp1, \src4, d0[3] 527 vmla.s16 \tmp2, \src5, d0[3] 528 .else 529 vmla.s16 \tmp1, \src5, d1[0] 530 vmla.s16 \tmp2, \src6, d1[0] 531 .endif 532 vqadd.s16 \dst1, \dst1, \tmp1 533 vqadd.s16 \dst2, \dst2, \tmp2 534 .endm 535 536 @ Load pixels and extend them to 16 bit 537 .macro loadl dst1, dst2, dst3, dst4 538 vld1.8 {d2}, [r2], r3 539 vld1.8 {d3}, [r2], r3 540 vld1.8 {d4}, [r2], r3 541 .ifnb \dst4 542 vld1.8 {d5}, [r2], r3 543 .endif 544 vmovl.u8 \dst1, d2 545 vmovl.u8 \dst2, d3 546 vmovl.u8 \dst3, d4 547 .ifnb \dst4 548 vmovl.u8 \dst4, d5 549 .endif 550 .endm 551 552 @ Instantiate a vertical filter function for filtering 8 pixels at a time. 553 @ The height is passed in r4, the width in r5 and the filter coefficients 554 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4) 555 @ and idx1 is the other one of them. 556 .macro do_8tap_8v type, idx1, idx2 557 function \type\()_8tap_8v_\idx1\idx2 558 sub r2, r2, r3, lsl #1 559 sub r2, r2, r3 560 vld1.16 {q0}, [r12, :128] 561 1: 562 mov r12, r4 563 564 loadl q5, q6, q7 565 loadl q8, q9, q10, q11 566 2: 567 loadl q12, q13, q14, q15 568 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q5 569 convolve q3, q4, q7, q8, q9, q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5, q6 570 do_store q1, d2, q2, d4, q3, d6, q4, d8, d3, d5, d7, d9, \type 571 572 subs r12, r12, #4 573 beq 8f 574 575 loadl q4, q5, q6, q7 576 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q4, q5, \idx1, \idx2, q8, q9 577 convolve q3, q8, q11, q12, q13, q14, q15, q4, q5, q6, q7, \idx1, \idx2, q9, q10 578 do_store q1, d2, q2, d4, q3, d6, q8, d16, d3, d5, d7, d17, \type 579 580 subs r12, r12, #4 581 beq 8f 582 583 loadl q8, q9, q10, q11 584 convolve q1, q2, q13, q14, q15, q4, q5, q6, q7, q8, q9, \idx1, \idx2, q12, q13 585 convolve q3, q12, q15, q4, q5, q6, q7, q8, q9, q10, q11, \idx1, \idx2, q13, q14 586 do_store q1, d2, q2, d4, q3, d6, q12, d24, d3, d5, d7, d25, \type 587 588 subs r12, r12, #4 589 bne 2b 590 591 8: 592 subs r5, r5, #8 593 beq 9f 594 @ r0 -= h * dst_stride 595 mls r0, r1, r4, r0 596 @ r2 -= h * src_stride 597 mls r2, r3, r4, r2 598 @ r2 -= 8 * src_stride 599 sub r2, r2, r3, lsl #3 600 @ r2 += 1 * src_stride 601 add r2, r2, r3 602 add r2, r2, #8 603 add r0, r0, #8 604 b 1b 605 9: 606 vpop {q4-q7} 607 pop {r4-r5} 608 bx lr 609 endfunc 610 .endm 611 612 do_8tap_8v put, 3, 4 613 do_8tap_8v put, 4, 3 614 do_8tap_8v avg, 3, 4 615 do_8tap_8v avg, 4, 3 616 617 @ Instantiate a vertical filter function for filtering a 4 pixels wide 618 @ slice. The first half of the registers contain one row, while the second 619 @ half of a register contains the second-next row (also stored in the first 620 @ half of the register two steps ahead). The convolution does two outputs 621 @ at a time; the output of q5-q12 into one, and q4-q13 into another one. 622 @ The first half of first output is the first output row, the first half 623 @ of the other output is the second output row. The second halves of the 624 @ registers are rows 3 and 4. 625 @ This only is designed to work for 4 or 8 output lines. 626 .macro do_8tap_4v type, idx1, idx2 627 function \type\()_8tap_4v_\idx1\idx2 628 sub r2, r2, r3, lsl #1 629 sub r2, r2, r3 630 vld1.16 {q0}, [r12, :128] 631 632 vld1.32 {d2[]}, [r2], r3 633 vld1.32 {d3[]}, [r2], r3 634 vld1.32 {d4[]}, [r2], r3 635 vld1.32 {d5[]}, [r2], r3 636 vld1.32 {d6[]}, [r2], r3 637 vld1.32 {d7[]}, [r2], r3 638 vext.8 d2, d2, d4, #4 639 vld1.32 {d8[]}, [r2], r3 640 vext.8 d3, d3, d5, #4 641 vld1.32 {d9[]}, [r2], r3 642 vmovl.u8 q5, d2 643 vext.8 d4, d4, d6, #4 644 vld1.32 {d28[]}, [r2], r3 645 vmovl.u8 q6, d3 646 vext.8 d5, d5, d7, #4 647 vld1.32 {d29[]}, [r2], r3 648 vmovl.u8 q7, d4 649 vext.8 d6, d6, d8, #4 650 vld1.32 {d30[]}, [r2], r3 651 vmovl.u8 q8, d5 652 vext.8 d7, d7, d9, #4 653 vmovl.u8 q9, d6 654 vext.8 d8, d8, d28, #4 655 vmovl.u8 q10, d7 656 vext.8 d9, d9, d29, #4 657 vmovl.u8 q11, d8 658 vext.8 d28, d28, d30, #4 659 vmovl.u8 q12, d9 660 vmovl.u8 q13, d28 661 662 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q3 663 do_store4 q1, d2, q2, d4, d3, d5, \type 664 subs r4, r4, #4 665 beq 9f 666 667 vld1.32 {d2[]}, [r2], r3 668 vld1.32 {d3[]}, [r2], r3 669 vext.8 d29, d29, d2, #4 670 vext.8 d30, d30, d3, #4 671 vld1.32 {d2[1]}, [r2], r3 672 vmovl.u8 q14, d29 673 vld1.32 {d3[1]}, [r2], r3 674 vmovl.u8 q15, d30 675 vmovl.u8 q5, d2 676 vmovl.u8 q6, d3 677 678 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q5, q6, \idx1, \idx2, q4, q3 679 do_store4 q1, d2, q2, d4, d3, d5, \type 680 681 9: 682 vpop {q4-q7} 683 pop {r4-r5} 684 bx lr 685 endfunc 686 .endm 687 688 do_8tap_4v put, 3, 4 689 do_8tap_4v put, 4, 3 690 do_8tap_4v avg, 3, 4 691 do_8tap_4v avg, 4, 3 692 693 .macro do_8tap_v_func type, filter, offset, size 694 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1 695 push {r4-r5} 696 vpush {q4-q7} 697 ldr r4, [sp, #72] 698 movrelx r12, X(ff_vp9_subpel_filters), r5 699 ldr r5, [sp, #80] 700 add r12, r12, 256*\offset 701 add r12, r12, r5, lsl #4 702 cmp r5, #8 703 mov r5, #\size 704 .if \size >= 8 705 bge \type\()_8tap_8v_34 706 b \type\()_8tap_8v_43 707 .else 708 bge \type\()_8tap_4v_34 709 b \type\()_8tap_4v_43 710 .endif 711 endfunc 712 .endm 713 714 .macro do_8tap_v_filters size 715 do_8tap_v_func put, regular, 1, \size 716 do_8tap_v_func avg, regular, 1, \size 717 do_8tap_v_func put, sharp, 2, \size 718 do_8tap_v_func avg, sharp, 2, \size 719 do_8tap_v_func put, smooth, 0, \size 720 do_8tap_v_func avg, smooth, 0, \size 721 .endm 722 723 do_8tap_v_filters 64 724 do_8tap_v_filters 32 725 do_8tap_v_filters 16 726 do_8tap_v_filters 8 727 do_8tap_v_filters 4