h264pred_neon.S (26288B)
1 /* 2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavutil/aarch64/asm.S" 22 23 .macro ldcol.8 rd, rs, rt, n=8, hi=0 24 .if \n >= 8 || \hi == 0 25 ld1 {\rd\().b}[0], [\rs], \rt 26 ld1 {\rd\().b}[1], [\rs], \rt 27 ld1 {\rd\().b}[2], [\rs], \rt 28 ld1 {\rd\().b}[3], [\rs], \rt 29 .endif 30 .if \n >= 8 || \hi == 1 31 ld1 {\rd\().b}[4], [\rs], \rt 32 ld1 {\rd\().b}[5], [\rs], \rt 33 ld1 {\rd\().b}[6], [\rs], \rt 34 ld1 {\rd\().b}[7], [\rs], \rt 35 .endif 36 .if \n == 16 37 ld1 {\rd\().b}[8], [\rs], \rt 38 ld1 {\rd\().b}[9], [\rs], \rt 39 ld1 {\rd\().b}[10], [\rs], \rt 40 ld1 {\rd\().b}[11], [\rs], \rt 41 ld1 {\rd\().b}[12], [\rs], \rt 42 ld1 {\rd\().b}[13], [\rs], \rt 43 ld1 {\rd\().b}[14], [\rs], \rt 44 ld1 {\rd\().b}[15], [\rs], \rt 45 .endif 46 .endm 47 48 function ff_pred16x16_128_dc_neon, export=1 49 movi v0.16b, #128 50 b .L_pred16x16_dc_end 51 endfunc 52 53 function ff_pred16x16_top_dc_neon, export=1 54 sub x2, x0, x1 55 ld1 {v0.16b}, [x2] 56 uaddlv h0, v0.16b 57 rshrn v0.8b, v0.8h, #4 58 dup v0.16b, v0.b[0] 59 b .L_pred16x16_dc_end 60 endfunc 61 62 function ff_pred16x16_left_dc_neon, export=1 63 sub x2, x0, #1 64 ldcol.8 v0, x2, x1, 16 65 uaddlv h0, v0.16b 66 rshrn v0.8b, v0.8h, #4 67 dup v0.16b, v0.b[0] 68 b .L_pred16x16_dc_end 69 endfunc 70 71 function ff_pred16x16_dc_neon, export=1 72 sub x2, x0, x1 73 sub x3, x0, #1 74 ld1 {v0.16b}, [x2] 75 ldcol.8 v1, x3, x1, 16 76 uaddlv h0, v0.16b 77 uaddlv h1, v1.16b 78 add v0.4h, v0.4h, v1.4h 79 rshrn v0.8b, v0.8h, #5 80 dup v0.16b, v0.b[0] 81 .L_pred16x16_dc_end: 82 mov w3, #8 83 6: st1 {v0.16b}, [x0], x1 84 subs w3, w3, #1 85 st1 {v0.16b}, [x0], x1 86 b.ne 6b 87 ret 88 endfunc 89 90 function ff_pred16x16_hor_neon, export=1 91 sub x2, x0, #1 92 mov w3, #16 93 1: ld1r {v0.16b}, [x2], x1 94 subs w3, w3, #1 95 st1 {v0.16b}, [x0], x1 96 b.ne 1b 97 ret 98 endfunc 99 100 function ff_pred16x16_vert_neon, export=1 101 sub x2, x0, x1 102 add x1, x1, x1 103 ld1 {v0.16b}, [x2], x1 104 mov w3, #8 105 1: subs w3, w3, #1 106 st1 {v0.16b}, [x0], x1 107 st1 {v0.16b}, [x2], x1 108 b.ne 1b 109 ret 110 endfunc 111 112 function ff_pred16x16_plane_neon, export=1 113 sub x3, x0, x1 114 movrel x4, p16weight 115 add x2, x3, #8 116 sub x3, x3, #1 117 ld1 {v0.8b}, [x3] 118 ld1 {v2.8b}, [x2], x1 119 ldcol.8 v1, x3, x1 120 add x3, x3, x1 121 ldcol.8 v3, x3, x1 122 rev64 v0.8b, v0.8b 123 rev64 v1.8b, v1.8b 124 uaddl v7.8h, v2.8b, v3.8b 125 usubl v2.8h, v2.8b, v0.8b 126 usubl v3.8h, v3.8b, v1.8b 127 ld1 {v0.8h}, [x4] 128 mul v2.8h, v2.8h, v0.8h 129 mul v3.8h, v3.8h, v0.8h 130 addp v2.8h, v2.8h, v3.8h 131 addp v2.8h, v2.8h, v2.8h 132 addp v2.4h, v2.4h, v2.4h 133 sshll v3.4s, v2.4h, #2 134 saddw v2.4s, v3.4s, v2.4h 135 rshrn v4.4h, v2.4s, #6 136 trn2 v5.4h, v4.4h, v4.4h 137 add v2.4h, v4.4h, v5.4h 138 shl v3.4h, v2.4h, #3 139 ext v7.16b, v7.16b, v7.16b, #14 140 sub v3.4h, v3.4h, v2.4h // 7 * (b + c) 141 add v7.4h, v7.4h, v0.4h 142 shl v2.4h, v7.4h, #4 143 sub v2.4h, v2.4h, v3.4h 144 shl v3.4h, v4.4h, #4 145 ext v0.16b, v0.16b, v0.16b, #14 146 sub v6.4h, v5.4h, v3.4h 147 mov v0.h[0], wzr 148 mul v0.8h, v0.8h, v4.h[0] 149 dup v1.8h, v2.h[0] 150 dup v2.8h, v4.h[0] 151 dup v3.8h, v6.h[0] 152 shl v2.8h, v2.8h, #3 153 add v1.8h, v1.8h, v0.8h 154 add v3.8h, v3.8h, v2.8h 155 mov w3, #16 156 1: 157 sqshrun v0.8b, v1.8h, #5 158 add v1.8h, v1.8h, v2.8h 159 sqshrun2 v0.16b, v1.8h, #5 160 add v1.8h, v1.8h, v3.8h 161 subs w3, w3, #1 162 st1 {v0.16b}, [x0], x1 163 b.ne 1b 164 ret 165 endfunc 166 167 const p16weight, align=4 168 .short 1,2,3,4,5,6,7,8 169 endconst 170 const p8weight, align=4 171 .short 1,2,3,4,1,2,3,4 172 endconst 173 174 function ff_pred8x8_hor_neon, export=1 175 sub x2, x0, #1 176 mov w3, #8 177 1: ld1r {v0.8b}, [x2], x1 178 subs w3, w3, #1 179 st1 {v0.8b}, [x0], x1 180 b.ne 1b 181 ret 182 endfunc 183 184 function ff_pred8x8_vert_neon, export=1 185 sub x2, x0, x1 186 lsl x1, x1, #1 187 ld1 {v0.8b}, [x2], x1 188 mov w3, #4 189 1: subs w3, w3, #1 190 st1 {v0.8b}, [x0], x1 191 st1 {v0.8b}, [x2], x1 192 b.ne 1b 193 ret 194 endfunc 195 196 function ff_pred8x8_plane_neon, export=1 197 sub x3, x0, x1 198 movrel x4, p8weight 199 movrel x5, p16weight 200 add x2, x3, #4 201 sub x3, x3, #1 202 ld1 {v0.s}[0], [x3] 203 ld1 {v2.s}[0], [x2], x1 204 ldcol.8 v0, x3, x1, 4, hi=1 205 add x3, x3, x1 206 ldcol.8 v3, x3, x1, 4 207 uaddl v7.8h, v2.8b, v3.8b 208 rev32 v0.8b, v0.8b 209 trn1 v2.2s, v2.2s, v3.2s 210 usubl v2.8h, v2.8b, v0.8b 211 ld1 {v6.8h}, [x4] 212 mul v2.8h, v2.8h, v6.8h 213 ld1 {v0.8h}, [x5] 214 saddlp v2.4s, v2.8h 215 addp v2.4s, v2.4s, v2.4s 216 shl v3.4s, v2.4s, #4 217 add v2.4s, v3.4s, v2.4s 218 rshrn v5.4h, v2.4s, #5 219 addp v2.4h, v5.4h, v5.4h 220 shl v3.4h, v2.4h, #1 221 add v3.4h, v3.4h, v2.4h 222 rev64 v7.4h, v7.4h 223 add v7.4h, v7.4h, v0.4h 224 shl v2.4h, v7.4h, #4 225 sub v2.4h, v2.4h, v3.4h 226 ext v0.16b, v0.16b, v0.16b, #14 227 mov v0.h[0], wzr 228 mul v0.8h, v0.8h, v5.h[0] 229 dup v1.8h, v2.h[0] 230 dup v2.8h, v5.h[1] 231 add v1.8h, v1.8h, v0.8h 232 mov w3, #8 233 1: 234 sqshrun v0.8b, v1.8h, #5 235 subs w3, w3, #1 236 add v1.8h, v1.8h, v2.8h 237 st1 {v0.8b}, [x0], x1 238 b.ne 1b 239 ret 240 endfunc 241 242 function ff_pred8x8_128_dc_neon, export=1 243 movi v0.8b, #128 244 movi v1.8b, #128 245 b .L_pred8x8_dc_end 246 endfunc 247 248 function ff_pred8x8_top_dc_neon, export=1 249 sub x2, x0, x1 250 ld1 {v0.8b}, [x2] 251 uaddlp v0.4h, v0.8b 252 addp v0.4h, v0.4h, v0.4h 253 zip1 v0.8h, v0.8h, v0.8h 254 rshrn v2.8b, v0.8h, #2 255 zip1 v0.8b, v2.8b, v2.8b 256 zip1 v1.8b, v2.8b, v2.8b 257 b .L_pred8x8_dc_end 258 endfunc 259 260 function ff_pred8x8_left_dc_neon, export=1 261 sub x2, x0, #1 262 ldcol.8 v0, x2, x1 263 uaddlp v0.4h, v0.8b 264 addp v0.4h, v0.4h, v0.4h 265 rshrn v2.8b, v0.8h, #2 266 dup v1.8b, v2.b[1] 267 dup v0.8b, v2.b[0] 268 b .L_pred8x8_dc_end 269 endfunc 270 271 function ff_pred8x8_dc_neon, export=1 272 sub x2, x0, x1 273 sub x3, x0, #1 274 ld1 {v0.8b}, [x2] 275 ldcol.8 v1, x3, x1 276 uaddlp v0.4h, v0.8b 277 uaddlp v1.4h, v1.8b 278 trn1 v2.2s, v0.2s, v1.2s 279 trn2 v3.2s, v0.2s, v1.2s 280 addp v4.4h, v2.4h, v3.4h 281 addp v5.4h, v4.4h, v4.4h 282 rshrn v6.8b, v5.8h, #3 283 rshrn v7.8b, v4.8h, #2 284 dup v0.8b, v6.b[0] 285 dup v2.8b, v7.b[2] 286 dup v1.8b, v7.b[3] 287 dup v3.8b, v6.b[1] 288 zip1 v0.2s, v0.2s, v2.2s 289 zip1 v1.2s, v1.2s, v3.2s 290 .L_pred8x8_dc_end: 291 mov w3, #4 292 add x2, x0, x1, lsl #2 293 6: subs w3, w3, #1 294 st1 {v0.8b}, [x0], x1 295 st1 {v1.8b}, [x2], x1 296 b.ne 6b 297 ret 298 endfunc 299 300 function ff_pred8x8_l0t_dc_neon, export=1 301 sub x2, x0, x1 302 sub x3, x0, #1 303 ld1 {v0.8b}, [x2] 304 ldcol.8 v1, x3, x1, 4 305 zip1 v0.4s, v0.4s, v1.4s 306 uaddlp v0.8h, v0.16b 307 addp v0.8h, v0.8h, v0.8h 308 addp v1.4h, v0.4h, v0.4h 309 rshrn v2.8b, v0.8h, #2 310 rshrn v3.8b, v1.8h, #3 311 dup v4.8b, v3.b[0] 312 dup v6.8b, v2.b[2] 313 dup v5.8b, v2.b[0] 314 zip1 v0.2s, v4.2s, v6.2s 315 zip1 v1.2s, v5.2s, v6.2s 316 b .L_pred8x8_dc_end 317 endfunc 318 319 function ff_pred8x8_l00_dc_neon, export=1 320 sub x2, x0, #1 321 ldcol.8 v0, x2, x1, 4 322 uaddlp v0.4h, v0.8b 323 addp v0.4h, v0.4h, v0.4h 324 rshrn v0.8b, v0.8h, #2 325 movi v1.8b, #128 326 dup v0.8b, v0.b[0] 327 b .L_pred8x8_dc_end 328 endfunc 329 330 function ff_pred8x8_0lt_dc_neon, export=1 331 add x3, x0, x1, lsl #2 332 sub x2, x0, x1 333 sub x3, x3, #1 334 ld1 {v0.8b}, [x2] 335 ldcol.8 v1, x3, x1, 4, hi=1 336 zip1 v0.4s, v0.4s, v1.4s 337 uaddlp v0.8h, v0.16b 338 addp v0.8h, v0.8h, v0.8h 339 addp v1.4h, v0.4h, v0.4h 340 rshrn v2.8b, v0.8h, #2 341 rshrn v3.8b, v1.8h, #3 342 dup v4.8b, v2.b[0] 343 dup v5.8b, v2.b[3] 344 dup v6.8b, v2.b[2] 345 dup v7.8b, v3.b[1] 346 zip1 v0.2s, v4.2s, v6.2s 347 zip1 v1.2s, v5.2s, v7.2s 348 b .L_pred8x8_dc_end 349 endfunc 350 351 function ff_pred8x8_0l0_dc_neon, export=1 352 add x2, x0, x1, lsl #2 353 sub x2, x2, #1 354 ldcol.8 v1, x2, x1, 4 355 uaddlp v2.4h, v1.8b 356 addp v2.4h, v2.4h, v2.4h 357 rshrn v1.8b, v2.8h, #2 358 movi v0.8b, #128 359 dup v1.8b, v1.b[0] 360 b .L_pred8x8_dc_end 361 endfunc 362 363 .macro ldcol.16 rd, rs, rt, n=4, hi=0 364 .if \n >= 4 && \hi == 0 365 ld1 {\rd\().h}[0], [\rs], \rt 366 ld1 {\rd\().h}[1], [\rs], \rt 367 ld1 {\rd\().h}[2], [\rs], \rt 368 ld1 {\rd\().h}[3], [\rs], \rt 369 .endif 370 .if \n == 8 || \hi == 1 371 ld1 {\rd\().h}[4], [\rs], \rt 372 ld1 {\rd\().h}[5], [\rs], \rt 373 ld1 {\rd\().h}[6], [\rs], \rt 374 ld1 {\rd\().h}[7], [\rs], \rt 375 .endif 376 .endm 377 378 // slower than C 379 /* 380 function ff_pred16x16_128_dc_neon_10, export=1 381 movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) 382 383 b .L_pred16x16_dc_10_end 384 endfunc 385 */ 386 387 function ff_pred16x16_top_dc_neon_10, export=1 388 sub x2, x0, x1 389 390 ld1 {v0.8h, v1.8h}, [x2] 391 392 add v0.8h, v0.8h, v1.8h 393 addv h0, v0.8h 394 395 urshr v0.4h, v0.4h, #4 396 dup v0.8h, v0.h[0] 397 b .L_pred16x16_dc_10_end 398 endfunc 399 400 // slower than C 401 /* 402 function ff_pred16x16_left_dc_neon_10, export=1 403 sub x2, x0, #2 // access to the "left" column 404 ldcol.16 v0, x2, x1, 8 405 ldcol.16 v1, x2, x1, 8 // load "left" column 406 407 add v0.8h, v0.8h, v1.8h 408 addv h0, v0.8h 409 410 urshr v0.4h, v0.4h, #4 411 dup v0.8h, v0.h[0] 412 b .L_pred16x16_dc_10_end 413 endfunc 414 */ 415 416 function ff_pred16x16_dc_neon_10, export=1 417 sub x2, x0, x1 // access to the "top" row 418 sub x3, x0, #2 // access to the "left" column 419 420 ld1 {v0.8h, v1.8h}, [x2] 421 ldcol.16 v2, x3, x1, 8 422 ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and "left" col 423 424 add v0.8h, v0.8h, v1.8h 425 add v2.8h, v2.8h, v3.8h 426 add v0.8h, v0.8h, v2.8h 427 addv h0, v0.8h 428 429 urshr v0.4h, v0.4h, #5 430 dup v0.8h, v0.h[0] 431 .L_pred16x16_dc_10_end: 432 mov v1.16b, v0.16b 433 mov w3, #8 434 6: st1 {v0.8h, v1.8h}, [x0], x1 435 subs w3, w3, #1 436 st1 {v0.8h, v1.8h}, [x0], x1 437 b.ne 6b 438 ret 439 endfunc 440 441 function ff_pred16x16_hor_neon_10, export=1 442 sub x2, x0, #2 443 add x3, x0, #16 444 445 mov w4, #16 446 1: ld1r {v0.8h}, [x2], x1 447 subs w4, w4, #1 448 st1 {v0.8h}, [x0], x1 449 st1 {v0.8h}, [x3], x1 450 b.ne 1b 451 ret 452 endfunc 453 454 function ff_pred16x16_vert_neon_10, export=1 455 sub x2, x0, x1 456 add x1, x1, x1 457 458 ld1 {v0.8h, v1.8h}, [x2], x1 459 460 mov w3, #8 461 1: subs w3, w3, #1 462 st1 {v0.8h, v1.8h}, [x0], x1 463 st1 {v0.8h, v1.8h}, [x2], x1 464 465 b.ne 1b 466 ret 467 endfunc 468 469 function ff_pred16x16_plane_neon_10, export=1 470 sub x3, x0, x1 471 movrel x4, p16weight 472 add x2, x3, #16 473 sub x3, x3, #2 474 ld1 {v0.8h}, [x3] 475 ld1 {v2.8h}, [x2], x1 476 ldcol.16 v1, x3, x1, 8 477 add x3, x3, x1 478 ldcol.16 v3, x3, x1, 8 479 480 rev64 v16.8h, v0.8h 481 rev64 v17.8h, v1.8h 482 ext v0.16b, v16.16b, v16.16b, #8 483 ext v1.16b, v17.16b, v17.16b, #8 484 485 add v7.8h, v2.8h, v3.8h 486 sub v2.8h, v2.8h, v0.8h 487 sub v3.8h, v3.8h, v1.8h 488 ld1 {v0.8h}, [x4] 489 mul v2.8h, v2.8h, v0.8h 490 mul v3.8h, v3.8h, v0.8h 491 addp v2.8h, v2.8h, v3.8h 492 addp v2.8h, v2.8h, v2.8h 493 addp v2.4h, v2.4h, v2.4h 494 sshll v3.4s, v2.4h, #2 495 saddw v2.4s, v3.4s, v2.4h 496 rshrn v4.4h, v2.4s, #6 497 trn2 v5.4h, v4.4h, v4.4h 498 add v2.4h, v4.4h, v5.4h 499 shl v3.4h, v2.4h, #3 500 ext v7.16b, v7.16b, v7.16b, #14 501 sub v3.4h, v3.4h, v2.4h // 7 * (b + c) 502 add v7.4h, v7.4h, v0.4h 503 shl v2.4h, v7.4h, #4 504 ssubl v2.4s, v2.4h, v3.4h 505 ext v0.16b, v0.16b, v0.16b, #14 506 sxtl v6.4s, v5.4h // c 507 508 mov v0.h[0], wzr 509 mul v0.8h, v0.8h, v4.h[0] 510 dup v16.4s, v2.s[0] 511 dup v17.4s, v2.s[0] 512 dup v2.8h, v4.h[0] // b 513 dup v3.4s, v6.s[0] // c 514 sshll v2.4s, v2.4h, #3 // b * 8 515 saddw v16.4s, v16.4s, v0.4h 516 saddw2 v17.4s, v17.4s, v0.8h 517 sub v3.4s, v3.4s, v2.4s 518 519 mov w3, #16 520 mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping 521 1: 522 sqshrun v0.4h, v16.4s, #5 523 sqshrun2 v0.8h, v17.4s, #5 524 add v16.4s, v16.4s, v2.4s 525 add v17.4s, v17.4s, v2.4s 526 sqshrun v1.4h, v16.4s, #5 527 sqshrun2 v1.8h, v17.4s, #5 528 add v16.4s, v16.4s, v3.4s 529 add v17.4s, v17.4s, v3.4s 530 531 subs w3, w3, #1 532 533 smin v0.8h, v0.8h, v4.8h 534 smin v1.8h, v1.8h, v4.8h 535 536 st1 {v0.8h, v1.8h}, [x0], x1 537 b.ne 1b 538 ret 539 endfunc 540 541 function ff_pred8x8_hor_neon_10, export=1 542 sub x2, x0, #2 543 mov w3, #8 544 545 1: ld1r {v0.8h}, [x2], x1 546 subs w3, w3, #1 547 st1 {v0.8h}, [x0], x1 548 b.ne 1b 549 ret 550 endfunc 551 552 function ff_pred8x8_vert_neon_10, export=1 553 sub x2, x0, x1 554 lsl x1, x1, #1 555 556 ld1 {v0.8h}, [x2], x1 557 mov w3, #4 558 1: subs w3, w3, #1 559 st1 {v0.8h}, [x0], x1 560 st1 {v0.8h}, [x2], x1 561 b.ne 1b 562 ret 563 endfunc 564 565 function ff_pred8x8_plane_neon_10, export=1 566 sub x3, x0, x1 567 movrel x4, p8weight 568 movrel x5, p16weight 569 add x2, x3, #8 570 sub x3, x3, #2 571 ld1 {v0.d}[0], [x3] 572 ld1 {v2.d}[0], [x2], x1 573 ldcol.16 v0, x3, x1, hi=1 574 add x3, x3, x1 575 ldcol.16 v3, x3, x1, 4 576 add v7.8h, v2.8h, v3.8h 577 rev64 v0.8h, v0.8h 578 trn1 v2.2d, v2.2d, v3.2d 579 sub v2.8h, v2.8h, v0.8h 580 ld1 {v6.8h}, [x4] 581 mul v2.8h, v2.8h, v6.8h 582 ld1 {v0.8h}, [x5] 583 saddlp v2.4s, v2.8h 584 addp v2.4s, v2.4s, v2.4s 585 shl v3.4s, v2.4s, #4 586 add v2.4s, v3.4s, v2.4s 587 rshrn v5.4h, v2.4s, #5 588 addp v2.4h, v5.4h, v5.4h 589 shl v3.4h, v2.4h, #1 590 add v3.4h, v3.4h, v2.4h 591 rev64 v7.4h, v7.4h 592 add v7.4h, v7.4h, v0.4h 593 shl v2.4h, v7.4h, #4 594 ssubl v2.4s, v2.4h, v3.4h 595 ext v0.16b, v0.16b, v0.16b, #14 596 mov v0.h[0], wzr 597 dup v1.4s, v2.s[0] 598 dup v2.4s, v2.s[0] 599 dup v3.8h, v5.h[1] 600 smlal v1.4s, v0.4h, v5.h[0] 601 smlal2 v2.4s, v0.8h, v5.h[0] 602 mov w3, #8 603 mvni v4.8h, #0xFC, lsl #8 // 1023 for clipping 604 1: 605 sqshrun v0.4h, v1.4s, #5 606 sqshrun2 v0.8h, v2.4s, #5 607 608 saddw v1.4s, v1.4s, v3.4h 609 saddw v2.4s, v2.4s, v3.4h 610 611 subs w3, w3, #1 612 613 smin v0.8h, v0.8h, v4.8h 614 615 st1 {v0.8h}, [x0], x1 616 b.ne 1b 617 ret 618 endfunc 619 620 function ff_pred8x8_128_dc_neon_10, export=1 621 movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) 622 movi v1.8h, #2, lsl #8 623 b .L_pred8x8_dc_10_end 624 endfunc 625 626 function ff_pred8x8_top_dc_neon_10, export=1 627 sub x2, x0, x1 628 ld1 {v0.8h}, [x2] 629 630 addp v0.8h, v0.8h, v0.8h 631 addp v0.4h, v0.4h, v0.4h 632 zip1 v0.4h, v0.4h, v0.4h 633 urshr v2.4h, v0.4h, #2 634 zip1 v0.8h, v2.8h, v2.8h 635 zip1 v1.8h, v2.8h, v2.8h 636 b .L_pred8x8_dc_10_end 637 endfunc 638 639 function ff_pred8x8_left_dc_neon_10, export=1 640 sub x2, x0, #2 641 ldcol.16 v0, x2, x1, 8 642 643 addp v0.8h, v0.8h, v0.8h 644 addp v0.4h, v0.4h, v0.4h 645 urshr v2.4h, v0.4h, #2 646 dup v1.8h, v2.h[1] 647 dup v0.8h, v2.h[0] 648 b .L_pred8x8_dc_10_end 649 endfunc 650 651 function ff_pred8x8_dc_neon_10, export=1 652 sub x2, x0, x1 653 sub x3, x0, #2 654 655 ld1 {v0.8h}, [x2] 656 ldcol.16 v1, x3, x1, 8 657 658 addp v0.8h, v0.8h, v0.8h 659 addp v1.8h, v1.8h, v1.8h 660 trn1 v2.2s, v0.2s, v1.2s 661 trn2 v3.2s, v0.2s, v1.2s 662 addp v4.4h, v2.4h, v3.4h 663 addp v5.4h, v4.4h, v4.4h 664 urshr v6.4h, v5.4h, #3 665 urshr v7.4h, v4.4h, #2 666 dup v0.8h, v6.h[0] 667 dup v2.8h, v7.h[2] 668 dup v1.8h, v7.h[3] 669 dup v3.8h, v6.h[1] 670 zip1 v0.2d, v0.2d, v2.2d 671 zip1 v1.2d, v1.2d, v3.2d 672 .L_pred8x8_dc_10_end: 673 mov w3, #4 674 add x2, x0, x1, lsl #2 675 676 6: st1 {v0.8h}, [x0], x1 677 subs w3, w3, #1 678 st1 {v1.8h}, [x2], x1 679 b.ne 6b 680 ret 681 endfunc 682 683 function ff_pred8x8_l0t_dc_neon_10, export=1 684 sub x2, x0, x1 685 sub x3, x0, #2 686 687 ld1 {v0.8h}, [x2] 688 ldcol.16 v1, x3, x1, 4 689 690 addp v0.8h, v0.8h, v0.8h 691 addp v1.4h, v1.4h, v1.4h 692 addp v0.4h, v0.4h, v0.4h 693 addp v1.4h, v1.4h, v1.4h 694 add v1.4h, v1.4h, v0.4h 695 696 urshr v2.4h, v0.4h, #2 697 urshr v3.4h, v1.4h, #3 // the pred4x4 part 698 699 dup v4.4h, v3.h[0] 700 dup v5.4h, v2.h[0] 701 dup v6.4h, v2.h[1] 702 703 zip1 v0.2d, v4.2d, v6.2d 704 zip1 v1.2d, v5.2d, v6.2d 705 b .L_pred8x8_dc_10_end 706 endfunc 707 708 function ff_pred8x8_l00_dc_neon_10, export=1 709 sub x2, x0, #2 710 711 ldcol.16 v0, x2, x1, 4 712 713 addp v0.4h, v0.4h, v0.4h 714 addp v0.4h, v0.4h, v0.4h 715 urshr v0.4h, v0.4h, #2 716 717 movi v1.8h, #2, lsl #8 // 512 718 dup v0.8h, v0.h[0] 719 b .L_pred8x8_dc_10_end 720 endfunc 721 722 function ff_pred8x8_0lt_dc_neon_10, export=1 723 add x3, x0, x1, lsl #2 724 sub x2, x0, x1 725 sub x3, x3, #2 726 727 ld1 {v0.8h}, [x2] 728 ldcol.16 v1, x3, x1, hi=1 729 730 addp v0.8h, v0.8h, v0.8h 731 addp v1.8h, v1.8h, v1.8h 732 addp v0.4h, v0.4h, v0.4h 733 addp v1.4h, v1.4h, v1.4h 734 zip1 v0.2s, v0.2s, v1.2s 735 add v1.4h, v0.4h, v1.4h 736 737 urshr v2.4h, v0.4h, #2 738 urshr v3.4h, v1.4h, #3 739 740 dup v4.4h, v2.h[0] 741 dup v5.4h, v2.h[3] 742 dup v6.4h, v2.h[1] 743 dup v7.4h, v3.h[1] 744 745 zip1 v0.2d, v4.2d, v6.2d 746 zip1 v1.2d, v5.2d, v7.2d 747 b .L_pred8x8_dc_10_end 748 endfunc 749 750 function ff_pred8x8_0l0_dc_neon_10, export=1 751 add x2, x0, x1, lsl #2 752 sub x2, x2, #2 753 754 ldcol.16 v1, x2, x1, 4 755 756 addp v2.8h, v1.8h, v1.8h 757 addp v2.4h, v2.4h, v2.4h 758 urshr v1.4h, v2.4h, #2 759 760 movi v0.8h, #2, lsl #8 // 512 761 dup v1.8h, v1.h[0] 762 b .L_pred8x8_dc_10_end 763 endfunc