h264idct_neon.S (16269B)
1 /* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #include "libavutil/aarch64/asm.S" 23 #include "neon.S" 24 25 function ff_h264_idct_add_neon, export=1 26 .L_ff_h264_idct_add_neon: 27 AARCH64_VALID_CALL_TARGET 28 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1] 29 sxtw x2, w2 30 movi v30.8h, #0 31 32 add v4.4h, v0.4h, v2.4h 33 sshr v16.4h, v1.4h, #1 34 st1 {v30.8h}, [x1], #16 35 sshr v17.4h, v3.4h, #1 36 st1 {v30.8h}, [x1], #16 37 sub v5.4h, v0.4h, v2.4h 38 sub v6.4h, v16.4h, v3.4h 39 add v7.4h, v1.4h, v17.4h 40 add v0.4h, v4.4h, v7.4h 41 add v1.4h, v5.4h, v6.4h 42 sub v2.4h, v5.4h, v6.4h 43 sub v3.4h, v4.4h, v7.4h 44 45 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 46 47 add v4.4h, v0.4h, v2.4h 48 ld1 {v18.s}[0], [x0], x2 49 sshr v16.4h, v3.4h, #1 50 sshr v17.4h, v1.4h, #1 51 ld1 {v18.s}[1], [x0], x2 52 sub v5.4h, v0.4h, v2.4h 53 ld1 {v19.s}[1], [x0], x2 54 add v6.4h, v16.4h, v1.4h 55 ins v4.d[1], v5.d[0] 56 sub v7.4h, v17.4h, v3.4h 57 ld1 {v19.s}[0], [x0], x2 58 ins v6.d[1], v7.d[0] 59 sub x0, x0, x2, lsl #2 60 add v0.8h, v4.8h, v6.8h 61 sub v1.8h, v4.8h, v6.8h 62 63 srshr v0.8h, v0.8h, #6 64 srshr v1.8h, v1.8h, #6 65 66 uaddw v0.8h, v0.8h, v18.8b 67 uaddw v1.8h, v1.8h, v19.8b 68 69 sqxtun v0.8b, v0.8h 70 sqxtun v1.8b, v1.8h 71 72 st1 {v0.s}[0], [x0], x2 73 st1 {v0.s}[1], [x0], x2 74 st1 {v1.s}[1], [x0], x2 75 st1 {v1.s}[0], [x0], x2 76 77 sub x1, x1, #32 78 ret 79 endfunc 80 81 function ff_h264_idct_dc_add_neon, export=1 82 .L_ff_h264_idct_dc_add_neon: 83 AARCH64_VALID_CALL_TARGET 84 sxtw x2, w2 85 mov w3, #0 86 ld1r {v2.8h}, [x1] 87 strh w3, [x1] 88 srshr v2.8h, v2.8h, #6 89 ld1 {v0.s}[0], [x0], x2 90 ld1 {v0.s}[1], [x0], x2 91 uaddw v3.8h, v2.8h, v0.8b 92 ld1 {v1.s}[0], [x0], x2 93 ld1 {v1.s}[1], [x0], x2 94 uaddw v4.8h, v2.8h, v1.8b 95 sqxtun v0.8b, v3.8h 96 sqxtun v1.8b, v4.8h 97 sub x0, x0, x2, lsl #2 98 st1 {v0.s}[0], [x0], x2 99 st1 {v0.s}[1], [x0], x2 100 st1 {v1.s}[0], [x0], x2 101 st1 {v1.s}[1], [x0], x2 102 ret 103 endfunc 104 105 function ff_h264_idct_add16_neon, export=1 106 mov x12, x30 107 mov x6, x0 // dest 108 mov x5, x1 // block_offset 109 mov x1, x2 // block 110 mov w9, w3 // stride 111 movrel x7, scan8 112 mov x10, #16 113 movrel x13, .L_ff_h264_idct_dc_add_neon 114 movrel x14, .L_ff_h264_idct_add_neon 115 1: mov w2, w9 116 ldrb w3, [x7], #1 117 ldrsw x0, [x5], #4 118 ldrb w3, [x4, w3, uxtw] 119 subs w3, w3, #1 120 b.lt 2f 121 ldrsh w3, [x1] 122 add x0, x0, x6 123 ccmp w3, #0, #4, eq 124 csel x15, x13, x14, ne 125 blr x15 126 2: subs x10, x10, #1 127 add x1, x1, #32 128 b.ne 1b 129 ret x12 130 endfunc 131 132 function ff_h264_idct_add16intra_neon, export=1 133 mov x12, x30 134 mov x6, x0 // dest 135 mov x5, x1 // block_offset 136 mov x1, x2 // block 137 mov w9, w3 // stride 138 movrel x7, scan8 139 mov x10, #16 140 movrel x13, .L_ff_h264_idct_dc_add_neon 141 movrel x14, .L_ff_h264_idct_add_neon 142 1: mov w2, w9 143 ldrb w3, [x7], #1 144 ldrsw x0, [x5], #4 145 ldrb w3, [x4, w3, uxtw] 146 add x0, x0, x6 147 cmp w3, #0 148 ldrsh w3, [x1] 149 csel x15, x13, x14, eq 150 ccmp w3, #0, #0, eq 151 b.eq 2f 152 blr x15 153 2: subs x10, x10, #1 154 add x1, x1, #32 155 b.ne 1b 156 ret x12 157 endfunc 158 159 function ff_h264_idct_add8_neon, export=1 160 stp x19, x20, [sp, #-0x40]! 161 mov x12, x30 162 ldp x6, x15, [x0] // dest[0], dest[1] 163 add x5, x1, #16*4 // block_offset 164 add x9, x2, #16*32 // block 165 mov w19, w3 // stride 166 movrel x13, .L_ff_h264_idct_dc_add_neon 167 movrel x14, .L_ff_h264_idct_add_neon 168 movrel x7, scan8, 16 169 mov x10, #0 170 mov x11, #16 171 1: mov w2, w19 172 ldrb w3, [x7, x10] // scan8[i] 173 ldrsw x0, [x5, x10, lsl #2] // block_offset[i] 174 ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] 175 add x0, x0, x6 // block_offset[i] + dst[j-1] 176 add x1, x9, x10, lsl #5 // block + i * 16 177 cmp w3, #0 178 ldrsh w3, [x1] // block[i*16] 179 csel x20, x13, x14, eq 180 ccmp w3, #0, #0, eq 181 b.eq 2f 182 blr x20 183 2: add x10, x10, #1 184 cmp x10, #4 185 csel x10, x11, x10, eq // mov x10, #16 186 csel x6, x15, x6, eq 187 cmp x10, #20 188 b.lt 1b 189 ldp x19, x20, [sp], #0x40 190 ret x12 191 endfunc 192 193 .macro idct8x8_cols pass 194 .if \pass == 0 195 va .req v18 196 vb .req v30 197 sshr v18.8h, v26.8h, #1 198 add v16.8h, v24.8h, v28.8h 199 ld1 {v30.8h, v31.8h}, [x1] 200 st1 {v19.8h}, [x1], #16 201 st1 {v19.8h}, [x1], #16 202 sub v17.8h, v24.8h, v28.8h 203 sshr v19.8h, v30.8h, #1 204 sub v18.8h, v18.8h, v30.8h 205 add v19.8h, v19.8h, v26.8h 206 .else 207 va .req v30 208 vb .req v18 209 sshr v30.8h, v26.8h, #1 210 sshr v19.8h, v18.8h, #1 211 add v16.8h, v24.8h, v28.8h 212 sub v17.8h, v24.8h, v28.8h 213 sub v30.8h, v30.8h, v18.8h 214 add v19.8h, v19.8h, v26.8h 215 .endif 216 add v26.8h, v17.8h, va.8h 217 sub v28.8h, v17.8h, va.8h 218 add v24.8h, v16.8h, v19.8h 219 sub vb.8h, v16.8h, v19.8h 220 sub v16.8h, v29.8h, v27.8h 221 add v17.8h, v31.8h, v25.8h 222 sub va.8h, v31.8h, v25.8h 223 add v19.8h, v29.8h, v27.8h 224 sub v16.8h, v16.8h, v31.8h 225 sub v17.8h, v17.8h, v27.8h 226 add va.8h, va.8h, v29.8h 227 add v19.8h, v19.8h, v25.8h 228 sshr v25.8h, v25.8h, #1 229 sshr v27.8h, v27.8h, #1 230 sshr v29.8h, v29.8h, #1 231 sshr v31.8h, v31.8h, #1 232 sub v16.8h, v16.8h, v31.8h 233 sub v17.8h, v17.8h, v27.8h 234 add va.8h, va.8h, v29.8h 235 add v19.8h, v19.8h, v25.8h 236 sshr v25.8h, v16.8h, #2 237 sshr v27.8h, v17.8h, #2 238 sshr v29.8h, va.8h, #2 239 sshr v31.8h, v19.8h, #2 240 sub v19.8h, v19.8h, v25.8h 241 sub va.8h, v27.8h, va.8h 242 add v17.8h, v17.8h, v29.8h 243 add v16.8h, v16.8h, v31.8h 244 .if \pass == 0 245 sub v31.8h, v24.8h, v19.8h 246 add v24.8h, v24.8h, v19.8h 247 add v25.8h, v26.8h, v18.8h 248 sub v18.8h, v26.8h, v18.8h 249 add v26.8h, v28.8h, v17.8h 250 add v27.8h, v30.8h, v16.8h 251 sub v29.8h, v28.8h, v17.8h 252 sub v28.8h, v30.8h, v16.8h 253 .else 254 sub v31.8h, v24.8h, v19.8h 255 add v24.8h, v24.8h, v19.8h 256 add v25.8h, v26.8h, v30.8h 257 sub v30.8h, v26.8h, v30.8h 258 add v26.8h, v28.8h, v17.8h 259 sub v29.8h, v28.8h, v17.8h 260 add v27.8h, v18.8h, v16.8h 261 sub v28.8h, v18.8h, v16.8h 262 .endif 263 .unreq va 264 .unreq vb 265 .endm 266 267 function ff_h264_idct8_add_neon, export=1 268 .L_ff_h264_idct8_add_neon: 269 AARCH64_VALID_CALL_TARGET 270 movi v19.8h, #0 271 sxtw x2, w2 272 ld1 {v24.8h, v25.8h}, [x1] 273 st1 {v19.8h}, [x1], #16 274 st1 {v19.8h}, [x1], #16 275 ld1 {v26.8h, v27.8h}, [x1] 276 st1 {v19.8h}, [x1], #16 277 st1 {v19.8h}, [x1], #16 278 ld1 {v28.8h, v29.8h}, [x1] 279 st1 {v19.8h}, [x1], #16 280 st1 {v19.8h}, [x1], #16 281 282 idct8x8_cols 0 283 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 284 idct8x8_cols 1 285 286 mov x3, x0 287 srshr v24.8h, v24.8h, #6 288 ld1 {v0.8b}, [x0], x2 289 srshr v25.8h, v25.8h, #6 290 ld1 {v1.8b}, [x0], x2 291 srshr v26.8h, v26.8h, #6 292 ld1 {v2.8b}, [x0], x2 293 srshr v27.8h, v27.8h, #6 294 ld1 {v3.8b}, [x0], x2 295 srshr v28.8h, v28.8h, #6 296 ld1 {v4.8b}, [x0], x2 297 srshr v29.8h, v29.8h, #6 298 ld1 {v5.8b}, [x0], x2 299 srshr v30.8h, v30.8h, #6 300 ld1 {v6.8b}, [x0], x2 301 srshr v31.8h, v31.8h, #6 302 ld1 {v7.8b}, [x0], x2 303 uaddw v24.8h, v24.8h, v0.8b 304 uaddw v25.8h, v25.8h, v1.8b 305 uaddw v26.8h, v26.8h, v2.8b 306 sqxtun v0.8b, v24.8h 307 uaddw v27.8h, v27.8h, v3.8b 308 sqxtun v1.8b, v25.8h 309 uaddw v28.8h, v28.8h, v4.8b 310 sqxtun v2.8b, v26.8h 311 st1 {v0.8b}, [x3], x2 312 uaddw v29.8h, v29.8h, v5.8b 313 sqxtun v3.8b, v27.8h 314 st1 {v1.8b}, [x3], x2 315 uaddw v30.8h, v30.8h, v6.8b 316 sqxtun v4.8b, v28.8h 317 st1 {v2.8b}, [x3], x2 318 uaddw v31.8h, v31.8h, v7.8b 319 sqxtun v5.8b, v29.8h 320 st1 {v3.8b}, [x3], x2 321 sqxtun v6.8b, v30.8h 322 sqxtun v7.8b, v31.8h 323 st1 {v4.8b}, [x3], x2 324 st1 {v5.8b}, [x3], x2 325 st1 {v6.8b}, [x3], x2 326 st1 {v7.8b}, [x3], x2 327 328 sub x1, x1, #128 329 ret 330 endfunc 331 332 function ff_h264_idct8_dc_add_neon, export=1 333 .L_ff_h264_idct8_dc_add_neon: 334 AARCH64_VALID_CALL_TARGET 335 mov w3, #0 336 sxtw x2, w2 337 ld1r {v31.8h}, [x1] 338 strh w3, [x1] 339 ld1 {v0.8b}, [x0], x2 340 srshr v31.8h, v31.8h, #6 341 ld1 {v1.8b}, [x0], x2 342 ld1 {v2.8b}, [x0], x2 343 uaddw v24.8h, v31.8h, v0.8b 344 ld1 {v3.8b}, [x0], x2 345 uaddw v25.8h, v31.8h, v1.8b 346 ld1 {v4.8b}, [x0], x2 347 uaddw v26.8h, v31.8h, v2.8b 348 ld1 {v5.8b}, [x0], x2 349 uaddw v27.8h, v31.8h, v3.8b 350 ld1 {v6.8b}, [x0], x2 351 uaddw v28.8h, v31.8h, v4.8b 352 ld1 {v7.8b}, [x0], x2 353 uaddw v29.8h, v31.8h, v5.8b 354 uaddw v30.8h, v31.8h, v6.8b 355 uaddw v31.8h, v31.8h, v7.8b 356 sqxtun v0.8b, v24.8h 357 sqxtun v1.8b, v25.8h 358 sqxtun v2.8b, v26.8h 359 sqxtun v3.8b, v27.8h 360 sub x0, x0, x2, lsl #3 361 st1 {v0.8b}, [x0], x2 362 sqxtun v4.8b, v28.8h 363 st1 {v1.8b}, [x0], x2 364 sqxtun v5.8b, v29.8h 365 st1 {v2.8b}, [x0], x2 366 sqxtun v6.8b, v30.8h 367 st1 {v3.8b}, [x0], x2 368 sqxtun v7.8b, v31.8h 369 st1 {v4.8b}, [x0], x2 370 st1 {v5.8b}, [x0], x2 371 st1 {v6.8b}, [x0], x2 372 st1 {v7.8b}, [x0], x2 373 ret 374 endfunc 375 376 function ff_h264_idct8_add4_neon, export=1 377 mov x12, x30 378 mov x6, x0 379 mov x5, x1 380 mov x1, x2 381 mov w2, w3 382 movrel x7, scan8 383 mov w10, #16 384 movrel x13, .L_ff_h264_idct8_dc_add_neon 385 movrel x14, .L_ff_h264_idct8_add_neon 386 1: ldrb w9, [x7], #4 387 ldrsw x0, [x5], #16 388 ldrb w9, [x4, w9, uxtw] 389 subs w9, w9, #1 390 b.lt 2f 391 ldrsh w11, [x1] 392 add x0, x6, x0 393 ccmp w11, #0, #4, eq 394 csel x15, x13, x14, ne 395 blr x15 396 2: subs w10, w10, #4 397 add x1, x1, #128 398 b.ne 1b 399 ret x12 400 endfunc 401 402 const scan8 403 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 404 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 405 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 406 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 407 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 408 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 409 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 410 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 411 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 412 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 413 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 414 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 415 endconst