looprestoration_tmpl.S (30150B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 30 #define FILTER_OUT_STRIDE 384 31 32 .macro sgr_funcs bpc 33 // void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp, 34 // const pixel *src, 35 // const ptrdiff_t src_stride, 36 // const int32_t **a, 37 // const int16_t **b, 38 // const int w, const int h); 39 function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1 40 stp d8, d9, [sp, #-0x40]! 41 stp d10, d11, [sp, #0x10] 42 stp d12, d13, [sp, #0x20] 43 stp d14, d15, [sp, #0x30] 44 45 ldp x7, x8, [x3] 46 ldp x9, x3, [x3, #16] 47 ldp x10, x11, [x4] 48 ldp x12, x4, [x4, #16] 49 50 mov x13, #FILTER_OUT_STRIDE 51 cmp w6, #1 52 add x2, x1, x2 // src + stride 53 csel x2, x1, x2, le // if (h <= 1) x2 = x1 54 add x13, x0, x13, lsl #1 55 56 movi v30.8h, #3 57 movi v31.4s, #3 58 1: 59 ld1 {v0.8h, v1.8h}, [x10], #32 60 ld1 {v2.8h, v3.8h}, [x11], #32 61 ld1 {v4.8h, v5.8h}, [x12], #32 62 ld1 {v6.8h, v7.8h}, [x4], #32 63 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 64 ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 65 ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48 66 ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48 67 68 2: 69 ext v8.16b, v0.16b, v1.16b, #2 // [0][1] 70 ext v9.16b, v2.16b, v3.16b, #2 // [1][1] 71 ext v10.16b, v4.16b, v5.16b, #2 // [2][1] 72 ext v11.16b, v0.16b, v1.16b, #4 // [0][2] 73 ext v12.16b, v2.16b, v3.16b, #4 // [1][2] 74 ext v13.16b, v4.16b, v5.16b, #4 // [2][2] 75 76 add v14.8h, v2.8h, v8.8h // [1][0] + [0][1] 77 add v15.8h, v9.8h, v10.8h // [1][1] + [2][1] 78 79 add v28.8h, v0.8h, v11.8h // [0][0] + [0][2] 80 add v14.8h, v14.8h, v12.8h // () + [1][2] 81 add v29.8h, v4.8h, v13.8h // [2][0] + [2][2] 82 83 ext v8.16b, v6.16b, v7.16b, #2 // [3][1] 84 ext v11.16b, v6.16b, v7.16b, #4 // [3][2] 85 86 add v14.8h, v14.8h, v15.8h // mid 87 add v15.8h, v28.8h, v29.8h // corners 88 89 add v28.8h, v4.8h, v9.8h // [2][0] + [1][1] 90 add v29.8h, v10.8h, v8.8h // [2][1] + [3][1] 91 92 add v2.8h, v2.8h, v12.8h // [1][0] + [1][2] 93 add v28.8h, v28.8h, v13.8h // () + [2][2] 94 add v4.8h, v6.8h, v11.8h // [3][0] + [3][2] 95 96 add v0.8h, v28.8h, v29.8h // mid 97 add v2.8h, v2.8h, v4.8h // corners 98 99 shl v4.8h, v14.8h, #2 100 mla v4.8h, v15.8h, v30.8h // * 3 -> a 101 102 shl v0.8h, v0.8h, #2 103 mla v0.8h, v2.8h, v30.8h // * 3 -> a 104 105 ext v8.16b, v16.16b, v17.16b, #4 // [0][1] 106 ext v9.16b, v17.16b, v18.16b, #4 107 ext v10.16b, v16.16b, v17.16b, #8 // [0][2] 108 ext v11.16b, v17.16b, v18.16b, #8 109 ext v12.16b, v19.16b, v20.16b, #4 // [1][1] 110 ext v13.16b, v20.16b, v21.16b, #4 111 add v8.4s, v8.4s, v19.4s // [0][1] + [1][0] 112 add v9.4s, v9.4s, v20.4s 113 add v16.4s, v16.4s, v10.4s // [0][0] + [0][2] 114 add v17.4s, v17.4s, v11.4s 115 ext v14.16b, v19.16b, v20.16b, #8 // [1][2] 116 ext v15.16b, v20.16b, v21.16b, #8 117 add v16.4s, v16.4s, v22.4s // () + [2][0] 118 add v17.4s, v17.4s, v23.4s 119 add v28.4s, v12.4s, v14.4s // [1][1] + [1][2] 120 add v29.4s, v13.4s, v15.4s 121 ext v10.16b, v22.16b, v23.16b, #4 // [2][1] 122 ext v11.16b, v23.16b, v24.16b, #4 123 add v8.4s, v8.4s, v28.4s // mid (incomplete) 124 add v9.4s, v9.4s, v29.4s 125 126 add v19.4s, v19.4s, v14.4s // [1][0] + [1][2] 127 add v20.4s, v20.4s, v15.4s 128 add v14.4s, v22.4s, v12.4s // [2][0] + [1][1] 129 add v15.4s, v23.4s, v13.4s 130 131 ext v12.16b, v22.16b, v23.16b, #8 // [2][2] 132 ext v13.16b, v23.16b, v24.16b, #8 133 ext v28.16b, v25.16b, v26.16b, #4 // [3][1] 134 ext v29.16b, v26.16b, v27.16b, #4 135 add v8.4s, v8.4s, v10.4s // () + [2][1] = mid 136 add v9.4s, v9.4s, v11.4s 137 add v14.4s, v14.4s, v10.4s // () + [2][1] 138 add v15.4s, v15.4s, v11.4s 139 ext v10.16b, v25.16b, v26.16b, #8 // [3][2] 140 ext v11.16b, v26.16b, v27.16b, #8 141 add v16.4s, v16.4s, v12.4s // () + [2][2] = corner 142 add v17.4s, v17.4s, v13.4s 143 144 add v12.4s, v12.4s, v28.4s // [2][2] + [3][1] 145 add v13.4s, v13.4s, v29.4s 146 add v25.4s, v25.4s, v10.4s // [3][0] + [3][2] 147 add v26.4s, v26.4s, v11.4s 148 149 add v14.4s, v14.4s, v12.4s // mid 150 add v15.4s, v15.4s, v13.4s 151 add v19.4s, v19.4s, v25.4s // corner 152 add v20.4s, v20.4s, v26.4s 153 154 .if \bpc == 8 155 ld1 {v25.8b}, [x1], #8 // src 156 ld1 {v26.8b}, [x2], #8 157 .else 158 ld1 {v25.8h}, [x1], #16 // src 159 ld1 {v26.8h}, [x2], #16 160 .endif 161 162 shl v8.4s, v8.4s, #2 163 shl v9.4s, v9.4s, #2 164 mla v8.4s, v16.4s, v31.4s // * 3 -> b 165 mla v9.4s, v17.4s, v31.4s 166 167 .if \bpc == 8 168 uxtl v25.8h, v25.8b // src 169 uxtl v26.8h, v26.8b 170 .endif 171 172 shl v14.4s, v14.4s, #2 173 shl v15.4s, v15.4s, #2 174 mla v14.4s, v19.4s, v31.4s // * 3 -> b 175 mla v15.4s, v20.4s, v31.4s 176 177 umlsl v8.4s, v4.4h, v25.4h // b - a * src 178 umlsl2 v9.4s, v4.8h, v25.8h 179 umlsl v14.4s, v0.4h, v26.4h // b - a * src 180 umlsl2 v15.4s, v0.8h, v26.8h 181 mov v0.16b, v1.16b 182 rshrn v8.4h, v8.4s, #9 183 rshrn2 v8.8h, v9.4s, #9 184 mov v2.16b, v3.16b 185 rshrn v14.4h, v14.4s, #9 186 rshrn2 v14.8h, v15.4s, #9 187 subs w5, w5, #8 188 mov v4.16b, v5.16b 189 st1 {v8.8h}, [x0], #16 190 mov v6.16b, v7.16b 191 st1 {v14.8h}, [x13], #16 192 193 b.le 3f 194 mov v16.16b, v18.16b 195 mov v19.16b, v21.16b 196 mov v22.16b, v24.16b 197 mov v25.16b, v27.16b 198 ld1 {v1.8h}, [x10], #16 199 ld1 {v3.8h}, [x11], #16 200 ld1 {v5.8h}, [x12], #16 201 ld1 {v7.8h}, [x4], #16 202 ld1 {v17.4s, v18.4s}, [x7], #32 203 ld1 {v20.4s, v21.4s}, [x8], #32 204 ld1 {v23.4s, v24.4s}, [x9], #32 205 ld1 {v26.4s, v27.4s}, [x3], #32 206 b 2b 207 208 3: 209 ldp d14, d15, [sp, #0x30] 210 ldp d12, d13, [sp, #0x20] 211 ldp d10, d11, [sp, #0x10] 212 ldp d8, d9, [sp], 0x40 213 ret 214 endfunc 215 216 // void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst, 217 // const int32_t **a, const int16_t **b, 218 // const int w, const int w1, 219 // const int bitdepth_max); 220 function sgr_finish_weighted1_\bpc\()bpc_neon, export=1 221 ldp x7, x8, [x1] 222 ldr x1, [x1, #16] 223 ldp x9, x10, [x2] 224 ldr x2, [x2, #16] 225 226 dup v31.8h, w4 227 dup v30.8h, w5 228 229 movi v6.8h, #3 230 movi v7.4s, #3 231 1: 232 ld1 {v0.8h, v1.8h}, [x9], #32 233 ld1 {v2.8h, v3.8h}, [x10], #32 234 ld1 {v4.8h, v5.8h}, [x2], #32 235 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 236 ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 237 ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48 238 239 2: 240 ext v25.16b, v0.16b, v1.16b, #2 // -stride 241 ext v26.16b, v2.16b, v3.16b, #2 // 0 242 ext v27.16b, v4.16b, v5.16b, #2 // +stride 243 ext v28.16b, v0.16b, v1.16b, #4 // +1-stride 244 ext v29.16b, v2.16b, v3.16b, #4 // +1 245 add v2.8h, v2.8h, v25.8h // -1, -stride 246 ext v25.16b, v4.16b, v5.16b, #4 // +1+stride 247 add v26.8h, v26.8h, v27.8h // 0, +stride 248 add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride 249 add v2.8h, v2.8h, v26.8h 250 add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride 251 add v2.8h, v2.8h, v29.8h // +1 252 add v0.8h, v0.8h, v4.8h 253 254 ext v25.16b, v16.16b, v17.16b, #4 // -stride 255 ext v26.16b, v17.16b, v18.16b, #4 256 shl v2.8h, v2.8h, #2 257 ext v27.16b, v16.16b, v17.16b, #8 // +1-stride 258 ext v28.16b, v17.16b, v18.16b, #8 259 ext v29.16b, v19.16b, v20.16b, #4 // 0 260 ext v4.16b, v20.16b, v21.16b, #4 261 mla v2.8h, v0.8h, v6.8h // * 3 -> a 262 add v25.4s, v25.4s, v19.4s // -stride, -1 263 add v26.4s, v26.4s, v20.4s 264 add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride 265 add v17.4s, v17.4s, v28.4s 266 ext v27.16b, v19.16b, v20.16b, #8 // +1 267 ext v28.16b, v20.16b, v21.16b, #8 268 add v16.4s, v16.4s, v22.4s // -1+stride 269 add v17.4s, v17.4s, v23.4s 270 add v29.4s, v29.4s, v27.4s // 0, +1 271 add v4.4s, v4.4s, v28.4s 272 add v25.4s, v25.4s, v29.4s 273 add v26.4s, v26.4s, v4.4s 274 ext v27.16b, v22.16b, v23.16b, #4 // +stride 275 ext v28.16b, v23.16b, v24.16b, #4 276 ext v29.16b, v22.16b, v23.16b, #8 // +1+stride 277 ext v4.16b, v23.16b, v24.16b, #8 278 .if \bpc == 8 279 ld1 {v19.8b}, [x0] // src 280 .else 281 ld1 {v19.8h}, [x0] // src 282 .endif 283 add v25.4s, v25.4s, v27.4s // +stride 284 add v26.4s, v26.4s, v28.4s 285 add v16.4s, v16.4s, v29.4s // +1+stride 286 add v17.4s, v17.4s, v4.4s 287 shl v25.4s, v25.4s, #2 288 shl v26.4s, v26.4s, #2 289 mla v25.4s, v16.4s, v7.4s // * 3 -> b 290 mla v26.4s, v17.4s, v7.4s 291 .if \bpc == 8 292 uxtl v19.8h, v19.8b // src 293 .endif 294 mov v0.16b, v1.16b 295 umlsl v25.4s, v2.4h, v19.4h // b - a * src 296 umlsl2 v26.4s, v2.8h, v19.8h 297 mov v2.16b, v3.16b 298 rshrn v25.4h, v25.4s, #9 299 rshrn2 v25.8h, v26.4s, #9 300 301 subs w3, w3, #8 302 303 // weighted1 304 mov v4.16b, v5.16b 305 306 ld1 {v1.8h}, [x9], #16 307 ld1 {v3.8h}, [x10], #16 308 smull v26.4s, v25.4h, v31.4h // v = t1 * w1 309 smull2 v27.4s, v25.8h, v31.8h 310 ld1 {v5.8h}, [x2], #16 311 rshrn v26.4h, v26.4s, #11 312 rshrn2 v26.8h, v27.4s, #11 313 usqadd v19.8h, v26.8h 314 .if \bpc == 8 315 mov v16.16b, v18.16b 316 sqxtun v26.8b, v19.8h 317 mov v19.16b, v21.16b 318 mov v22.16b, v24.16b 319 st1 {v26.8b}, [x0], #8 320 .else 321 mov v16.16b, v18.16b 322 umin v26.8h, v19.8h, v30.8h 323 mov v19.16b, v21.16b 324 mov v22.16b, v24.16b 325 st1 {v26.8h}, [x0], #16 326 .endif 327 328 b.le 3f 329 ld1 {v17.4s, v18.4s}, [x7], #32 330 ld1 {v20.4s, v21.4s}, [x8], #32 331 ld1 {v23.4s, v24.4s}, [x1], #32 332 b 2b 333 334 3: 335 ret 336 endfunc 337 338 // void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp, 339 // const pixel *src, 340 // const ptrdiff_t stride, 341 // const int32_t **a, 342 // const int16_t **b, 343 // const int w, const int h); 344 function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1 345 stp d8, d9, [sp, #-0x40]! 346 stp d10, d11, [sp, #0x10] 347 stp d12, d13, [sp, #0x20] 348 stp d14, d15, [sp, #0x30] 349 350 ldp x3, x7, [x3] 351 ldp x4, x8, [x4] 352 mov x10, #FILTER_OUT_STRIDE 353 cmp w6, #1 354 add x2, x1, x2 // src + stride 355 csel x2, x1, x2, le // if (h <= 1) x2 = x1 356 add x10, x0, x10, lsl #1 357 movi v4.8h, #5 358 movi v5.4s, #5 359 movi v6.8h, #6 360 movi v7.4s, #6 361 1: 362 ld1 {v0.8h, v1.8h}, [x4], #32 363 ld1 {v2.8h, v3.8h}, [x8], #32 364 ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 365 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 366 367 2: 368 ext v24.16b, v0.16b, v1.16b, #4 // +1-stride 369 ext v25.16b, v2.16b, v3.16b, #4 // +1+stride 370 ext v22.16b, v0.16b, v1.16b, #2 // -stride 371 ext v23.16b, v2.16b, v3.16b, #2 // +stride 372 add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride 373 add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride 374 add v2.8h, v22.8h, v23.8h // -stride, +stride 375 add v0.8h, v0.8h, v25.8h 376 377 mul v8.8h, v25.8h, v4.8h // * 5 378 mla v8.8h, v23.8h, v6.8h // * 6 379 380 ext v22.16b, v16.16b, v17.16b, #4 // -stride 381 ext v23.16b, v17.16b, v18.16b, #4 382 ext v24.16b, v19.16b, v20.16b, #4 // +stride 383 ext v25.16b, v20.16b, v21.16b, #4 384 ext v26.16b, v16.16b, v17.16b, #8 // +1-stride 385 ext v27.16b, v17.16b, v18.16b, #8 386 ext v28.16b, v19.16b, v20.16b, #8 // +1+stride 387 ext v29.16b, v20.16b, v21.16b, #8 388 mul v0.8h, v0.8h, v4.8h // * 5 389 mla v0.8h, v2.8h, v6.8h // * 6 390 .if \bpc == 8 391 ld1 {v31.8b}, [x1], #8 392 ld1 {v30.8b}, [x2], #8 393 .else 394 ld1 {v31.8h}, [x1], #16 395 ld1 {v30.8h}, [x2], #16 396 .endif 397 add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride 398 add v17.4s, v17.4s, v27.4s 399 add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride 400 add v20.4s, v20.4s, v29.4s 401 add v16.4s, v16.4s, v19.4s 402 add v17.4s, v17.4s, v20.4s 403 404 mul v9.4s, v19.4s, v5.4s // * 5 405 mla v9.4s, v24.4s, v7.4s // * 6 406 mul v10.4s, v20.4s, v5.4s // * 5 407 mla v10.4s, v25.4s, v7.4s // * 6 408 409 add v22.4s, v22.4s, v24.4s // -stride, +stride 410 add v23.4s, v23.4s, v25.4s 411 // This is, surprisingly, faster than other variants where the 412 // mul+mla pairs are further apart, on Cortex A53. 413 mul v16.4s, v16.4s, v5.4s // * 5 414 mla v16.4s, v22.4s, v7.4s // * 6 415 mul v17.4s, v17.4s, v5.4s // * 5 416 mla v17.4s, v23.4s, v7.4s // * 6 417 418 .if \bpc == 8 419 uxtl v31.8h, v31.8b 420 uxtl v30.8h, v30.8b 421 .endif 422 umlsl v16.4s, v0.4h, v31.4h // b - a * src 423 umlsl2 v17.4s, v0.8h, v31.8h 424 umlsl v9.4s, v8.4h, v30.4h // b - a * src 425 umlsl2 v10.4s, v8.8h, v30.8h 426 mov v0.16b, v1.16b 427 rshrn v16.4h, v16.4s, #9 428 rshrn2 v16.8h, v17.4s, #9 429 rshrn v9.4h, v9.4s, #8 430 rshrn2 v9.8h, v10.4s, #8 431 subs w5, w5, #8 432 mov v2.16b, v3.16b 433 st1 {v16.8h}, [x0], #16 434 st1 {v9.8h}, [x10], #16 435 436 b.le 9f 437 mov v16.16b, v18.16b 438 mov v19.16b, v21.16b 439 ld1 {v1.8h}, [x4], #16 440 ld1 {v3.8h}, [x8], #16 441 ld1 {v17.4s, v18.4s}, [x3], #32 442 ld1 {v20.4s, v21.4s}, [x7], #32 443 b 2b 444 445 9: 446 ldp d14, d15, [sp, #0x30] 447 ldp d12, d13, [sp, #0x20] 448 ldp d10, d11, [sp, #0x10] 449 ldp d8, d9, [sp], 0x40 450 ret 451 endfunc 452 453 // void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, 454 // const int32_t **a, 455 // const int16_t **b, 456 // const int w, const int h, 457 // const int w1, 458 // const int bitdepth_max); 459 function sgr_finish_weighted2_\bpc\()bpc_neon, export=1 460 stp d8, d9, [sp, #-0x30]! 461 str d10, [sp, #0x10] 462 stp d14, d15, [sp, #0x20] 463 464 dup v14.8h, w6 465 dup v15.8h, w7 466 467 ldp x2, x7, [x2] 468 ldp x3, x8, [x3] 469 cmp w5, #1 470 add x1, x0, x1 // src + stride 471 // if (h <= 1), set the pointer to the second row to any dummy buffer 472 // we can clobber (x2 in this case) 473 csel x1, x2, x1, le 474 movi v4.8h, #5 475 movi v5.4s, #5 476 movi v6.8h, #6 477 movi v7.4s, #6 478 1: 479 ld1 {v0.8h, v1.8h}, [x3], #32 480 ld1 {v2.8h, v3.8h}, [x8], #32 481 ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48 482 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 483 484 2: 485 ext v24.16b, v0.16b, v1.16b, #4 // +1-stride 486 ext v25.16b, v2.16b, v3.16b, #4 // +1+stride 487 ext v22.16b, v0.16b, v1.16b, #2 // -stride 488 ext v23.16b, v2.16b, v3.16b, #2 // +stride 489 add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride 490 add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride 491 add v2.8h, v22.8h, v23.8h // -stride, +stride 492 add v0.8h, v0.8h, v25.8h 493 494 mul v8.8h, v25.8h, v4.8h // * 5 495 mla v8.8h, v23.8h, v6.8h // * 6 496 497 ext v22.16b, v16.16b, v17.16b, #4 // -stride 498 ext v23.16b, v17.16b, v18.16b, #4 499 ext v24.16b, v19.16b, v20.16b, #4 // +stride 500 ext v25.16b, v20.16b, v21.16b, #4 501 ext v26.16b, v16.16b, v17.16b, #8 // +1-stride 502 ext v27.16b, v17.16b, v18.16b, #8 503 ext v28.16b, v19.16b, v20.16b, #8 // +1+stride 504 ext v29.16b, v20.16b, v21.16b, #8 505 mul v0.8h, v0.8h, v4.8h // * 5 506 mla v0.8h, v2.8h, v6.8h // * 6 507 .if \bpc == 8 508 ld1 {v31.8b}, [x0] 509 ld1 {v30.8b}, [x1] 510 .else 511 ld1 {v31.8h}, [x0] 512 ld1 {v30.8h}, [x1] 513 .endif 514 add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride 515 add v17.4s, v17.4s, v27.4s 516 add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride 517 add v20.4s, v20.4s, v29.4s 518 add v16.4s, v16.4s, v19.4s 519 add v17.4s, v17.4s, v20.4s 520 521 mul v9.4s, v19.4s, v5.4s // * 5 522 mla v9.4s, v24.4s, v7.4s // * 6 523 mul v10.4s, v20.4s, v5.4s // * 5 524 mla v10.4s, v25.4s, v7.4s // * 6 525 526 add v22.4s, v22.4s, v24.4s // -stride, +stride 527 add v23.4s, v23.4s, v25.4s 528 // This is, surprisingly, faster than other variants where the 529 // mul+mla pairs are further apart, on Cortex A53. 530 mul v16.4s, v16.4s, v5.4s // * 5 531 mla v16.4s, v22.4s, v7.4s // * 6 532 mul v17.4s, v17.4s, v5.4s // * 5 533 mla v17.4s, v23.4s, v7.4s // * 6 534 535 .if \bpc == 8 536 uxtl v31.8h, v31.8b 537 uxtl v30.8h, v30.8b 538 .endif 539 umlsl v16.4s, v0.4h, v31.4h // b - a * src 540 umlsl2 v17.4s, v0.8h, v31.8h 541 umlsl v9.4s, v8.4h, v30.4h // b - a * src 542 umlsl2 v10.4s, v8.8h, v30.8h 543 mov v0.16b, v1.16b 544 rshrn v16.4h, v16.4s, #9 545 rshrn2 v16.8h, v17.4s, #9 546 rshrn v9.4h, v9.4s, #8 547 rshrn2 v9.8h, v10.4s, #8 548 549 subs w4, w4, #8 550 551 // weighted1 552 mov v2.16b, v3.16b 553 554 ld1 {v1.8h}, [x3], #16 555 ld1 {v3.8h}, [x8], #16 556 smull v22.4s, v16.4h, v14.4h // v 557 smull2 v23.4s, v16.8h, v14.8h 558 mov v16.16b, v18.16b 559 smull v24.4s, v9.4h, v14.4h 560 smull2 v25.4s, v9.8h, v14.8h 561 mov v19.16b, v21.16b 562 rshrn v22.4h, v22.4s, #11 563 rshrn2 v22.8h, v23.4s, #11 564 rshrn v23.4h, v24.4s, #11 565 rshrn2 v23.8h, v25.4s, #11 566 usqadd v31.8h, v22.8h 567 usqadd v30.8h, v23.8h 568 .if \bpc == 8 569 sqxtun v22.8b, v31.8h 570 sqxtun v23.8b, v30.8h 571 st1 {v22.8b}, [x0], #8 572 st1 {v23.8b}, [x1], #8 573 .else 574 umin v22.8h, v31.8h, v15.8h 575 umin v23.8h, v30.8h, v15.8h 576 st1 {v22.8h}, [x0], #16 577 st1 {v23.8h}, [x1], #16 578 .endif 579 580 b.le 3f 581 ld1 {v17.4s, v18.4s}, [x2], #32 582 ld1 {v20.4s, v21.4s}, [x7], #32 583 b 2b 584 585 3: 586 ldp d14, d15, [sp, #0x20] 587 ldr d10, [sp, #0x10] 588 ldp d8, d9, [sp], 0x30 589 ret 590 endfunc 591 592 // void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, 593 // const int16_t *t1, const int16_t *t2, 594 // const int w, const int h, 595 // const int16_t wt[2], const int bitdepth_max); 596 function sgr_weighted2_\bpc\()bpc_neon, export=1 597 cmp w5, #2 598 add x10, x0, x1 599 add x12, x2, #2*FILTER_OUT_STRIDE 600 add x13, x3, #2*FILTER_OUT_STRIDE 601 ld2r {v30.8h, v31.8h}, [x6] // wt[0], wt[1] 602 .if \bpc == 16 603 dup v29.8h, w7 604 .endif 605 mov x8, #4*FILTER_OUT_STRIDE 606 lsl x1, x1, #1 607 add w9, w4, #7 608 bic x9, x9, #7 // Aligned width 609 .if \bpc == 8 610 sub x1, x1, x9 611 .else 612 sub x1, x1, x9, lsl #1 613 .endif 614 sub x8, x8, x9, lsl #1 615 mov w9, w4 616 b.lt 2f 617 1: 618 .if \bpc == 8 619 ld1 {v0.8b}, [x0] 620 ld1 {v16.8b}, [x10] 621 .else 622 ld1 {v0.8h}, [x0] 623 ld1 {v16.8h}, [x10] 624 .endif 625 ld1 {v1.8h}, [x2], #16 626 ld1 {v17.8h}, [x12], #16 627 ld1 {v2.8h}, [x3], #16 628 ld1 {v18.8h}, [x13], #16 629 subs w4, w4, #8 630 .if \bpc == 8 631 uxtl v0.8h, v0.8b 632 uxtl v16.8h, v16.8b 633 .endif 634 smull v3.4s, v1.4h, v30.4h // wt[0] * t1 635 smlal v3.4s, v2.4h, v31.4h // wt[1] * t2 636 smull2 v4.4s, v1.8h, v30.8h // wt[0] * t1 637 smlal2 v4.4s, v2.8h, v31.8h // wt[1] * t2 638 smull v19.4s, v17.4h, v30.4h // wt[0] * t1 639 smlal v19.4s, v18.4h, v31.4h // wt[1] * t2 640 smull2 v20.4s, v17.8h, v30.8h // wt[0] * t1 641 smlal2 v20.4s, v18.8h, v31.8h // wt[1] * t2 642 rshrn v3.4h, v3.4s, #11 643 rshrn2 v3.8h, v4.4s, #11 644 rshrn v19.4h, v19.4s, #11 645 rshrn2 v19.8h, v20.4s, #11 646 usqadd v0.8h, v3.8h 647 usqadd v16.8h, v19.8h 648 .if \bpc == 8 649 sqxtun v3.8b, v0.8h 650 sqxtun v19.8b, v16.8h 651 st1 {v3.8b}, [x0], #8 652 st1 {v19.8b}, [x10], #8 653 .else 654 umin v3.8h, v0.8h, v29.8h 655 umin v19.8h, v16.8h, v29.8h 656 st1 {v3.8h}, [x0], #16 657 st1 {v19.8h}, [x10], #16 658 .endif 659 b.gt 1b 660 661 subs w5, w5, #2 662 cmp w5, #1 663 b.lt 0f 664 mov w4, w9 665 add x0, x0, x1 666 add x10, x10, x1 667 add x2, x2, x8 668 add x12, x12, x8 669 add x3, x3, x8 670 add x13, x13, x8 671 b.eq 2f 672 b 1b 673 674 2: 675 .if \bpc == 8 676 ld1 {v0.8b}, [x0] 677 .else 678 ld1 {v0.8h}, [x0] 679 .endif 680 ld1 {v1.8h}, [x2], #16 681 ld1 {v2.8h}, [x3], #16 682 subs w4, w4, #8 683 .if \bpc == 8 684 uxtl v0.8h, v0.8b 685 .endif 686 smull v3.4s, v1.4h, v30.4h // wt[0] * t1 687 smlal v3.4s, v2.4h, v31.4h // wt[1] * t2 688 smull2 v4.4s, v1.8h, v30.8h // wt[0] * t1 689 smlal2 v4.4s, v2.8h, v31.8h // wt[1] * t2 690 rshrn v3.4h, v3.4s, #11 691 rshrn2 v3.8h, v4.4s, #11 692 usqadd v0.8h, v3.8h 693 .if \bpc == 8 694 sqxtun v3.8b, v0.8h 695 st1 {v3.8b}, [x0], #8 696 .else 697 umin v3.8h, v0.8h, v29.8h 698 st1 {v3.8h}, [x0], #16 699 .endif 700 b.gt 2b 701 0: 702 ret 703 endfunc 704 .endm