looprestoration_tmpl.S (15078B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 30 #define FILTER_OUT_STRIDE 384 31 32 .macro sgr_funcs bpc 33 // void dav1d_sgr_finish_filter_row1_Xbpc_neon(int16_t *tmp, 34 // const pixel *src, 35 // const int32_t **a, const int16_t **b, 36 // const int w); 37 function sgr_finish_filter_row1_\bpc\()bpc_neon, export=1 38 push {r4-r11,lr} 39 vpush {q4-q7} 40 ldr r4, [sp, #100] 41 ldrd r6, r7, [r2] 42 ldr r2, [r2, #8] 43 ldrd r8, r9, [r3] 44 ldr r3, [r3, #8] 45 vmov.i16 q14, #3 46 vmov.i32 q15, #3 47 1: 48 vld1.16 {q0}, [r8, :128]! 49 vld1.16 {q1}, [r9, :128]! 50 vld1.16 {q2}, [r3, :128]! 51 vld1.32 {q8, q9}, [r6, :128]! 52 vld1.32 {q10, q11}, [r7, :128]! 53 vld1.32 {q12, q13}, [r2, :128]! 54 55 2: 56 subs r4, r4, #4 57 vext.8 d6, d0, d1, #2 // -stride 58 vext.8 d7, d2, d3, #2 // 0 59 vext.8 d8, d4, d5, #2 // +stride 60 vext.8 d9, d0, d1, #4 // +1-stride 61 vext.8 d10, d2, d3, #4 // +1 62 vext.8 d11, d4, d5, #4 // +1+stride 63 vadd.i16 d2, d2, d6 // -1, -stride 64 vadd.i16 d7, d7, d8 // 0, +stride 65 vadd.i16 d0, d0, d9 // -1-stride, +1-stride 66 vadd.i16 d2, d2, d7 67 vadd.i16 d4, d4, d11 // -1+stride, +1+stride 68 vadd.i16 d2, d2, d10 // +1 69 vadd.i16 d0, d0, d4 70 71 vext.8 q3, q8, q9, #4 // -stride 72 vshl.i16 d2, d2, #2 73 vext.8 q4, q8, q9, #8 // +1-stride 74 vext.8 q5, q10, q11, #4 // 0 75 vext.8 q6, q10, q11, #8 // +1 76 vmla.i16 d2, d0, d28 // * 3 -> a 77 vadd.i32 q3, q3, q10 // -stride, -1 78 vadd.i32 q8, q8, q4 // -1-stride, +1-stride 79 vadd.i32 q5, q5, q6 // 0, +1 80 vadd.i32 q8, q8, q12 // -1+stride 81 vadd.i32 q3, q3, q5 82 vext.8 q7, q12, q13, #4 // +stride 83 vext.8 q10, q12, q13, #8 // +1+stride 84 .if \bpc == 8 85 vld1.32 {d24[0]}, [r1, :32]! // src 86 .else 87 vld1.16 {d24}, [r1, :64]! // src 88 .endif 89 vadd.i32 q3, q3, q7 // +stride 90 vadd.i32 q8, q8, q10 // +1+stride 91 vshl.i32 q3, q3, #2 92 vmla.i32 q3, q8, q15 // * 3 -> b 93 .if \bpc == 8 94 vmovl.u8 q12, d24 // src 95 .endif 96 vmov d0, d1 97 vmlsl.u16 q3, d2, d24 // b - a * src 98 vmov d2, d3 99 vrshrn.i32 d6, q3, #9 100 vmov d4, d5 101 vst1.16 {d6}, [r0]! 102 103 ble 3f 104 vmov q8, q9 105 vmov q10, q11 106 vmov q12, q13 107 vld1.16 {d1}, [r8, :64]! 108 vld1.16 {d3}, [r9, :64]! 109 vld1.16 {d5}, [r3, :64]! 110 vld1.32 {q9}, [r6, :128]! 111 vld1.32 {q11}, [r7, :128]! 112 vld1.32 {q13}, [r2, :128]! 113 b 2b 114 115 3: 116 vpop {q4-q7} 117 pop {r4-r11,pc} 118 endfunc 119 120 // void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp, 121 // const pixel *src, const ptrdiff_t stride, 122 // const int32_t **a, const int16_t **b, 123 // const int w, const int h); 124 function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1 125 push {r4-r11,lr} 126 vpush {q4-q7} 127 ldrd r4, r5, [sp, #100] 128 ldr r6, [sp, #108] 129 ldrd r8, r9, [r3] 130 ldrd r10, r11, [r4] 131 mov r7, #2*FILTER_OUT_STRIDE 132 add r2, r1, r2 133 add r7, r7, r0 134 mov lr, r5 135 136 1: 137 vld1.16 {q0, q1}, [r10, :128]! 138 vld1.16 {q2, q3}, [r11, :128]! 139 vld1.32 {q8, q9}, [r8, :128]! 140 vld1.32 {q11, q12}, [r9, :128]! 141 vld1.32 {q10}, [r8, :128]! 142 vld1.32 {q13}, [r9, :128]! 143 144 2: 145 vmov.i16 q14, #5 146 vmov.i16 q15, #6 147 subs r5, r5, #8 148 vext.8 q4, q0, q1, #4 // +1-stride 149 vext.8 q5, q2, q3, #4 // +1+stride 150 vext.8 q6, q0, q1, #2 // -stride 151 vext.8 q7, q2, q3, #2 // +stride 152 vadd.i16 q0, q0, q4 // -1-stride, +1-stride 153 vadd.i16 q5, q2, q5 // -1+stride, +1+stride 154 vadd.i16 q2, q6, q7 // -stride, +stride 155 vadd.i16 q0, q0, q5 156 157 vext.8 q4, q8, q9, #8 // +1-stride 158 vext.8 q5, q9, q10, #8 159 vext.8 q6, q11, q12, #8 // +1+stride 160 vext.8 q7, q12, q13, #8 161 vmul.i16 q0, q0, q14 // * 5 162 vmla.i16 q0, q2, q15 // * 6 163 vadd.i32 q4, q4, q8 // -1-stride, +1-stride 164 vadd.i32 q5, q5, q9 165 vadd.i32 q6, q6, q11 // -1+stride, +1+stride 166 vadd.i32 q7, q7, q12 167 vadd.i32 q4, q4, q6 168 vadd.i32 q5, q5, q7 169 vext.8 q6, q8, q9, #4 // -stride 170 vext.8 q7, q9, q10, #4 171 vext.8 q8, q11, q12, #4 // +stride 172 vext.8 q11, q12, q13, #4 173 174 .if \bpc == 8 175 vld1.8 {d4}, [r1, :64]! 176 .else 177 vld1.8 {q2}, [r1, :128]! 178 .endif 179 180 vmov.i32 q14, #5 181 vmov.i32 q15, #6 182 183 vadd.i32 q6, q6, q8 // -stride, +stride 184 vadd.i32 q7, q7, q11 185 vmul.i32 q4, q4, q14 // * 5 186 vmla.i32 q4, q6, q15 // * 6 187 vmul.i32 q5, q5, q14 // * 5 188 vmla.i32 q5, q7, q15 // * 6 189 190 .if \bpc == 8 191 vmovl.u8 q2, d4 192 .endif 193 vmlsl.u16 q4, d0, d4 // b - a * src 194 vmlsl.u16 q5, d1, d5 // b - a * src 195 vmov q0, q1 196 vrshrn.i32 d8, q4, #9 197 vrshrn.i32 d9, q5, #9 198 vmov q2, q3 199 vst1.16 {q4}, [r0, :128]! 200 201 ble 3f 202 vmov q8, q10 203 vmov q11, q13 204 vld1.16 {q1}, [r10, :128]! 205 vld1.16 {q3}, [r11, :128]! 206 vld1.32 {q9, q10}, [r8, :128]! 207 vld1.32 {q12, q13}, [r9, :128]! 208 b 2b 209 210 3: 211 subs r6, r6, #1 212 ble 0f 213 mov r5, lr 214 ldrd r8, r9, [r3] 215 ldrd r10, r11, [r4] 216 mov r0, r7 217 mov r1, r2 218 219 vld1.32 {q8, q9}, [r9, :128]! 220 vld1.16 {q0, q1}, [r11, :128]! 221 vld1.32 {q10}, [r9, :128]! 222 223 vmov.i16 q12, #5 224 vmov.i16 q13, #6 225 226 4: 227 subs r5, r5, #8 228 vext.8 q3, q0, q1, #4 // +1 229 vext.8 q2, q0, q1, #2 // 0 230 vadd.i16 q0, q0, q3 // -1, +1 231 232 vext.8 q4, q8, q9, #4 // 0 233 vext.8 q5, q9, q10, #4 234 vext.8 q6, q8, q9, #8 // +1 235 vext.8 q7, q9, q10, #8 236 vmul.i16 q2, q2, q13 // * 6 237 vmla.i16 q2, q0, q12 // * 5 -> a 238 .if \bpc == 8 239 vld1.8 {d22}, [r1, :64]! 240 .else 241 vld1.16 {q11}, [r1, :128]! 242 .endif 243 vadd.i32 q8, q8, q6 // -1, +1 244 vadd.i32 q9, q9, q7 245 .if \bpc == 8 246 vmovl.u8 q11, d22 247 .endif 248 vmul.i32 q4, q4, q15 // * 6 249 vmla.i32 q4, q8, q14 // * 5 -> b 250 vmul.i32 q5, q5, q15 // * 6 251 vmla.i32 q5, q9, q14 // * 5 -> b 252 253 vmlsl.u16 q4, d4, d22 // b - a * src 254 vmlsl.u16 q5, d5, d23 255 vmov q0, q1 256 vrshrn.i32 d8, q4, #8 257 vrshrn.i32 d9, q5, #8 258 vmov q8, q10 259 vst1.16 {q4}, [r0, :128]! 260 261 ble 5f 262 vld1.16 {q1}, [r11, :128]! 263 vld1.32 {q9, q10}, [r9, :128]! 264 b 4b 265 266 5: 267 0: 268 vpop {q4-q7} 269 pop {r4-r11,pc} 270 endfunc 271 272 // void dav1d_sgr_weighted_row1_Xbpc_neon(pixel *dst, 273 // const int16_t *t1, const int w, 274 // const int w1, const int bitdepth_max); 275 function sgr_weighted_row1_\bpc\()bpc_neon, export=1 276 push {lr} 277 .if \bpc == 16 278 ldr lr, [sp, #4] 279 .endif 280 vdup.16 d31, r3 281 .if \bpc == 16 282 vmov.i16 q13, #0 283 vdup.16 q14, lr 284 .endif 285 286 1: 287 .if \bpc == 8 288 vld1.8 {d0}, [r0, :64] 289 .else 290 vld1.16 {q0}, [r0, :128] 291 .endif 292 vld1.16 {q1}, [r1, :128]! 293 subs r2, r2, #8 294 vmull.s16 q2, d2, d31 // v 295 vmull.s16 q3, d3, d31 // v 296 vrshrn.i32 d4, q2, #11 297 vrshrn.i32 d5, q3, #11 298 .if \bpc == 8 299 vaddw.u8 q2, q2, d0 300 vqmovun.s16 d2, q2 301 vst1.8 {d2}, [r0, :64]! 302 .else 303 vadd.i16 q2, q2, q0 304 vmax.s16 q2, q2, q13 305 vmin.u16 q2, q2, q14 306 vst1.16 {q2}, [r0, :128]! 307 .endif 308 bgt 1b 309 0: 310 pop {pc} 311 endfunc 312 313 // void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, 314 // const int16_t *t1, const int16_t *t2, 315 // const int w, const int h, 316 // const int16_t wt[2], const int bitdepth_max); 317 function sgr_weighted2_\bpc\()bpc_neon, export=1 318 push {r4-r8,lr} 319 ldrd r4, r5, [sp, #24] 320 .if \bpc == 8 321 ldr r6, [sp, #32] 322 .else 323 ldrd r6, r7, [sp, #32] 324 .endif 325 cmp r5, #2 326 add r8, r0, r1 327 add r12, r2, #2*FILTER_OUT_STRIDE 328 add lr, r3, #2*FILTER_OUT_STRIDE 329 vld2.16 {d30[], d31[]}, [r6] // wt[0], wt[1] 330 .if \bpc == 16 331 vdup.16 q14, r7 332 .endif 333 blt 2f 334 1: 335 .if \bpc == 8 336 vld1.8 {d0}, [r0, :64] 337 vld1.8 {d16}, [r8, :64] 338 .else 339 vld1.16 {q0}, [r0, :128] 340 vld1.16 {q8}, [r8, :128] 341 .endif 342 vld1.16 {q1}, [r2, :128]! 343 vld1.16 {q9}, [r12, :128]! 344 vld1.16 {q2}, [r3, :128]! 345 vld1.16 {q10}, [lr, :128]! 346 subs r4, r4, #8 347 vmull.s16 q3, d2, d30 // wt[0] * t1 348 vmlal.s16 q3, d4, d31 // wt[1] * t2 349 vmull.s16 q12, d3, d30 // wt[0] * t1 350 vmlal.s16 q12, d5, d31 // wt[1] * t2 351 vmull.s16 q11, d18, d30 // wt[0] * t1 352 vmlal.s16 q11, d20, d31 // wt[1] * t2 353 vmull.s16 q13, d19, d30 // wt[0] * t1 354 vmlal.s16 q13, d21, d31 // wt[1] * t2 355 vrshrn.i32 d6, q3, #11 356 vrshrn.i32 d7, q12, #11 357 vrshrn.i32 d22, q11, #11 358 vrshrn.i32 d23, q13, #11 359 .if \bpc == 8 360 vaddw.u8 q3, q3, d0 361 vaddw.u8 q11, q11, d16 362 vqmovun.s16 d6, q3 363 vqmovun.s16 d22, q11 364 vst1.8 {d6}, [r0, :64]! 365 vst1.8 {d22}, [r8, :64]! 366 .else 367 vmov.i16 q13, #0 368 vadd.i16 q3, q3, q0 369 vadd.i16 q11, q11, q8 370 vmax.s16 q3, q3, q13 371 vmax.s16 q11, q11, q13 372 vmin.u16 q3, q3, q14 373 vmin.u16 q11, q11, q14 374 vst1.16 {q3}, [r0, :128]! 375 vst1.16 {q11}, [r8, :128]! 376 .endif 377 bgt 1b 378 b 0f 379 380 2: 381 .if \bpc == 8 382 vld1.8 {d0}, [r0, :64] 383 .else 384 vld1.16 {q0}, [r0, :128] 385 .endif 386 vld1.16 {q1}, [r2, :128]! 387 vld1.16 {q2}, [r3, :128]! 388 subs r4, r4, #8 389 vmull.s16 q3, d2, d30 // wt[0] * t1 390 vmlal.s16 q3, d4, d31 // wt[1] * t2 391 vmull.s16 q11, d3, d30 // wt[0] * t1 392 vmlal.s16 q11, d5, d31 // wt[1] * t2 393 vrshrn.i32 d6, q3, #11 394 vrshrn.i32 d7, q11, #11 395 .if \bpc == 8 396 vaddw.u8 q3, q3, d0 397 vqmovun.s16 d6, q3 398 vst1.8 {d6}, [r0, :64]! 399 .else 400 vmov.i16 q13, #0 401 vadd.i16 q3, q3, q0 402 vmax.s16 q3, q3, q13 403 vmin.u16 q3, q3, q14 404 vst1.16 {q3}, [r0, :128]! 405 .endif 406 bgt 2b 407 0: 408 pop {r4-r8,pc} 409 endfunc 410 .endm