refmvs.S (23923B)
1 /* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm-offsets.h" 29 #include "src/arm/asm.S" 30 #include "util.S" 31 32 #define INVALID_MV 0x80008000 33 34 // void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, 35 // int bx4, int bw4, int bh4) 36 37 function splat_mv_neon, export=1 38 ld1 {v3.16b}, [x1] 39 clz w3, w3 40 movrel x5, splat_tbl 41 sub w3, w3, #26 42 ext v2.16b, v3.16b, v3.16b, #12 43 ldrsw x3, [x5, w3, uxtw #2] 44 add w2, w2, w2, lsl #1 45 ext v0.16b, v2.16b, v3.16b, #4 46 add x3, x5, x3 47 ext v1.16b, v2.16b, v3.16b, #8 48 lsl w2, w2, #2 49 ext v2.16b, v2.16b, v3.16b, #12 50 1: 51 ldr x1, [x0], #8 52 subs w4, w4, #1 53 add x1, x1, x2 54 br x3 55 56 10: 57 AARCH64_VALID_JUMP_TARGET 58 st1 {v0.8b}, [x1] 59 str s2, [x1, #8] 60 b.gt 1b 61 ret 62 20: 63 AARCH64_VALID_JUMP_TARGET 64 st1 {v0.16b}, [x1] 65 str d1, [x1, #16] 66 b.gt 1b 67 ret 68 320: 69 AARCH64_VALID_JUMP_TARGET 70 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 71 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 72 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 73 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 74 160: 75 AARCH64_VALID_JUMP_TARGET 76 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 77 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 78 80: 79 AARCH64_VALID_JUMP_TARGET 80 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 81 40: 82 AARCH64_VALID_JUMP_TARGET 83 st1 {v0.16b, v1.16b, v2.16b}, [x1] 84 b.gt 1b 85 ret 86 endfunc 87 88 jumptable splat_tbl 89 .word 320b - splat_tbl 90 .word 160b - splat_tbl 91 .word 80b - splat_tbl 92 .word 40b - splat_tbl 93 .word 20b - splat_tbl 94 .word 10b - splat_tbl 95 endjumptable 96 97 const mv_tbls, align=4 98 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 99 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 100 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 101 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 102 endconst 103 104 const mask_mult, align=4 105 .byte 1, 2, 1, 2, 0, 0, 0, 0 106 endconst 107 108 // void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, 109 // refmvs_block **rr, const uint8_t *ref_sign, 110 // int col_end8, int row_end8, 111 // int col_start8, int row_start8) 112 function save_tmvs_neon, export=1 113 AARCH64_SIGN_LINK_REGISTER 114 stp x29, x30, [sp, #-16]! 115 mov x29, sp 116 117 movi v30.8b, #0 118 ld1 {v31.8b}, [x3] 119 movrel x8, save_tmvs_tbl 120 movrel x16, mask_mult 121 movrel x13, mv_tbls 122 ld1 {v29.8b}, [x16] 123 ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign] 124 mov w15, #5 125 mov w14, #12*2 126 sxtw x4, w4 127 sxtw x6, w6 128 mul w1, w1, w15 // stride *= 5 129 sub w5, w5, w7 // h = row_end8 - row_start8 130 lsl w7, w7, #1 // row_start8 <<= 1 131 1: 132 mov w15, #5 133 and w9, w7, #30 // (y & 15) * 2 134 ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2] 135 add x9, x9, #12 // &b[... + 1] 136 madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1] 137 madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1] 138 139 madd x3, x6, x15, x0 // &rp[x] 140 141 2: 142 ldrb w11, [x9, #10] // cand_b->bs 143 ld1 {v0.16b}, [x9] // cand_b->mv 144 add x11, x8, w11, uxtw #3 145 ldr h1, [x9, #8] // cand_b->ref 146 ldr w12, [x11] // bw8 147 mov x15, x8 148 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 149 cmp x9, x10 150 mov v2.8b, v0.8b 151 b.ge 3f 152 153 ldrb w15, [x9, #10] // cand_b->bs 154 add x16, x9, #8 155 ld1 {v4.16b}, [x9] // cand_b->mv 156 add x15, x8, w15, uxtw #3 157 ld1 {v1.h}[1], [x16] // cand_b->ref 158 ldr w12, [x15] // bw8 159 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 160 trn1 v2.2d, v0.2d, v4.2d 161 162 3: 163 abs v2.8h, v2.8h // abs(mv[].xy) 164 tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref] 165 ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12 166 umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2} 167 cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096 168 xtn v2.4h, v2.4s // abs() condition to 16 bit 169 and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1] 170 addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0] 171 umov w16, v1.h[0] // Extract case for first block 172 umov w17, v1.h[1] 173 ldrsw x11, [x11, #4] // Fetch jump table entry 174 ldrsw x15, [x15, #4] 175 ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case 176 ldr q5, [x13, w17, uxtw #4] 177 add x11, x8, x11 // Find jump table target 178 add x15, x8, x15 179 tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block 180 tbl v4.16b, {v4.16b}, v5.16b 181 182 // v1 follows on v0, with another 3 full repetitions of the pattern. 183 ext v1.16b, v0.16b, v0.16b, #1 184 ext v5.16b, v4.16b, v4.16b, #1 185 // v2 ends with 3 complete repetitions of the pattern. 186 ext v2.16b, v0.16b, v1.16b, #4 187 ext v6.16b, v4.16b, v5.16b, #4 188 189 blr x11 190 b.ge 4f // if (cand_b >= end) 191 mov v0.16b, v4.16b 192 mov v1.16b, v5.16b 193 mov v2.16b, v6.16b 194 cmp x9, x10 195 blr x15 196 b.lt 2b // if (cand_b < end) 197 198 4: 199 subs w5, w5, #1 // h-- 200 add w7, w7, #2 // y += 2 201 add x0, x0, x1 // rp += stride 202 b.gt 1b 203 204 ldp x29, x30, [sp], #16 205 AARCH64_VALIDATE_LINK_REGISTER 206 ret 207 208 10: 209 AARCH64_VALID_CALL_TARGET 210 add x16, x3, #4 211 st1 {v0.s}[0], [x3] 212 st1 {v0.b}[4], [x16] 213 add x3, x3, #5 214 ret 215 20: 216 AARCH64_VALID_CALL_TARGET 217 add x16, x3, #8 218 st1 {v0.d}[0], [x3] 219 st1 {v0.h}[4], [x16] 220 add x3, x3, #2*5 221 ret 222 40: 223 AARCH64_VALID_CALL_TARGET 224 st1 {v0.16b}, [x3] 225 str s1, [x3, #16] 226 add x3, x3, #4*5 227 ret 228 80: 229 AARCH64_VALID_CALL_TARGET 230 // This writes 6 full entries plus 2 extra bytes 231 st1 {v0.16b, v1.16b}, [x3] 232 // Write the last few, overlapping with the first write. 233 stur q2, [x3, #(8*5-16)] 234 add x3, x3, #8*5 235 ret 236 160: 237 AARCH64_VALID_CALL_TARGET 238 add x16, x3, #6*5 239 add x17, x3, #12*5 240 // This writes 6 full entries plus 2 extra bytes 241 st1 {v0.16b, v1.16b}, [x3] 242 // Write another 6 full entries, slightly overlapping with the first set 243 st1 {v0.16b, v1.16b}, [x16] 244 // Write 8 bytes (one full entry) after the first 12 245 st1 {v0.8b}, [x17] 246 // Write the last 3 entries 247 str q2, [x3, #(16*5-16)] 248 add x3, x3, #16*5 249 ret 250 endfunc 251 252 jumptable save_tmvs_tbl 253 .word 16 * 12 254 .word 160b - save_tmvs_tbl 255 .word 16 * 12 256 .word 160b - save_tmvs_tbl 257 .word 8 * 12 258 .word 80b - save_tmvs_tbl 259 .word 8 * 12 260 .word 80b - save_tmvs_tbl 261 .word 8 * 12 262 .word 80b - save_tmvs_tbl 263 .word 8 * 12 264 .word 80b - save_tmvs_tbl 265 .word 4 * 12 266 .word 40b - save_tmvs_tbl 267 .word 4 * 12 268 .word 40b - save_tmvs_tbl 269 .word 4 * 12 270 .word 40b - save_tmvs_tbl 271 .word 4 * 12 272 .word 40b - save_tmvs_tbl 273 .word 2 * 12 274 .word 20b - save_tmvs_tbl 275 .word 2 * 12 276 .word 20b - save_tmvs_tbl 277 .word 2 * 12 278 .word 20b - save_tmvs_tbl 279 .word 2 * 12 280 .word 20b - save_tmvs_tbl 281 .word 2 * 12 282 .word 20b - save_tmvs_tbl 283 .word 1 * 12 284 .word 10b - save_tmvs_tbl 285 .word 1 * 12 286 .word 10b - save_tmvs_tbl 287 .word 1 * 12 288 .word 10b - save_tmvs_tbl 289 .word 1 * 12 290 .word 10b - save_tmvs_tbl 291 .word 1 * 12 292 .word 10b - save_tmvs_tbl 293 .word 1 * 12 294 .word 10b - save_tmvs_tbl 295 .word 1 * 12 296 .word 10b - save_tmvs_tbl 297 endjumptable 298 299 // void dav1d_load_tmvs_neon(const refmvs_frame *const rf, int tile_row_idx, 300 // const int col_start8, const int col_end8, 301 // const int row_start8, int row_end8) 302 function load_tmvs_neon, export=1 303 rf .req x0 304 tile_row_idx .req w1 305 col_start8 .req w2 306 col_end8 .req w3 307 row_start8 .req w4 308 row_end8 .req w5 309 col_start8i .req w6 310 col_end8i .req w7 311 rp_proj .req x8 312 stride5 .req x9 313 wstride5 .req w9 314 stp x28, x27, [sp, #-96]! 315 stp x26, x25, [sp, #16] 316 stp x24, x23, [sp, #32] 317 stp x22, x21, [sp, #48] 318 stp x20, x19, [sp, #64] 319 stp x29, x30, [sp, #80] 320 321 ldr w15, [rf, #RMVSF_N_TILE_THREADS] 322 ldp w16, w17, [rf, #RMVSF_IW8] // include rf->ih8 too 323 sub col_start8i, col_start8, #8 // col_start8 - 8 324 add col_end8i, col_end8, #8 // col_end8 + 8 325 ldr wstride5, [rf, #RMVSF_RP_STRIDE] 326 ldr rp_proj, [rf, #RMVSF_RP_PROJ] 327 328 cmp w15, #1 329 csel tile_row_idx, wzr, tile_row_idx, eq // if (rf->n_tile_threads == 1) tile_row_idx = 0 330 331 bic col_start8i, col_start8i, col_start8i, asr #31 // imax(col_start8 - 8, 0) 332 cmp col_end8i, w16 333 csel col_end8i, col_end8i, w16, lt // imin(col_end8 + 8, rf->iw8) 334 335 lsl tile_row_idx, tile_row_idx, #4 // 16 * tile_row_idx 336 337 cmp row_end8, w17 338 csel row_end8, row_end8, w17, lt // imin(row_end8, rf->ih8) 339 340 add wstride5, wstride5, wstride5, lsl #2 // stride * sizeof(refmvs_temporal_block) 341 and w15, row_start8, #15 // row_start8 & 15 342 add w10, col_start8, col_start8, lsl #2 // col_start8 * sizeof(refmvs_temporal_block) 343 smaddl rp_proj, tile_row_idx, wstride5, rp_proj // &rf->rp_proj[16 * stride * tile_row_idx] 344 smaddl x10, w15, wstride5, x10 // ((row_start8 & 15) * stride + col_start8) * sizeof(refmvs_temporal_block) 345 mov w15, #INVALID_MV 346 sub w11, col_end8, col_start8 // xfill loop count 347 add x10, x10, rp_proj // &rf->rp_proj[16 * stride * tile_row_idx + (row_start8 & 15) * stride + col_start8] 348 add x15, x15, x15, lsl #40 // first 64b of 4 [INVALID_MV, 0]... patterns 349 mov w17, #(INVALID_MV >> 8) // last 32b of 4 patterns 350 sub w12, row_end8, row_start8 // yfill loop count 351 ror x16, x15, #48 // second 64b of 4 patterns 352 ldr w19, [rf, #RMVSF_N_MFMVS] 353 354 5: // yfill loop 355 and w13, w11, #-4 // xfill 4x count by patterns 356 mov x14, x10 // fill_ptr = row_ptr 357 add x10, x10, stride5 // row_ptr += stride 358 sub w12, w12, #1 // y-- 359 360 cbz w13, 3f 361 362 4: // xfill loop 4x 363 sub w13, w13, #4 // xfill 4x count -= 4 364 stp x15, x16, [x14] 365 str w17, [x14, #16] 366 add x14, x14, #20 // fill_ptr += 4 * sizeof(refmvs_temporal_block) 367 cbnz w13, 4b 368 369 3: // up to 3 residuals 370 tbz w11, #1, 1f 371 str x15, [x14] 372 strh w16, [x14, #8] 373 add x14, x14, #10 // fill_ptr += 2 * sizeof(refmvs_temporal_block) 374 375 1: // up to 1 residual 376 tbz w11, #0, 2f 377 str w15, [x14] 378 2: 379 cbnz w12, 5b // yfill loop 380 381 cbz w19, 11f // if (!rf->n_mfmvs) skip nloop 382 383 add x29, rf, #RMVSF_MFMV_REF2CUR 384 mov w10, #0 // n = 0 385 movi v3.2s, #255 // 0x3FFF >> 6, for MV clamp 386 movrel x1, div_mult_tbl 387 388 10: // nloop 389 ldrsb w16, [x29, x10] // ref2cur = rf->mfmv_ref2cur[n] 390 cmp w16, #-32 391 b.eq 9f // if (ref2cur == INVALID_REF2CUR) continue 392 393 add x17, x10, #(RMVSF_MFMV_REF - RMVSF_MFMV_REF2CUR) // n - (&rf->mfmv_ref - &rf->mfmv_ref2cur) 394 mov x20, #4 395 ldrb w17, [x29, x17] // ref = rf->mfmv_ref[n] 396 ldr x13, [x29, #(RMVSF_RP_REF - RMVSF_MFMV_REF2CUR)] 397 sub x21, x10, x10, lsl #3 // -(n * 7) 398 smaddl x20, row_start8, wstride5, x20 // row_start8 * stride * sizeof(refmvs_temporal_block) + 4 399 mov w12, row_start8 // y = row_start8 400 add x28, x29, #(RMVSF_MFMV_REF2REF - RMVSF_MFMV_REF2CUR - 1) // &rf->mfmv_ref2ref - 1 401 ldr x13, [x13, x17, lsl #3] // rf->rp_ref[ref] 402 sub x28, x28, x21 // rf->mfmv_ref2ref[n] - 1 403 sub w17, w17, #4 // ref_sign = ref - 4 404 add x13, x13, x20 // r = &rf->rp_ref[ref][row_start8 * stride].ref 405 dup v0.2s, w17 // ref_sign 406 407 5: // yloop 408 and w14, w12, #-8 // y_sb_align = y & ~7 409 mov w11, col_start8i // x = col_start8i 410 add w15, w14, #8 // y_sb_align + 8 411 cmp w14, row_start8 412 csel w14, w14, row_start8, gt // imax(y_sb_align, row_start8) 413 cmp w15, row_end8 414 csel w15, w15, row_end8, lt // imin(y_sb_align + 8, row_end8) 415 416 4: // xloop 417 add x23, x13, x11, lsl #2 // partial &r[x] address 418 ldrb w22, [x23, x11] // b_ref = rb->ref 419 cbz w22, 6f // if (!b_ref) continue 420 421 ldrb w24, [x28, x22] // ref2ref = rf->mfmv_ref2ref[n][b_ref - 1] 422 cbz w24, 6f // if (!ref2ref) continue 423 424 ldrh w20, [x1, x24, lsl #1] // div_mult[ref2ref] 425 add x23, x23, x11 // &r[x] 426 mul w20, w20, w16 // frac = ref2cur * div_mult[ref2ref] 427 428 ldur s1, [x23, #-4] // mv{y, x} = rb->mv 429 fmov s2, w20 // frac 430 sxtl v1.4s, v1.4h 431 mul v1.2s, v1.2s, v2.s[0] // offset{y, x} = frac * mv{y, x} 432 433 ssra v1.2s, v1.2s, #31 // offset{y, x} + (offset{y, x} >> 31) 434 ldur w25, [x23, #-4] // b_mv = rb->mv 435 srshr v1.2s, v1.2s, #14 // (offset{y, x} + (offset{y, x} >> 31) + 8192) >> 14 436 437 abs v2.2s, v1.2s // abs(offset{y, x}) 438 eor v1.8b, v1.8b, v0.8b // offset{y, x} ^ ref_sign 439 440 sshr v2.2s, v2.2s, #6 // abs(offset{y, x}) >> 6 441 cmlt v1.2s, v1.2s, #0 // sign(offset{y, x} ^ ref_sign): -1 or 0 442 umin v2.2s, v2.2s, v3.2s // iclip(abs(offset{y, x}) >> 6, 0, 0x3FFF >> 6) 443 444 neg v4.2s, v2.2s 445 bsl v1.8b, v4.8b, v2.8b // apply_sign(iclip(abs(offset{y, x}) >> 6, 0, 0x3FFF >> 6)) 446 fmov x20, d1 // offset{y, x} 447 448 add w21, w12, w20 // pos_y = y + offset.y 449 cmp w21, w14 // pos_y >= y_proj_start 450 b.lt 1f 451 cmp w21, w15 // pos_y < y_proj_end 452 b.ge 1f 453 add x26, x11, x20, asr #32 // pos_x = x + offset.x 454 and w27, w21, #15 // pos_y & 15 455 add x21, x26, x26, lsl #2 // pos_x * sizeof(refmvs_temporal_block) 456 umaddl x27, w27, wstride5, rp_proj // &rp_proj[(pos_y & 15) * stride] 457 add x27, x27, x21 // &rp_proj[(pos_y & 15) * stride + pos_x] 458 459 3: // copy loop 460 and w20, w11, #-8 // x_sb_align = x & ~7 461 sub w21, w20, #8 // x_sb_align - 8 462 cmp w21, col_start8 463 csel w21, w21, col_start8, gt // imax(x_sb_align - 8, col_start8) 464 cmp w26, w21 // pos_x >= imax(x_sb_align - 8, col_start8) 465 b.lt 2f 466 add w20, w20, #16 // x_sb_align + 16 467 cmp w20, col_end8 468 csel w20, w20, col_end8, lt // imin(x_sb_align + 16, col_end8) 469 cmp w26, w20 // pos_x < imin(x_sb_align + 16, col_end8) 470 b.ge 2f 471 str w25, [x27] // rp_proj[pos + pos_x].mv = rb->mv (b_mv) 472 strb w24, [x27, #4] // rp_proj[pos + pos_x].ref = ref2ref 473 474 2: // search part of copy loop 475 add w11, w11, #1 // x++ 476 cmp w11, col_end8i // if (++x >= col_end8i) break xloop 477 b.ge 8f 478 479 ldrb w20, [x23, #5]! // rb++; rb->ref 480 cmp w20, w22 // if (rb->ref != b_ref) break 481 b.ne 7f 482 483 ldur w21, [x23, #-4] // rb->mv.n 484 cmp w21, w25 // if (rb->mv.n != b_mv.n) break 485 b.ne 7f 486 487 add w26, w26, #1 // pos_x++ 488 add x27, x27, #5 // advance &rp_proj[(pos_y & 15) * stride + pos_x] 489 b 3b // copy loop 490 491 1: // search loop 492 add w11, w11, #1 // x++ 493 cmp w11, col_end8i // if (++x >= col_end8i) break xloop 494 b.ge 8f 495 496 ldrb w20, [x23, #5]! // rb++; rb->ref 497 cmp w20, w22 // if (rb->ref != b_ref) break 498 b.ne 7f 499 500 ldur w21, [x23, #-4] // rb->mv.n 501 cmp w21, w25 // if (rb->mv.n == b_mv.n) continue 502 b.eq 1b // search loop 503 7: 504 cmp w11, col_end8i // x < col_end8i 505 b.lt 4b // xloop 506 507 6: // continue case of xloop 508 add w11, w11, #1 // x++ 509 cmp w11, col_end8i // x < col_end8i 510 b.lt 4b // xloop 511 8: 512 add w12, w12, #1 // y++ 513 add x13, x13, stride5 // r += stride 514 cmp w12, row_end8 // y < row_end8 515 b.lt 5b // yloop 516 9: 517 add w10, w10, #1 518 cmp w10, w19 // n < rf->n_mfmvs 519 b.lt 10b // nloop 520 11: 521 ldp x29, x30, [sp, #80] 522 ldp x20, x19, [sp, #64] 523 ldp x22, x21, [sp, #48] 524 ldp x24, x23, [sp, #32] 525 ldp x26, x25, [sp, #16] 526 ldp x28, x27, [sp], #96 527 ret 528 .unreq rf 529 .unreq tile_row_idx 530 .unreq col_start8 531 .unreq col_end8 532 .unreq row_start8 533 .unreq row_end8 534 .unreq col_start8i 535 .unreq col_end8i 536 .unreq rp_proj 537 .unreq stride5 538 .unreq wstride5 539 endfunc 540 541 const div_mult_tbl 542 .hword 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 543 .hword 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 544 .hword 1024, 963, 910, 862, 819, 780, 744, 712 545 .hword 682, 655, 630, 606, 585, 564, 546, 528 546 endconst