refmvs.S (27696B)
1 /* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/loongarch/loongson_asm.S" 29 30 /* 31 static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, 32 const int bx4, const int bw4, int bh4) 33 */ 34 35 function splat_mv_lsx 36 vld vr0, a1, 0 // 0 1 ... 11 ... 37 clz.w t4, a3 38 vaddi.bu vr1, vr0, 0 39 addi.w t4, t4, -26 40 vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3 41 la.local t5, .SPLAT_LSX_JRTABLE 42 vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0 43 alsl.d t6, t4, t5, 1 44 vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7 45 ld.h t7, t6, 0 46 vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0 47 add.d t8, t5, t7 48 alsl.d a2, a2, a2, 1 49 vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 50 slli.w a2, a2, 2 51 jirl $r0, t8, 0 52 53 .SPLAT_LSX_JRTABLE: 54 .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE 55 .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE 56 .hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE 57 .hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE 58 .hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE 59 .hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE 60 61 .SPLAT_W1_LSX: 62 ld.d t3, a0, 0 63 addi.d a0, a0, 8 64 addi.d a4, a4, -1 65 add.d t3, t3, a2 66 67 fst.d f1, t3, 0 68 fst.s f3, t3, 8 69 blt zero, a4, .SPLAT_W1_LSX 70 b .splat_end 71 .SPLAT_W2_LSX: 72 ld.d t3, a0, 0 73 addi.d a0, a0, 8 74 addi.d a4, a4, -1 75 add.d t3, t3, a2 76 77 vst vr1, t3, 0 78 fst.d f2, t3, 16 79 blt zero, a4, .SPLAT_W2_LSX 80 b .splat_end 81 82 .SPLAT_W4_LSX: 83 ld.d t3, a0, 0 84 addi.d a0, a0, 8 85 addi.d a4, a4, -1 86 add.d t3, t3, a2 87 88 vst vr1, t3, 0 89 vst vr2, t3, 16 90 vst vr3, t3, 32 91 blt zero, a4, .SPLAT_W4_LSX 92 b .splat_end 93 94 .SPLAT_W8_LSX: 95 ld.d t3, a0, 0 96 addi.d a0, a0, 8 97 addi.d a4, a4, -1 98 add.d t3, t3, a2 99 100 vst vr1, t3, 0 101 vst vr2, t3, 16 102 vst vr3, t3, 32 103 104 vst vr1, t3, 48 105 vst vr2, t3, 64 106 vst vr3, t3, 80 107 blt zero, a4, .SPLAT_W8_LSX 108 b .splat_end 109 110 .SPLAT_W16_LSX: 111 ld.d t3, a0, 0 112 addi.d a0, a0, 8 113 addi.d a4, a4, -1 114 add.d t3, t3, a2 115 116 .rept 2 117 vst vr1, t3, 0 118 vst vr2, t3, 16 119 vst vr3, t3, 32 120 121 vst vr1, t3, 48 122 vst vr2, t3, 64 123 vst vr3, t3, 80 124 125 addi.d t3, t3, 96 126 .endr 127 128 blt zero, a4, .SPLAT_W16_LSX 129 b .splat_end 130 131 .SPLAT_W32_LSX: 132 ld.d t3, a0, 0 133 addi.d a0, a0, 8 134 addi.d a4, a4, -1 135 add.d t3, t3, a2 136 137 .rept 4 138 vst vr1, t3, 0 139 vst vr2, t3, 16 140 vst vr3, t3, 32 141 142 vst vr1, t3, 48 143 vst vr2, t3, 64 144 vst vr3, t3, 80 145 146 addi.d t3, t3, 96 147 .endr 148 149 blt zero, a4, .SPLAT_W32_LSX 150 151 .splat_end: 152 endfunc 153 154 const la_div_mult 155 .short 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 156 .short 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 157 .short 1024, 963, 910, 862, 819, 780, 744, 712 158 .short 682, 655, 630, 606, 585, 564, 546, 528 159 endconst 160 161 /* 162 * temp reg: a6 a7 163 */ 164 .macro LOAD_SET_LOOP is_odd 165 slli.d a6, t6, 2 166 add.d a6, a6, t6 // col_w * 5 167 0: 168 addi.d a7, zero, 0 // x 169 .if \is_odd 170 stx.w t7, t3, a7 171 addi.d a7, a7, 5 172 bge a7, a6, 2f 173 .endif 174 175 1: 176 stx.w t7, t3, a7 177 addi.d a7, a7, 5 178 stx.w t7, t3, a7 179 addi.d a7, a7, 5 180 blt a7, a6, 1b 181 2: 182 add.d t3, t3, t2 183 addi.d t5, t5, 1 184 blt t5, a5, 0b 185 .endm 186 187 /* 188 * static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx, 189 * const int col_start8, const int col_end8, 190 * const int row_start8, int row_end8) 191 */ 192 function load_tmvs_lsx 193 addi.d sp, sp, -80 194 st.d s0, sp, 0 195 st.d s1, sp, 8 196 st.d s2, sp, 16 197 st.d s3, sp, 24 198 st.d s4, sp, 32 199 st.d s5, sp, 40 200 st.d s6, sp, 48 201 st.d s7, sp, 56 202 st.d s8, sp, 64 203 204 vld vr16, a0, 16 205 vld vr0, a0, 48 // rf->mfmv_ref, rf->mfmv_ref2cur 206 ld.w s8, a0, 80 // [0] - rf->n_mfmvs 207 vld vr17, a0, 96 // [0] - rp_ref| [1]- rp_proj 208 ld.d t1, a0, 112 // stride 209 ld.w t0, a0, 128 210 addi.w t0, t0, -1 211 bnez t0, 1f 212 addi.w a1, zero, 0 213 1: 214 addi.d t0, a3, 8 215 vinsgr2vr.w vr1, t0, 0 216 vinsgr2vr.w vr1, a5, 1 217 vmin.w vr1, vr1, vr16 // [0] col_end8i [1] row_end8 218 addi.d t0, a2, -8 219 bge t0, zero, 2f 220 addi.w t0, zero, 0 // t0 col_start8i 221 2: 222 vpickve2gr.d t4, vr17, 1 // rf->rp_proj 223 slli.d t2, t1, 2 224 add.d t2, t2, t1 // stride * 5 225 slli.d a1, a1, 4 // tile_row_idx * 16 226 andi t3, a4, 0xf 227 add.d t3, t3, a1 // tile_row_idx * 16 + row_start8 & 15 228 mul.w t3, t3, t2 229 mul.w t8, a1, t2 230 vpickve2gr.w a5, vr1, 1 231 addi.d t5, a4, 0 232 sub.d t6, a3, a2 // col_end8 - col_start8 233 li.w t7, 0x80008000 234 slli.d a7, a2, 2 235 add.d t3, t3, a2 236 add.d t3, t3, a7 237 add.d t3, t3, t4 // rp_proj 238 andi a6, t6, 1 239 bnez a6, 3f 240 LOAD_SET_LOOP 0 241 b 4f 242 3: 243 LOAD_SET_LOOP 1 244 4: 245 addi.d a6, zero, 0 // n 246 bge a6, s8, .end_load 247 add.d t3, t8, t4 // rp_proj 248 mul.w t6, a4, t2 249 addi.d s7, zero, 40 250 vpickve2gr.w t1, vr1, 0 // col_end8i 251 addi.d t5, a0, 58 // rf->mfmv_ref2ref - 1 252 la.local t8, la_div_mult 253 vld vr6, t8, 0 254 vld vr7, t8, 16 255 vld vr8, t8, 32 256 vld vr9, t8, 48 257 li.w t8, 0x3fff 258 vreplgr2vr.h vr21, t8 259 vxor.v vr18, vr18, vr18 // zero 260 vsub.h vr20, vr18, vr21 261 vpickev.b vr12, vr7, vr6 262 vpickod.b vr13, vr7, vr6 263 vpickev.b vr14, vr9, vr8 264 vpickod.b vr15, vr9, vr8 265 vpickve2gr.d s6, vr17, 0 // rf->rp_ref 266 5: 267 vld vr10, t5, 0 // ref2ref [1...7] 268 vpickve2gr.b t8, vr0, 8 // ref2cur 269 vbsrl.v vr0, vr0, 1 270 addi.w t4, t8, 32 271 beqz t4, 8f // INVALID_REF2CUR 272 273 vreplgr2vr.h vr23, t8 274 vshuf.b vr6, vr14, vr12, vr10 275 vshuf.b vr7, vr15, vr13, vr10 276 vilvl.b vr8, vr7, vr6 277 vmulwev.w.h vr6, vr8, vr23 278 vmulwod.w.h vr7, vr8, vr23 279 280 vpickve2gr.b s0, vr0, 4 // ref 281 slli.d t8, s0, 3 282 ldx.d s1, s6, t8 // rf->rp_ref[ref] 283 addi.d s0, s0, -4 // ref_sign 284 vreplgr2vr.h vr19, s0 285 add.d s1, s1, t6 // &rf->rp_ref[ref][row_start8 * stride] 286 addi.d s2, a4, 0 // y 287 vilvl.w vr8, vr7, vr6 288 vilvh.w vr9, vr7, vr6 289 6: // for (int y = row_start8; 290 andi s3, s2, 0xff8 291 292 addi.d s4, s3, 8 293 blt a4, s3, 0f 294 addi.d s3, a4, 0 // y_proj_start 295 0: 296 blt s4, a5, 0f 297 addi.d s4, a5, 0 // y_proj_end 298 0: 299 addi.d s5, t0, 0 // x 300 7: // for (int x = col_start8i; 301 slli.d a7, s5, 2 302 add.d a7, a7, s5 303 add.d a7, s1, a7 // rb 304 vld vr3, a7, 0 // [rb] 305 vpickve2gr.b t4, vr3, 4 // b_ref 306 beqz t4, .end_x 307 vreplve.b vr11, vr10, t4 308 vpickve2gr.b t7, vr11, 4 // ref2ref 309 beqz t7, .end_x 310 vsllwil.w.h vr4, vr3, 0 311 vreplgr2vr.w vr6, t4 312 vshuf.w vr6, vr9, vr8 // frac 313 vmul.w vr5, vr6, vr4 314 vsrai.w vr4, vr5, 31 315 vadd.w vr4, vr4, vr5 316 vssrarni.h.w vr4, vr4, 14 317 vclip.h vr4, vr4, vr20, vr21 // offset 318 vxor.v vr5, vr4, vr19 // offset.x ^ ref_sign 319 vori.b vr5, vr5, 0x1 // offset.x ^ ref_sign 320 vabsd.h vr4, vr4, vr18 321 vsrli.h vr4, vr4, 6 // abs(offset.x) >> 6 322 vsigncov.h vr4, vr5, vr4 // apply_sign 323 vpickve2gr.h s0, vr4, 0 324 add.d s0, s2, s0 // pos_y 325 blt s0, s3, .n_posy 326 bge s0, s4, .n_posy 327 andi s0, s0, 0xf 328 mul.w s0, s0, t2 // pos 329 vpickve2gr.h t7, vr4, 1 330 add.d t7, t7, s5 // pos_x 331 add.d s0, t3, s0 // rp_proj + pos 332 333 .loop_posx: 334 andi t4, s5, 0xff8 // x_sb_align 335 336 blt t7, a2, .n_posx 337 addi.d t8, t4, -8 338 blt t7, t8, .n_posx 339 340 bge t7, a3, .n_posx 341 addi.d t4, t4, 16 342 bge t7, t4, .n_posx 343 344 slli.d t4, t7, 2 345 add.d t4, t4, t7 // pos_x * 5 346 add.d t4, s0, t4 // rp_proj[pos + pos_x] 347 vstelm.w vr3, t4, 0, 0 348 vstelm.b vr11, t4, 4, 4 349 350 .n_posx: 351 addi.d s5, s5, 1 // x + 1 352 bge s5, t1, .ret_posx 353 addi.d a7, a7, 5 // rb + 1 354 vld vr4, a7, 0 // [rb] 355 vseq.b vr5, vr4, vr3 356 357 vpickve2gr.d t8, vr5, 0 358 cto.d t8, t8 359 blt t8, s7, 7b 360 361 addi.d t7, t7, 1 // pos_x + 1 362 363 /* Core computing loop expansion(sencond) */ 364 andi t4, s5, 0xff8 // x_sb_align 365 366 blt t7, a2, .n_posx 367 addi.d t8, t4, -8 368 blt t7, t8, .n_posx 369 370 bge t7, a3, .n_posx 371 addi.d t4, t4, 16 372 bge t7, t4, .n_posx 373 374 slli.d t4, t7, 2 375 add.d t4, t4, t7 // pos_x * 5 376 add.d t4, s0, t4 // rp_proj[pos + pos_x] 377 vstelm.w vr3, t4, 0, 0 378 vstelm.b vr11, t4, 4, 4 379 380 addi.d s5, s5, 1 // x + 1 381 bge s5, t1, .ret_posx 382 addi.d a7, a7, 5 // rb + 1 383 vld vr4, a7, 0 // [rb] 384 vseq.b vr5, vr4, vr3 385 386 vpickve2gr.d t8, vr5, 0 387 cto.d t8, t8 388 blt t8, s7, 7b 389 390 addi.d t7, t7, 1 // pos_x + 1 391 392 /* Core computing loop expansion(third) */ 393 andi t4, s5, 0xff8 // x_sb_align 394 395 blt t7, a2, .n_posx 396 addi.d t8, t4, -8 397 blt t7, t8, .n_posx 398 399 bge t7, a3, .n_posx 400 addi.d t4, t4, 16 401 bge t7, t4, .n_posx 402 403 slli.d t4, t7, 2 404 add.d t4, t4, t7 // pos_x * 5 405 add.d t4, s0, t4 // rp_proj[pos + pos_x] 406 vstelm.w vr3, t4, 0, 0 407 vstelm.b vr11, t4, 4, 4 408 409 addi.d s5, s5, 1 // x + 1 410 bge s5, t1, .ret_posx 411 addi.d a7, a7, 5 // rb + 1 412 vld vr4, a7, 0 // [rb] 413 vseq.b vr5, vr4, vr3 414 415 vpickve2gr.d t8, vr5, 0 416 cto.d t8, t8 417 blt t8, s7, 7b 418 419 addi.d t7, t7, 1 // pos_x + 1 420 421 b .loop_posx 422 423 .n_posy: 424 addi.d s5, s5, 1 // x + 1 425 bge s5, t1, .ret_posx 426 addi.d a7, a7, 5 // rb + 1 427 vld vr4, a7, 0 // [rb] 428 vseq.b vr5, vr4, vr3 429 430 vpickve2gr.d t8, vr5, 0 431 cto.d t8, t8 432 blt t8, s7, 7b 433 434 addi.d s5, s5, 1 // x + 1 435 bge s5, t1, .ret_posx 436 addi.d a7, a7, 5 // rb + 1 437 vld vr4, a7, 0 // [rb] 438 vseq.b vr5, vr4, vr3 439 440 vpickve2gr.d t8, vr5, 0 441 cto.d t8, t8 442 blt t8, s7, 7b 443 444 b .n_posy 445 446 .end_x: 447 addi.d s5, s5, 1 // x + 1 448 blt s5, t1, 7b 449 450 .ret_posx: 451 add.d s1, s1, t2 // r + stride 452 addi.d s2, s2, 1 // y + 1 453 blt s2, a5, 6b 454 8: 455 addi.d a6, a6, 1 // n + 1 456 addi.d t5, t5, 7 // mfmv_ref2ref(offset) + 7 457 blt a6, s8, 5b 458 459 .end_load: 460 ld.d s0, sp, 0 461 ld.d s1, sp, 8 462 ld.d s2, sp, 16 463 ld.d s3, sp, 24 464 ld.d s4, sp, 32 465 ld.d s5, sp, 40 466 ld.d s6, sp, 48 467 ld.d s7, sp, 56 468 ld.d s8, sp, 64 469 addi.d sp, sp, 80 470 endfunc 471 472 const mv_tbls 473 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 474 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 475 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 476 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 477 endconst 478 479 const mask_mult 480 .byte 1, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 481 endconst 482 483 const mask_mv0 484 .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 485 endconst 486 487 const mask_mv1 488 .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 489 endconst 490 491 // void dav1d_save_tmvs_lsx(refmvs_temporal_block *rp, ptrdiff_t stride, 492 // refmvs_block **rr, const uint8_t *ref_sign, 493 // int col_end8, int row_end8, 494 // int col_start8, int row_start8) 495 function save_tmvs_lsx 496 addi.d sp, sp, -0x28 497 st.d s0, sp, 0x00 498 st.d s1, sp, 0x08 499 st.d s2, sp, 0x10 500 st.d s3, sp, 0x18 501 st.d s4, sp, 0x20 502 move t0, ra 503 504 vxor.v vr10, vr10, vr10 505 vld vr11, a3, 0 // Load ref_sign[0] ~ Load ref_sign[7] 506 la.local t2, .save_tevs_tbl 507 la.local s1, mask_mult 508 la.local t7, mv_tbls 509 vld vr9, s1, 0 // Load mask_mult 510 vslli.d vr11, vr11, 8 // 0, ref_sign[0], ... ,ref_sign[6] 511 la.local s3, mask_mv0 512 vld vr8, s3, 0 // Load mask_mv0 513 la.local s4, mask_mv1 514 vld vr7, s4, 0 // Load mask_mv1 515 li.d s0, 5 516 li.d t8, 12 * 2 517 mul.d a1, a1, s0 // stride *= 5 518 sub.d a5, a5, a7 // h = row_end8 - row_start8 519 slli.d a7, a7, 1 // row_start8 <<= 1 520 1: 521 li.d s0, 5 522 andi t3, a7, 30 // (y & 15) * 2 523 slli.d s4, t3, 3 524 ldx.d t3, a2, s4 // b = rr[(y & 15) * 2] 525 addi.d t3, t3, 12 // &b[... + 1] 526 mul.d s4, a4, t8 527 add.d t4, s4, t3 // end_cand_b = &b[col_end8*2 + 1] 528 mul.d s3, a6, t8 529 add.d t3, s3, t3 // cand_b = &b[x*2 + 1] 530 mul.d s4, a6, s0 531 add.d a3, s4, a0 // &rp[x] 532 2: 533 /* First cand_b */ 534 ld.b t5, t3, 10 // cand_b->bs 535 vld vr0, t3, 0 // cand_b->mv and ref 536 alsl.d t5, t5, t2, 2 // bt2 index 537 ld.h s3, t3, 8 // cand_b->ref 538 ld.h t6, t5, 0 // bt2 539 move s0, t2 540 alsl.d t3, t6, t3, 1 // Next cand_b += bt2 * 2 541 vor.v vr2, vr0, vr0 542 vinsgr2vr.h vr1, s3, 0 543 move t1 , t3 544 bge t3, t4, 3f 545 546 /* Next cand_b */ 547 ld.b s0, t3, 10 // cand_b->bs 548 vld vr4, t3, 0 // cand_b->mv and ref 549 alsl.d s0, s0, t2, 2 // bt2 index 550 ld.h s4, t3, 8 // cand_b->ref 551 ld.h t6, s0, 0 // bt2 552 alsl.d t3, t6, t3, 1 // Next cand_b += bt2*2 553 vpackev.d vr2, vr4, vr0 // a0.mv[0] a0.mv[1] a1.mv[0], a1.mv[1] 554 vinsgr2vr.h vr1, s4, 1 // a0.ref[0] a0.ref[1], a1.ref[0], a1.ref[1] 555 3: 556 vabsd.h vr2, vr2, vr10 // abs(mv[].xy) 557 vsle.b vr16, vr10, vr1 558 vand.v vr1, vr16, vr1 559 vshuf.b vr1, vr11, vr11, vr1 // ref_sign[ref] 560 vsrli.h vr2, vr2, 12 // abs(mv[].xy) >> 12 561 vilvl.b vr1, vr1, vr1 562 vmulwev.h.bu vr1, vr1, vr9 // ef_sign[ref] * {1, 2} 563 564 vseqi.w vr2, vr2, 0 // abs(mv[].xy) <= 4096 565 vpickev.h vr2, vr2, vr2 // abs() condition to 16 bit 566 567 vand.v vr1, vr2, vr1 // h[0-3] contains conditions for mv[0-1] 568 vhaddw.wu.hu vr1, vr1, vr1 // Combine condition for [1] and [0] 569 vpickve2gr.wu s1, vr1, 0 // Extract case for first block 570 vpickve2gr.wu s2, vr1, 1 571 572 ld.hu t5, t5, 2 // Fetch jump table entry 573 ld.hu s0, s0, 2 574 alsl.d s3, s1, t7, 4 // Load permutation table base on case 575 vld vr1, s3, 0 576 alsl.d s4, s2, t7, 4 577 vld vr5, s4, 0 578 sub.d t5, t2, t5 // Find jump table target 579 sub.d s0, t2, s0 580 581 vshuf.b vr0, vr0, vr0, vr1 // Permute cand_b to output refmvs_temporal_block 582 vshuf.b vr4, vr4, vr4, vr5 583 vsle.b vr16, vr10, vr1 584 vand.v vr0, vr16, vr0 585 586 vsle.b vr17, vr10, vr5 587 vand.v vr4, vr17, vr4 588 // v1 follows on v0, with another 3 full repetitions of the pattern. 589 vshuf.b vr1, vr0, vr0, vr8 // 1, 2, 3, ... , 15, 16 590 vshuf.b vr5, vr4, vr4, vr8 // 1, 2, 3, ... , 15, 16 591 // v2 ends with 3 complete repetitions of the pattern. 592 vshuf.b vr2, vr1, vr0, vr7 593 vshuf.b vr6, vr5, vr4, vr7 // 4, 5, 6, 7, ... , 12, 13, 14, 15, 16, 17, 18, 19 594 595 jirl ra, t5, 0 596 bge t1 , t4, 4f // if (cand_b >= end) 597 vor.v vr0, vr4, vr4 598 vor.v vr1, vr5, vr5 599 vor.v vr2, vr6, vr6 600 jirl ra, s0, 0 601 blt t3, t4, 2b // if (cand_b < end) 602 603 4: 604 addi.d a5, a5, -1 // h-- 605 addi.d a7, a7, 2 // y += 2 606 add.d a0, a0, a1 // rp += stride 607 blt zero, a5, 1b 608 609 ld.d s0, sp, 0x00 610 ld.d s1, sp, 0x08 611 ld.d s2, sp, 0x10 612 ld.d s3, sp, 0x18 613 ld.d s4, sp, 0x20 614 addi.d sp, sp, 0x28 615 616 move ra, t0 617 jirl zero, ra, 0x00 618 619 10: 620 addi.d s1, a3, 4 621 vstelm.w vr0, a3, 0, 0 // .mv 622 vstelm.b vr0, s1, 0, 4 // .ref 623 addi.d a3, a3, 5 624 jirl zero, ra, 0x00 625 20: 626 addi.d s1, a3, 8 627 vstelm.d vr0, a3, 0, 0 // .mv 628 vstelm.h vr0, s1, 0, 4 // .ref 629 addi.d a3, a3, 2 * 5 630 jirl zero, ra, 0x00 631 40: 632 vst vr0, a3, 0 633 vstelm.w vr1, a3, 0x10, 0 634 addi.d a3, a3, 4 * 5 635 jirl zero, ra, 0x00 636 637 80: 638 vst vr0, a3, 0 639 vst vr1, a3, 0x10 // This writes 6 full entries plus 2 extra bytes 640 vst vr2, a3, 5 * 8 - 16 // Write the last few, overlapping with the first write. 641 addi.d a3, a3, 8 * 5 642 jirl zero, ra, 0x00 643 160: 644 addi.d s1, a3, 6 * 5 645 addi.d s2, a3, 12 * 5 646 vst vr0, a3, 0 647 vst vr1, a3, 0x10 // This writes 6 full entries plus 2 extra bytes 648 vst vr0, a3, 6 * 5 649 vst vr1, a3, 6 * 5 + 16 // Write another 6 full entries, slightly overlapping with the first set 650 vstelm.d vr0, s2, 0, 0 // Write 8 bytes (one full entry) after the first 12 651 vst vr2, a3, 5 * 16 - 16 // Write the last 3 entries 652 addi.d a3, a3, 16 * 5 653 jirl zero, ra, 0x00 654 655 .save_tevs_tbl: 656 .hword 16 * 12 // bt2 * 12, 12 is sizeof(refmvs_block) 657 .hword .save_tevs_tbl - 160b 658 .hword 16 * 12 659 .hword .save_tevs_tbl - 160b 660 .hword 8 * 12 661 .hword .save_tevs_tbl - 80b 662 .hword 8 * 12 663 .hword .save_tevs_tbl - 80b 664 .hword 8 * 12 665 .hword .save_tevs_tbl - 80b 666 .hword 8 * 12 667 .hword .save_tevs_tbl - 80b 668 .hword 4 * 12 669 .hword .save_tevs_tbl - 40b 670 .hword 4 * 12 671 .hword .save_tevs_tbl - 40b 672 .hword 4 * 12 673 .hword .save_tevs_tbl - 40b 674 .hword 4 * 12 675 .hword .save_tevs_tbl - 40b 676 .hword 2 * 12 677 .hword .save_tevs_tbl - 20b 678 .hword 2 * 12 679 .hword .save_tevs_tbl - 20b 680 .hword 2 * 12 681 .hword .save_tevs_tbl - 20b 682 .hword 2 * 12 683 .hword .save_tevs_tbl - 20b 684 .hword 2 * 12 685 .hword .save_tevs_tbl - 20b 686 .hword 1 * 12 687 .hword .save_tevs_tbl - 10b 688 .hword 1 * 12 689 .hword .save_tevs_tbl - 10b 690 .hword 1 * 12 691 .hword .save_tevs_tbl - 10b 692 .hword 1 * 12 693 .hword .save_tevs_tbl - 10b 694 .hword 1 * 12 695 .hword .save_tevs_tbl - 10b 696 .hword 1 * 12 697 .hword .save_tevs_tbl - 10b 698 .hword 1 * 12 699 .hword .save_tevs_tbl - 10b 700 endfunc