tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

refmvs.S (23923B)


      1 /*
      2 * Copyright © 2021, VideoLAN and dav1d authors
      3 * Copyright © 2021, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm-offsets.h"
     29 #include "src/arm/asm.S"
     30 #include "util.S"
     31 
     32 #define INVALID_MV 0x80008000
     33 
     34 // void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
     35 //                          int bx4, int bw4, int bh4)
     36 
     37 function splat_mv_neon, export=1
     38        ld1             {v3.16b},  [x1]
     39        clz             w3,  w3
     40        movrel          x5,  splat_tbl
     41        sub             w3,  w3,  #26
     42        ext             v2.16b,  v3.16b,  v3.16b,  #12
     43        ldrsw           x3,  [x5, w3, uxtw #2]
     44        add             w2,  w2,  w2,  lsl #1
     45        ext             v0.16b,  v2.16b,  v3.16b,  #4
     46        add             x3,  x5,  x3
     47        ext             v1.16b,  v2.16b,  v3.16b,  #8
     48        lsl             w2,  w2,  #2
     49        ext             v2.16b,  v2.16b,  v3.16b,  #12
     50 1:
     51        ldr             x1,  [x0],  #8
     52        subs            w4,  w4,  #1
     53        add             x1,  x1,  x2
     54        br              x3
     55 
     56 10:
     57        AARCH64_VALID_JUMP_TARGET
     58        st1             {v0.8b}, [x1]
     59        str             s2,  [x1, #8]
     60        b.gt            1b
     61        ret
     62 20:
     63        AARCH64_VALID_JUMP_TARGET
     64        st1             {v0.16b}, [x1]
     65        str             d1,  [x1, #16]
     66        b.gt            1b
     67        ret
     68 320:
     69        AARCH64_VALID_JUMP_TARGET
     70        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
     71        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
     72        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
     73        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
     74 160:
     75        AARCH64_VALID_JUMP_TARGET
     76        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
     77        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
     78 80:
     79        AARCH64_VALID_JUMP_TARGET
     80        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
     81 40:
     82        AARCH64_VALID_JUMP_TARGET
     83        st1             {v0.16b, v1.16b, v2.16b}, [x1]
     84        b.gt            1b
     85        ret
     86 endfunc
     87 
     88 jumptable splat_tbl
     89        .word 320b  - splat_tbl
     90        .word 160b  - splat_tbl
     91        .word 80b   - splat_tbl
     92        .word 40b   - splat_tbl
     93        .word 20b   - splat_tbl
     94        .word 10b   - splat_tbl
     95 endjumptable
     96 
     97 const mv_tbls, align=4
     98        .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
     99        .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
    100        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
    101        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
    102 endconst
    103 
    104 const mask_mult, align=4
    105        .byte           1, 2, 1, 2, 0, 0, 0, 0
    106 endconst
    107 
    108 // void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
    109 //                           refmvs_block **rr, const uint8_t *ref_sign,
    110 //                           int col_end8, int row_end8,
    111 //                           int col_start8, int row_start8)
    112 function save_tmvs_neon, export=1
    113        AARCH64_SIGN_LINK_REGISTER
    114        stp             x29, x30, [sp, #-16]!
    115        mov             x29, sp
    116 
    117        movi            v30.8b,  #0
    118        ld1             {v31.8b}, [x3]
    119        movrel          x8,  save_tmvs_tbl
    120        movrel          x16, mask_mult
    121        movrel          x13, mv_tbls
    122        ld1             {v29.8b}, [x16]
    123        ext             v31.8b,  v30.8b,  v31.8b,  #7 // [0, ref_sign]
    124        mov             w15, #5
    125        mov             w14, #12*2
    126        sxtw            x4,  w4
    127        sxtw            x6,  w6
    128        mul             w1,  w1,  w15             // stride *= 5
    129        sub             w5,  w5,  w7              // h = row_end8 - row_start8
    130        lsl             w7,  w7,  #1              // row_start8 <<= 1
    131 1:
    132        mov             w15, #5
    133        and             w9,  w7,  #30             // (y & 15) * 2
    134        ldr             x9,  [x2, w9, uxtw #3]    // b = rr[(y & 15) * 2]
    135        add             x9,  x9,  #12             // &b[... + 1]
    136        madd            x10, x4,  x14,  x9        // end_cand_b = &b[col_end8*2 + 1]
    137        madd            x9,  x6,  x14,  x9        // cand_b = &b[x*2 + 1]
    138 
    139        madd            x3,  x6,  x15,  x0        // &rp[x]
    140 
    141 2:
    142        ldrb            w11, [x9, #10]            // cand_b->bs
    143        ld1             {v0.16b}, [x9]            // cand_b->mv
    144        add             x11, x8,  w11, uxtw #3
    145        ldr             h1,  [x9, #8]             // cand_b->ref
    146        ldr             w12, [x11]                // bw8
    147        mov             x15, x8
    148        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
    149        cmp             x9,  x10
    150        mov             v2.8b,   v0.8b
    151        b.ge            3f
    152 
    153        ldrb            w15, [x9, #10]            // cand_b->bs
    154        add             x16, x9,  #8
    155        ld1             {v4.16b}, [x9]            // cand_b->mv
    156        add             x15, x8,  w15, uxtw #3
    157        ld1             {v1.h}[1], [x16]          // cand_b->ref
    158        ldr             w12, [x15]                // bw8
    159        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
    160        trn1            v2.2d,   v0.2d,   v4.2d
    161 
    162 3:
    163        abs             v2.8h,   v2.8h            // abs(mv[].xy)
    164        tbl             v1.8b, {v31.16b}, v1.8b   // ref_sign[ref]
    165        ushr            v2.8h,   v2.8h,   #12     // abs(mv[].xy) >> 12
    166        umull           v1.8h,   v1.8b,   v29.8b  // ref_sign[ref] * {1, 2}
    167        cmeq            v2.4s,   v2.4s,   #0      // abs(mv[].xy) <= 4096
    168        xtn             v2.4h,   v2.4s            // abs() condition to 16 bit
    169        and             v1.8b,   v1.8b,   v2.8b   // h[0-3] contains conditions for mv[0-1]
    170        addp            v1.4h,   v1.4h,   v1.4h   // Combine condition for [1] and [0]
    171        umov            w16, v1.h[0]              // Extract case for first block
    172        umov            w17, v1.h[1]
    173        ldrsw           x11, [x11, #4]            // Fetch jump table entry
    174        ldrsw           x15, [x15, #4]
    175        ldr             q1, [x13, w16, uxtw #4]   // Load permutation table base on case
    176        ldr             q5, [x13, w17, uxtw #4]
    177        add             x11, x8,  x11             // Find jump table target
    178        add             x15, x8,  x15
    179        tbl             v0.16b, {v0.16b}, v1.16b  // Permute cand_b to output refmvs_temporal_block
    180        tbl             v4.16b, {v4.16b}, v5.16b
    181 
    182        // v1 follows on v0, with another 3 full repetitions of the pattern.
    183        ext             v1.16b,  v0.16b,  v0.16b,  #1
    184        ext             v5.16b,  v4.16b,  v4.16b,  #1
    185        // v2 ends with 3 complete repetitions of the pattern.
    186        ext             v2.16b,  v0.16b,  v1.16b,  #4
    187        ext             v6.16b,  v4.16b,  v5.16b,  #4
    188 
    189        blr             x11
    190        b.ge            4f  // if (cand_b >= end)
    191        mov             v0.16b,  v4.16b
    192        mov             v1.16b,  v5.16b
    193        mov             v2.16b,  v6.16b
    194        cmp             x9,  x10
    195        blr             x15
    196        b.lt            2b  // if (cand_b < end)
    197 
    198 4:
    199        subs            w5,  w5,  #1              // h--
    200        add             w7,  w7,  #2              // y += 2
    201        add             x0,  x0,  x1              // rp += stride
    202        b.gt            1b
    203 
    204        ldp             x29, x30, [sp], #16
    205        AARCH64_VALIDATE_LINK_REGISTER
    206        ret
    207 
    208 10:
    209        AARCH64_VALID_CALL_TARGET
    210        add             x16, x3,  #4
    211        st1             {v0.s}[0], [x3]
    212        st1             {v0.b}[4], [x16]
    213        add             x3,  x3,  #5
    214        ret
    215 20:
    216        AARCH64_VALID_CALL_TARGET
    217        add             x16, x3,  #8
    218        st1             {v0.d}[0], [x3]
    219        st1             {v0.h}[4], [x16]
    220        add             x3,  x3,  #2*5
    221        ret
    222 40:
    223        AARCH64_VALID_CALL_TARGET
    224        st1             {v0.16b}, [x3]
    225        str             s1, [x3, #16]
    226        add             x3,  x3,  #4*5
    227        ret
    228 80:
    229        AARCH64_VALID_CALL_TARGET
    230        // This writes 6 full entries plus 2 extra bytes
    231        st1             {v0.16b, v1.16b}, [x3]
    232        // Write the last few, overlapping with the first write.
    233        stur            q2, [x3, #(8*5-16)]
    234        add             x3,  x3,  #8*5
    235        ret
    236 160:
    237        AARCH64_VALID_CALL_TARGET
    238        add             x16, x3,  #6*5
    239        add             x17, x3,  #12*5
    240        // This writes 6 full entries plus 2 extra bytes
    241        st1             {v0.16b, v1.16b}, [x3]
    242        // Write another 6 full entries, slightly overlapping with the first set
    243        st1             {v0.16b, v1.16b}, [x16]
    244        // Write 8 bytes (one full entry) after the first 12
    245        st1             {v0.8b}, [x17]
    246        // Write the last 3 entries
    247        str             q2, [x3, #(16*5-16)]
    248        add             x3,  x3,  #16*5
    249        ret
    250 endfunc
    251 
    252 jumptable save_tmvs_tbl
    253        .word 16 * 12
    254        .word 160b - save_tmvs_tbl
    255        .word 16 * 12
    256        .word 160b - save_tmvs_tbl
    257        .word 8 * 12
    258        .word 80b  - save_tmvs_tbl
    259        .word 8 * 12
    260        .word 80b  - save_tmvs_tbl
    261        .word 8 * 12
    262        .word 80b  - save_tmvs_tbl
    263        .word 8 * 12
    264        .word 80b  - save_tmvs_tbl
    265        .word 4 * 12
    266        .word 40b  - save_tmvs_tbl
    267        .word 4 * 12
    268        .word 40b  - save_tmvs_tbl
    269        .word 4 * 12
    270        .word 40b  - save_tmvs_tbl
    271        .word 4 * 12
    272        .word 40b  - save_tmvs_tbl
    273        .word 2 * 12
    274        .word 20b  - save_tmvs_tbl
    275        .word 2 * 12
    276        .word 20b  - save_tmvs_tbl
    277        .word 2 * 12
    278        .word 20b  - save_tmvs_tbl
    279        .word 2 * 12
    280        .word 20b  - save_tmvs_tbl
    281        .word 2 * 12
    282        .word 20b  - save_tmvs_tbl
    283        .word 1 * 12
    284        .word 10b  - save_tmvs_tbl
    285        .word 1 * 12
    286        .word 10b  - save_tmvs_tbl
    287        .word 1 * 12
    288        .word 10b  - save_tmvs_tbl
    289        .word 1 * 12
    290        .word 10b  - save_tmvs_tbl
    291        .word 1 * 12
    292        .word 10b  - save_tmvs_tbl
    293        .word 1 * 12
    294        .word 10b  - save_tmvs_tbl
    295        .word 1 * 12
    296        .word 10b  - save_tmvs_tbl
    297 endjumptable
    298 
    299 // void dav1d_load_tmvs_neon(const refmvs_frame *const rf, int tile_row_idx,
    300 //                           const int col_start8, const int col_end8,
    301 //                           const int row_start8, int row_end8)
    302 function load_tmvs_neon, export=1
    303        rf              .req x0
    304        tile_row_idx    .req w1
    305        col_start8      .req w2
    306        col_end8        .req w3
    307        row_start8      .req w4
    308        row_end8        .req w5
    309        col_start8i     .req w6
    310        col_end8i       .req w7
    311        rp_proj         .req x8
    312        stride5         .req x9
    313        wstride5        .req w9
    314        stp             x28, x27, [sp, #-96]!
    315        stp             x26, x25, [sp, #16]
    316        stp             x24, x23, [sp, #32]
    317        stp             x22, x21, [sp, #48]
    318        stp             x20, x19, [sp, #64]
    319        stp             x29, x30, [sp, #80]
    320 
    321        ldr             w15, [rf, #RMVSF_N_TILE_THREADS]
    322        ldp             w16, w17, [rf, #RMVSF_IW8]          // include rf->ih8 too
    323        sub             col_start8i, col_start8, #8         // col_start8 - 8
    324        add             col_end8i, col_end8, #8             // col_end8 + 8
    325        ldr             wstride5, [rf, #RMVSF_RP_STRIDE]
    326        ldr             rp_proj, [rf, #RMVSF_RP_PROJ]
    327 
    328        cmp             w15, #1
    329        csel            tile_row_idx, wzr, tile_row_idx, eq // if (rf->n_tile_threads == 1) tile_row_idx = 0
    330 
    331        bic             col_start8i, col_start8i, col_start8i, asr #31  // imax(col_start8 - 8, 0)
    332        cmp             col_end8i, w16
    333        csel            col_end8i, col_end8i, w16, lt       // imin(col_end8 + 8, rf->iw8)
    334 
    335        lsl             tile_row_idx, tile_row_idx, #4      // 16 * tile_row_idx
    336 
    337        cmp             row_end8, w17
    338        csel            row_end8, row_end8, w17, lt         // imin(row_end8, rf->ih8)
    339 
    340        add             wstride5, wstride5, wstride5, lsl #2    // stride * sizeof(refmvs_temporal_block)
    341        and             w15, row_start8, #15                // row_start8 & 15
    342        add             w10, col_start8, col_start8, lsl #2 // col_start8 * sizeof(refmvs_temporal_block)
    343        smaddl          rp_proj, tile_row_idx, wstride5, rp_proj    // &rf->rp_proj[16 * stride * tile_row_idx]
    344        smaddl          x10, w15, wstride5, x10             // ((row_start8 & 15) * stride + col_start8) * sizeof(refmvs_temporal_block)
    345        mov             w15, #INVALID_MV
    346        sub             w11, col_end8, col_start8           // xfill loop count
    347        add             x10, x10, rp_proj                   // &rf->rp_proj[16 * stride * tile_row_idx + (row_start8 & 15) * stride + col_start8]
    348        add             x15, x15, x15, lsl #40              // first 64b of 4 [INVALID_MV, 0]... patterns
    349        mov             w17, #(INVALID_MV >> 8)             // last 32b of 4 patterns
    350        sub             w12, row_end8, row_start8           // yfill loop count
    351        ror             x16, x15, #48                       // second 64b of 4 patterns
    352        ldr             w19, [rf, #RMVSF_N_MFMVS]
    353 
    354 5:      // yfill loop
    355        and             w13, w11, #-4           // xfill 4x count by patterns
    356        mov             x14, x10                // fill_ptr = row_ptr
    357        add             x10, x10, stride5       // row_ptr += stride
    358        sub             w12, w12, #1            // y--
    359 
    360        cbz             w13, 3f
    361 
    362 4:      // xfill loop 4x
    363        sub             w13, w13, #4            // xfill 4x count -= 4
    364        stp             x15, x16, [x14]
    365        str             w17, [x14, #16]
    366        add             x14, x14, #20           // fill_ptr += 4 * sizeof(refmvs_temporal_block)
    367        cbnz            w13, 4b
    368 
    369 3:      // up to 3 residuals
    370        tbz             w11, #1, 1f
    371        str             x15, [x14]
    372        strh            w16, [x14, #8]
    373        add             x14, x14, #10           // fill_ptr += 2 * sizeof(refmvs_temporal_block)
    374 
    375 1:      // up to 1 residual
    376        tbz             w11, #0, 2f
    377        str             w15, [x14]
    378 2:
    379        cbnz            w12, 5b                 // yfill loop
    380 
    381        cbz             w19, 11f                // if (!rf->n_mfmvs) skip nloop
    382 
    383        add             x29, rf, #RMVSF_MFMV_REF2CUR
    384        mov             w10, #0                 // n = 0
    385        movi            v3.2s, #255             // 0x3FFF >> 6, for MV clamp
    386        movrel          x1, div_mult_tbl
    387 
    388 10:     // nloop
    389        ldrsb           w16, [x29, x10]         // ref2cur = rf->mfmv_ref2cur[n]
    390        cmp             w16, #-32
    391        b.eq            9f                      // if (ref2cur == INVALID_REF2CUR) continue
    392 
    393        add             x17, x10, #(RMVSF_MFMV_REF - RMVSF_MFMV_REF2CUR)    // n - (&rf->mfmv_ref - &rf->mfmv_ref2cur)
    394        mov             x20, #4
    395        ldrb            w17, [x29, x17]         // ref = rf->mfmv_ref[n]
    396        ldr             x13, [x29, #(RMVSF_RP_REF - RMVSF_MFMV_REF2CUR)]
    397        sub             x21, x10, x10, lsl #3   // -(n * 7)
    398        smaddl          x20, row_start8, wstride5, x20  // row_start8 * stride * sizeof(refmvs_temporal_block) + 4
    399        mov             w12, row_start8         // y = row_start8
    400        add             x28, x29, #(RMVSF_MFMV_REF2REF - RMVSF_MFMV_REF2CUR - 1)    // &rf->mfmv_ref2ref - 1
    401        ldr             x13, [x13, x17, lsl #3] // rf->rp_ref[ref]
    402        sub             x28, x28, x21           // rf->mfmv_ref2ref[n] - 1
    403        sub             w17, w17, #4            // ref_sign = ref - 4
    404        add             x13, x13, x20           // r = &rf->rp_ref[ref][row_start8 * stride].ref
    405        dup             v0.2s, w17              // ref_sign
    406 
    407 5:      // yloop
    408        and             w14, w12, #-8           // y_sb_align = y & ~7
    409        mov             w11, col_start8i        // x = col_start8i
    410        add             w15, w14, #8            // y_sb_align + 8
    411        cmp             w14, row_start8
    412        csel            w14, w14, row_start8, gt    // imax(y_sb_align, row_start8)
    413        cmp             w15, row_end8
    414        csel            w15, w15, row_end8, lt  // imin(y_sb_align + 8, row_end8)
    415 
    416 4:      // xloop
    417        add             x23, x13, x11, lsl #2   // partial &r[x] address
    418        ldrb            w22, [x23, x11]         // b_ref = rb->ref
    419        cbz             w22, 6f                 // if (!b_ref) continue
    420 
    421        ldrb            w24, [x28, x22]         // ref2ref = rf->mfmv_ref2ref[n][b_ref - 1]
    422        cbz             w24, 6f                 // if (!ref2ref) continue
    423 
    424        ldrh            w20, [x1, x24, lsl #1]  // div_mult[ref2ref]
    425        add             x23, x23, x11           // &r[x]
    426        mul             w20, w20, w16           // frac = ref2cur * div_mult[ref2ref]
    427 
    428        ldur            s1, [x23, #-4]          // mv{y, x} = rb->mv
    429        fmov            s2, w20                 // frac
    430        sxtl            v1.4s, v1.4h
    431        mul             v1.2s, v1.2s, v2.s[0]   // offset{y, x} = frac * mv{y, x}
    432 
    433        ssra            v1.2s, v1.2s, #31       // offset{y, x} + (offset{y, x} >> 31)
    434        ldur            w25, [x23, #-4]         // b_mv = rb->mv
    435        srshr           v1.2s, v1.2s, #14       // (offset{y, x} + (offset{y, x} >> 31) + 8192) >> 14
    436 
    437        abs             v2.2s, v1.2s            // abs(offset{y, x})
    438        eor             v1.8b, v1.8b, v0.8b     // offset{y, x} ^ ref_sign
    439 
    440        sshr            v2.2s, v2.2s, #6        // abs(offset{y, x}) >> 6
    441        cmlt            v1.2s, v1.2s, #0        // sign(offset{y, x} ^ ref_sign): -1 or 0
    442        umin            v2.2s, v2.2s, v3.2s     // iclip(abs(offset{y, x}) >> 6, 0, 0x3FFF >> 6)
    443 
    444        neg             v4.2s, v2.2s
    445        bsl             v1.8b, v4.8b, v2.8b     // apply_sign(iclip(abs(offset{y, x}) >> 6, 0, 0x3FFF >> 6))
    446        fmov            x20, d1                 // offset{y, x}
    447 
    448        add             w21, w12, w20           // pos_y = y + offset.y
    449        cmp             w21, w14                // pos_y >= y_proj_start
    450        b.lt            1f
    451        cmp             w21, w15                // pos_y < y_proj_end
    452        b.ge            1f
    453        add             x26, x11, x20, asr #32  // pos_x = x + offset.x
    454        and             w27, w21, #15           // pos_y & 15
    455        add             x21, x26, x26, lsl #2   // pos_x * sizeof(refmvs_temporal_block)
    456        umaddl          x27, w27, wstride5, rp_proj // &rp_proj[(pos_y & 15) * stride]
    457        add             x27, x27, x21           // &rp_proj[(pos_y & 15) * stride + pos_x]
    458 
    459 3:      // copy loop
    460        and             w20, w11, #-8           // x_sb_align = x & ~7
    461        sub             w21, w20, #8            // x_sb_align - 8
    462        cmp             w21, col_start8
    463        csel            w21, w21, col_start8, gt    // imax(x_sb_align - 8, col_start8)
    464        cmp             w26, w21                // pos_x >= imax(x_sb_align - 8, col_start8)
    465        b.lt            2f
    466        add             w20, w20, #16           // x_sb_align + 16
    467        cmp             w20, col_end8
    468        csel            w20, w20, col_end8, lt  // imin(x_sb_align + 16, col_end8)
    469        cmp             w26, w20                // pos_x < imin(x_sb_align + 16, col_end8)
    470        b.ge            2f
    471        str             w25, [x27]              // rp_proj[pos + pos_x].mv = rb->mv (b_mv)
    472        strb            w24, [x27, #4]          // rp_proj[pos + pos_x].ref = ref2ref
    473 
    474 2:      // search part of copy loop
    475        add             w11, w11, #1            // x++
    476        cmp             w11, col_end8i          // if (++x >= col_end8i) break xloop
    477        b.ge            8f
    478 
    479        ldrb            w20, [x23, #5]!         // rb++; rb->ref
    480        cmp             w20, w22                // if (rb->ref != b_ref) break
    481        b.ne            7f
    482 
    483        ldur            w21, [x23, #-4]         // rb->mv.n
    484        cmp             w21, w25                // if (rb->mv.n != b_mv.n) break
    485        b.ne            7f
    486 
    487        add             w26, w26, #1            // pos_x++
    488        add             x27, x27, #5            // advance &rp_proj[(pos_y & 15) * stride + pos_x]
    489        b               3b                      // copy loop
    490 
    491 1:      // search loop
    492        add             w11, w11, #1            // x++
    493        cmp             w11, col_end8i          // if (++x >= col_end8i) break xloop
    494        b.ge            8f
    495 
    496        ldrb            w20, [x23, #5]!         // rb++; rb->ref
    497        cmp             w20, w22                // if (rb->ref != b_ref) break
    498        b.ne            7f
    499 
    500        ldur            w21, [x23, #-4]         // rb->mv.n
    501        cmp             w21, w25                // if (rb->mv.n == b_mv.n) continue
    502        b.eq            1b                      // search loop
    503 7:
    504        cmp             w11, col_end8i          // x < col_end8i
    505        b.lt            4b                      // xloop
    506 
    507 6:      // continue case of xloop
    508        add             w11, w11, #1            // x++
    509        cmp             w11, col_end8i          // x < col_end8i
    510        b.lt            4b                      // xloop
    511 8:
    512        add             w12, w12, #1            // y++
    513        add             x13, x13, stride5       // r += stride
    514        cmp             w12, row_end8           // y < row_end8
    515        b.lt            5b                      // yloop
    516 9:
    517        add             w10, w10, #1
    518        cmp             w10, w19                // n < rf->n_mfmvs
    519        b.lt            10b                     // nloop
    520 11:
    521        ldp             x29, x30, [sp, #80]
    522        ldp             x20, x19, [sp, #64]
    523        ldp             x22, x21, [sp, #48]
    524        ldp             x24, x23, [sp, #32]
    525        ldp             x26, x25, [sp, #16]
    526        ldp             x28, x27, [sp], #96
    527        ret
    528        .unreq          rf
    529        .unreq          tile_row_idx
    530        .unreq          col_start8
    531        .unreq          col_end8
    532        .unreq          row_start8
    533        .unreq          row_end8
    534        .unreq          col_start8i
    535        .unreq          col_end8i
    536        .unreq          rp_proj
    537        .unreq          stride5
    538        .unreq          wstride5
    539 endfunc
    540 
    541 const div_mult_tbl
    542        .hword             0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
    543        .hword          2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092
    544        .hword          1024,   963,  910,  862,  819,  780,  744,  712
    545        .hword           682,   655,  630,  606,  585,  564,  546,  528
    546 endconst