tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef_tmpl.S (20656B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2020, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 .macro dir_table w, stride
     32 const directions\w
     33        .byte           -1 * \stride + 1, -2 * \stride + 2
     34        .byte            0 * \stride + 1, -1 * \stride + 2
     35        .byte            0 * \stride + 1,  0 * \stride + 2
     36        .byte            0 * \stride + 1,  1 * \stride + 2
     37        .byte            1 * \stride + 1,  2 * \stride + 2
     38        .byte            1 * \stride + 0,  2 * \stride + 1
     39        .byte            1 * \stride + 0,  2 * \stride + 0
     40        .byte            1 * \stride + 0,  2 * \stride - 1
     41 // Repeated, to avoid & 7
     42        .byte           -1 * \stride + 1, -2 * \stride + 2
     43        .byte            0 * \stride + 1, -1 * \stride + 2
     44        .byte            0 * \stride + 1,  0 * \stride + 2
     45        .byte            0 * \stride + 1,  1 * \stride + 2
     46        .byte            1 * \stride + 1,  2 * \stride + 2
     47        .byte            1 * \stride + 0,  2 * \stride + 1
     48 endconst
     49 .endm
     50 
     51 .macro tables
     52 dir_table 8, 16
     53 dir_table 4, 8
     54 
     55 const pri_taps
     56        .byte           4, 2, 3, 3
     57 endconst
     58 .endm
     59 
     60 .macro load_px d1, d2, w
     61 .if \w == 8
     62        add             x6,  x2,  w9, sxtb #1       // x + off
     63        sub             x9,  x2,  w9, sxtb #1       // x - off
     64        ld1             {\d1\().8h}, [x6]           // p0
     65        ld1             {\d2\().8h}, [x9]           // p1
     66 .else
     67        add             x6,  x2,  w9, sxtb #1       // x + off
     68        sub             x9,  x2,  w9, sxtb #1       // x - off
     69        ld1             {\d1\().4h}, [x6]           // p0
     70        add             x6,  x6,  #2*8              // += stride
     71        ld1             {\d2\().4h}, [x9]           // p1
     72        add             x9,  x9,  #2*8              // += stride
     73        ld1             {\d1\().d}[1], [x6]         // p0
     74        ld1             {\d2\().d}[1], [x9]         // p1
     75 .endif
     76 .endm
     77 .macro handle_pixel s1, s2, thresh_vec, shift, tap, min
     78 .if \min
     79        umin            v2.8h,   v2.8h,  \s1\().8h
     80        smax            v3.8h,   v3.8h,  \s1\().8h
     81        umin            v2.8h,   v2.8h,  \s2\().8h
     82        smax            v3.8h,   v3.8h,  \s2\().8h
     83 .endif
     84        uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
     85        uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
     86        ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
     87        ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
     88        uqsub           v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
     89        uqsub           v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
     90        sub             v18.8h, \s1\().8h,  v0.8h   // diff = p0 - px
     91        sub             v22.8h, \s2\().8h,  v0.8h   // diff = p1 - px
     92        neg             v16.8h, v17.8h              // -clip
     93        neg             v20.8h, v21.8h              // -clip
     94        smin            v18.8h, v18.8h, v17.8h      // imin(diff, clip)
     95        smin            v22.8h, v22.8h, v21.8h      // imin(diff, clip)
     96        dup             v19.8h, \tap                // taps[k]
     97        smax            v18.8h, v18.8h, v16.8h      // constrain() = imax(imin(diff, clip), -clip)
     98        smax            v22.8h, v22.8h, v20.8h      // constrain() = imax(imin(diff, clip), -clip)
     99        mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
    100        mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
    101 .endm
    102 
    103 // void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
    104 //                                   const uint16_t *tmp, int pri_strength,
    105 //                                   int sec_strength, int dir, int damping,
    106 //                                   int h, size_t edges);
    107 .macro filter_func w, bpc, pri, sec, min, suffix
    108 function cdef_filter\w\suffix\()_\bpc\()bpc_neon
    109 .if \bpc == 8
    110        ldr             w8,  [sp]                   // edges
    111        cmp             w8,  #0xf
    112        b.eq            cdef_filter\w\suffix\()_edged_8bpc_neon
    113 .endif
    114 .if \pri
    115 .if \bpc == 16
    116        ldr             w9,  [sp, #8]               // bitdepth_max
    117        clz             w9,  w9
    118        sub             w9,  w9,  #24               // -bitdepth_min_8
    119        neg             w9,  w9                     // bitdepth_min_8
    120 .endif
    121        movrel          x8,  pri_taps
    122 .if \bpc == 16
    123        lsr             w9,  w3,  w9                // pri_strength >> bitdepth_min_8
    124        and             w9,  w9,  #1                // (pri_strength >> bitdepth_min_8) & 1
    125 .else
    126        and             w9,  w3,  #1
    127 .endif
    128        add             x8,  x8,  w9, uxtw #1
    129 .endif
    130        movrel          x9,  directions\w
    131        add             x5,  x9,  w5, uxtw #1
    132        movi            v30.4h,   #15
    133        dup             v28.4h,   w6                // damping
    134 
    135 .if \pri
    136        dup             v25.8h, w3                  // threshold
    137 .endif
    138 .if \sec
    139        dup             v27.8h, w4                  // threshold
    140 .endif
    141        trn1            v24.4h, v25.4h, v27.4h
    142        clz             v24.4h, v24.4h              // clz(threshold)
    143        sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
    144        uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
    145        neg             v24.4h, v24.4h              // -shift
    146 .if \sec
    147        dup             v26.8h, v24.h[1]
    148 .endif
    149 .if \pri
    150        dup             v24.8h, v24.h[0]
    151 .endif
    152 
    153 1:
    154 .if \w == 8
    155        ld1             {v0.8h}, [x2]               // px
    156 .else
    157        add             x12, x2,  #2*8
    158        ld1             {v0.4h},   [x2]             // px
    159        ld1             {v0.d}[1], [x12]            // px
    160 .endif
    161 
    162        movi            v1.8h,  #0                  // sum
    163 .if \min
    164        mov             v2.16b, v0.16b              // min
    165        mov             v3.16b, v0.16b              // max
    166 .endif
    167 
    168        // Instead of loading sec_taps 2, 1 from memory, just set it
    169        // to 2 initially and decrease for the second round.
    170        // This is also used as loop counter.
    171        mov             w11, #2                     // sec_taps[0]
    172 
    173 2:
    174 .if \pri
    175        ldrb            w9,  [x5]                   // off1
    176 
    177        load_px         v4,  v5, \w
    178 .endif
    179 
    180 .if \sec
    181        add             x5,  x5,  #4                // +2*2
    182        ldrb            w9,  [x5]                   // off2
    183        load_px         v6,  v7,  \w
    184 .endif
    185 
    186 .if \pri
    187        ldrb            w10, [x8]                   // *pri_taps
    188 
    189        handle_pixel    v4,  v5,  v25.8h, v24.8h, w10, \min
    190 .endif
    191 
    192 .if \sec
    193        add             x5,  x5,  #8                // +2*4
    194        ldrb            w9,  [x5]                   // off3
    195        load_px         v4,  v5,  \w
    196 
    197        handle_pixel    v6,  v7,  v27.8h, v26.8h, w11, \min
    198 
    199        handle_pixel    v4,  v5,  v27.8h, v26.8h, w11, \min
    200 
    201        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
    202 .else
    203        add             x5,  x5,  #1                // x5 += 1
    204 .endif
    205        subs            w11, w11, #1                // sec_tap-- (value)
    206 .if \pri
    207        add             x8,  x8,  #1                // pri_taps++ (pointer)
    208 .endif
    209        b.ne            2b
    210 
    211        cmlt            v4.8h,  v1.8h,  #0          // -(sum < 0)
    212        add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
    213        srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
    214        add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
    215 .if \min
    216        smin            v0.8h,  v0.8h,  v3.8h
    217        smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
    218 .endif
    219 .if \bpc == 8
    220        xtn             v0.8b,  v0.8h
    221 .endif
    222 .if \w == 8
    223        add             x2,  x2,  #2*16             // tmp += tmp_stride
    224        subs            w7,  w7,  #1                // h--
    225 .if \bpc == 8
    226        st1             {v0.8b}, [x0], x1
    227 .else
    228        st1             {v0.8h}, [x0], x1
    229 .endif
    230 .else
    231 .if \bpc == 8
    232        st1             {v0.s}[0], [x0], x1
    233 .else
    234        st1             {v0.d}[0], [x0], x1
    235 .endif
    236        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
    237        subs            w7,  w7,  #2                // h -= 2
    238 .if \bpc == 8
    239        st1             {v0.s}[1], [x0], x1
    240 .else
    241        st1             {v0.d}[1], [x0], x1
    242 .endif
    243 .endif
    244 
    245        // Reset pri_taps and directions back to the original point
    246        sub             x5,  x5,  #2
    247 .if \pri
    248        sub             x8,  x8,  #2
    249 .endif
    250 
    251        b.gt            1b
    252        ret
    253 endfunc
    254 .endm
    255 
    256 .macro filter w, bpc
    257 filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
    258 filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
    259 filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
    260 
    261 function cdef_filter\w\()_\bpc\()bpc_neon, export=1
    262        cbnz            w3,  1f // pri_strength
    263        b               cdef_filter\w\()_sec_\bpc\()bpc_neon     // only sec
    264 1:
    265        cbnz            w4,  1f // sec_strength
    266        b               cdef_filter\w\()_pri_\bpc\()bpc_neon     // only pri
    267 1:
    268        b               cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
    269 endfunc
    270 .endm
    271 
    272 const div_table
    273        .short         840, 420, 280, 210, 168, 140, 120, 105
    274 endconst
    275 
    276 const alt_fact
    277        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
    278 endconst
    279 
    280 .macro cost_alt d1, d2, s1, s2, s3, s4
    281        smull           v22.4s,  \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
    282        smull2          v23.4s,  \s1\().8h, \s1\().8h
    283        smull           v24.4s,  \s2\().4h, \s2\().4h
    284        smull           v25.4s,  \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
    285        smull2          v26.4s,  \s3\().8h, \s3\().8h
    286        smull           v27.4s,  \s4\().4h, \s4\().4h
    287        mul             v22.4s,  v22.4s,  v29.4s      // sum_alt[n]^2*fact
    288        mla             v22.4s,  v23.4s,  v30.4s
    289        mla             v22.4s,  v24.4s,  v31.4s
    290        mul             v25.4s,  v25.4s,  v29.4s      // sum_alt[n]^2*fact
    291        mla             v25.4s,  v26.4s,  v30.4s
    292        mla             v25.4s,  v27.4s,  v31.4s
    293        addv            \d1, v22.4s                   // *cost_ptr
    294        addv            \d2, v25.4s                   // *cost_ptr
    295 .endm
    296 
    297 .macro find_best s1, s2, s3
    298 .ifnb \s2
    299        mov             w5,  \s2\().s[0]
    300 .endif
    301        cmp             w4,  w1                       // cost[n] > best_cost
    302        csel            w0,  w3,  w0,  gt             // best_dir = n
    303        csel            w1,  w4,  w1,  gt             // best_cost = cost[n]
    304 .ifnb \s2
    305        add             w3,  w3,  #1                  // n++
    306        cmp             w5,  w1                       // cost[n] > best_cost
    307        mov             w4,  \s3\().s[0]
    308        csel            w0,  w3,  w0,  gt             // best_dir = n
    309        csel            w1,  w5,  w1,  gt             // best_cost = cost[n]
    310        add             w3,  w3,  #1                  // n++
    311 .endif
    312 .endm
    313 
    314 // Steps for loading and preparing each row
    315 .macro dir_load_step1 s1, bpc
    316 .if \bpc == 8
    317        ld1             {\s1\().8b}, [x0], x1
    318 .else
    319        ld1             {\s1\().8h}, [x0], x1
    320 .endif
    321 .endm
    322 
    323 .macro dir_load_step2 s1, bpc
    324 .if \bpc == 8
    325        usubl           \s1\().8h,  \s1\().8b, v31.8b
    326 .else
    327        ushl            \s1\().8h,  \s1\().8h, v8.8h
    328 .endif
    329 .endm
    330 
    331 .macro dir_load_step3 s1, bpc
    332 // Nothing for \bpc == 8
    333 .if \bpc != 8
    334        sub             \s1\().8h,  \s1\().8h, v31.8h
    335 .endif
    336 .endm
    337 
    338 // int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
    339 //                                   unsigned *const var)
    340 .macro find_dir bpc
    341 function cdef_find_dir_\bpc\()bpc_neon, export=1
    342 .if \bpc == 16
    343        str             d8,  [sp, #-0x10]!
    344        clz             w3,  w3                       // clz(bitdepth_max)
    345        sub             w3,  w3,  #24                 // -bitdepth_min_8
    346        dup             v8.8h,   w3
    347 .endif
    348        sub             sp,  sp,  #32 // cost
    349        mov             w3,  #8
    350 .if \bpc == 8
    351        movi            v31.16b, #128
    352 .else
    353        movi            v31.8h,  #128
    354 .endif
    355        movi            v30.16b, #0
    356        movi            v1.8h,   #0 // v0-v1 sum_diag[0]
    357        movi            v3.8h,   #0 // v2-v3 sum_diag[1]
    358        movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
    359        movi            v7.8h,   #0 // v6-v7 sum_alt[0]
    360        dir_load_step1  v26, \bpc       // Setup first row early
    361        movi            v17.8h,  #0 // v16-v17 sum_alt[1]
    362        movi            v18.8h,  #0 // v18-v19 sum_alt[2]
    363        dir_load_step2  v26, \bpc
    364        movi            v19.8h,  #0
    365        dir_load_step3  v26, \bpc
    366        movi            v21.8h,  #0 // v20-v21 sum_alt[3]
    367 
    368 .irpc i, 01234567
    369        addv            h25,     v26.8h               // [y]
    370        rev64           v27.8h,  v26.8h
    371        addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
    372        add             v5.8h,   v5.8h,   v26.8h      // sum_hv[1]
    373        ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
    374        rev64           v29.4h,  v28.4h               // [-(x >> 1)]
    375        ins             v4.h[\i], v25.h[0]            // sum_hv[0]
    376 .if \i < 6
    377        ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
    378        ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
    379        add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
    380        add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
    381 .else
    382        add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
    383 .endif
    384 .if \i == 0
    385        mov             v20.16b, v26.16b              // sum_alt[3]
    386 .elseif \i == 1
    387        add             v20.8h,  v20.8h,  v26.8h      // sum_alt[3]
    388 .else
    389        ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
    390        ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
    391        add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
    392        add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
    393 .endif
    394 .if \i == 0
    395        mov             v0.16b,  v26.16b              // sum_diag[0]
    396        dir_load_step1  v26, \bpc
    397        mov             v2.16b,  v27.16b              // sum_diag[1]
    398        dir_load_step2  v26, \bpc
    399        mov             v6.16b,  v28.16b              // sum_alt[0]
    400        dir_load_step3  v26, \bpc
    401        mov             v16.16b, v29.16b              // sum_alt[1]
    402 .else
    403        ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
    404        ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
    405        ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
    406        ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
    407 .if \i != 7 // Nothing to load for the final row
    408        dir_load_step1  v26, \bpc // Start setting up the next row early.
    409 .endif
    410        add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
    411        add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
    412        add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
    413        add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
    414 .if \i != 7
    415        dir_load_step2  v26, \bpc
    416 .endif
    417        ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
    418        ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
    419        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
    420        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
    421 .if \i != 7
    422        dir_load_step3  v26, \bpc
    423 .endif
    424        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
    425        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
    426        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
    427        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
    428 .endif
    429 .endr
    430 
    431        movi            v31.4s,  #105
    432 
    433        smull           v26.4s,  v4.4h,   v4.4h       // sum_hv[0]*sum_hv[0]
    434        smlal2          v26.4s,  v4.8h,   v4.8h
    435        smull           v27.4s,  v5.4h,   v5.4h       // sum_hv[1]*sum_hv[1]
    436        smlal2          v27.4s,  v5.8h,   v5.8h
    437        mul             v26.4s,  v26.4s,  v31.4s      // cost[2] *= 105
    438        mul             v27.4s,  v27.4s,  v31.4s      // cost[6] *= 105
    439        addv            s4,  v26.4s                   // cost[2]
    440        addv            s5,  v27.4s                   // cost[6]
    441 
    442        rev64           v1.8h,   v1.8h
    443        rev64           v3.8h,   v3.8h
    444        ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
    445        ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]
    446 
    447        str             s4,  [sp, #2*4]               // cost[2]
    448        str             s5,  [sp, #6*4]               // cost[6]
    449 
    450        movrel          x4,  div_table
    451        ld1             {v31.8h}, [x4]
    452 
    453        smull           v22.4s,  v0.4h,   v0.4h       // sum_diag[0]*sum_diag[0]
    454        smull2          v23.4s,  v0.8h,   v0.8h
    455        smlal           v22.4s,  v1.4h,   v1.4h
    456        smlal2          v23.4s,  v1.8h,   v1.8h
    457        smull           v24.4s,  v2.4h,   v2.4h       // sum_diag[1]*sum_diag[1]
    458        smull2          v25.4s,  v2.8h,   v2.8h
    459        smlal           v24.4s,  v3.4h,   v3.4h
    460        smlal2          v25.4s,  v3.8h,   v3.8h
    461        uxtl            v30.4s,  v31.4h               // div_table
    462        uxtl2           v31.4s,  v31.8h
    463        mul             v22.4s,  v22.4s,  v30.4s      // cost[0]
    464        mla             v22.4s,  v23.4s,  v31.4s      // cost[0]
    465        mul             v24.4s,  v24.4s,  v30.4s      // cost[4]
    466        mla             v24.4s,  v25.4s,  v31.4s      // cost[4]
    467        addv            s0,  v22.4s                   // cost[0]
    468        addv            s2,  v24.4s                   // cost[4]
    469 
    470        movrel          x5,  alt_fact
    471        ld1             {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
    472 
    473        str             s0,  [sp, #0*4]               // cost[0]
    474        str             s2,  [sp, #4*4]               // cost[4]
    475 
    476        uxtl            v29.4s,  v29.4h               // div_table[2*m+1] + 105
    477        uxtl            v30.4s,  v30.4h
    478        uxtl            v31.4s,  v31.4h
    479 
    480        cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
    481        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
    482        str             s6,  [sp, #1*4]               // cost[1]
    483        str             s16, [sp, #3*4]               // cost[3]
    484 
    485        mov             w0,  #0                       // best_dir
    486        mov             w1,  v0.s[0]                  // best_cost
    487        mov             w3,  #1                       // n
    488 
    489        str             s18, [sp, #5*4]               // cost[5]
    490        str             s20, [sp, #7*4]               // cost[7]
    491 
    492        mov             w4,  v6.s[0]
    493 
    494        find_best       v6,  v4, v16
    495        find_best       v16, v2, v18
    496        find_best       v18, v5, v20
    497        find_best       v20
    498 
    499        eor             w3,  w0,  #4                  // best_dir ^4
    500        ldr             w4,  [sp, w3, uxtw #2]
    501        sub             w1,  w1,  w4                  // best_cost - cost[best_dir ^ 4]
    502        lsr             w1,  w1,  #10
    503        str             w1,  [x2]                     // *var
    504 
    505        add             sp,  sp,  #32
    506 .if \bpc == 16
    507        ldr             d8,  [sp], 0x10
    508 .endif
    509        ret
    510 endfunc
    511 .endm