tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef.S (19101B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2019, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 #include "cdef_tmpl.S"
     31 
     32 .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
     33        tst             w7,  #1 // CDEF_HAVE_LEFT
     34        b.eq            2f
     35        // CDEF_HAVE_LEFT
     36        sub             \s1,  \s1,  #2
     37        sub             \s2,  \s2,  #2
     38        tst             w7,  #2 // CDEF_HAVE_RIGHT
     39        b.eq            1f
     40        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
     41        ldr             \rn\()0, [\s1]
     42        ldr             s1,      [\s1, #\w]
     43        ldr             \rn\()2, [\s2]
     44        ldr             s3,      [\s2, #\w]
     45        uxtl            v0.8h,   v0.8b
     46        uxtl            v1.8h,   v1.8b
     47        uxtl            v2.8h,   v2.8b
     48        uxtl            v3.8h,   v3.8b
     49        str             \rw\()0, [x0]
     50        str             d1,      [x0, #2*\w]
     51        add             x0,  x0,  #2*\stride
     52        str             \rw\()2, [x0]
     53        str             d3,      [x0, #2*\w]
     54 .if \ret
     55        ret
     56 .else
     57        add             x0,  x0,  #2*\stride
     58        b               3f
     59 .endif
     60 
     61 1:
     62        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
     63        ldr             \rn\()0, [\s1]
     64        ldr             h1,      [\s1, #\w]
     65        ldr             \rn\()2, [\s2]
     66        ldr             h3,      [\s2, #\w]
     67        uxtl            v0.8h,   v0.8b
     68        uxtl            v1.8h,   v1.8b
     69        uxtl            v2.8h,   v2.8b
     70        uxtl            v3.8h,   v3.8b
     71        str             \rw\()0, [x0]
     72        str             s1,      [x0, #2*\w]
     73        str             s31,     [x0, #2*\w+4]
     74        add             x0,  x0,  #2*\stride
     75        str             \rw\()2, [x0]
     76        str             s3,      [x0, #2*\w]
     77        str             s31,     [x0, #2*\w+4]
     78 .if \ret
     79        ret
     80 .else
     81        add             x0,  x0,  #2*\stride
     82        b               3f
     83 .endif
     84 
     85 2:
     86        // !CDEF_HAVE_LEFT
     87        tst             w7,  #2 // CDEF_HAVE_RIGHT
     88        b.eq            1f
     89        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
     90        ldr             \rn\()0, [\s1]
     91        ldr             h1,      [\s1, #\w]
     92        ldr             \rn\()2, [\s2]
     93        ldr             h3,      [\s2, #\w]
     94        uxtl            v0.8h,  v0.8b
     95        uxtl            v1.8h,  v1.8b
     96        uxtl            v2.8h,  v2.8b
     97        uxtl            v3.8h,  v3.8b
     98        str             s31, [x0]
     99        stur            \rw\()0, [x0, #4]
    100        str             s1,      [x0, #4+2*\w]
    101        add             x0,  x0,  #2*\stride
    102        str             s31, [x0]
    103        stur            \rw\()2, [x0, #4]
    104        str             s3,      [x0, #4+2*\w]
    105 .if \ret
    106        ret
    107 .else
    108        add             x0,  x0,  #2*\stride
    109        b               3f
    110 .endif
    111 
    112 1:
    113        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
    114        ldr             \rn\()0, [\s1]
    115        ldr             \rn\()1, [\s2]
    116        uxtl            v0.8h,  v0.8b
    117        uxtl            v1.8h,  v1.8b
    118        str             s31,     [x0]
    119        stur            \rw\()0, [x0, #4]
    120        str             s31,     [x0, #4+2*\w]
    121        add             x0,  x0,  #2*\stride
    122        str             s31,     [x0]
    123        stur            \rw\()1, [x0, #4]
    124        str             s31,     [x0, #4+2*\w]
    125 .if \ret
    126        ret
    127 .else
    128        add             x0,  x0,  #2*\stride
    129 .endif
    130 3:
    131 .endm
    132 
    133 .macro load_n_incr dst, src, incr, w
    134 .if \w == 4
    135        ld1             {\dst\().s}[0], [\src], \incr
    136 .else
    137        ld1             {\dst\().8b},   [\src], \incr
    138 .endif
    139 .endm
    140 
    141 // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
    142 //                                    ptrdiff_t src_stride, const pixel (*left)[2],
    143 //                                    const pixel *const top,
    144 //                                    const pixel *const bottom, int h,
    145 //                                    enum CdefEdgeFlags edges);
    146 
    147 .macro padding_func w, stride, rn, rw
    148 function cdef_padding\w\()_8bpc_neon, export=1
    149        cmp             w7,  #0xf // fully edged
    150        b.eq            cdef_padding\w\()_edged_8bpc_neon
    151        movi            v30.8h,  #0x80, lsl #8
    152        mov             v31.16b, v30.16b
    153        sub             x0,  x0,  #2*(2*\stride+2)
    154        tst             w7,  #4 // CDEF_HAVE_TOP
    155        b.ne            1f
    156        // !CDEF_HAVE_TOP
    157        st1             {v30.8h, v31.8h}, [x0], #32
    158 .if \w == 8
    159        st1             {v30.8h, v31.8h}, [x0], #32
    160 .endif
    161        b               3f
    162 1:
    163        // CDEF_HAVE_TOP
    164        add             x9,  x4,  x2
    165        pad_top_bottom  x4,  x9, \w, \stride, \rn, \rw, 0
    166 
    167        // Middle section
    168 3:
    169        tst             w7,  #1 // CDEF_HAVE_LEFT
    170        b.eq            2f
    171        // CDEF_HAVE_LEFT
    172        tst             w7,  #2 // CDEF_HAVE_RIGHT
    173        b.eq            1f
    174        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
    175 0:
    176        ld1             {v0.h}[0], [x3], #2
    177        ldr             h2,      [x1, #\w]
    178        load_n_incr     v1,  x1,  x2,  \w
    179        subs            w6,  w6,  #1
    180        uxtl            v0.8h,  v0.8b
    181        uxtl            v1.8h,  v1.8b
    182        uxtl            v2.8h,  v2.8b
    183        str             s0,      [x0]
    184        stur            \rw\()1, [x0, #4]
    185        str             s2,      [x0, #4+2*\w]
    186        add             x0,  x0,  #2*\stride
    187        b.gt            0b
    188        b               3f
    189 1:
    190        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
    191        ld1             {v0.h}[0], [x3], #2
    192        load_n_incr     v1,  x1,  x2,  \w
    193        subs            w6,  w6,  #1
    194        uxtl            v0.8h,  v0.8b
    195        uxtl            v1.8h,  v1.8b
    196        str             s0,      [x0]
    197        stur            \rw\()1, [x0, #4]
    198        str             s31,     [x0, #4+2*\w]
    199        add             x0,  x0,  #2*\stride
    200        b.gt            1b
    201        b               3f
    202 2:
    203        tst             w7,  #2 // CDEF_HAVE_RIGHT
    204        b.eq            1f
    205        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
    206 0:
    207        ldr             h1,      [x1, #\w]
    208        load_n_incr     v0,  x1,  x2,  \w
    209        subs            w6,  w6,  #1
    210        uxtl            v0.8h,  v0.8b
    211        uxtl            v1.8h,  v1.8b
    212        str             s31,     [x0]
    213        stur            \rw\()0, [x0, #4]
    214        str             s1,      [x0, #4+2*\w]
    215        add             x0,  x0,  #2*\stride
    216        b.gt            0b
    217        b               3f
    218 1:
    219        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
    220        load_n_incr     v0,  x1,  x2,  \w
    221        subs            w6,  w6,  #1
    222        uxtl            v0.8h,  v0.8b
    223        str             s31,     [x0]
    224        stur            \rw\()0, [x0, #4]
    225        str             s31,     [x0, #4+2*\w]
    226        add             x0,  x0,  #2*\stride
    227        b.gt            1b
    228 
    229 3:
    230        tst             w7,  #8 // CDEF_HAVE_BOTTOM
    231        b.ne            1f
    232        // !CDEF_HAVE_BOTTOM
    233        st1             {v30.8h, v31.8h}, [x0], #32
    234 .if \w == 8
    235        st1             {v30.8h, v31.8h}, [x0], #32
    236 .endif
    237        ret
    238 1:
    239        // CDEF_HAVE_BOTTOM
    240        add             x9,  x5,  x2
    241        pad_top_bottom  x5,  x9, \w, \stride, \rn, \rw, 1
    242 endfunc
    243 .endm
    244 
    245 padding_func 8, 16, d, q
    246 padding_func 4, 8,  s, d
    247 
    248 // void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
    249 //                                    ptrdiff_t src_stride, const pixel (*left)[2],
    250 //                                    const pixel *const top,
    251 //                                    const pixel *const bottom, int h,
    252 //                                    enum CdefEdgeFlags edges);
    253 
    254 .macro padding_func_edged w, stride, reg
    255 function cdef_padding\w\()_edged_8bpc_neon, export=1
    256        sub             x4,  x4,  #2
    257        sub             x5,  x5,  #2
    258        sub             x0,  x0,  #(2*\stride+2)
    259 
    260 .if \w == 4
    261        ldr             d0, [x4]
    262        ldr             d1, [x4, x2]
    263        st1             {v0.8b, v1.8b}, [x0], #16
    264 .else
    265        add             x9,  x4,  x2
    266        ldr             d0, [x4]
    267        ldr             s1, [x4, #8]
    268        ldr             d2, [x9]
    269        ldr             s3, [x9, #8]
    270        str             d0, [x0]
    271        str             s1, [x0, #8]
    272        str             d2, [x0, #\stride]
    273        str             s3, [x0, #\stride+8]
    274        add             x0,  x0,  #2*\stride
    275 .endif
    276 
    277 0:
    278        ld1             {v0.h}[0], [x3], #2
    279        ldr             h2,      [x1, #\w]
    280        load_n_incr     v1,  x1,  x2,  \w
    281        subs            w6,  w6,  #1
    282        str             h0,      [x0]
    283        stur            \reg\()1, [x0, #2]
    284        str             h2,      [x0, #2+\w]
    285        add             x0,  x0,  #\stride
    286        b.gt            0b
    287 
    288 .if \w == 4
    289        ldr             d0, [x5]
    290        ldr             d1, [x5, x2]
    291        st1             {v0.8b, v1.8b}, [x0], #16
    292 .else
    293        add             x9,  x5,  x2
    294        ldr             d0, [x5]
    295        ldr             s1, [x5, #8]
    296        ldr             d2, [x9]
    297        ldr             s3, [x9, #8]
    298        str             d0, [x0]
    299        str             s1, [x0, #8]
    300        str             d2, [x0, #\stride]
    301        str             s3, [x0, #\stride+8]
    302 .endif
    303        ret
    304 endfunc
    305 .endm
    306 
    307 padding_func_edged 8, 16, d
    308 padding_func_edged 4, 8,  s
    309 
    310 tables
    311 
    312 filter 8, 8
    313 filter 4, 8
    314 
    315 find_dir 8
    316 
    317 .macro load_px_8 d1, d2, w
    318 .if \w == 8
    319        add             x6,  x2,  w9, sxtb          // x + off
    320        sub             x9,  x2,  w9, sxtb          // x - off
    321        ld1             {\d1\().d}[0], [x6]         // p0
    322        add             x6,  x6,  #16               // += stride
    323        ld1             {\d2\().d}[0], [x9]         // p1
    324        add             x9,  x9,  #16               // += stride
    325        ld1             {\d1\().d}[1], [x6]         // p0
    326        ld1             {\d2\().d}[1], [x9]         // p0
    327 .else
    328        add             x6,  x2,  w9, sxtb          // x + off
    329        sub             x9,  x2,  w9, sxtb          // x - off
    330        ld1             {\d1\().s}[0], [x6]         // p0
    331        add             x6,  x6,  #8                // += stride
    332        ld1             {\d2\().s}[0], [x9]         // p1
    333        add             x9,  x9,  #8                // += stride
    334        ld1             {\d1\().s}[1], [x6]         // p0
    335        add             x6,  x6,  #8                // += stride
    336        ld1             {\d2\().s}[1], [x9]         // p1
    337        add             x9,  x9,  #8                // += stride
    338        ld1             {\d1\().s}[2], [x6]         // p0
    339        add             x6,  x6,  #8                // += stride
    340        ld1             {\d2\().s}[2], [x9]         // p1
    341        add             x9,  x9,  #8                // += stride
    342        ld1             {\d1\().s}[3], [x6]         // p0
    343        ld1             {\d2\().s}[3], [x9]         // p1
    344 .endif
    345 .endm
    346 .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
    347 .if \min
    348        umin            v3.16b,  v3.16b,  \s1\().16b
    349        umax            v4.16b,  v4.16b,  \s1\().16b
    350        umin            v3.16b,  v3.16b,  \s2\().16b
    351        umax            v4.16b,  v4.16b,  \s2\().16b
    352 .endif
    353        uabd            v16.16b, v0.16b,  \s1\().16b  // abs(diff)
    354        uabd            v20.16b, v0.16b,  \s2\().16b  // abs(diff)
    355        ushl            v17.16b, v16.16b, \shift      // abs(diff) >> shift
    356        ushl            v21.16b, v20.16b, \shift      // abs(diff) >> shift
    357        uqsub           v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
    358        uqsub           v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
    359        cmhi            v18.16b, v0.16b,  \s1\().16b  // px > p0
    360        cmhi            v22.16b, v0.16b,  \s2\().16b  // px > p1
    361        umin            v17.16b, v17.16b, v16.16b     // imin(abs(diff), clip)
    362        umin            v21.16b, v21.16b, v20.16b     // imin(abs(diff), clip)
    363        dup             v19.16b, \tap                 // taps[k]
    364        neg             v16.16b, v17.16b              // -imin()
    365        neg             v20.16b, v21.16b              // -imin()
    366        bsl             v18.16b, v16.16b, v17.16b     // constrain() = apply_sign()
    367        bsl             v22.16b, v20.16b, v21.16b     // constrain() = apply_sign()
    368        mla             v1.16b,  v18.16b, v19.16b     // sum += taps[k] * constrain()
    369        mla             v2.16b,  v22.16b, v19.16b     // sum += taps[k] * constrain()
    370 .endm
    371 
    372 // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
    373 //                                   const uint8_t *tmp, int pri_strength,
    374 //                                   int sec_strength, int dir, int damping,
    375 //                                   int h);
    376 .macro filter_func_8 w, pri, sec, min, suffix
    377 function cdef_filter\w\suffix\()_edged_8bpc_neon
    378 .if \pri
    379        movrel          x8,  pri_taps
    380        and             w9,  w3,  #1
    381        add             x8,  x8,  w9, uxtw #1
    382 .endif
    383        movrel          x9,  directions\w
    384        add             x5,  x9,  w5, uxtw #1
    385        movi            v30.8b,  #7
    386        dup             v28.8b,  w6                 // damping
    387 
    388 .if \pri
    389        dup             v25.16b, w3                 // threshold
    390 .endif
    391 .if \sec
    392        dup             v27.16b, w4                 // threshold
    393 .endif
    394        trn1            v24.8b,  v25.8b, v27.8b
    395        clz             v24.8b,  v24.8b             // clz(threshold)
    396        sub             v24.8b,  v30.8b, v24.8b     // ulog2(threshold)
    397        uqsub           v24.8b,  v28.8b, v24.8b     // shift = imax(0, damping - ulog2(threshold))
    398        neg             v24.8b,  v24.8b             // -shift
    399 .if \sec
    400        dup             v26.16b, v24.b[1]
    401 .endif
    402 .if \pri
    403        dup             v24.16b, v24.b[0]
    404 .endif
    405 
    406 1:
    407 .if \w == 8
    408        add             x12, x2,  #16
    409        ld1             {v0.d}[0], [x2]             // px
    410        ld1             {v0.d}[1], [x12]            // px
    411 .else
    412        add             x12, x2,  #1*8
    413        add             x13, x2,  #2*8
    414        add             x14, x2,  #3*8
    415        ld1             {v0.s}[0], [x2]             // px
    416        ld1             {v0.s}[1], [x12]            // px
    417        ld1             {v0.s}[2], [x13]            // px
    418        ld1             {v0.s}[3], [x14]            // px
    419 .endif
    420 
    421        // We need 9-bits or two 8-bit accululators to fit the sum.
    422        // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
    423        // Start sum at -1 instead of 0 to help handle rounding later.
    424        movi            v1.16b, #255                // sum
    425        movi            v2.16b, #0                  // sum
    426 .if \min
    427        mov             v3.16b, v0.16b              // min
    428        mov             v4.16b, v0.16b              // max
    429 .endif
    430 
    431        // Instead of loading sec_taps 2, 1 from memory, just set it
    432        // to 2 initially and decrease for the second round.
    433        // This is also used as loop counter.
    434        mov             w11, #2                     // sec_taps[0]
    435 
    436 2:
    437 .if \pri
    438        ldrb            w9,  [x5]                   // off1
    439 
    440        load_px_8       v5,  v6, \w
    441 .endif
    442 
    443 .if \sec
    444        add             x5,  x5,  #4                // +2*2
    445        ldrb            w9,  [x5]                   // off2
    446        load_px_8       v28, v29, \w
    447 .endif
    448 
    449 .if \pri
    450        ldrb            w10, [x8]                   // *pri_taps
    451 
    452        handle_pixel_8  v5,  v6,  v25.16b, v24.16b, w10, \min
    453 .endif
    454 
    455 .if \sec
    456        add             x5,  x5,  #8                // +2*4
    457        ldrb            w9,  [x5]                   // off3
    458        load_px_8       v5,  v6,  \w
    459 
    460        handle_pixel_8  v28, v29, v27.16b, v26.16b, w11, \min
    461 
    462        handle_pixel_8  v5,  v6,  v27.16b, v26.16b, w11, \min
    463 
    464        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
    465 .else
    466        add             x5,  x5,  #1                // x5 += 1
    467 .endif
    468        subs            w11, w11, #1                // sec_tap-- (value)
    469 .if \pri
    470        add             x8,  x8,  #1                // pri_taps++ (pointer)
    471 .endif
    472        b.ne            2b
    473 
    474        // Perform halving adds since the value won't fit otherwise.
    475        // To handle the offset for negative values, use both halving w/ and w/o rounding.
    476        srhadd          v5.16b,  v1.16b,  v2.16b    // sum >> 1
    477        shadd           v6.16b,  v1.16b,  v2.16b    // (sum - 1) >> 1
    478        cmlt            v1.16b,  v5.16b,  #0        // sum < 0
    479        bsl             v1.16b,  v6.16b,  v5.16b    // (sum - (sum < 0)) >> 1
    480 
    481        srshr           v1.16b,  v1.16b,  #3        // (8 + sum - (sum < 0)) >> 4
    482 
    483        usqadd          v0.16b,  v1.16b             // px + (8 + sum ...) >> 4
    484 .if \min
    485        umin            v0.16b,  v0.16b,  v4.16b
    486        umax            v0.16b,  v0.16b,  v3.16b    // iclip(px + .., min, max)
    487 .endif
    488 .if \w == 8
    489        st1             {v0.d}[0], [x0], x1
    490        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
    491        subs            w7,  w7,  #2                // h -= 2
    492        st1             {v0.d}[1], [x0], x1
    493 .else
    494        st1             {v0.s}[0], [x0], x1
    495        add             x2,  x2,  #4*8              // tmp += 4*tmp_stride
    496        st1             {v0.s}[1], [x0], x1
    497        subs            w7,  w7,  #4                // h -= 4
    498        st1             {v0.s}[2], [x0], x1
    499        st1             {v0.s}[3], [x0], x1
    500 .endif
    501 
    502        // Reset pri_taps and directions back to the original point
    503        sub             x5,  x5,  #2
    504 .if \pri
    505        sub             x8,  x8,  #2
    506 .endif
    507 
    508        b.gt            1b
    509        ret
    510 endfunc
    511 .endm
    512 
    513 .macro filter_8 w
    514 filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
    515 filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
    516 filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
    517 .endm
    518 
    519 filter_8 8
    520 filter_8 4