[ tor-browser ].git.dasho

looprestoration_common.S (14938B)
      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2018, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 // Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table.
     32 // In the comments, let RefTable denote the original, reference table.
     33 const x_by_x_tables
     34 // RangeMins
     35 //
     36 // Min(RefTable[i*8:i*8+8])
     37 // First two values are zeroed.
     38 //
     39 // Lookup using RangeMins[(x >> 3)]
     40        .byte 0,  0, 11,  8,  6,  5,  5,  4,  4,  3,  3,  3,  2,  2,  2,  2
     41        .byte 2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0
     42 
     43 // DiffMasks
     44 //
     45 // This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range
     46 // in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of
     47 // RefTable changes at that particular index.
     48 // Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of
     49 // the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15).
     50 //
     51 // Lookup using DiffMasks[(x >> 3)]
     52        .byte 0x00, 0x00, 0xD4, 0x44
     53        .byte 0x42, 0x04, 0x00, 0x00
     54        .byte 0x00, 0x80, 0x00, 0x00
     55        .byte 0x04, 0x00, 0x00, 0x00
     56        .byte 0x00, 0x00, 0x00, 0x00
     57        .byte 0x00, 0x40, 0x00, 0x00
     58        .byte 0x00, 0x00, 0x00, 0x00
     59        .byte 0x00, 0x00, 0x00, 0x02
     60 // Binary form:
     61 // 0b00000000, 0b00000000, 0b11010100, 0b01000100
     62 // 0b01000010, 0b00000100, 0b00000000, 0b00000000
     63 // 0b00000000, 0b10000000, 0b00000000, 0b00000000
     64 // 0b00000100, 0b00000000, 0b00000000, 0b00000000
     65 // 0b00000000, 0b00000000, 0b00000000, 0b00000000
     66 // 0b00000000, 0b01000000, 0b00000000, 0b00000000
     67 // 0b00000000, 0b00000000, 0b00000000, 0b00000000
     68 // 0b00000000, 0b00000000, 0b00000000, 0b00000010
     69 
     70 // RefLo
     71 //
     72 // RefTable[0:16]
     73 //      i.e. First 16 elements of the original table.
     74 // Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable.
     75 //
     76 // Lookup using RangeMins[x] (tbl will replace x > 15 with 0)
     77        .byte 255, 128,  85,  64,  51,  43,  37,  32, 28,  26,  23,  21,  20,  18,  17,  16
     78 
     79 // Pseudo assembly
     80 //
     81 // hi_bits = x >> 3
     82 // tbl             ref,    {RefLo}, x
     83 // tbl             diffs,  {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits
     84 // tbl             min,    {RangeMins[0:16], RangeMins[16:32]}, hi_bits
     85 // lo_bits = x & 0x7
     86 // diffs = diffs << lo_bits
     87 // ref = ref + min
     88 // integral = popcnt(diffs)
     89 // ref = ref + integral
     90 // return ref
     91 endconst
     92 
     93 // void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
     94 //                               int32_t *AA, int16_t *BB,
     95 //                               const int w, const int s,
     96 //                               const int bitdepth_max);
     97 function sgr_box3_vert_neon, export=1
     98        stp             d8,  d9,  [sp, #-0x40]!
     99        stp             d10, d11, [sp, #0x10]
    100        stp             d12, d13, [sp, #0x20]
    101        stp             d14, d15, [sp, #0x30]
    102 
    103        add             w4,  w4,  #2
    104        clz             w9,  w6        // bitdepth_max
    105        dup             v28.4s,   w5   // strength
    106 
    107        ldp             x5,  x6,  [x0]
    108        ldr             x0,       [x0, #16]
    109        ldp             x7,  x8,  [x1]
    110        ldr             x1,       [x1, #16]
    111 
    112        movi            v31.4s,   #9   // n
    113 
    114        sub             w9,  w9,  #24  // -bitdepth_min_8
    115        movrel          x12, x_by_x_tables
    116        mov             w13, #455      // one_by_x
    117        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks
    118        movi            v22.16b, #0x7
    119        ldr             q23, [x12, #64] //RefLo
    120        dup             v6.8h,    w9   // -bitdepth_min_8
    121        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
    122        dup             v30.4s,   w13  // one_by_x
    123 
    124        ld1             {v8.4s,  v9.4s,  v10.4s, v11.4s}, [x5], #64
    125        ld1             {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
    126        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
    127        ld1             {v20.8h, v21.8h}, [x8], #32
    128        ld1             {v0.8h,  v1.8h},  [x7], #32
    129 1:
    130        ld1             {v2.8h,  v3.8h},   [x1], #32
    131        add             v8.4s,   v8.4s,   v12.4s
    132        add             v9.4s,   v9.4s,   v13.4s
    133        add             v10.4s,  v10.4s,  v14.4s
    134        add             v11.4s,  v11.4s,  v15.4s
    135        add             v0.8h,   v0.8h,   v20.8h
    136        add             v1.8h,   v1.8h,   v21.8h
    137 
    138        add             v16.4s,  v16.4s,  v8.4s
    139        add             v17.4s,  v17.4s,  v9.4s
    140        add             v18.4s,  v18.4s,  v10.4s
    141        add             v19.4s,  v19.4s,  v11.4s
    142        add             v4.8h,   v2.8h,   v0.8h
    143        add             v5.8h,   v3.8h,   v1.8h
    144 
    145        srshl           v16.4s,  v16.4s,  v7.4s
    146        srshl           v17.4s,  v17.4s,  v7.4s
    147        srshl           v18.4s,  v18.4s,  v7.4s
    148        srshl           v19.4s,  v19.4s,  v7.4s
    149        srshl           v9.8h,   v4.8h,   v6.8h
    150        srshl           v13.8h,  v5.8h,   v6.8h
    151        mul             v16.4s,  v16.4s,  v31.4s // a * n
    152        mul             v17.4s,  v17.4s,  v31.4s // a * n
    153        mul             v18.4s,  v18.4s,  v31.4s // a * n
    154        mul             v19.4s,  v19.4s,  v31.4s // a * n
    155        umull           v8.4s,   v9.4h,   v9.4h  // b * b
    156        umull2          v9.4s,   v9.8h,   v9.8h  // b * b
    157        umull           v12.4s,  v13.4h,  v13.4h // b * b
    158        umull2          v13.4s,  v13.8h,  v13.8h // b * b
    159        uqsub           v16.4s,  v16.4s,  v8.4s  // imax(a * n - b * b, 0)
    160        uqsub           v17.4s,  v17.4s,  v9.4s  // imax(a * n - b * b, 0)
    161        uqsub           v18.4s,  v18.4s,  v12.4s // imax(a * n - b * b, 0)
    162        uqsub           v19.4s,  v19.4s,  v13.4s // imax(a * n - b * b, 0)
    163        mul             v16.4s,  v16.4s,  v28.4s // p * s
    164        mul             v17.4s,  v17.4s,  v28.4s // p * s
    165        mul             v18.4s,  v18.4s,  v28.4s // p * s
    166        mul             v19.4s,  v19.4s,  v28.4s // p * s
    167        uqshrn          v16.4h,  v16.4s,  #16
    168        uqshrn2         v16.8h,  v17.4s,  #16
    169        uqshrn          v18.4h,  v18.4s,  #16
    170        uqshrn2         v18.8h,  v19.4s,  #16
    171        uqrshrn         v1.8b,   v16.8h,  #4     // imin(z, 255)
    172        uqrshrn2        v1.16b,  v18.8h,  #4     // imin(z, 255)
    173 
    174        ld1             {v16.4s, v17.4s}, [x0], #32
    175        subs            w4,  w4,  #16
    176 
    177        ushr            v0.16b,  v1.16b,  #3
    178        ld1             {v8.4s,  v9.4s}, [x5], #32
    179        tbl             v2.16b,  {v26.16b, v27.16b}, v0.16b // RangeMins
    180        tbl             v0.16b,  {v24.16b, v25.16b}, v0.16b // DiffMasks
    181        tbl             v3.16b,  {v23.16b}, v1.16b          // RefLo
    182        and             v1.16b,  v1.16b,   v22.16b
    183        ld1             {v12.4s, v13.4s}, [x6], #32
    184        ushl            v1.16b,  v2.16b,  v1.16b
    185        ld1             {v20.8h, v21.8h}, [x8], #32
    186        add             v3.16b,  v3.16b,  v0.16b
    187        cnt             v1.16b,  v1.16b
    188        ld1             {v18.4s, v19.4s}, [x0], #32
    189        add             v3.16b,  v3.16b,  v1.16b
    190        ld1             {v10.4s, v11.4s}, [x5], #32
    191        uxtl            v0.8h,   v3.8b           // x
    192        uxtl2           v1.8h,   v3.16b          // x
    193 
    194        ld1             {v14.4s, v15.4s}, [x6], #32
    195 
    196        umull           v2.4s,   v0.4h,   v4.4h // x * BB[i]
    197        umull2          v3.4s,   v0.8h,   v4.8h // x * BB[i]
    198        umull           v4.4s,   v1.4h,   v5.4h // x * BB[i]
    199        umull2          v5.4s,   v1.8h,   v5.8h // x * BB[i]
    200        mul             v2.4s,   v2.4s,  v30.4s // x * BB[i] * sgr_one_by_x
    201        mul             v3.4s,   v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
    202        mul             v4.4s,   v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
    203        mul             v5.4s,   v5.4s,  v30.4s // x * BB[i] * sgr_one_by_x
    204        st1             {v0.8h, v1.8h}, [x3], #32
    205        ld1             {v0.8h, v1.8h}, [x7], #32
    206        srshr           v2.4s,   v2.4s,  #12    // AA[i]
    207        srshr           v3.4s,   v3.4s,  #12    // AA[i]
    208        srshr           v4.4s,   v4.4s,  #12    // AA[i]
    209        srshr           v5.4s,   v5.4s,  #12    // AA[i]
    210 
    211        st1             {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64
    212        b.gt            1b
    213 
    214        ldp             d14, d15, [sp, #0x30]
    215        ldp             d12, d13, [sp, #0x20]
    216        ldp             d10, d11, [sp, #0x10]
    217        ldp             d8,  d9,  [sp], 0x40
    218        ret
    219 endfunc
    220 
    221 // void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
    222 //                               int32_t *AA, int16_t *BB,
    223 //                               const int w, const int s,
    224 //                               const int bitdepth_max);
    225 function sgr_box5_vert_neon, export=1
    226        stp             d8,  d9,  [sp, #-0x30]!
    227        stp             d10, d11, [sp, #0x10]
    228        stp             d12, d13, [sp, #0x20]
    229 
    230        add             w4,  w4,  #2
    231        clz             w15, w6        // bitdepth_max
    232        dup             v28.4s,   w5   // strength
    233 
    234        ldp             x5,  x6,  [x0]
    235        ldp             x7,  x8,  [x0, #16]
    236        ldr             x0,       [x0, #32]
    237        ldp             x9,  x10, [x1]
    238        ldp             x11, x12, [x1, #16]
    239        ldr             x1,       [x1, #32]
    240 
    241        movi            v31.4s,   #25   // n
    242 
    243        sub             w15, w15, #24  // -bitdepth_min_8
    244        movrel          x13, x_by_x_tables
    245        movi            v30.4s,  #164
    246        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks
    247        dup             v6.8h,   w15  // -bitdepth_min_8
    248        movi            v19.8b,  #0x7
    249        ldr             q18, [x13, #64] // RefLo
    250        saddl           v7.4s,   v6.4h,   v6.4h  // -2*bitdepth_min_8
    251 
    252        ld1             {v8.4s,  v9.4s},  [x5], #32
    253        ld1             {v10.4s, v11.4s}, [x6], #32
    254        ld1             {v12.4s, v13.4s}, [x7], #32
    255        ld1             {v16.4s, v17.4s}, [x8], #32
    256        ld1             {v20.8h},         [x9], #16
    257        ld1             {v21.8h},         [x10], #16
    258        ld1             {v22.8h},         [x11], #16
    259        ld1             {v23.8h},         [x12], #16
    260        ld1             {v0.4s,  v1.4s},  [x0], #32
    261        ld1             {v2.8h},          [x1], #16
    262 
    263 1:
    264        add             v8.4s,   v8.4s,   v10.4s
    265        add             v9.4s,   v9.4s,   v11.4s
    266        add             v12.4s,  v12.4s,  v16.4s
    267        add             v13.4s,  v13.4s,  v17.4s
    268 
    269        add             v20.8h,  v20.8h,  v21.8h
    270        add             v22.8h,  v22.8h,  v23.8h
    271 
    272        add             v0.4s,   v0.4s,   v8.4s
    273        add             v1.4s,   v1.4s,   v9.4s
    274        add             v2.8h,   v2.8h,   v20.8h
    275 
    276        add             v0.4s,   v0.4s,   v12.4s
    277        add             v1.4s,   v1.4s,   v13.4s
    278        add             v2.8h,   v2.8h,   v22.8h
    279 
    280        subs            w4,  w4,  #8
    281 
    282        srshl           v0.4s,   v0.4s,   v7.4s
    283        srshl           v1.4s,   v1.4s,   v7.4s
    284        srshl           v4.8h,   v2.8h,   v6.8h
    285        mul             v0.4s,   v0.4s,   v31.4s // a * n
    286        mul             v1.4s,   v1.4s,   v31.4s // a * n
    287        umull           v3.4s,   v4.4h,   v4.4h  // b * b
    288        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
    289        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
    290        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
    291        mul             v0.4s,   v0.4s,   v28.4s // p * s
    292        mul             v1.4s,   v1.4s,   v28.4s // p * s
    293        ld1             {v8.4s,  v9.4s},  [x5], #32
    294        uqshrn          v0.4h,   v0.4s,   #16
    295        uqshrn2         v0.8h,   v1.4s,   #16
    296        ld1             {v10.4s, v11.4s}, [x6], #32
    297        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
    298 
    299        ld1             {v12.4s, v13.4s}, [x7], #32
    300 
    301        ushr            v1.8b,   v0.8b,  #3
    302        ld1             {v16.4s, v17.4s}, [x8], #32
    303        tbl             v5.8b,   {v26.16b, v27.16b}, v1.8b // RangeMins
    304        tbl             v1.8b,   {v24.16b, v25.16b}, v1.8b // DiffMasks
    305        tbl             v4.8b,   {v18.16b}, v0.8b          // RefLo
    306        and             v0.8b,   v0.8b,  v19.8b
    307        ld1             {v20.8h},         [x9], #16
    308        ushl            v5.8b,   v5.8b,  v0.8b
    309        add             v4.8b,   v4.8b,  v1.8b
    310        ld1             {v21.8h},         [x10], #16
    311        cnt             v5.8b,   v5.8b
    312        ld1             {v22.8h},         [x11], #16
    313        add             v5.8b,   v4.8b,  v5.8b
    314        ld1             {v23.8h},         [x12], #16
    315        uxtl            v5.8h,   v5.8b           // x
    316 
    317        ld1             {v0.4s,  v1.4s},  [x0], #32
    318        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
    319        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
    320        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
    321        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
    322        srshr           v3.4s,   v3.4s,   #12    // AA[i]
    323        srshr           v4.4s,   v4.4s,   #12    // AA[i]
    324        ld1             {v2.8h},          [x1], #16
    325 
    326        st1             {v3.4s, v4.4s}, [x2], #32
    327        st1             {v5.8h}, [x3], #16
    328        b.gt            1b
    329 
    330        ldp             d12, d13, [sp, #0x20]
    331        ldp             d10, d11, [sp, #0x10]
    332        ldp             d8,  d9,  [sp], 0x30
    333        ret
    334 endfunc
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE