tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration_common.S (8383B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2019, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 // void dav1d_sgr_box3_row_v_neon(int32_t **sumsq, int16_t **sum,
     32 //                                int32_t *sumsq_out, int16_t *sum_out,
     33 //                                const int w);
     34 function sgr_box3_row_v_neon, export=1
     35        push            {r4-r9,lr}
     36        ldr             r4,  [sp, #28]
     37        ldrd            r6,  r7,  [r0]
     38        ldr             r0,       [r0, #8]
     39        add             r4,  r4,  #2
     40        ldrd            r8,  r9,  [r1]
     41        ldr             r1,       [r1, #8]
     42 
     43 1:
     44        vld1.32         {q8,  q9},  [r6]!
     45        vld1.32         {q10, q11}, [r7]!
     46        vld1.16         {q14},      [r8]!
     47        vld1.16         {q15},      [r9]!
     48        subs            r4,  r4,  #8
     49 
     50        vadd.i32        q8, q8, q10
     51        vadd.i32        q9, q9, q11
     52 
     53        vld1.32         {q12, q13}, [r0]!
     54 
     55        vadd.i16        q14, q14, q15
     56 
     57        vld1.16         {q15},      [r1]!
     58        vadd.i32        q8,  q8,  q12
     59        vadd.i32        q9,  q9,  q13
     60        vadd.i16        q14, q14, q15
     61 
     62        vst1.32         {q8,  q9},  [r2]!
     63        vst1.16         {q14},      [r3]!
     64 
     65        bgt             1b
     66        pop             {r4-r9,pc}
     67 endfunc
     68 
     69 // void dav1d_sgr_box5_row_v_neon(int32_t **sumsq, int16_t **sum,
     70 //                                int32_t *sumsq_out, int16_t *sum_out,
     71 //                                const int w);
     72 function sgr_box5_row_v_neon, export=1
     73        push            {r4-r11,lr}
     74        ldr             lr,  [sp, #36]
     75 
     76        ldrd            r4,  r5,  [r0]
     77        ldrd            r6,  r7,  [r0, #8]
     78        ldr             r0,       [r0, #16]
     79        add             lr,  lr,  #2
     80        ldrd            r8,  r9,  [r1]
     81        ldrd            r10, r11, [r1, #8]
     82        ldr             r1,       [r1, #16]
     83 
     84 1:
     85        vld1.32         {q8,  q9},  [r4]!
     86        vld1.32         {q10, q11}, [r5]!
     87        vld1.32         {q12, q13}, [r6]!
     88        vld1.32         {q14, q15}, [r7]!
     89        vld1.16         {q0},       [r8]!
     90        vld1.16         {q1},       [r9]!
     91        vld1.16         {q2},       [r10]!
     92        vld1.16         {q3},       [r11]!
     93        subs            lr,  lr,  #8
     94 
     95        vadd.i32        q8,  q8,  q10
     96        vadd.i32        q9,  q9,  q11
     97        vadd.i32        q12, q12, q14
     98        vadd.i32        q13, q13, q15
     99 
    100        vld1.32         {q14, q15}, [r0]!
    101 
    102        vadd.i16        q0,  q0,  q1
    103        vadd.i16        q2,  q2,  q3
    104 
    105        vld1.16         {q3},       [r1]!
    106        vadd.i32        q8,  q8,  q12
    107        vadd.i32        q9,  q9,  q13
    108        vadd.i16        q0,  q0,  q2
    109 
    110        vadd.i32        q8,  q8,  q14
    111        vadd.i32        q9,  q9,  q15
    112        vadd.i16        q0,  q0,  q3
    113 
    114        vst1.32         {q8,  q9},  [r2]!
    115        vst1.16         {q0},       [r3]!
    116 
    117        bgt             1b
    118        pop             {r4-r11,pc}
    119 endfunc
    120 
    121 // void dav1d_sgr_calc_row_ab1_neon(int32_t *a, int16_t *b,
    122 //                                  const int w, const int strength,
    123 //                                  const int bitdepth_max);
    124 // void dav1d_sgr_calc_row_ab2_neon(int32_t *a, int16_t *b,
    125 //                                  const int w, const int strength,
    126 //                                  const int bitdepth_max);
    127 function sgr_calc_row_ab1_neon, export=1
    128        push            {r4-r7,lr}
    129        vpush           {q4-q7}
    130        ldr             r4,  [sp, #84]
    131        clz             r6,  r4
    132        vmov.i32        q15, #9        // n
    133        movw            r5,  #455
    134        b               sgr_calc_ab_neon
    135 endfunc
    136 
    137 function sgr_calc_row_ab2_neon, export=1
    138        push            {r4-r7,lr}
    139        vpush           {q4-q7}
    140        ldr             r4,  [sp, #84]
    141        clz             r6,  r4
    142        vmov.i32        q15, #25       // n
    143        mov             r5,  #164
    144 endfunc
    145 
    146 function sgr_calc_ab_neon
    147        movrel          r12, X(sgr_x_by_x)
    148        sub             r6,  r6,  #24  // -bitdepth_min_8
    149        vld1.8          {q8, q9}, [r12, :128]!
    150        add             r7,  r6,  r6   // -2*bitdepth_min_8
    151        vmov.i8         q11, #5
    152        vmov.i8         d10, #55       // idx of last 5
    153        vld1.8          {q10},    [r12, :128]
    154        vmov.i8         d11, #72       // idx of last 4
    155        vmov.i8         d12, #101      // idx of last 3
    156        vmov.i8         d13, #169      // idx of last 2
    157        vmov.i8         d14, #254      // idx of last 1
    158        vmov.i8         d15, #32       // elements consumed in first vtbl
    159        add             r2,  r2,  #2   // w += 2
    160        vdup.32         q12, r3
    161        vsub.i8         q8,  q8,  q11
    162        vsub.i8         q9,  q9,  q11
    163        vsub.i8         q10, q10, q11
    164        vdup.32         q13, r7        // -2*bitdepth_min_8
    165 1:
    166        vld1.32         {q0, q1}, [r0, :128] // a
    167        vld1.16         {q2},     [r1, :128] // b
    168        vdup.16         q14, r6        // -bitdepth_min_8
    169        subs            r2,  r2,  #8
    170        vrshl.s32       q0,  q0,  q13
    171        vrshl.s32       q1,  q1,  q13
    172        vrshl.s16       q4,  q2,  q14
    173        vmul.i32        q0,  q0,  q15  // a * n
    174        vmul.i32        q1,  q1,  q15  // a * n
    175        vmull.u16       q3,  d8,  d8   // b * b
    176        vmull.u16       q4,  d9,  d9   // b * b
    177        vqsub.u32       q0,  q0,  q3   // imax(a * n - b * b, 0)
    178        vqsub.u32       q1,  q1,  q4   // imax(a * n - b * b, 0)
    179        vmul.i32        q0,  q0,  q12  // p * s
    180        vmul.i32        q1,  q1,  q12  // p * s
    181        vqshrn.u32      d0,  q0,  #16
    182        vqshrn.u32      d1,  q1,  #16
    183        vqrshrn.u16     d0,  q0,  #4   // imin(z, 255)
    184 
    185        vcgt.u8         d2,  d0,  d10  // = -1 if sgr_x_by_x[d0] < 5
    186        vcgt.u8         d3,  d0,  d11  // = -1 if sgr_x_by_x[d0] < 4
    187        vtbl.8          d1,  {q8, q9}, d0
    188        vcgt.u8         d6,  d0,  d12  // = -1 if sgr_x_by_x[d0] < 3
    189        vsub.i8         d9,  d0,  d15  // indices for vtbx
    190        vcgt.u8         d7,  d0,  d13  // = -1 if sgr_x_by_x[d0] < 2
    191        vadd.i8         d2,  d2,  d3
    192        vtbx.8          d1,  {q10}, d9
    193        vcgt.u8         d8,  d0,  d14  // = -1 if sgr_x_by_x[d0] < 1
    194        vadd.i8         d6,  d6,  d7
    195        vadd.i8         d8,  d8,  d22
    196        vadd.i8         d2,  d2,  d6
    197        vadd.i8         d1,  d1,  d8
    198        vadd.i8         d1,  d1,  d2
    199        vmovl.u8        q0,  d1        // x
    200 
    201        vdup.32         q14, r5        // one_by_x
    202 
    203        vmull.u16       q1,  d0,  d4   // x * BB[i]
    204        vmull.u16       q2,  d1,  d5   // x * BB[i]
    205        vmul.i32        q1,  q1,  q14  // x * BB[i] * sgr_one_by_x
    206        vmul.i32        q2,  q2,  q14  // x * BB[i] * sgr_one_by_x
    207        vrshr.s32       q1,  q1,  #12  // AA[i]
    208        vrshr.s32       q2,  q2,  #12  // AA[i]
    209 
    210        vst1.32         {q1, q2}, [r0, :128]!
    211        vst1.16         {q0},     [r1, :128]!
    212        bgt             1b
    213 
    214        vpop            {q4-q7}
    215        pop             {r4-r7,pc}
    216 endfunc