looprestoration_common.S (8383B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 31 // void dav1d_sgr_box3_row_v_neon(int32_t **sumsq, int16_t **sum, 32 // int32_t *sumsq_out, int16_t *sum_out, 33 // const int w); 34 function sgr_box3_row_v_neon, export=1 35 push {r4-r9,lr} 36 ldr r4, [sp, #28] 37 ldrd r6, r7, [r0] 38 ldr r0, [r0, #8] 39 add r4, r4, #2 40 ldrd r8, r9, [r1] 41 ldr r1, [r1, #8] 42 43 1: 44 vld1.32 {q8, q9}, [r6]! 45 vld1.32 {q10, q11}, [r7]! 46 vld1.16 {q14}, [r8]! 47 vld1.16 {q15}, [r9]! 48 subs r4, r4, #8 49 50 vadd.i32 q8, q8, q10 51 vadd.i32 q9, q9, q11 52 53 vld1.32 {q12, q13}, [r0]! 54 55 vadd.i16 q14, q14, q15 56 57 vld1.16 {q15}, [r1]! 58 vadd.i32 q8, q8, q12 59 vadd.i32 q9, q9, q13 60 vadd.i16 q14, q14, q15 61 62 vst1.32 {q8, q9}, [r2]! 63 vst1.16 {q14}, [r3]! 64 65 bgt 1b 66 pop {r4-r9,pc} 67 endfunc 68 69 // void dav1d_sgr_box5_row_v_neon(int32_t **sumsq, int16_t **sum, 70 // int32_t *sumsq_out, int16_t *sum_out, 71 // const int w); 72 function sgr_box5_row_v_neon, export=1 73 push {r4-r11,lr} 74 ldr lr, [sp, #36] 75 76 ldrd r4, r5, [r0] 77 ldrd r6, r7, [r0, #8] 78 ldr r0, [r0, #16] 79 add lr, lr, #2 80 ldrd r8, r9, [r1] 81 ldrd r10, r11, [r1, #8] 82 ldr r1, [r1, #16] 83 84 1: 85 vld1.32 {q8, q9}, [r4]! 86 vld1.32 {q10, q11}, [r5]! 87 vld1.32 {q12, q13}, [r6]! 88 vld1.32 {q14, q15}, [r7]! 89 vld1.16 {q0}, [r8]! 90 vld1.16 {q1}, [r9]! 91 vld1.16 {q2}, [r10]! 92 vld1.16 {q3}, [r11]! 93 subs lr, lr, #8 94 95 vadd.i32 q8, q8, q10 96 vadd.i32 q9, q9, q11 97 vadd.i32 q12, q12, q14 98 vadd.i32 q13, q13, q15 99 100 vld1.32 {q14, q15}, [r0]! 101 102 vadd.i16 q0, q0, q1 103 vadd.i16 q2, q2, q3 104 105 vld1.16 {q3}, [r1]! 106 vadd.i32 q8, q8, q12 107 vadd.i32 q9, q9, q13 108 vadd.i16 q0, q0, q2 109 110 vadd.i32 q8, q8, q14 111 vadd.i32 q9, q9, q15 112 vadd.i16 q0, q0, q3 113 114 vst1.32 {q8, q9}, [r2]! 115 vst1.16 {q0}, [r3]! 116 117 bgt 1b 118 pop {r4-r11,pc} 119 endfunc 120 121 // void dav1d_sgr_calc_row_ab1_neon(int32_t *a, int16_t *b, 122 // const int w, const int strength, 123 // const int bitdepth_max); 124 // void dav1d_sgr_calc_row_ab2_neon(int32_t *a, int16_t *b, 125 // const int w, const int strength, 126 // const int bitdepth_max); 127 function sgr_calc_row_ab1_neon, export=1 128 push {r4-r7,lr} 129 vpush {q4-q7} 130 ldr r4, [sp, #84] 131 clz r6, r4 132 vmov.i32 q15, #9 // n 133 movw r5, #455 134 b sgr_calc_ab_neon 135 endfunc 136 137 function sgr_calc_row_ab2_neon, export=1 138 push {r4-r7,lr} 139 vpush {q4-q7} 140 ldr r4, [sp, #84] 141 clz r6, r4 142 vmov.i32 q15, #25 // n 143 mov r5, #164 144 endfunc 145 146 function sgr_calc_ab_neon 147 movrel r12, X(sgr_x_by_x) 148 sub r6, r6, #24 // -bitdepth_min_8 149 vld1.8 {q8, q9}, [r12, :128]! 150 add r7, r6, r6 // -2*bitdepth_min_8 151 vmov.i8 q11, #5 152 vmov.i8 d10, #55 // idx of last 5 153 vld1.8 {q10}, [r12, :128] 154 vmov.i8 d11, #72 // idx of last 4 155 vmov.i8 d12, #101 // idx of last 3 156 vmov.i8 d13, #169 // idx of last 2 157 vmov.i8 d14, #254 // idx of last 1 158 vmov.i8 d15, #32 // elements consumed in first vtbl 159 add r2, r2, #2 // w += 2 160 vdup.32 q12, r3 161 vsub.i8 q8, q8, q11 162 vsub.i8 q9, q9, q11 163 vsub.i8 q10, q10, q11 164 vdup.32 q13, r7 // -2*bitdepth_min_8 165 1: 166 vld1.32 {q0, q1}, [r0, :128] // a 167 vld1.16 {q2}, [r1, :128] // b 168 vdup.16 q14, r6 // -bitdepth_min_8 169 subs r2, r2, #8 170 vrshl.s32 q0, q0, q13 171 vrshl.s32 q1, q1, q13 172 vrshl.s16 q4, q2, q14 173 vmul.i32 q0, q0, q15 // a * n 174 vmul.i32 q1, q1, q15 // a * n 175 vmull.u16 q3, d8, d8 // b * b 176 vmull.u16 q4, d9, d9 // b * b 177 vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) 178 vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) 179 vmul.i32 q0, q0, q12 // p * s 180 vmul.i32 q1, q1, q12 // p * s 181 vqshrn.u32 d0, q0, #16 182 vqshrn.u32 d1, q1, #16 183 vqrshrn.u16 d0, q0, #4 // imin(z, 255) 184 185 vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 186 vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 187 vtbl.8 d1, {q8, q9}, d0 188 vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 189 vsub.i8 d9, d0, d15 // indices for vtbx 190 vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 191 vadd.i8 d2, d2, d3 192 vtbx.8 d1, {q10}, d9 193 vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 194 vadd.i8 d6, d6, d7 195 vadd.i8 d8, d8, d22 196 vadd.i8 d2, d2, d6 197 vadd.i8 d1, d1, d8 198 vadd.i8 d1, d1, d2 199 vmovl.u8 q0, d1 // x 200 201 vdup.32 q14, r5 // one_by_x 202 203 vmull.u16 q1, d0, d4 // x * BB[i] 204 vmull.u16 q2, d1, d5 // x * BB[i] 205 vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x 206 vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x 207 vrshr.s32 q1, q1, #12 // AA[i] 208 vrshr.s32 q2, q2, #12 // AA[i] 209 210 vst1.32 {q1, q2}, [r0, :128]! 211 vst1.16 {q0}, [r1, :128]! 212 bgt 1b 213 214 vpop {q4-q7} 215 pop {r4-r7,pc} 216 endfunc