looprestoration_common.S (14938B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 31 // Series of LUTs for efficiently computing sgr's 1 - x/(x+1) table. 32 // In the comments, let RefTable denote the original, reference table. 33 const x_by_x_tables 34 // RangeMins 35 // 36 // Min(RefTable[i*8:i*8+8]) 37 // First two values are zeroed. 38 // 39 // Lookup using RangeMins[(x >> 3)] 40 .byte 0, 0, 11, 8, 6, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 2 41 .byte 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 42 43 // DiffMasks 44 // 45 // This contains a bit pattern, indicating at which index positions the value of RefTable changes. For each range 46 // in the RangeMins table (covering 8 RefTable entries), we have one byte; each bit indicates whether the value of 47 // RefTable changes at that particular index. 48 // Using popcount, we can integrate the diff bit field. By shifting away bits in a byte, we can refine the range of 49 // the integral. Finally, adding the integral to RangeMins[(x>>3)] reconstructs RefTable (for x > 15). 50 // 51 // Lookup using DiffMasks[(x >> 3)] 52 .byte 0x00, 0x00, 0xD4, 0x44 53 .byte 0x42, 0x04, 0x00, 0x00 54 .byte 0x00, 0x80, 0x00, 0x00 55 .byte 0x04, 0x00, 0x00, 0x00 56 .byte 0x00, 0x00, 0x00, 0x00 57 .byte 0x00, 0x40, 0x00, 0x00 58 .byte 0x00, 0x00, 0x00, 0x00 59 .byte 0x00, 0x00, 0x00, 0x02 60 // Binary form: 61 // 0b00000000, 0b00000000, 0b11010100, 0b01000100 62 // 0b01000010, 0b00000100, 0b00000000, 0b00000000 63 // 0b00000000, 0b10000000, 0b00000000, 0b00000000 64 // 0b00000100, 0b00000000, 0b00000000, 0b00000000 65 // 0b00000000, 0b00000000, 0b00000000, 0b00000000 66 // 0b00000000, 0b01000000, 0b00000000, 0b00000000 67 // 0b00000000, 0b00000000, 0b00000000, 0b00000000 68 // 0b00000000, 0b00000000, 0b00000000, 0b00000010 69 70 // RefLo 71 // 72 // RefTable[0:16] 73 // i.e. First 16 elements of the original table. 74 // Add to the sum obtained in the rest of the other lut logic to include the first 16 bytes of RefTable. 75 // 76 // Lookup using RangeMins[x] (tbl will replace x > 15 with 0) 77 .byte 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 78 79 // Pseudo assembly 80 // 81 // hi_bits = x >> 3 82 // tbl ref, {RefLo}, x 83 // tbl diffs, {DiffMasks[0:16], DiffMasks[16:32]}, hi_bits 84 // tbl min, {RangeMins[0:16], RangeMins[16:32]}, hi_bits 85 // lo_bits = x & 0x7 86 // diffs = diffs << lo_bits 87 // ref = ref + min 88 // integral = popcnt(diffs) 89 // ref = ref + integral 90 // return ref 91 endconst 92 93 // void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, 94 // int32_t *AA, int16_t *BB, 95 // const int w, const int s, 96 // const int bitdepth_max); 97 function sgr_box3_vert_neon, export=1 98 stp d8, d9, [sp, #-0x40]! 99 stp d10, d11, [sp, #0x10] 100 stp d12, d13, [sp, #0x20] 101 stp d14, d15, [sp, #0x30] 102 103 add w4, w4, #2 104 clz w9, w6 // bitdepth_max 105 dup v28.4s, w5 // strength 106 107 ldp x5, x6, [x0] 108 ldr x0, [x0, #16] 109 ldp x7, x8, [x1] 110 ldr x1, [x1, #16] 111 112 movi v31.4s, #9 // n 113 114 sub w9, w9, #24 // -bitdepth_min_8 115 movrel x12, x_by_x_tables 116 mov w13, #455 // one_by_x 117 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x12] // RangeMins, DiffMasks 118 movi v22.16b, #0x7 119 ldr q23, [x12, #64] //RefLo 120 dup v6.8h, w9 // -bitdepth_min_8 121 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 122 dup v30.4s, w13 // one_by_x 123 124 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x5], #64 125 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64 126 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 127 ld1 {v20.8h, v21.8h}, [x8], #32 128 ld1 {v0.8h, v1.8h}, [x7], #32 129 1: 130 ld1 {v2.8h, v3.8h}, [x1], #32 131 add v8.4s, v8.4s, v12.4s 132 add v9.4s, v9.4s, v13.4s 133 add v10.4s, v10.4s, v14.4s 134 add v11.4s, v11.4s, v15.4s 135 add v0.8h, v0.8h, v20.8h 136 add v1.8h, v1.8h, v21.8h 137 138 add v16.4s, v16.4s, v8.4s 139 add v17.4s, v17.4s, v9.4s 140 add v18.4s, v18.4s, v10.4s 141 add v19.4s, v19.4s, v11.4s 142 add v4.8h, v2.8h, v0.8h 143 add v5.8h, v3.8h, v1.8h 144 145 srshl v16.4s, v16.4s, v7.4s 146 srshl v17.4s, v17.4s, v7.4s 147 srshl v18.4s, v18.4s, v7.4s 148 srshl v19.4s, v19.4s, v7.4s 149 srshl v9.8h, v4.8h, v6.8h 150 srshl v13.8h, v5.8h, v6.8h 151 mul v16.4s, v16.4s, v31.4s // a * n 152 mul v17.4s, v17.4s, v31.4s // a * n 153 mul v18.4s, v18.4s, v31.4s // a * n 154 mul v19.4s, v19.4s, v31.4s // a * n 155 umull v8.4s, v9.4h, v9.4h // b * b 156 umull2 v9.4s, v9.8h, v9.8h // b * b 157 umull v12.4s, v13.4h, v13.4h // b * b 158 umull2 v13.4s, v13.8h, v13.8h // b * b 159 uqsub v16.4s, v16.4s, v8.4s // imax(a * n - b * b, 0) 160 uqsub v17.4s, v17.4s, v9.4s // imax(a * n - b * b, 0) 161 uqsub v18.4s, v18.4s, v12.4s // imax(a * n - b * b, 0) 162 uqsub v19.4s, v19.4s, v13.4s // imax(a * n - b * b, 0) 163 mul v16.4s, v16.4s, v28.4s // p * s 164 mul v17.4s, v17.4s, v28.4s // p * s 165 mul v18.4s, v18.4s, v28.4s // p * s 166 mul v19.4s, v19.4s, v28.4s // p * s 167 uqshrn v16.4h, v16.4s, #16 168 uqshrn2 v16.8h, v17.4s, #16 169 uqshrn v18.4h, v18.4s, #16 170 uqshrn2 v18.8h, v19.4s, #16 171 uqrshrn v1.8b, v16.8h, #4 // imin(z, 255) 172 uqrshrn2 v1.16b, v18.8h, #4 // imin(z, 255) 173 174 ld1 {v16.4s, v17.4s}, [x0], #32 175 subs w4, w4, #16 176 177 ushr v0.16b, v1.16b, #3 178 ld1 {v8.4s, v9.4s}, [x5], #32 179 tbl v2.16b, {v26.16b, v27.16b}, v0.16b // RangeMins 180 tbl v0.16b, {v24.16b, v25.16b}, v0.16b // DiffMasks 181 tbl v3.16b, {v23.16b}, v1.16b // RefLo 182 and v1.16b, v1.16b, v22.16b 183 ld1 {v12.4s, v13.4s}, [x6], #32 184 ushl v1.16b, v2.16b, v1.16b 185 ld1 {v20.8h, v21.8h}, [x8], #32 186 add v3.16b, v3.16b, v0.16b 187 cnt v1.16b, v1.16b 188 ld1 {v18.4s, v19.4s}, [x0], #32 189 add v3.16b, v3.16b, v1.16b 190 ld1 {v10.4s, v11.4s}, [x5], #32 191 uxtl v0.8h, v3.8b // x 192 uxtl2 v1.8h, v3.16b // x 193 194 ld1 {v14.4s, v15.4s}, [x6], #32 195 196 umull v2.4s, v0.4h, v4.4h // x * BB[i] 197 umull2 v3.4s, v0.8h, v4.8h // x * BB[i] 198 umull v4.4s, v1.4h, v5.4h // x * BB[i] 199 umull2 v5.4s, v1.8h, v5.8h // x * BB[i] 200 mul v2.4s, v2.4s, v30.4s // x * BB[i] * sgr_one_by_x 201 mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x 202 mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x 203 mul v5.4s, v5.4s, v30.4s // x * BB[i] * sgr_one_by_x 204 st1 {v0.8h, v1.8h}, [x3], #32 205 ld1 {v0.8h, v1.8h}, [x7], #32 206 srshr v2.4s, v2.4s, #12 // AA[i] 207 srshr v3.4s, v3.4s, #12 // AA[i] 208 srshr v4.4s, v4.4s, #12 // AA[i] 209 srshr v5.4s, v5.4s, #12 // AA[i] 210 211 st1 {v2.4s, v3.4s, v4.4s, v5.4s}, [x2], #64 212 b.gt 1b 213 214 ldp d14, d15, [sp, #0x30] 215 ldp d12, d13, [sp, #0x20] 216 ldp d10, d11, [sp, #0x10] 217 ldp d8, d9, [sp], 0x40 218 ret 219 endfunc 220 221 // void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, 222 // int32_t *AA, int16_t *BB, 223 // const int w, const int s, 224 // const int bitdepth_max); 225 function sgr_box5_vert_neon, export=1 226 stp d8, d9, [sp, #-0x30]! 227 stp d10, d11, [sp, #0x10] 228 stp d12, d13, [sp, #0x20] 229 230 add w4, w4, #2 231 clz w15, w6 // bitdepth_max 232 dup v28.4s, w5 // strength 233 234 ldp x5, x6, [x0] 235 ldp x7, x8, [x0, #16] 236 ldr x0, [x0, #32] 237 ldp x9, x10, [x1] 238 ldp x11, x12, [x1, #16] 239 ldr x1, [x1, #32] 240 241 movi v31.4s, #25 // n 242 243 sub w15, w15, #24 // -bitdepth_min_8 244 movrel x13, x_by_x_tables 245 movi v30.4s, #164 246 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x13] // RangeMins, DiffMasks 247 dup v6.8h, w15 // -bitdepth_min_8 248 movi v19.8b, #0x7 249 ldr q18, [x13, #64] // RefLo 250 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 251 252 ld1 {v8.4s, v9.4s}, [x5], #32 253 ld1 {v10.4s, v11.4s}, [x6], #32 254 ld1 {v12.4s, v13.4s}, [x7], #32 255 ld1 {v16.4s, v17.4s}, [x8], #32 256 ld1 {v20.8h}, [x9], #16 257 ld1 {v21.8h}, [x10], #16 258 ld1 {v22.8h}, [x11], #16 259 ld1 {v23.8h}, [x12], #16 260 ld1 {v0.4s, v1.4s}, [x0], #32 261 ld1 {v2.8h}, [x1], #16 262 263 1: 264 add v8.4s, v8.4s, v10.4s 265 add v9.4s, v9.4s, v11.4s 266 add v12.4s, v12.4s, v16.4s 267 add v13.4s, v13.4s, v17.4s 268 269 add v20.8h, v20.8h, v21.8h 270 add v22.8h, v22.8h, v23.8h 271 272 add v0.4s, v0.4s, v8.4s 273 add v1.4s, v1.4s, v9.4s 274 add v2.8h, v2.8h, v20.8h 275 276 add v0.4s, v0.4s, v12.4s 277 add v1.4s, v1.4s, v13.4s 278 add v2.8h, v2.8h, v22.8h 279 280 subs w4, w4, #8 281 282 srshl v0.4s, v0.4s, v7.4s 283 srshl v1.4s, v1.4s, v7.4s 284 srshl v4.8h, v2.8h, v6.8h 285 mul v0.4s, v0.4s, v31.4s // a * n 286 mul v1.4s, v1.4s, v31.4s // a * n 287 umull v3.4s, v4.4h, v4.4h // b * b 288 umull2 v4.4s, v4.8h, v4.8h // b * b 289 uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) 290 uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) 291 mul v0.4s, v0.4s, v28.4s // p * s 292 mul v1.4s, v1.4s, v28.4s // p * s 293 ld1 {v8.4s, v9.4s}, [x5], #32 294 uqshrn v0.4h, v0.4s, #16 295 uqshrn2 v0.8h, v1.4s, #16 296 ld1 {v10.4s, v11.4s}, [x6], #32 297 uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) 298 299 ld1 {v12.4s, v13.4s}, [x7], #32 300 301 ushr v1.8b, v0.8b, #3 302 ld1 {v16.4s, v17.4s}, [x8], #32 303 tbl v5.8b, {v26.16b, v27.16b}, v1.8b // RangeMins 304 tbl v1.8b, {v24.16b, v25.16b}, v1.8b // DiffMasks 305 tbl v4.8b, {v18.16b}, v0.8b // RefLo 306 and v0.8b, v0.8b, v19.8b 307 ld1 {v20.8h}, [x9], #16 308 ushl v5.8b, v5.8b, v0.8b 309 add v4.8b, v4.8b, v1.8b 310 ld1 {v21.8h}, [x10], #16 311 cnt v5.8b, v5.8b 312 ld1 {v22.8h}, [x11], #16 313 add v5.8b, v4.8b, v5.8b 314 ld1 {v23.8h}, [x12], #16 315 uxtl v5.8h, v5.8b // x 316 317 ld1 {v0.4s, v1.4s}, [x0], #32 318 umull v3.4s, v5.4h, v2.4h // x * BB[i] 319 umull2 v4.4s, v5.8h, v2.8h // x * BB[i] 320 mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x 321 mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x 322 srshr v3.4s, v3.4s, #12 // AA[i] 323 srshr v4.4s, v4.4s, #12 // AA[i] 324 ld1 {v2.8h}, [x1], #16 325 326 st1 {v3.4s, v4.4s}, [x2], #32 327 st1 {v5.8h}, [x3], #16 328 b.gt 1b 329 330 ldp d12, d13, [sp, #0x20] 331 ldp d10, d11, [sp, #0x10] 332 ldp d8, d9, [sp], 0x30 333 ret 334 endfunc