cdef16.S (8007B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2020, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/arm/asm.S" 29 #include "util.S" 30 #include "cdef_tmpl.S" 31 32 // r1 = d0/q0 33 // r2 = d2/q1 34 .macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret 35 tst r7, #1 // CDEF_HAVE_LEFT 36 beq 2f 37 // CDEF_HAVE_LEFT 38 tst r7, #2 // CDEF_HAVE_RIGHT 39 beq 1f 40 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 41 vldr s8, [\s1, #-4] 42 vld1.16 {\r1}, [\s1, :\align] 43 vldr s9, [\s1, #2*\w] 44 vldr s10, [\s2, #-4] 45 vld1.16 {\r2}, [\s2, :\align] 46 vldr s11, [\s2, #2*\w] 47 vstr s8, [r0, #-4] 48 vst1.16 {\r1}, [r0, :\align] 49 vstr s9, [r0, #2*\w] 50 add r0, r0, #2*\stride 51 vstr s10, [r0, #-4] 52 vst1.16 {\r2}, [r0, :\align] 53 vstr s11, [r0, #2*\w] 54 .if \ret 55 pop {r4-r8,pc} 56 .else 57 add r0, r0, #2*\stride 58 b 3f 59 .endif 60 61 1: 62 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 63 vldr s8, [\s1, #-4] 64 vld1.16 {\r1}, [\s1, :\align] 65 vldr s9, [\s2, #-4] 66 vld1.16 {\r2}, [\s2, :\align] 67 vstr s8, [r0, #-4] 68 vst1.16 {\r1}, [r0, :\align] 69 vstr s12, [r0, #2*\w] 70 add r0, r0, #2*\stride 71 vstr s9, [r0, #-4] 72 vst1.16 {\r2}, [r0, :\align] 73 vstr s12, [r0, #2*\w] 74 .if \ret 75 pop {r4-r8,pc} 76 .else 77 add r0, r0, #2*\stride 78 b 3f 79 .endif 80 81 2: 82 // !CDEF_HAVE_LEFT 83 tst r7, #2 // CDEF_HAVE_RIGHT 84 beq 1f 85 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 86 vld1.16 {\r1}, [\s1, :\align] 87 vldr s8, [\s1, #2*\w] 88 vld1.16 {\r2}, [\s2, :\align] 89 vldr s9, [\s2, #2*\w] 90 vstr s12, [r0, #-4] 91 vst1.16 {\r1}, [r0, :\align] 92 vstr s8, [r0, #2*\w] 93 add r0, r0, #2*\stride 94 vstr s12, [r0, #-4] 95 vst1.16 {\r2}, [r0, :\align] 96 vstr s9, [r0, #2*\w] 97 .if \ret 98 pop {r4-r8,pc} 99 .else 100 add r0, r0, #2*\stride 101 b 3f 102 .endif 103 104 1: 105 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 106 vld1.16 {\r1}, [\s1, :\align] 107 vld1.16 {\r2}, [\s2, :\align] 108 vstr s12, [r0, #-4] 109 vst1.16 {\r1}, [r0, :\align] 110 vstr s12, [r0, #2*\w] 111 add r0, r0, #2*\stride 112 vstr s12, [r0, #-4] 113 vst1.16 {\r2}, [r0, :\align] 114 vstr s12, [r0, #2*\w] 115 .if \ret 116 pop {r4-r8,pc} 117 .else 118 add r0, r0, #2*\stride 119 .endif 120 3: 121 .endm 122 123 // void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, 124 // ptrdiff_t src_stride, const pixel (*left)[2], 125 // const pixel *const top, 126 // const pixel *const bottom, int h, 127 // enum CdefEdgeFlags edges); 128 129 // r1 = d0/q0 130 // r2 = d2/q1 131 .macro padding_func_16 w, stride, r1, r2, align 132 function cdef_padding\w\()_16bpc_neon, export=1 133 push {r4-r8,lr} 134 ldrd r4, r5, [sp, #24] 135 ldrd r6, r7, [sp, #32] 136 vmov.i16 q3, #0x8000 137 tst r7, #4 // CDEF_HAVE_TOP 138 bne 1f 139 // !CDEF_HAVE_TOP 140 sub r12, r0, #2*(2*\stride+2) 141 vmov.i16 q2, #0x8000 142 vst1.16 {q2,q3}, [r12]! 143 .if \w == 8 144 vst1.16 {q2,q3}, [r12]! 145 .endif 146 b 3f 147 1: 148 // CDEF_HAVE_TOP 149 add r8, r4, r2 150 sub r0, r0, #2*(2*\stride) 151 pad_top_bot_16 r4, r8, \w, \stride, \r1, \r2, \align, 0 152 153 // Middle section 154 3: 155 tst r7, #1 // CDEF_HAVE_LEFT 156 beq 2f 157 // CDEF_HAVE_LEFT 158 tst r7, #2 // CDEF_HAVE_RIGHT 159 beq 1f 160 // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 161 0: 162 vld1.32 {d2[]}, [r3, :32]! 163 vldr s5, [r1, #2*\w] 164 vld1.16 {\r1}, [r1, :\align], r2 165 subs r6, r6, #1 166 vstr s4, [r0, #-4] 167 vst1.16 {\r1}, [r0, :\align] 168 vstr s5, [r0, #2*\w] 169 add r0, r0, #2*\stride 170 bgt 0b 171 b 3f 172 1: 173 // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 174 vld1.32 {d2[]}, [r3, :32]! 175 vld1.16 {\r1}, [r1, :\align], r2 176 subs r6, r6, #1 177 vstr s4, [r0, #-4] 178 vst1.16 {\r1}, [r0, :\align] 179 vstr s12, [r0, #2*\w] 180 add r0, r0, #2*\stride 181 bgt 1b 182 b 3f 183 2: 184 tst r7, #2 // CDEF_HAVE_RIGHT 185 beq 1f 186 // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 187 0: 188 vldr s4, [r1, #2*\w] 189 vld1.16 {\r1}, [r1, :\align], r2 190 subs r6, r6, #1 191 vstr s12, [r0, #-4] 192 vst1.16 {\r1}, [r0, :\align] 193 vstr s4, [r0, #2*\w] 194 add r0, r0, #2*\stride 195 bgt 0b 196 b 3f 197 1: 198 // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT 199 vld1.16 {\r1}, [r1, :\align], r2 200 subs r6, r6, #1 201 vstr s12, [r0, #-4] 202 vst1.16 {\r1}, [r0, :\align] 203 vstr s12, [r0, #2*\w] 204 add r0, r0, #2*\stride 205 bgt 1b 206 207 3: 208 tst r7, #8 // CDEF_HAVE_BOTTOM 209 bne 1f 210 // !CDEF_HAVE_BOTTOM 211 sub r12, r0, #4 212 vmov.i16 q2, #0x8000 213 vst1.16 {q2,q3}, [r12]! 214 .if \w == 8 215 vst1.16 {q2,q3}, [r12]! 216 .endif 217 pop {r4-r8,pc} 218 1: 219 // CDEF_HAVE_BOTTOM 220 add r8, r5, r2 221 pad_top_bot_16 r5, r8, \w, \stride, \r1, \r2, \align, 1 222 endfunc 223 .endm 224 225 padding_func_16 8, 16, q0, q1, 128 226 padding_func_16 4, 8, d0, d2, 64 227 228 tables 229 230 filter 8, 16 231 filter 4, 16 232 233 find_dir 16