fdctdsp_neon.S (17761B)
1 /* 2 * Armv8 Neon optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 9 * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. 11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30 #include "libavutil/aarch64/asm.S" 31 #include "neon.S" 32 33 // #define EIGHT_BIT_SAMPLES 34 35 /* Constants for jsimd_fdct_islow_neon() */ 36 37 #define F_0_298 2446 /* FIX(0.298631336) */ 38 #define F_0_390 3196 /* FIX(0.390180644) */ 39 #define F_0_541 4433 /* FIX(0.541196100) */ 40 #define F_0_765 6270 /* FIX(0.765366865) */ 41 #define F_0_899 7373 /* FIX(0.899976223) */ 42 #define F_1_175 9633 /* FIX(1.175875602) */ 43 #define F_1_501 12299 /* FIX(1.501321110) */ 44 #define F_1_847 15137 /* FIX(1.847759065) */ 45 #define F_1_961 16069 /* FIX(1.961570560) */ 46 #define F_2_053 16819 /* FIX(2.053119869) */ 47 #define F_2_562 20995 /* FIX(2.562915447) */ 48 #define F_3_072 25172 /* FIX(3.072711026) */ 49 50 const jsimd_fdct_islow_neon_consts, align=4 51 .short F_0_298 52 .short -F_0_390 53 .short F_0_541 54 .short F_0_765 55 .short - F_0_899 56 .short F_1_175 57 .short F_1_501 58 .short - F_1_847 59 .short - F_1_961 60 .short F_2_053 61 .short - F_2_562 62 .short F_3_072 63 .short 0 /* padding */ 64 .short 0 65 .short 0 66 .short 0 67 endconst 68 69 #undef F_0_298 70 #undef F_0_390 71 #undef F_0_541 72 #undef F_0_765 73 #undef F_0_899 74 #undef F_1_175 75 #undef F_1_501 76 #undef F_1_847 77 #undef F_1_961 78 #undef F_2_053 79 #undef F_2_562 80 #undef F_3_072 81 82 /*****************************************************************************/ 83 84 /* 85 * jsimd_fdct_islow_neon 86 * 87 * This file contains a slower but more accurate integer implementation of the 88 * forward DCT (Discrete Cosine Transform). The following code is based 89 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for 90 * more details. 91 */ 92 93 #define CONST_BITS 13 94 #ifdef EIGHT_BIT_SAMPLES 95 #define PASS1_BITS 2 96 #else 97 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ 98 #endif 99 100 #define DESCALE_P1 (CONST_BITS - PASS1_BITS) 101 #define DESCALE_P2 (CONST_BITS + PASS1_BITS) 102 103 #define XFIX_P_0_298 v0.h[0] 104 #define XFIX_N_0_390 v0.h[1] 105 #define XFIX_P_0_541 v0.h[2] 106 #define XFIX_P_0_765 v0.h[3] 107 #define XFIX_N_0_899 v0.h[4] 108 #define XFIX_P_1_175 v0.h[5] 109 #define XFIX_P_1_501 v0.h[6] 110 #define XFIX_N_1_847 v0.h[7] 111 #define XFIX_N_1_961 v1.h[0] 112 #define XFIX_P_2_053 v1.h[1] 113 #define XFIX_N_2_562 v1.h[2] 114 #define XFIX_P_3_072 v1.h[3] 115 116 function ff_fdct_neon, export=1 117 118 DATA .req x0 119 TMP .req x9 120 121 /* Load constants */ 122 movrel TMP, jsimd_fdct_islow_neon_consts 123 ld1 {v0.8h, v1.8h}, [TMP] 124 125 /* Load all DATA into Neon registers with the following allocation: 126 * 0 1 2 3 | 4 5 6 7 127 * ---------+-------- 128 * 0 | d16 | d17 | v16.8h 129 * 1 | d18 | d19 | v17.8h 130 * 2 | d20 | d21 | v18.8h 131 * 3 | d22 | d23 | v19.8h 132 * 4 | d24 | d25 | v20.8h 133 * 5 | d26 | d27 | v21.8h 134 * 6 | d28 | d29 | v22.8h 135 * 7 | d30 | d31 | v23.8h 136 */ 137 138 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 139 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 140 sub DATA, DATA, #64 141 142 /* Transpose */ 143 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v31, v2 144 145 /* 1-D FDCT */ 146 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 147 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 148 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 149 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 150 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 151 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 152 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 153 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 154 155 /* Even part */ 156 add v4.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 157 sub v5.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 158 add v6.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 159 sub v7.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 160 161 add v16.8h, v4.8h, v6.8h /* tmp10 + tmp11 */ 162 sub v20.8h, v4.8h, v6.8h /* tmp10 - tmp11 */ 163 164 add v18.8h, v7.8h, v5.8h /* tmp12 + tmp13 */ 165 166 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ 167 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ 168 169 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 170 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 171 mov v22.16b, v18.16b 172 mov v25.16b, v24.16b 173 174 smlal v18.4s, v5.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 175 smlal2 v24.4s, v5.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 176 smlal v22.4s, v7.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 177 smlal2 v25.4s, v7.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 178 179 rshrn v18.4h, v18.4s, #DESCALE_P1 180 rshrn v22.4h, v22.4s, #DESCALE_P1 181 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 182 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 183 184 /* Odd part */ 185 add v2.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 186 add v3.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 187 add v6.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 188 add v7.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 189 smull v4.4s, v6.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 190 smull2 v5.4s, v6.8h, XFIX_P_1_175 191 smlal v4.4s, v7.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 192 smlal2 v5.4s, v7.8h, XFIX_P_1_175 193 194 smull2 v24.4s, v28.8h, XFIX_P_0_298 195 smull2 v25.4s, v29.8h, XFIX_P_2_053 196 smull2 v26.4s, v30.8h, XFIX_P_3_072 197 smull2 v27.4s, v31.8h, XFIX_P_1_501 198 smull v23.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 199 smull v21.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 200 smull v19.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 201 smull v17.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 202 203 smull2 v28.4s, v2.8h, XFIX_N_0_899 204 smull2 v29.4s, v3.8h, XFIX_N_2_562 205 smull2 v30.4s, v6.8h, XFIX_N_1_961 206 smull2 v31.4s, v7.8h, XFIX_N_0_390 207 smull v2.4s, v2.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 208 smull v3.4s, v3.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 209 smull v6.4s, v6.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 210 smull v7.4s, v7.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 211 212 add v6.4s, v6.4s, v4.4s /* z3 += z5 */ 213 add v30.4s, v30.4s, v5.4s 214 add v7.4s, v7.4s, v4.4s /* z4 += z5 */ 215 add v31.4s, v31.4s, v5.4s 216 217 add v23.4s, v23.4s, v2.4s /* tmp4 += z1 */ 218 add v24.4s, v24.4s, v28.4s 219 add v21.4s, v21.4s, v3.4s /* tmp5 += z2 */ 220 add v25.4s, v25.4s, v29.4s 221 add v19.4s, v19.4s, v6.4s /* tmp6 += z3 */ 222 add v26.4s, v26.4s, v30.4s 223 add v17.4s, v17.4s, v7.4s /* tmp7 += z4 */ 224 add v27.4s, v27.4s, v31.4s 225 226 add v23.4s, v23.4s, v6.4s /* tmp4 += z3 */ 227 add v24.4s, v24.4s, v30.4s 228 add v21.4s, v21.4s, v7.4s /* tmp5 += z4 */ 229 add v25.4s, v25.4s, v31.4s 230 add v19.4s, v19.4s, v3.4s /* tmp6 += z2 */ 231 add v26.4s, v26.4s, v29.4s 232 add v17.4s, v17.4s, v2.4s /* tmp7 += z1 */ 233 add v27.4s, v27.4s, v28.4s 234 235 rshrn v23.4h, v23.4s, #DESCALE_P1 236 rshrn v21.4h, v21.4s, #DESCALE_P1 237 rshrn v19.4h, v19.4s, #DESCALE_P1 238 rshrn v17.4h, v17.4s, #DESCALE_P1 239 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 240 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 241 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 242 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 243 244 /* Transpose */ 245 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v31, v2 246 247 /* 1-D FDCT */ 248 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 249 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 250 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 251 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 252 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 253 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 254 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 255 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 256 257 /* Even part */ 258 add v4.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 259 sub v5.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 260 add v6.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 261 sub v7.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 262 263 add v16.8h, v4.8h, v6.8h /* tmp10 + tmp11 */ 264 sub v20.8h, v4.8h, v6.8h /* tmp10 - tmp11 */ 265 266 add v18.8h, v7.8h, v5.8h /* tmp12 + tmp13 */ 267 268 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ 269 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ 270 271 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 272 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 273 mov v22.16b, v18.16b 274 mov v25.16b, v24.16b 275 276 smlal v18.4s, v5.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 277 smlal2 v24.4s, v5.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 278 smlal v22.4s, v7.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 279 smlal2 v25.4s, v7.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 280 281 rshrn v18.4h, v18.4s, #DESCALE_P2 282 rshrn v22.4h, v22.4s, #DESCALE_P2 283 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS+PASS1_BITS); */ 284 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS+PASS1_BITS); */ 285 286 /* Odd part */ 287 add v2.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 288 add v3.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 289 add v6.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 290 add v7.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 291 292 smull v4.4s, v6.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 293 smull2 v5.4s, v6.8h, XFIX_P_1_175 294 smlal v4.4s, v7.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 295 smlal2 v5.4s, v7.8h, XFIX_P_1_175 296 297 smull2 v24.4s, v28.8h, XFIX_P_0_298 298 smull2 v25.4s, v29.8h, XFIX_P_2_053 299 smull2 v26.4s, v30.8h, XFIX_P_3_072 300 smull2 v27.4s, v31.8h, XFIX_P_1_501 301 smull v23.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 302 smull v21.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 303 smull v19.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 304 smull v17.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 305 306 smull2 v28.4s, v2.8h, XFIX_N_0_899 307 smull2 v29.4s, v3.8h, XFIX_N_2_562 308 smull2 v30.4s, v6.8h, XFIX_N_1_961 309 smull2 v31.4s, v7.8h, XFIX_N_0_390 310 smull v2.4s, v2.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 311 smull v3.4s, v3.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 312 smull v6.4s, v6.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 313 smull v7.4s, v7.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 314 315 add v6.4s, v6.4s, v4.4s /* z3 += z5 */ 316 add v30.4s, v30.4s, v5.4s 317 add v7.4s, v7.4s, v4.4s /* z4 += z5 */ 318 add v31.4s, v31.4s, v5.4s 319 320 add v23.4s, v23.4s, v2.4s /* tmp4 += z1 */ 321 add v24.4s, v24.4s, v28.4s 322 add v21.4s, v21.4s, v3.4s /* tmp5 += z2 */ 323 add v25.4s, v25.4s, v29.4s 324 add v19.4s, v19.4s, v6.4s /* tmp6 += z3 */ 325 add v26.4s, v26.4s, v30.4s 326 add v17.4s, v17.4s, v7.4s /* tmp7 += z4 */ 327 add v27.4s, v27.4s, v31.4s 328 329 add v23.4s, v23.4s, v6.4s /* tmp4 += z3 */ 330 add v24.4s, v24.4s, v30.4s 331 add v21.4s, v21.4s, v7.4s /* tmp5 += z4 */ 332 add v25.4s, v25.4s, v31.4s 333 add v19.4s, v19.4s, v3.4s /* tmp6 += z2 */ 334 add v26.4s, v26.4s, v29.4s 335 add v17.4s, v17.4s, v2.4s /* tmp7 += z1 */ 336 add v27.4s, v27.4s, v28.4s 337 338 rshrn v23.4h, v23.4s, #DESCALE_P2 339 rshrn v21.4h, v21.4s, #DESCALE_P2 340 rshrn v19.4h, v19.4s, #DESCALE_P2 341 rshrn v17.4h, v17.4s, #DESCALE_P2 342 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS+PASS1_BITS); */ 343 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS+PASS1_BITS); */ 344 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS+PASS1_BITS); */ 345 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS+PASS1_BITS); */ 346 347 /* Store results */ 348 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 349 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 350 351 ret 352 353 .unreq DATA 354 .unreq TMP 355 endfunc 356 357 #undef XFIX_P_0_298 358 #undef XFIX_N_0_390 359 #undef XFIX_P_0_541 360 #undef XFIX_P_0_765 361 #undef XFIX_N_0_899 362 #undef XFIX_P_1_175 363 #undef XFIX_P_1_501 364 #undef XFIX_N_1_847 365 #undef XFIX_N_1_961 366 #undef XFIX_P_2_053 367 #undef XFIX_N_2_562 368 #undef XFIX_P_3_072