jsimd_neon.S (98482B)
1 /* 2 * Armv8 Neon optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 9 * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. 11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30 #if defined(__linux__) && defined(__ELF__) 31 .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ 32 #endif 33 34 #if defined(__APPLE__) 35 .section __DATA, __const 36 #elif defined(_WIN32) 37 .section .rdata 38 #else 39 .section .rodata, "a", %progbits 40 #endif 41 42 /* Constants for jsimd_idct_islow_neon() */ 43 44 #define F_0_298 2446 /* FIX(0.298631336) */ 45 #define F_0_390 3196 /* FIX(0.390180644) */ 46 #define F_0_541 4433 /* FIX(0.541196100) */ 47 #define F_0_765 6270 /* FIX(0.765366865) */ 48 #define F_0_899 7373 /* FIX(0.899976223) */ 49 #define F_1_175 9633 /* FIX(1.175875602) */ 50 #define F_1_501 12299 /* FIX(1.501321110) */ 51 #define F_1_847 15137 /* FIX(1.847759065) */ 52 #define F_1_961 16069 /* FIX(1.961570560) */ 53 #define F_2_053 16819 /* FIX(2.053119869) */ 54 #define F_2_562 20995 /* FIX(2.562915447) */ 55 #define F_3_072 25172 /* FIX(3.072711026) */ 56 57 .balign 16 58 Ljsimd_idct_islow_neon_consts: 59 .short F_0_298 60 .short -F_0_390 61 .short F_0_541 62 .short F_0_765 63 .short - F_0_899 64 .short F_1_175 65 .short F_1_501 66 .short - F_1_847 67 .short - F_1_961 68 .short F_2_053 69 .short - F_2_562 70 .short F_3_072 71 .short 0 /* padding */ 72 .short 0 73 .short 0 74 .short 0 75 76 #undef F_0_298 77 #undef F_0_390 78 #undef F_0_541 79 #undef F_0_765 80 #undef F_0_899 81 #undef F_1_175 82 #undef F_1_501 83 #undef F_1_847 84 #undef F_1_961 85 #undef F_2_053 86 #undef F_2_562 87 #undef F_3_072 88 89 /* Constants for jsimd_ycc_*_neon() */ 90 91 .balign 16 92 Ljsimd_ycc_rgb_neon_consts: 93 .short 0, 0, 0, 0 94 .short 22971, -11277, -23401, 29033 95 .short -128, -128, -128, -128 96 .short -128, -128, -128, -128 97 98 /* Constants for jsimd_*_ycc_neon() */ 99 100 .balign 16 101 Ljsimd_rgb_ycc_neon_consts: 102 .short 19595, 38470, 7471, 11059 103 .short 21709, 32768, 27439, 5329 104 .short 32767, 128, 32767, 128 105 .short 32767, 128, 32767, 128 106 107 /* Constants for jsimd_fdct_islow_neon() */ 108 109 #define F_0_298 2446 /* FIX(0.298631336) */ 110 #define F_0_390 3196 /* FIX(0.390180644) */ 111 #define F_0_541 4433 /* FIX(0.541196100) */ 112 #define F_0_765 6270 /* FIX(0.765366865) */ 113 #define F_0_899 7373 /* FIX(0.899976223) */ 114 #define F_1_175 9633 /* FIX(1.175875602) */ 115 #define F_1_501 12299 /* FIX(1.501321110) */ 116 #define F_1_847 15137 /* FIX(1.847759065) */ 117 #define F_1_961 16069 /* FIX(1.961570560) */ 118 #define F_2_053 16819 /* FIX(2.053119869) */ 119 #define F_2_562 20995 /* FIX(2.562915447) */ 120 #define F_3_072 25172 /* FIX(3.072711026) */ 121 122 .balign 16 123 Ljsimd_fdct_islow_neon_consts: 124 .short F_0_298 125 .short -F_0_390 126 .short F_0_541 127 .short F_0_765 128 .short - F_0_899 129 .short F_1_175 130 .short F_1_501 131 .short - F_1_847 132 .short - F_1_961 133 .short F_2_053 134 .short - F_2_562 135 .short F_3_072 136 .short 0 /* padding */ 137 .short 0 138 .short 0 139 .short 0 140 141 #undef F_0_298 142 #undef F_0_390 143 #undef F_0_541 144 #undef F_0_765 145 #undef F_0_899 146 #undef F_1_175 147 #undef F_1_501 148 #undef F_1_847 149 #undef F_1_961 150 #undef F_2_053 151 #undef F_2_562 152 #undef F_3_072 153 154 /* Constants for jsimd_huff_encode_one_block_neon() */ 155 156 .balign 16 157 Ljsimd_huff_encode_one_block_neon_consts: 158 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ 159 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 160 .byte 0, 1, 2, 3, 16, 17, 32, 33, \ 161 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ 162 .byte 34, 35, 48, 49, 255, 255, 50, 51, \ 163 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ 164 .byte 8, 9, 22, 23, 36, 37, 50, 51, \ 165 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ 166 .byte 54, 55, 40, 41, 26, 27, 12, 13, \ 167 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ 168 .byte 6, 7, 20, 21, 34, 35, 48, 49, \ 169 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ 170 .byte 42, 43, 28, 29, 14, 15, 30, 31, \ 171 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ 172 .byte 255, 255, 255, 255, 56, 57, 42, 43, \ 173 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ 174 .byte 26, 27, 40, 41, 42, 43, 28, 29, \ 175 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ 176 .byte 255, 255, 255, 255, 0, 1, 255, 255, \ 177 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ 178 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 179 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ 180 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 181 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ 182 .byte 4, 5, 6, 7, 255, 255, 255, 255, \ 183 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ 184 185 .text 186 187 188 /*****************************************************************************/ 189 190 /* Supplementary macro for setting function attributes */ 191 .macro asm_function fname 192 #ifdef __APPLE__ 193 .private_extern _\fname 194 .globl _\fname 195 _\fname: 196 #else 197 .global \fname 198 #ifdef __ELF__ 199 .hidden \fname 200 .type \fname, %function 201 #endif 202 \fname: 203 #endif 204 .endm 205 206 /* Get symbol location */ 207 .macro get_symbol_loc reg, symbol 208 #ifdef __APPLE__ 209 adrp \reg, \symbol@PAGE 210 add \reg, \reg, \symbol@PAGEOFF 211 #else 212 adrp \reg, \symbol 213 add \reg, \reg, :lo12:\symbol 214 #endif 215 .endm 216 217 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 218 trn1 \t0\().8h, \l0\().8h, \l1\().8h 219 trn1 \t1\().8h, \l2\().8h, \l3\().8h 220 trn1 \t2\().8h, \l4\().8h, \l5\().8h 221 trn1 \t3\().8h, \l6\().8h, \l7\().8h 222 trn2 \l1\().8h, \l0\().8h, \l1\().8h 223 trn2 \l3\().8h, \l2\().8h, \l3\().8h 224 trn2 \l5\().8h, \l4\().8h, \l5\().8h 225 trn2 \l7\().8h, \l6\().8h, \l7\().8h 226 227 trn1 \l4\().4s, \t2\().4s, \t3\().4s 228 trn2 \t3\().4s, \t2\().4s, \t3\().4s 229 trn1 \t2\().4s, \t0\().4s, \t1\().4s 230 trn2 \l2\().4s, \t0\().4s, \t1\().4s 231 trn1 \t0\().4s, \l1\().4s, \l3\().4s 232 trn2 \l3\().4s, \l1\().4s, \l3\().4s 233 trn2 \t1\().4s, \l5\().4s, \l7\().4s 234 trn1 \l5\().4s, \l5\().4s, \l7\().4s 235 236 trn2 \l6\().2d, \l2\().2d, \t3\().2d 237 trn1 \l0\().2d, \t2\().2d, \l4\().2d 238 trn1 \l1\().2d, \t0\().2d, \l5\().2d 239 trn2 \l7\().2d, \l3\().2d, \t1\().2d 240 trn1 \l2\().2d, \l2\().2d, \t3\().2d 241 trn2 \l4\().2d, \t2\().2d, \l4\().2d 242 trn1 \l3\().2d, \l3\().2d, \t1\().2d 243 trn2 \l5\().2d, \t0\().2d, \l5\().2d 244 .endm 245 246 247 #define CENTERJSAMPLE 128 248 249 /*****************************************************************************/ 250 251 /* 252 * Perform dequantization and inverse DCT on one block of coefficients. 253 * 254 * GLOBAL(void) 255 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, 256 * JSAMPARRAY output_buf, JDIMENSION output_col) 257 */ 258 259 #define CONST_BITS 13 260 #define PASS1_BITS 2 261 262 #define XFIX_P_0_298 v0.h[0] 263 #define XFIX_N_0_390 v0.h[1] 264 #define XFIX_P_0_541 v0.h[2] 265 #define XFIX_P_0_765 v0.h[3] 266 #define XFIX_N_0_899 v0.h[4] 267 #define XFIX_P_1_175 v0.h[5] 268 #define XFIX_P_1_501 v0.h[6] 269 #define XFIX_N_1_847 v0.h[7] 270 #define XFIX_N_1_961 v1.h[0] 271 #define XFIX_P_2_053 v1.h[1] 272 #define XFIX_N_2_562 v1.h[2] 273 #define XFIX_P_3_072 v1.h[3] 274 275 asm_function jsimd_idct_islow_neon 276 DCT_TABLE .req x0 277 COEF_BLOCK .req x1 278 OUTPUT_BUF .req x2 279 OUTPUT_COL .req x3 280 TMP1 .req x0 281 TMP2 .req x1 282 TMP3 .req x9 283 TMP4 .req x10 284 TMP5 .req x11 285 TMP6 .req x12 286 TMP7 .req x13 287 TMP8 .req x14 288 289 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 290 guarantee that the upper (unused) 32 bits of x3 are valid. This 291 instruction ensures that those bits are set to zero. */ 292 uxtw x3, w3 293 294 sub sp, sp, #64 295 get_symbol_loc x15, Ljsimd_idct_islow_neon_consts 296 mov x10, sp 297 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 298 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 299 ld1 {v0.8h, v1.8h}, [x15] 300 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 301 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 302 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 303 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 304 305 cmeq v16.8h, v3.8h, #0 306 cmeq v26.8h, v4.8h, #0 307 cmeq v27.8h, v5.8h, #0 308 cmeq v28.8h, v6.8h, #0 309 cmeq v29.8h, v7.8h, #0 310 cmeq v30.8h, v8.8h, #0 311 cmeq v31.8h, v9.8h, #0 312 313 and v10.16b, v16.16b, v26.16b 314 and v11.16b, v27.16b, v28.16b 315 and v12.16b, v29.16b, v30.16b 316 and v13.16b, v31.16b, v10.16b 317 and v14.16b, v11.16b, v12.16b 318 mul v2.8h, v2.8h, v18.8h 319 and v15.16b, v13.16b, v14.16b 320 shl v10.8h, v2.8h, #(PASS1_BITS) 321 sqxtn v16.8b, v15.8h 322 mov TMP1, v16.d[0] 323 mvn TMP2, TMP1 324 325 cbnz TMP2, 2f 326 /* case all AC coeffs are zeros */ 327 dup v2.2d, v10.d[0] 328 dup v6.2d, v10.d[1] 329 mov v3.16b, v2.16b 330 mov v7.16b, v6.16b 331 mov v4.16b, v2.16b 332 mov v8.16b, v6.16b 333 mov v5.16b, v2.16b 334 mov v9.16b, v6.16b 335 1: 336 /* for this transpose, we should organise data like this: 337 * 00, 01, 02, 03, 40, 41, 42, 43 338 * 10, 11, 12, 13, 50, 51, 52, 53 339 * 20, 21, 22, 23, 60, 61, 62, 63 340 * 30, 31, 32, 33, 70, 71, 72, 73 341 * 04, 05, 06, 07, 44, 45, 46, 47 342 * 14, 15, 16, 17, 54, 55, 56, 57 343 * 24, 25, 26, 27, 64, 65, 66, 67 344 * 34, 35, 36, 37, 74, 75, 76, 77 345 */ 346 trn1 v28.8h, v2.8h, v3.8h 347 trn1 v29.8h, v4.8h, v5.8h 348 trn1 v30.8h, v6.8h, v7.8h 349 trn1 v31.8h, v8.8h, v9.8h 350 trn2 v16.8h, v2.8h, v3.8h 351 trn2 v17.8h, v4.8h, v5.8h 352 trn2 v18.8h, v6.8h, v7.8h 353 trn2 v19.8h, v8.8h, v9.8h 354 trn1 v2.4s, v28.4s, v29.4s 355 trn1 v6.4s, v30.4s, v31.4s 356 trn1 v3.4s, v16.4s, v17.4s 357 trn1 v7.4s, v18.4s, v19.4s 358 trn2 v4.4s, v28.4s, v29.4s 359 trn2 v8.4s, v30.4s, v31.4s 360 trn2 v5.4s, v16.4s, v17.4s 361 trn2 v9.4s, v18.4s, v19.4s 362 /* Even part: reverse the even part of the forward DCT. */ 363 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 364 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 365 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 366 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 367 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 368 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 369 mov v21.16b, v19.16b /* tmp3 = z1 */ 370 mov v20.16b, v18.16b /* tmp3 = z1 */ 371 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 372 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 373 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 374 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 375 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 376 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 377 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 378 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 379 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 380 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 381 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 382 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 383 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 384 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 385 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 386 387 /* Odd part per figure 8; the matrix is unitary and hence its 388 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 389 */ 390 391 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 392 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 393 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 394 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 395 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 396 397 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 398 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 399 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 400 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 401 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 402 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 403 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 404 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 405 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 406 407 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 408 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 409 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 410 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 411 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 412 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 413 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 414 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 415 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 416 417 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 418 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 419 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 420 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 421 422 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 423 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 424 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 425 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 426 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 427 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 428 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 429 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 430 431 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 432 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 433 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 434 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 435 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 436 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 437 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 438 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 439 440 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 441 442 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 443 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 444 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 445 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 446 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 447 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 448 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 449 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 450 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 451 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 452 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 453 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 454 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 455 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 456 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 457 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 458 459 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 460 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 461 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 462 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 463 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 464 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 465 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 466 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 467 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 468 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 469 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 470 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 471 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 472 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 473 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 474 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 475 movi v0.16b, #(CENTERJSAMPLE) 476 /* Prepare pointers (dual-issue with Neon instructions) */ 477 ldp TMP1, TMP2, [OUTPUT_BUF], 16 478 sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 479 ldp TMP3, TMP4, [OUTPUT_BUF], 16 480 sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 481 add TMP1, TMP1, OUTPUT_COL 482 sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 483 add TMP2, TMP2, OUTPUT_COL 484 sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 485 add TMP3, TMP3, OUTPUT_COL 486 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 487 add TMP4, TMP4, OUTPUT_COL 488 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 489 ldp TMP5, TMP6, [OUTPUT_BUF], 16 490 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 491 ldp TMP7, TMP8, [OUTPUT_BUF], 16 492 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 493 add TMP5, TMP5, OUTPUT_COL 494 add v16.16b, v28.16b, v0.16b 495 add TMP6, TMP6, OUTPUT_COL 496 add v18.16b, v29.16b, v0.16b 497 add TMP7, TMP7, OUTPUT_COL 498 add v20.16b, v30.16b, v0.16b 499 add TMP8, TMP8, OUTPUT_COL 500 add v22.16b, v31.16b, v0.16b 501 502 /* Transpose the final 8-bit samples */ 503 trn1 v28.16b, v16.16b, v18.16b 504 trn1 v30.16b, v20.16b, v22.16b 505 trn2 v29.16b, v16.16b, v18.16b 506 trn2 v31.16b, v20.16b, v22.16b 507 508 trn1 v16.8h, v28.8h, v30.8h 509 trn2 v18.8h, v28.8h, v30.8h 510 trn1 v20.8h, v29.8h, v31.8h 511 trn2 v22.8h, v29.8h, v31.8h 512 513 uzp1 v28.4s, v16.4s, v18.4s 514 uzp2 v30.4s, v16.4s, v18.4s 515 uzp1 v29.4s, v20.4s, v22.4s 516 uzp2 v31.4s, v20.4s, v22.4s 517 518 /* Store results to the output buffer */ 519 st1 {v28.d}[0], [TMP1] 520 st1 {v29.d}[0], [TMP2] 521 st1 {v28.d}[1], [TMP3] 522 st1 {v29.d}[1], [TMP4] 523 st1 {v30.d}[0], [TMP5] 524 st1 {v31.d}[0], [TMP6] 525 st1 {v30.d}[1], [TMP7] 526 st1 {v31.d}[1], [TMP8] 527 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 528 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 529 blr x30 530 531 .balign 16 532 2: 533 mul v3.8h, v3.8h, v19.8h 534 mul v4.8h, v4.8h, v20.8h 535 mul v5.8h, v5.8h, v21.8h 536 add TMP4, xzr, TMP2, LSL #32 537 mul v6.8h, v6.8h, v22.8h 538 mul v7.8h, v7.8h, v23.8h 539 adds TMP3, xzr, TMP2, LSR #32 540 mul v8.8h, v8.8h, v24.8h 541 mul v9.8h, v9.8h, v25.8h 542 b.ne 3f 543 /* Right AC coef is zero */ 544 dup v15.2d, v10.d[1] 545 /* Even part: reverse the even part of the forward DCT. */ 546 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 547 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 548 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 549 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 550 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 551 mov v20.16b, v18.16b /* tmp3 = z1 */ 552 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 553 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 554 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 555 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 556 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 557 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 558 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 559 560 /* Odd part per figure 8; the matrix is unitary and hence its 561 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 562 */ 563 564 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 565 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 566 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 567 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 568 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ 569 570 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 571 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 572 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 573 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 574 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 575 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 576 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 577 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 578 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 579 580 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 581 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 582 583 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 584 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 585 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 586 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 587 588 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 589 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 590 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 591 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 592 593 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 594 595 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 596 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 597 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 598 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 599 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 600 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 601 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 602 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 603 604 rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 605 rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 606 rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 607 rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 608 rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 609 rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 610 rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 611 rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 612 mov v6.16b, v15.16b 613 mov v7.16b, v15.16b 614 mov v8.16b, v15.16b 615 mov v9.16b, v15.16b 616 b 1b 617 618 .balign 16 619 3: 620 cbnz TMP4, 4f 621 /* Left AC coef is zero */ 622 dup v14.2d, v10.d[0] 623 /* Even part: reverse the even part of the forward DCT. */ 624 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 625 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 626 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 627 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 628 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 629 mov v21.16b, v19.16b /* tmp3 = z1 */ 630 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 631 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 632 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 633 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 634 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 635 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 636 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 637 638 /* Odd part per figure 8; the matrix is unitary and hence its 639 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 640 */ 641 642 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 643 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 644 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 645 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 646 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 647 648 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 649 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 650 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 651 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 652 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 653 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 654 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 655 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 656 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 657 658 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 659 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 660 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 661 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 662 663 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 664 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 665 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 666 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 667 668 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 669 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 670 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 671 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 672 673 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 674 675 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 676 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 677 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 678 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 679 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 680 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 681 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 682 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 683 684 mov v2.16b, v14.16b 685 mov v3.16b, v14.16b 686 mov v4.16b, v14.16b 687 mov v5.16b, v14.16b 688 rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 689 rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 690 rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 691 rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 692 rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 693 rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 694 rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 695 rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 696 b 1b 697 698 .balign 16 699 4: 700 /* "No" AC coef is zero */ 701 /* Even part: reverse the even part of the forward DCT. */ 702 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 703 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 704 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 705 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 706 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 707 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 708 mov v21.16b, v19.16b /* tmp3 = z1 */ 709 mov v20.16b, v18.16b /* tmp3 = z1 */ 710 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 711 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 712 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 713 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 714 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 715 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 716 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 717 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 718 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 719 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 720 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 721 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 722 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 723 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 724 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 725 726 /* Odd part per figure 8; the matrix is unitary and hence its 727 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 728 */ 729 730 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 731 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 732 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 733 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 734 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 735 736 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 737 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 738 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 739 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 740 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 741 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 742 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 743 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 744 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 745 746 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 747 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 748 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 749 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 750 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 751 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 752 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 753 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 754 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 755 756 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 757 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 758 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 759 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 760 761 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 762 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 763 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 764 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 765 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 766 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 767 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 768 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 769 770 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 771 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 772 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 773 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 774 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 775 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 776 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 777 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 778 779 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 780 781 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 782 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 783 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 784 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 785 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 786 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 787 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 788 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 789 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 790 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 791 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 792 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 793 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 794 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 795 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 796 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 797 798 rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 799 rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 800 rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 801 rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 802 rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 803 rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 804 rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 805 rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 806 rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 807 rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 808 rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 809 rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 810 rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 811 rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 812 rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 813 rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 814 b 1b 815 816 .unreq DCT_TABLE 817 .unreq COEF_BLOCK 818 .unreq OUTPUT_BUF 819 .unreq OUTPUT_COL 820 .unreq TMP1 821 .unreq TMP2 822 .unreq TMP3 823 .unreq TMP4 824 .unreq TMP5 825 .unreq TMP6 826 .unreq TMP7 827 .unreq TMP8 828 829 #undef CENTERJSAMPLE 830 #undef CONST_BITS 831 #undef PASS1_BITS 832 #undef XFIX_P_0_298 833 #undef XFIX_N_0_390 834 #undef XFIX_P_0_541 835 #undef XFIX_P_0_765 836 #undef XFIX_N_0_899 837 #undef XFIX_P_1_175 838 #undef XFIX_P_1_501 839 #undef XFIX_N_1_847 840 #undef XFIX_N_1_961 841 #undef XFIX_P_2_053 842 #undef XFIX_N_2_562 843 #undef XFIX_P_3_072 844 845 846 /*****************************************************************************/ 847 848 /* 849 * jsimd_ycc_extrgb_convert_neon 850 * jsimd_ycc_extbgr_convert_neon 851 * jsimd_ycc_extrgbx_convert_neon 852 * jsimd_ycc_extbgrx_convert_neon 853 * jsimd_ycc_extxbgr_convert_neon 854 * jsimd_ycc_extxrgb_convert_neon 855 * 856 * Colorspace conversion YCbCr -> RGB 857 */ 858 859 .macro do_load size 860 .if \size == 8 861 ld1 {v4.8b}, [U], 8 862 ld1 {v5.8b}, [V], 8 863 ld1 {v0.8b}, [Y], 8 864 prfm pldl1keep, [U, #64] 865 prfm pldl1keep, [V, #64] 866 prfm pldl1keep, [Y, #64] 867 .elseif \size == 4 868 ld1 {v4.b}[0], [U], 1 869 ld1 {v4.b}[1], [U], 1 870 ld1 {v4.b}[2], [U], 1 871 ld1 {v4.b}[3], [U], 1 872 ld1 {v5.b}[0], [V], 1 873 ld1 {v5.b}[1], [V], 1 874 ld1 {v5.b}[2], [V], 1 875 ld1 {v5.b}[3], [V], 1 876 ld1 {v0.b}[0], [Y], 1 877 ld1 {v0.b}[1], [Y], 1 878 ld1 {v0.b}[2], [Y], 1 879 ld1 {v0.b}[3], [Y], 1 880 .elseif \size == 2 881 ld1 {v4.b}[4], [U], 1 882 ld1 {v4.b}[5], [U], 1 883 ld1 {v5.b}[4], [V], 1 884 ld1 {v5.b}[5], [V], 1 885 ld1 {v0.b}[4], [Y], 1 886 ld1 {v0.b}[5], [Y], 1 887 .elseif \size == 1 888 ld1 {v4.b}[6], [U], 1 889 ld1 {v5.b}[6], [V], 1 890 ld1 {v0.b}[6], [Y], 1 891 .else 892 .error unsupported macroblock size 893 .endif 894 .endm 895 896 .macro do_store bpp, size, fast_st3 897 .if \bpp == 24 898 .if \size == 8 899 .if \fast_st3 == 1 900 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 901 .else 902 st1 {v10.b}[0], [RGB], #1 903 st1 {v11.b}[0], [RGB], #1 904 st1 {v12.b}[0], [RGB], #1 905 906 st1 {v10.b}[1], [RGB], #1 907 st1 {v11.b}[1], [RGB], #1 908 st1 {v12.b}[1], [RGB], #1 909 910 st1 {v10.b}[2], [RGB], #1 911 st1 {v11.b}[2], [RGB], #1 912 st1 {v12.b}[2], [RGB], #1 913 914 st1 {v10.b}[3], [RGB], #1 915 st1 {v11.b}[3], [RGB], #1 916 st1 {v12.b}[3], [RGB], #1 917 918 st1 {v10.b}[4], [RGB], #1 919 st1 {v11.b}[4], [RGB], #1 920 st1 {v12.b}[4], [RGB], #1 921 922 st1 {v10.b}[5], [RGB], #1 923 st1 {v11.b}[5], [RGB], #1 924 st1 {v12.b}[5], [RGB], #1 925 926 st1 {v10.b}[6], [RGB], #1 927 st1 {v11.b}[6], [RGB], #1 928 st1 {v12.b}[6], [RGB], #1 929 930 st1 {v10.b}[7], [RGB], #1 931 st1 {v11.b}[7], [RGB], #1 932 st1 {v12.b}[7], [RGB], #1 933 .endif 934 .elseif \size == 4 935 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 936 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 937 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 938 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 939 .elseif \size == 2 940 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 941 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 942 .elseif \size == 1 943 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 944 .else 945 .error unsupported macroblock size 946 .endif 947 .elseif \bpp == 32 948 .if \size == 8 949 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 950 .elseif \size == 4 951 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 952 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 953 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 954 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 955 .elseif \size == 2 956 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 957 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 958 .elseif \size == 1 959 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 960 .else 961 .error unsupported macroblock size 962 .endif 963 .elseif \bpp == 16 964 .if \size == 8 965 st1 {v25.8h}, [RGB], 16 966 .elseif \size == 4 967 st1 {v25.4h}, [RGB], 8 968 .elseif \size == 2 969 st1 {v25.h}[4], [RGB], 2 970 st1 {v25.h}[5], [RGB], 2 971 .elseif \size == 1 972 st1 {v25.h}[6], [RGB], 2 973 .else 974 .error unsupported macroblock size 975 .endif 976 .else 977 .error unsupported bpp 978 .endif 979 .endm 980 981 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ 982 g_offs, gsize, b_offs, bsize, \ 983 defsize, fast_st3 984 985 /* 986 * 2-stage pipelined YCbCr->RGB conversion 987 */ 988 989 .macro do_yuv_to_rgb_stage1 990 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ 991 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 992 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 993 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 994 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 995 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 996 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 997 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 998 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 999 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1000 .endm 1001 1002 .macro do_yuv_to_rgb_stage2 1003 rshrn v20.4h, v20.4s, #15 1004 rshrn2 v20.8h, v22.4s, #15 1005 rshrn v24.4h, v24.4s, #14 1006 rshrn2 v24.8h, v26.4s, #14 1007 rshrn v28.4h, v28.4s, #14 1008 rshrn2 v28.8h, v30.4s, #14 1009 uaddw v20.8h, v20.8h, v0.8b 1010 uaddw v24.8h, v24.8h, v0.8b 1011 uaddw v28.8h, v28.8h, v0.8b 1012 .if \bpp != 16 1013 sqxtun v1\g_offs\defsize, v20.8h 1014 sqxtun v1\r_offs\defsize, v24.8h 1015 sqxtun v1\b_offs\defsize, v28.8h 1016 .else 1017 sqshlu v21.8h, v20.8h, #8 1018 sqshlu v25.8h, v24.8h, #8 1019 sqshlu v29.8h, v28.8h, #8 1020 sri v25.8h, v21.8h, #5 1021 sri v25.8h, v29.8h, #11 1022 .endif 1023 .endm 1024 1025 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 1026 rshrn v20.4h, v20.4s, #15 1027 rshrn v24.4h, v24.4s, #14 1028 rshrn v28.4h, v28.4s, #14 1029 ld1 {v4.8b}, [U], 8 1030 rshrn2 v20.8h, v22.4s, #15 1031 rshrn2 v24.8h, v26.4s, #14 1032 rshrn2 v28.8h, v30.4s, #14 1033 ld1 {v5.8b}, [V], 8 1034 uaddw v20.8h, v20.8h, v0.8b 1035 uaddw v24.8h, v24.8h, v0.8b 1036 uaddw v28.8h, v28.8h, v0.8b 1037 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ 1038 sqxtun v1\g_offs\defsize, v20.8h 1039 ld1 {v0.8b}, [Y], 8 1040 sqxtun v1\r_offs\defsize, v24.8h 1041 prfm pldl1keep, [U, #64] 1042 prfm pldl1keep, [V, #64] 1043 prfm pldl1keep, [Y, #64] 1044 sqxtun v1\b_offs\defsize, v28.8h 1045 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1046 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1047 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1048 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1049 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1050 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1051 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1052 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1053 .else /**************************** rgb565 ********************************/ 1054 sqshlu v21.8h, v20.8h, #8 1055 sqshlu v25.8h, v24.8h, #8 1056 sqshlu v29.8h, v28.8h, #8 1057 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1058 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1059 ld1 {v0.8b}, [Y], 8 1060 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1061 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1062 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1063 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1064 sri v25.8h, v21.8h, #5 1065 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1066 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1067 prfm pldl1keep, [U, #64] 1068 prfm pldl1keep, [V, #64] 1069 prfm pldl1keep, [Y, #64] 1070 sri v25.8h, v29.8h, #11 1071 .endif 1072 do_store \bpp, 8, \fast_st3 1073 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1074 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1075 .endm 1076 1077 .macro do_yuv_to_rgb 1078 do_yuv_to_rgb_stage1 1079 do_yuv_to_rgb_stage2 1080 .endm 1081 1082 .if \fast_st3 == 1 1083 asm_function jsimd_ycc_\colorid\()_convert_neon 1084 .else 1085 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 1086 .endif 1087 OUTPUT_WIDTH .req w0 1088 INPUT_BUF .req x1 1089 INPUT_ROW .req w2 1090 OUTPUT_BUF .req x3 1091 NUM_ROWS .req w4 1092 1093 INPUT_BUF0 .req x5 1094 INPUT_BUF1 .req x6 1095 INPUT_BUF2 .req x1 1096 1097 RGB .req x7 1098 Y .req x9 1099 U .req x10 1100 V .req x11 1101 N .req w15 1102 1103 sub sp, sp, 64 1104 mov x9, sp 1105 1106 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ 1107 get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts 1108 1109 /* Save Neon registers */ 1110 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1111 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1112 ld1 {v0.4h, v1.4h}, [x15], 16 1113 ld1 {v2.8h}, [x15] 1114 1115 ldr INPUT_BUF0, [INPUT_BUF] 1116 ldr INPUT_BUF1, [INPUT_BUF, #8] 1117 ldr INPUT_BUF2, [INPUT_BUF, #16] 1118 .unreq INPUT_BUF 1119 1120 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ 1121 movi v10.16b, #255 1122 movi v13.16b, #255 1123 1124 /* Outer loop over scanlines */ 1125 cmp NUM_ROWS, #1 1126 b.lt 9f 1127 0: 1128 ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] 1129 ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] 1130 mov N, OUTPUT_WIDTH 1131 ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] 1132 add INPUT_ROW, INPUT_ROW, #1 1133 ldr RGB, [OUTPUT_BUF], #8 1134 1135 /* Inner loop over pixels */ 1136 subs N, N, #8 1137 b.lt 3f 1138 do_load 8 1139 do_yuv_to_rgb_stage1 1140 subs N, N, #8 1141 b.lt 2f 1142 1: 1143 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 1144 subs N, N, #8 1145 b.ge 1b 1146 2: 1147 do_yuv_to_rgb_stage2 1148 do_store \bpp, 8, \fast_st3 1149 tst N, #7 1150 b.eq 8f 1151 3: 1152 tst N, #4 1153 b.eq 3f 1154 do_load 4 1155 3: 1156 tst N, #2 1157 b.eq 4f 1158 do_load 2 1159 4: 1160 tst N, #1 1161 b.eq 5f 1162 do_load 1 1163 5: 1164 do_yuv_to_rgb 1165 tst N, #4 1166 b.eq 6f 1167 do_store \bpp, 4, \fast_st3 1168 6: 1169 tst N, #2 1170 b.eq 7f 1171 do_store \bpp, 2, \fast_st3 1172 7: 1173 tst N, #1 1174 b.eq 8f 1175 do_store \bpp, 1, \fast_st3 1176 8: 1177 subs NUM_ROWS, NUM_ROWS, #1 1178 b.gt 0b 1179 9: 1180 /* Restore all registers and return */ 1181 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1182 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1183 br x30 1184 .unreq OUTPUT_WIDTH 1185 .unreq INPUT_ROW 1186 .unreq OUTPUT_BUF 1187 .unreq NUM_ROWS 1188 .unreq INPUT_BUF0 1189 .unreq INPUT_BUF1 1190 .unreq INPUT_BUF2 1191 .unreq RGB 1192 .unreq Y 1193 .unreq U 1194 .unreq V 1195 .unreq N 1196 1197 .purgem do_yuv_to_rgb 1198 .purgem do_yuv_to_rgb_stage1 1199 .purgem do_yuv_to_rgb_stage2 1200 .purgem do_yuv_to_rgb_stage2_store_load_stage1 1201 1202 .endm 1203 1204 /*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ 1205 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1206 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1207 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1208 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1209 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 1210 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 1211 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 1212 1213 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 1214 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 1215 1216 .purgem do_load 1217 .purgem do_store 1218 1219 1220 /*****************************************************************************/ 1221 1222 /* 1223 * jsimd_extrgb_ycc_convert_neon 1224 * jsimd_extbgr_ycc_convert_neon 1225 * jsimd_extrgbx_ycc_convert_neon 1226 * jsimd_extbgrx_ycc_convert_neon 1227 * jsimd_extxbgr_ycc_convert_neon 1228 * jsimd_extxrgb_ycc_convert_neon 1229 * 1230 * Colorspace conversion RGB -> YCbCr 1231 */ 1232 1233 .macro do_store size 1234 .if \size == 8 1235 st1 {v20.8b}, [Y], #8 1236 st1 {v21.8b}, [U], #8 1237 st1 {v22.8b}, [V], #8 1238 .elseif \size == 4 1239 st1 {v20.b}[0], [Y], #1 1240 st1 {v20.b}[1], [Y], #1 1241 st1 {v20.b}[2], [Y], #1 1242 st1 {v20.b}[3], [Y], #1 1243 st1 {v21.b}[0], [U], #1 1244 st1 {v21.b}[1], [U], #1 1245 st1 {v21.b}[2], [U], #1 1246 st1 {v21.b}[3], [U], #1 1247 st1 {v22.b}[0], [V], #1 1248 st1 {v22.b}[1], [V], #1 1249 st1 {v22.b}[2], [V], #1 1250 st1 {v22.b}[3], [V], #1 1251 .elseif \size == 2 1252 st1 {v20.b}[4], [Y], #1 1253 st1 {v20.b}[5], [Y], #1 1254 st1 {v21.b}[4], [U], #1 1255 st1 {v21.b}[5], [U], #1 1256 st1 {v22.b}[4], [V], #1 1257 st1 {v22.b}[5], [V], #1 1258 .elseif \size == 1 1259 st1 {v20.b}[6], [Y], #1 1260 st1 {v21.b}[6], [U], #1 1261 st1 {v22.b}[6], [V], #1 1262 .else 1263 .error unsupported macroblock size 1264 .endif 1265 .endm 1266 1267 .macro do_load bpp, size, fast_ld3 1268 .if \bpp == 24 1269 .if \size == 8 1270 .if \fast_ld3 == 1 1271 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 1272 .else 1273 ld1 {v10.b}[0], [RGB], #1 1274 ld1 {v11.b}[0], [RGB], #1 1275 ld1 {v12.b}[0], [RGB], #1 1276 1277 ld1 {v10.b}[1], [RGB], #1 1278 ld1 {v11.b}[1], [RGB], #1 1279 ld1 {v12.b}[1], [RGB], #1 1280 1281 ld1 {v10.b}[2], [RGB], #1 1282 ld1 {v11.b}[2], [RGB], #1 1283 ld1 {v12.b}[2], [RGB], #1 1284 1285 ld1 {v10.b}[3], [RGB], #1 1286 ld1 {v11.b}[3], [RGB], #1 1287 ld1 {v12.b}[3], [RGB], #1 1288 1289 ld1 {v10.b}[4], [RGB], #1 1290 ld1 {v11.b}[4], [RGB], #1 1291 ld1 {v12.b}[4], [RGB], #1 1292 1293 ld1 {v10.b}[5], [RGB], #1 1294 ld1 {v11.b}[5], [RGB], #1 1295 ld1 {v12.b}[5], [RGB], #1 1296 1297 ld1 {v10.b}[6], [RGB], #1 1298 ld1 {v11.b}[6], [RGB], #1 1299 ld1 {v12.b}[6], [RGB], #1 1300 1301 ld1 {v10.b}[7], [RGB], #1 1302 ld1 {v11.b}[7], [RGB], #1 1303 ld1 {v12.b}[7], [RGB], #1 1304 .endif 1305 prfm pldl1keep, [RGB, #128] 1306 .elseif \size == 4 1307 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 1308 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 1309 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 1310 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 1311 .elseif \size == 2 1312 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 1313 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 1314 .elseif \size == 1 1315 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 1316 .else 1317 .error unsupported macroblock size 1318 .endif 1319 .elseif \bpp == 32 1320 .if \size == 8 1321 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 1322 prfm pldl1keep, [RGB, #128] 1323 .elseif \size == 4 1324 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 1325 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 1326 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 1327 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 1328 .elseif \size == 2 1329 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 1330 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 1331 .elseif \size == 1 1332 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 1333 .else 1334 .error unsupported macroblock size 1335 .endif 1336 .else 1337 .error unsupported bpp 1338 .endif 1339 .endm 1340 1341 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ 1342 b_offs, fast_ld3 1343 1344 /* 1345 * 2-stage pipelined RGB->YCbCr conversion 1346 */ 1347 1348 .macro do_rgb_to_yuv_stage1 1349 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ 1350 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ 1351 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ 1352 rev64 v18.4s, v1.4s 1353 rev64 v26.4s, v1.4s 1354 rev64 v28.4s, v1.4s 1355 rev64 v30.4s, v1.4s 1356 umull v14.4s, v4.4h, v0.h[0] 1357 umull2 v16.4s, v4.8h, v0.h[0] 1358 umlsl v18.4s, v4.4h, v0.h[3] 1359 umlsl2 v26.4s, v4.8h, v0.h[3] 1360 umlal v28.4s, v4.4h, v0.h[5] 1361 umlal2 v30.4s, v4.8h, v0.h[5] 1362 umlal v14.4s, v6.4h, v0.h[1] 1363 umlal2 v16.4s, v6.8h, v0.h[1] 1364 umlsl v18.4s, v6.4h, v0.h[4] 1365 umlsl2 v26.4s, v6.8h, v0.h[4] 1366 umlsl v28.4s, v6.4h, v0.h[6] 1367 umlsl2 v30.4s, v6.8h, v0.h[6] 1368 umlal v14.4s, v8.4h, v0.h[2] 1369 umlal2 v16.4s, v8.8h, v0.h[2] 1370 umlal v18.4s, v8.4h, v0.h[5] 1371 umlal2 v26.4s, v8.8h, v0.h[5] 1372 umlsl v28.4s, v8.4h, v0.h[7] 1373 umlsl2 v30.4s, v8.8h, v0.h[7] 1374 .endm 1375 1376 .macro do_rgb_to_yuv_stage2 1377 rshrn v20.4h, v14.4s, #16 1378 shrn v22.4h, v18.4s, #16 1379 shrn v24.4h, v28.4s, #16 1380 rshrn2 v20.8h, v16.4s, #16 1381 shrn2 v22.8h, v26.4s, #16 1382 shrn2 v24.8h, v30.4s, #16 1383 xtn v20.8b, v20.8h /* v20 = y */ 1384 xtn v21.8b, v22.8h /* v21 = u */ 1385 xtn v22.8b, v24.8h /* v22 = v */ 1386 .endm 1387 1388 .macro do_rgb_to_yuv 1389 do_rgb_to_yuv_stage1 1390 do_rgb_to_yuv_stage2 1391 .endm 1392 1393 /* TODO: expand macros and interleave instructions if some in-order 1394 * AArch64 processor actually can dual-issue LOAD/STORE with ALU */ 1395 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 1396 do_rgb_to_yuv_stage2 1397 do_load \bpp, 8, \fast_ld3 1398 st1 {v20.8b}, [Y], #8 1399 st1 {v21.8b}, [U], #8 1400 st1 {v22.8b}, [V], #8 1401 do_rgb_to_yuv_stage1 1402 .endm 1403 1404 .if \fast_ld3 == 1 1405 asm_function jsimd_\colorid\()_ycc_convert_neon 1406 .else 1407 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 1408 .endif 1409 OUTPUT_WIDTH .req w0 1410 INPUT_BUF .req x1 1411 OUTPUT_BUF .req x2 1412 OUTPUT_ROW .req w3 1413 NUM_ROWS .req w4 1414 1415 OUTPUT_BUF0 .req x5 1416 OUTPUT_BUF1 .req x6 1417 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ 1418 1419 RGB .req x7 1420 Y .req x9 1421 U .req x10 1422 V .req x11 1423 N .req w12 1424 1425 /* Load constants to d0, d1, d2, d3 */ 1426 get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts 1427 ld1 {v0.8h, v1.8h}, [x13] 1428 1429 ldr OUTPUT_BUF0, [OUTPUT_BUF] 1430 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] 1431 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] 1432 .unreq OUTPUT_BUF 1433 1434 /* Save Neon registers */ 1435 sub sp, sp, #64 1436 mov x9, sp 1437 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1438 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1439 1440 /* Outer loop over scanlines */ 1441 cmp NUM_ROWS, #1 1442 b.lt 9f 1443 0: 1444 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] 1445 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] 1446 mov N, OUTPUT_WIDTH 1447 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] 1448 add OUTPUT_ROW, OUTPUT_ROW, #1 1449 ldr RGB, [INPUT_BUF], #8 1450 1451 /* Inner loop over pixels */ 1452 subs N, N, #8 1453 b.lt 3f 1454 do_load \bpp, 8, \fast_ld3 1455 do_rgb_to_yuv_stage1 1456 subs N, N, #8 1457 b.lt 2f 1458 1: 1459 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 1460 subs N, N, #8 1461 b.ge 1b 1462 2: 1463 do_rgb_to_yuv_stage2 1464 do_store 8 1465 tst N, #7 1466 b.eq 8f 1467 3: 1468 tbz N, #2, 3f 1469 do_load \bpp, 4, \fast_ld3 1470 3: 1471 tbz N, #1, 4f 1472 do_load \bpp, 2, \fast_ld3 1473 4: 1474 tbz N, #0, 5f 1475 do_load \bpp, 1, \fast_ld3 1476 5: 1477 do_rgb_to_yuv 1478 tbz N, #2, 6f 1479 do_store 4 1480 6: 1481 tbz N, #1, 7f 1482 do_store 2 1483 7: 1484 tbz N, #0, 8f 1485 do_store 1 1486 8: 1487 subs NUM_ROWS, NUM_ROWS, #1 1488 b.gt 0b 1489 9: 1490 /* Restore all registers and return */ 1491 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1492 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1493 br x30 1494 1495 .unreq OUTPUT_WIDTH 1496 .unreq OUTPUT_ROW 1497 .unreq INPUT_BUF 1498 .unreq NUM_ROWS 1499 .unreq OUTPUT_BUF0 1500 .unreq OUTPUT_BUF1 1501 .unreq OUTPUT_BUF2 1502 .unreq RGB 1503 .unreq Y 1504 .unreq U 1505 .unreq V 1506 .unreq N 1507 1508 .purgem do_rgb_to_yuv 1509 .purgem do_rgb_to_yuv_stage1 1510 .purgem do_rgb_to_yuv_stage2 1511 .purgem do_rgb_to_yuv_stage2_store_load_stage1 1512 1513 .endm 1514 1515 /*--------------------------------- id ----- bpp R G B Fast LD3 */ 1516 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 1517 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 1518 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 1519 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 1520 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 1521 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 1522 1523 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 1524 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 1525 1526 .purgem do_load 1527 .purgem do_store 1528 1529 1530 /*****************************************************************************/ 1531 1532 /* 1533 * jsimd_fdct_islow_neon 1534 * 1535 * This file contains a slower but more accurate integer implementation of the 1536 * forward DCT (Discrete Cosine Transform). The following code is based 1537 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for 1538 * more details. 1539 * 1540 * TODO: can be combined with 'jsimd_convsamp_neon' to get 1541 * rid of a bunch of VLD1.16 instructions 1542 */ 1543 1544 #define CONST_BITS 13 1545 #define PASS1_BITS 2 1546 1547 #define DESCALE_P1 (CONST_BITS - PASS1_BITS) 1548 #define DESCALE_P2 (CONST_BITS + PASS1_BITS) 1549 1550 #define XFIX_P_0_298 v0.h[0] 1551 #define XFIX_N_0_390 v0.h[1] 1552 #define XFIX_P_0_541 v0.h[2] 1553 #define XFIX_P_0_765 v0.h[3] 1554 #define XFIX_N_0_899 v0.h[4] 1555 #define XFIX_P_1_175 v0.h[5] 1556 #define XFIX_P_1_501 v0.h[6] 1557 #define XFIX_N_1_847 v0.h[7] 1558 #define XFIX_N_1_961 v1.h[0] 1559 #define XFIX_P_2_053 v1.h[1] 1560 #define XFIX_N_2_562 v1.h[2] 1561 #define XFIX_P_3_072 v1.h[3] 1562 1563 asm_function jsimd_fdct_islow_neon 1564 1565 DATA .req x0 1566 TMP .req x9 1567 1568 /* Load constants */ 1569 get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts 1570 ld1 {v0.8h, v1.8h}, [TMP] 1571 1572 /* Save Neon registers */ 1573 sub sp, sp, #64 1574 mov x10, sp 1575 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 1576 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 1577 1578 /* Load all DATA into Neon registers with the following allocation: 1579 * 0 1 2 3 | 4 5 6 7 1580 * ---------+-------- 1581 * 0 | d16 | d17 | v16.8h 1582 * 1 | d18 | d19 | v17.8h 1583 * 2 | d20 | d21 | v18.8h 1584 * 3 | d22 | d23 | v19.8h 1585 * 4 | d24 | d25 | v20.8h 1586 * 5 | d26 | d27 | v21.8h 1587 * 6 | d28 | d29 | v22.8h 1588 * 7 | d30 | d31 | v23.8h 1589 */ 1590 1591 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 1592 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 1593 sub DATA, DATA, #64 1594 1595 /* Transpose */ 1596 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 1597 /* 1-D FDCT */ 1598 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 1599 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 1600 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 1601 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 1602 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 1603 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 1604 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 1605 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 1606 1607 /* even part */ 1608 1609 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 1610 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 1611 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 1612 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 1613 1614 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 1615 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 1616 1617 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 1618 1619 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ 1620 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ 1621 1622 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 1623 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 1624 mov v22.16b, v18.16b 1625 mov v25.16b, v24.16b 1626 1627 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 1628 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 1629 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 1630 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 1631 1632 rshrn v18.4h, v18.4s, #DESCALE_P1 1633 rshrn v22.4h, v22.4s, #DESCALE_P1 1634 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 1635 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 1636 1637 /* Odd part */ 1638 1639 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 1640 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 1641 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 1642 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 1643 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 1644 smull2 v5.4s, v10.8h, XFIX_P_1_175 1645 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 1646 smlal2 v5.4s, v11.8h, XFIX_P_1_175 1647 1648 smull2 v24.4s, v28.8h, XFIX_P_0_298 1649 smull2 v25.4s, v29.8h, XFIX_P_2_053 1650 smull2 v26.4s, v30.8h, XFIX_P_3_072 1651 smull2 v27.4s, v31.8h, XFIX_P_1_501 1652 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 1653 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 1654 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 1655 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 1656 1657 smull2 v12.4s, v8.8h, XFIX_N_0_899 1658 smull2 v13.4s, v9.8h, XFIX_N_2_562 1659 smull2 v14.4s, v10.8h, XFIX_N_1_961 1660 smull2 v15.4s, v11.8h, XFIX_N_0_390 1661 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 1662 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 1663 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 1664 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 1665 1666 add v10.4s, v10.4s, v4.4s /* z3 += z5 */ 1667 add v14.4s, v14.4s, v5.4s 1668 add v11.4s, v11.4s, v4.4s /* z4 += z5 */ 1669 add v15.4s, v15.4s, v5.4s 1670 1671 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 1672 add v24.4s, v24.4s, v12.4s 1673 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 1674 add v25.4s, v25.4s, v13.4s 1675 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 1676 add v26.4s, v26.4s, v14.4s 1677 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 1678 add v27.4s, v27.4s, v15.4s 1679 1680 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 1681 add v24.4s, v24.4s, v14.4s 1682 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 1683 add v25.4s, v25.4s, v15.4s 1684 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 1685 add v26.4s, v26.4s, v13.4s 1686 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 1687 add v27.4s, v27.4s, v12.4s 1688 1689 rshrn v23.4h, v28.4s, #DESCALE_P1 1690 rshrn v21.4h, v29.4s, #DESCALE_P1 1691 rshrn v19.4h, v30.4s, #DESCALE_P1 1692 rshrn v17.4h, v31.4s, #DESCALE_P1 1693 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 1694 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 1695 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 1696 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 1697 1698 /* Transpose */ 1699 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 1700 1701 /* 1-D FDCT */ 1702 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 1703 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 1704 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 1705 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 1706 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 1707 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 1708 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 1709 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 1710 1711 /* even part */ 1712 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 1713 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 1714 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 1715 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 1716 1717 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 1718 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 1719 1720 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 1721 1722 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ 1723 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ 1724 1725 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 1726 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 1727 mov v22.16b, v18.16b 1728 mov v25.16b, v24.16b 1729 1730 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 1731 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 1732 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 1733 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 1734 1735 rshrn v18.4h, v18.4s, #DESCALE_P2 1736 rshrn v22.4h, v22.4s, #DESCALE_P2 1737 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 1738 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 1739 1740 /* Odd part */ 1741 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 1742 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 1743 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 1744 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 1745 1746 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 1747 smull2 v5.4s, v10.8h, XFIX_P_1_175 1748 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 1749 smlal2 v5.4s, v11.8h, XFIX_P_1_175 1750 1751 smull2 v24.4s, v28.8h, XFIX_P_0_298 1752 smull2 v25.4s, v29.8h, XFIX_P_2_053 1753 smull2 v26.4s, v30.8h, XFIX_P_3_072 1754 smull2 v27.4s, v31.8h, XFIX_P_1_501 1755 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 1756 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 1757 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 1758 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 1759 1760 smull2 v12.4s, v8.8h, XFIX_N_0_899 1761 smull2 v13.4s, v9.8h, XFIX_N_2_562 1762 smull2 v14.4s, v10.8h, XFIX_N_1_961 1763 smull2 v15.4s, v11.8h, XFIX_N_0_390 1764 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 1765 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 1766 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 1767 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 1768 1769 add v10.4s, v10.4s, v4.4s 1770 add v14.4s, v14.4s, v5.4s 1771 add v11.4s, v11.4s, v4.4s 1772 add v15.4s, v15.4s, v5.4s 1773 1774 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 1775 add v24.4s, v24.4s, v12.4s 1776 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 1777 add v25.4s, v25.4s, v13.4s 1778 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 1779 add v26.4s, v26.4s, v14.4s 1780 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 1781 add v27.4s, v27.4s, v15.4s 1782 1783 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 1784 add v24.4s, v24.4s, v14.4s 1785 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 1786 add v25.4s, v25.4s, v15.4s 1787 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 1788 add v26.4s, v26.4s, v13.4s 1789 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 1790 add v27.4s, v27.4s, v12.4s 1791 1792 rshrn v23.4h, v28.4s, #DESCALE_P2 1793 rshrn v21.4h, v29.4s, #DESCALE_P2 1794 rshrn v19.4h, v30.4s, #DESCALE_P2 1795 rshrn v17.4h, v31.4s, #DESCALE_P2 1796 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 1797 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 1798 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 1799 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 1800 1801 /* store results */ 1802 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 1803 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 1804 1805 /* Restore Neon registers */ 1806 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1807 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1808 1809 br x30 1810 1811 .unreq DATA 1812 .unreq TMP 1813 1814 #undef XFIX_P_0_298 1815 #undef XFIX_N_0_390 1816 #undef XFIX_P_0_541 1817 #undef XFIX_P_0_765 1818 #undef XFIX_N_0_899 1819 #undef XFIX_P_1_175 1820 #undef XFIX_P_1_501 1821 #undef XFIX_N_1_847 1822 #undef XFIX_N_1_961 1823 #undef XFIX_P_2_053 1824 #undef XFIX_N_2_562 1825 #undef XFIX_P_3_072 1826 1827 1828 /*****************************************************************************/ 1829 1830 /* 1831 * GLOBAL(JOCTET *) 1832 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, 1833 * JCOEFPTR block, int last_dc_val, 1834 * c_derived_tbl *dctbl, c_derived_tbl *actbl) 1835 * 1836 */ 1837 1838 BUFFER .req x1 1839 PUT_BUFFER .req x6 1840 PUT_BITS .req x7 1841 PUT_BITSw .req w7 1842 1843 .macro emit_byte 1844 sub PUT_BITS, PUT_BITS, #0x8 1845 lsr x19, PUT_BUFFER, PUT_BITS 1846 uxtb w19, w19 1847 strb w19, [BUFFER, #1]! 1848 cmp w19, #0xff 1849 b.ne 14f 1850 strb wzr, [BUFFER, #1]! 1851 14: 1852 .endm 1853 .macro put_bits CODE, SIZE 1854 lsl PUT_BUFFER, PUT_BUFFER, \SIZE 1855 add PUT_BITS, PUT_BITS, \SIZE 1856 orr PUT_BUFFER, PUT_BUFFER, \CODE 1857 .endm 1858 .macro checkbuf31 1859 cmp PUT_BITS, #0x20 1860 b.lt 31f 1861 emit_byte 1862 emit_byte 1863 emit_byte 1864 emit_byte 1865 31: 1866 .endm 1867 .macro checkbuf47 1868 cmp PUT_BITS, #0x30 1869 b.lt 47f 1870 emit_byte 1871 emit_byte 1872 emit_byte 1873 emit_byte 1874 emit_byte 1875 emit_byte 1876 47: 1877 .endm 1878 1879 .macro generate_jsimd_huff_encode_one_block fast_tbl 1880 1881 .if \fast_tbl == 1 1882 asm_function jsimd_huff_encode_one_block_neon 1883 .else 1884 asm_function jsimd_huff_encode_one_block_neon_slowtbl 1885 .endif 1886 sub sp, sp, 272 1887 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ 1888 /* Save Arm registers */ 1889 stp x19, x20, [sp] 1890 get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts 1891 ldr PUT_BUFFER, [x0, #0x10] 1892 ldr PUT_BITSw, [x0, #0x18] 1893 ldrsh w12, [x2] /* load DC coeff in w12 */ 1894 /* prepare data */ 1895 .if \fast_tbl == 1 1896 ld1 {v23.16b}, [x15], #16 1897 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 1898 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 1899 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 1900 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 1901 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 1902 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 1903 /* ZigZag 8x8 */ 1904 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b 1905 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b 1906 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b 1907 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b 1908 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b 1909 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b 1910 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b 1911 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b 1912 ins v0.h[0], w12 1913 tbx v1.16b, {v28.16b}, v16.16b 1914 tbx v2.16b, {v29.16b, v30.16b}, v17.16b 1915 tbx v5.16b, {v29.16b, v30.16b}, v18.16b 1916 tbx v6.16b, {v31.16b}, v19.16b 1917 .else 1918 add x13, x2, #0x22 1919 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 1920 ld1 {v23.16b}, [x15] 1921 add x14, x2, #0x18 1922 add x3, x2, #0x36 1923 ins v0.h[0], w12 1924 add x9, x2, #0x2 1925 ld1 {v1.h}[0], [x13] 1926 add x15, x2, #0x30 1927 ld1 {v2.h}[0], [x14] 1928 add x19, x2, #0x26 1929 ld1 {v3.h}[0], [x3] 1930 add x20, x2, #0x28 1931 ld1 {v0.h}[1], [x9] 1932 add x12, x2, #0x10 1933 ld1 {v1.h}[1], [x15] 1934 add x13, x2, #0x40 1935 ld1 {v2.h}[1], [x19] 1936 add x14, x2, #0x34 1937 ld1 {v3.h}[1], [x20] 1938 add x3, x2, #0x1a 1939 ld1 {v0.h}[2], [x12] 1940 add x9, x2, #0x20 1941 ld1 {v1.h}[2], [x13] 1942 add x15, x2, #0x32 1943 ld1 {v2.h}[2], [x14] 1944 add x19, x2, #0x42 1945 ld1 {v3.h}[2], [x3] 1946 add x20, x2, #0xc 1947 ld1 {v0.h}[3], [x9] 1948 add x12, x2, #0x12 1949 ld1 {v1.h}[3], [x15] 1950 add x13, x2, #0x24 1951 ld1 {v2.h}[3], [x19] 1952 add x14, x2, #0x50 1953 ld1 {v3.h}[3], [x20] 1954 add x3, x2, #0xe 1955 ld1 {v0.h}[4], [x12] 1956 add x9, x2, #0x4 1957 ld1 {v1.h}[4], [x13] 1958 add x15, x2, #0x16 1959 ld1 {v2.h}[4], [x14] 1960 add x19, x2, #0x60 1961 ld1 {v3.h}[4], [x3] 1962 add x20, x2, #0x1c 1963 ld1 {v0.h}[5], [x9] 1964 add x12, x2, #0x6 1965 ld1 {v1.h}[5], [x15] 1966 add x13, x2, #0x8 1967 ld1 {v2.h}[5], [x19] 1968 add x14, x2, #0x52 1969 ld1 {v3.h}[5], [x20] 1970 add x3, x2, #0x2a 1971 ld1 {v0.h}[6], [x12] 1972 add x9, x2, #0x14 1973 ld1 {v1.h}[6], [x13] 1974 add x15, x2, #0xa 1975 ld1 {v2.h}[6], [x14] 1976 add x19, x2, #0x44 1977 ld1 {v3.h}[6], [x3] 1978 add x20, x2, #0x38 1979 ld1 {v0.h}[7], [x9] 1980 add x12, x2, #0x46 1981 ld1 {v1.h}[7], [x15] 1982 add x13, x2, #0x3a 1983 ld1 {v2.h}[7], [x19] 1984 add x14, x2, #0x74 1985 ld1 {v3.h}[7], [x20] 1986 add x3, x2, #0x6a 1987 ld1 {v4.h}[0], [x12] 1988 add x9, x2, #0x54 1989 ld1 {v5.h}[0], [x13] 1990 add x15, x2, #0x2c 1991 ld1 {v6.h}[0], [x14] 1992 add x19, x2, #0x76 1993 ld1 {v7.h}[0], [x3] 1994 add x20, x2, #0x78 1995 ld1 {v4.h}[1], [x9] 1996 add x12, x2, #0x62 1997 ld1 {v5.h}[1], [x15] 1998 add x13, x2, #0x1e 1999 ld1 {v6.h}[1], [x19] 2000 add x14, x2, #0x68 2001 ld1 {v7.h}[1], [x20] 2002 add x3, x2, #0x7a 2003 ld1 {v4.h}[2], [x12] 2004 add x9, x2, #0x70 2005 ld1 {v5.h}[2], [x13] 2006 add x15, x2, #0x2e 2007 ld1 {v6.h}[2], [x14] 2008 add x19, x2, #0x5a 2009 ld1 {v7.h}[2], [x3] 2010 add x20, x2, #0x6c 2011 ld1 {v4.h}[3], [x9] 2012 add x12, x2, #0x72 2013 ld1 {v5.h}[3], [x15] 2014 add x13, x2, #0x3c 2015 ld1 {v6.h}[3], [x19] 2016 add x14, x2, #0x4c 2017 ld1 {v7.h}[3], [x20] 2018 add x3, x2, #0x5e 2019 ld1 {v4.h}[4], [x12] 2020 add x9, x2, #0x64 2021 ld1 {v5.h}[4], [x13] 2022 add x15, x2, #0x4a 2023 ld1 {v6.h}[4], [x14] 2024 add x19, x2, #0x3e 2025 ld1 {v7.h}[4], [x3] 2026 add x20, x2, #0x6e 2027 ld1 {v4.h}[5], [x9] 2028 add x12, x2, #0x56 2029 ld1 {v5.h}[5], [x15] 2030 add x13, x2, #0x58 2031 ld1 {v6.h}[5], [x19] 2032 add x14, x2, #0x4e 2033 ld1 {v7.h}[5], [x20] 2034 add x3, x2, #0x7c 2035 ld1 {v4.h}[6], [x12] 2036 add x9, x2, #0x48 2037 ld1 {v5.h}[6], [x13] 2038 add x15, x2, #0x66 2039 ld1 {v6.h}[6], [x14] 2040 add x19, x2, #0x5c 2041 ld1 {v7.h}[6], [x3] 2042 add x20, x2, #0x7e 2043 ld1 {v4.h}[7], [x9] 2044 ld1 {v5.h}[7], [x15] 2045 ld1 {v6.h}[7], [x19] 2046 ld1 {v7.h}[7], [x20] 2047 .endif 2048 cmlt v24.8h, v0.8h, #0 2049 cmlt v25.8h, v1.8h, #0 2050 cmlt v26.8h, v2.8h, #0 2051 cmlt v27.8h, v3.8h, #0 2052 cmlt v28.8h, v4.8h, #0 2053 cmlt v29.8h, v5.8h, #0 2054 cmlt v30.8h, v6.8h, #0 2055 cmlt v31.8h, v7.8h, #0 2056 abs v0.8h, v0.8h 2057 abs v1.8h, v1.8h 2058 abs v2.8h, v2.8h 2059 abs v3.8h, v3.8h 2060 abs v4.8h, v4.8h 2061 abs v5.8h, v5.8h 2062 abs v6.8h, v6.8h 2063 abs v7.8h, v7.8h 2064 eor v24.16b, v24.16b, v0.16b 2065 eor v25.16b, v25.16b, v1.16b 2066 eor v26.16b, v26.16b, v2.16b 2067 eor v27.16b, v27.16b, v3.16b 2068 eor v28.16b, v28.16b, v4.16b 2069 eor v29.16b, v29.16b, v5.16b 2070 eor v30.16b, v30.16b, v6.16b 2071 eor v31.16b, v31.16b, v7.16b 2072 cmeq v16.8h, v0.8h, #0 2073 cmeq v17.8h, v1.8h, #0 2074 cmeq v18.8h, v2.8h, #0 2075 cmeq v19.8h, v3.8h, #0 2076 cmeq v20.8h, v4.8h, #0 2077 cmeq v21.8h, v5.8h, #0 2078 cmeq v22.8h, v6.8h, #0 2079 xtn v16.8b, v16.8h 2080 xtn v18.8b, v18.8h 2081 xtn v20.8b, v20.8h 2082 xtn v22.8b, v22.8h 2083 umov w14, v0.h[0] 2084 xtn2 v16.16b, v17.8h 2085 umov w13, v24.h[0] 2086 xtn2 v18.16b, v19.8h 2087 clz w14, w14 2088 xtn2 v20.16b, v21.8h 2089 lsl w13, w13, w14 2090 cmeq v17.8h, v7.8h, #0 2091 sub w12, w14, #32 2092 xtn2 v22.16b, v17.8h 2093 lsr w13, w13, w14 2094 and v16.16b, v16.16b, v23.16b 2095 neg w12, w12 2096 and v18.16b, v18.16b, v23.16b 2097 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ 2098 and v20.16b, v20.16b, v23.16b 2099 add x15, sp, #0x90 /* x15 = t2 */ 2100 and v22.16b, v22.16b, v23.16b 2101 ldr w10, [x4, x12, lsl #2] 2102 addp v16.16b, v16.16b, v18.16b 2103 ldrb w11, [x3, x12] 2104 addp v20.16b, v20.16b, v22.16b 2105 checkbuf47 2106 addp v16.16b, v16.16b, v20.16b 2107 put_bits x10, x11 2108 addp v16.16b, v16.16b, v18.16b 2109 checkbuf47 2110 umov x9, v16.D[0] 2111 put_bits x13, x12 2112 cnt v17.8b, v16.8b 2113 mvn x9, x9 2114 addv B18, v17.8b 2115 add x4, x5, #0x400 /* x4 = actbl->ehufsi */ 2116 umov w12, v18.b[0] 2117 lsr x9, x9, #0x1 /* clear AC coeff */ 2118 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ 2119 rbit x9, x9 /* x9 = index0 */ 2120 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ 2121 cmp w12, #(64-8) 2122 add x11, sp, #16 2123 b.lt 4f 2124 cbz x9, 6f 2125 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 2126 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 2127 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 2128 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 2129 1: 2130 clz x2, x9 2131 add x15, x15, x2, lsl #1 2132 lsl x9, x9, x2 2133 ldrh w20, [x15, #-126] 2134 2: 2135 cmp x2, #0x10 2136 b.lt 3f 2137 sub x2, x2, #0x10 2138 checkbuf47 2139 put_bits x13, x14 2140 b 2b 2141 3: 2142 clz w20, w20 2143 ldrh w3, [x15, #2]! 2144 sub w11, w20, #32 2145 lsl w3, w3, w20 2146 neg w11, w11 2147 lsr w3, w3, w20 2148 add x2, x11, x2, lsl #4 2149 lsl x9, x9, #0x1 2150 ldr w12, [x5, x2, lsl #2] 2151 ldrb w10, [x4, x2] 2152 checkbuf31 2153 put_bits x12, x10 2154 put_bits x3, x11 2155 cbnz x9, 1b 2156 b 6f 2157 4: 2158 movi v21.8h, #0x0010 2159 clz v0.8h, v0.8h 2160 clz v1.8h, v1.8h 2161 clz v2.8h, v2.8h 2162 clz v3.8h, v3.8h 2163 clz v4.8h, v4.8h 2164 clz v5.8h, v5.8h 2165 clz v6.8h, v6.8h 2166 clz v7.8h, v7.8h 2167 ushl v24.8h, v24.8h, v0.8h 2168 ushl v25.8h, v25.8h, v1.8h 2169 ushl v26.8h, v26.8h, v2.8h 2170 ushl v27.8h, v27.8h, v3.8h 2171 ushl v28.8h, v28.8h, v4.8h 2172 ushl v29.8h, v29.8h, v5.8h 2173 ushl v30.8h, v30.8h, v6.8h 2174 ushl v31.8h, v31.8h, v7.8h 2175 neg v0.8h, v0.8h 2176 neg v1.8h, v1.8h 2177 neg v2.8h, v2.8h 2178 neg v3.8h, v3.8h 2179 neg v4.8h, v4.8h 2180 neg v5.8h, v5.8h 2181 neg v6.8h, v6.8h 2182 neg v7.8h, v7.8h 2183 ushl v24.8h, v24.8h, v0.8h 2184 ushl v25.8h, v25.8h, v1.8h 2185 ushl v26.8h, v26.8h, v2.8h 2186 ushl v27.8h, v27.8h, v3.8h 2187 ushl v28.8h, v28.8h, v4.8h 2188 ushl v29.8h, v29.8h, v5.8h 2189 ushl v30.8h, v30.8h, v6.8h 2190 ushl v31.8h, v31.8h, v7.8h 2191 add v0.8h, v21.8h, v0.8h 2192 add v1.8h, v21.8h, v1.8h 2193 add v2.8h, v21.8h, v2.8h 2194 add v3.8h, v21.8h, v3.8h 2195 add v4.8h, v21.8h, v4.8h 2196 add v5.8h, v21.8h, v5.8h 2197 add v6.8h, v21.8h, v6.8h 2198 add v7.8h, v21.8h, v7.8h 2199 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 2200 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 2201 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 2202 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 2203 1: 2204 clz x2, x9 2205 add x15, x15, x2, lsl #1 2206 lsl x9, x9, x2 2207 ldrh w11, [x15, #-126] 2208 2: 2209 cmp x2, #0x10 2210 b.lt 3f 2211 sub x2, x2, #0x10 2212 checkbuf47 2213 put_bits x13, x14 2214 b 2b 2215 3: 2216 ldrh w3, [x15, #2]! 2217 add x2, x11, x2, lsl #4 2218 lsl x9, x9, #0x1 2219 ldr w12, [x5, x2, lsl #2] 2220 ldrb w10, [x4, x2] 2221 checkbuf31 2222 put_bits x12, x10 2223 put_bits x3, x11 2224 cbnz x9, 1b 2225 6: 2226 add x13, sp, #0x10e 2227 cmp x15, x13 2228 b.hs 1f 2229 ldr w12, [x5] 2230 ldrb w14, [x4] 2231 checkbuf47 2232 put_bits x12, x14 2233 1: 2234 str PUT_BUFFER, [x0, #0x10] 2235 str PUT_BITSw, [x0, #0x18] 2236 ldp x19, x20, [sp], 16 2237 add x0, BUFFER, #0x1 2238 add sp, sp, 256 2239 br x30 2240 2241 .endm 2242 2243 generate_jsimd_huff_encode_one_block 1 2244 generate_jsimd_huff_encode_one_block 0 2245 2246 .unreq BUFFER 2247 .unreq PUT_BUFFER 2248 .unreq PUT_BITS 2249 .unreq PUT_BITSw 2250 2251 .purgem emit_byte 2252 .purgem put_bits 2253 .purgem checkbuf31 2254 .purgem checkbuf47