jcphuff-sse2.asm (17152B)
1 ; 2 ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding 3 ; (64-bit SSE2) 4 ; 5 ; Copyright (C) 2016, 2018, Matthieu Darbois 6 ; Copyright (C) 2023, Aliaksiej Kandracienka. 7 ; Copyright (C) 2024, D. R. Commander. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 ; 15 ; This file contains an SSE2 implementation of data preparation for progressive 16 ; Huffman encoding. See jcphuff.c for more details. 17 18 %include "jsimdext.inc" 19 20 ; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 64 23 24 ; -------------------------------------------------------------------------- 25 ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and 26 ; jsimd_encode_mcu_AC_refine_prepare_sse2() 27 28 %macro LOAD16 0 29 pxor N0, N0 30 pxor N1, N1 31 32 mov T0d, INT [LUT + 0*SIZEOF_INT] 33 mov T1d, INT [LUT + 8*SIZEOF_INT] 34 pinsrw X0, word [BLOCK + T0 * 2], 0 35 pinsrw X1, word [BLOCK + T1 * 2], 0 36 37 mov T0d, INT [LUT + 1*SIZEOF_INT] 38 mov T1d, INT [LUT + 9*SIZEOF_INT] 39 pinsrw X0, word [BLOCK + T0 * 2], 1 40 pinsrw X1, word [BLOCK + T1 * 2], 1 41 42 mov T0d, INT [LUT + 2*SIZEOF_INT] 43 mov T1d, INT [LUT + 10*SIZEOF_INT] 44 pinsrw X0, word [BLOCK + T0 * 2], 2 45 pinsrw X1, word [BLOCK + T1 * 2], 2 46 47 mov T0d, INT [LUT + 3*SIZEOF_INT] 48 mov T1d, INT [LUT + 11*SIZEOF_INT] 49 pinsrw X0, word [BLOCK + T0 * 2], 3 50 pinsrw X1, word [BLOCK + T1 * 2], 3 51 52 mov T0d, INT [LUT + 4*SIZEOF_INT] 53 mov T1d, INT [LUT + 12*SIZEOF_INT] 54 pinsrw X0, word [BLOCK + T0 * 2], 4 55 pinsrw X1, word [BLOCK + T1 * 2], 4 56 57 mov T0d, INT [LUT + 5*SIZEOF_INT] 58 mov T1d, INT [LUT + 13*SIZEOF_INT] 59 pinsrw X0, word [BLOCK + T0 * 2], 5 60 pinsrw X1, word [BLOCK + T1 * 2], 5 61 62 mov T0d, INT [LUT + 6*SIZEOF_INT] 63 mov T1d, INT [LUT + 14*SIZEOF_INT] 64 pinsrw X0, word [BLOCK + T0 * 2], 6 65 pinsrw X1, word [BLOCK + T1 * 2], 6 66 67 mov T0d, INT [LUT + 7*SIZEOF_INT] 68 mov T1d, INT [LUT + 15*SIZEOF_INT] 69 pinsrw X0, word [BLOCK + T0 * 2], 7 70 pinsrw X1, word [BLOCK + T1 * 2], 7 71 %endmacro 72 73 %macro LOAD15 0 74 pxor N0, N0 75 pxor N1, N1 76 pxor X1, X1 77 78 mov T0d, INT [LUT + 0*SIZEOF_INT] 79 mov T1d, INT [LUT + 8*SIZEOF_INT] 80 pinsrw X0, word [BLOCK + T0 * 2], 0 81 pinsrw X1, word [BLOCK + T1 * 2], 0 82 83 mov T0d, INT [LUT + 1*SIZEOF_INT] 84 pinsrw X0, word [BLOCK + T0 * 2], 1 85 86 mov T0d, INT [LUT + 2*SIZEOF_INT] 87 pinsrw X0, word [BLOCK + T0 * 2], 2 88 89 mov T0d, INT [LUT + 3*SIZEOF_INT] 90 pinsrw X0, word [BLOCK + T0 * 2], 3 91 92 mov T0d, INT [LUT + 4*SIZEOF_INT] 93 pinsrw X0, word [BLOCK + T0 * 2], 4 94 95 mov T0d, INT [LUT + 5*SIZEOF_INT] 96 pinsrw X0, word [BLOCK + T0 * 2], 5 97 98 mov T0d, INT [LUT + 6*SIZEOF_INT] 99 pinsrw X0, word [BLOCK + T0 * 2], 6 100 101 mov T0d, INT [LUT + 7*SIZEOF_INT] 102 pinsrw X0, word [BLOCK + T0 * 2], 7 103 104 cmp LENEND, 2 105 jl %%.ELOAD15 106 mov T1d, INT [LUT + 9*SIZEOF_INT] 107 pinsrw X1, word [BLOCK + T1 * 2], 1 108 109 cmp LENEND, 3 110 jl %%.ELOAD15 111 mov T1d, INT [LUT + 10*SIZEOF_INT] 112 pinsrw X1, word [BLOCK + T1 * 2], 2 113 114 cmp LENEND, 4 115 jl %%.ELOAD15 116 mov T1d, INT [LUT + 11*SIZEOF_INT] 117 pinsrw X1, word [BLOCK + T1 * 2], 3 118 119 cmp LENEND, 5 120 jl %%.ELOAD15 121 mov T1d, INT [LUT + 12*SIZEOF_INT] 122 pinsrw X1, word [BLOCK + T1 * 2], 4 123 124 cmp LENEND, 6 125 jl %%.ELOAD15 126 mov T1d, INT [LUT + 13*SIZEOF_INT] 127 pinsrw X1, word [BLOCK + T1 * 2], 5 128 129 cmp LENEND, 7 130 jl %%.ELOAD15 131 mov T1d, INT [LUT + 14*SIZEOF_INT] 132 pinsrw X1, word [BLOCK + T1 * 2], 6 133 %%.ELOAD15: 134 %endmacro 135 136 %macro LOAD8 0 137 pxor N0, N0 138 139 mov T0d, INT [LUT + 0*SIZEOF_INT] 140 pinsrw X0, word [BLOCK + T0 * 2], 0 141 142 mov T0d, INT [LUT + 1*SIZEOF_INT] 143 pinsrw X0, word [BLOCK + T0 * 2], 1 144 145 mov T0d, INT [LUT + 2*SIZEOF_INT] 146 pinsrw X0, word [BLOCK + T0 * 2], 2 147 148 mov T0d, INT [LUT + 3*SIZEOF_INT] 149 pinsrw X0, word [BLOCK + T0 * 2], 3 150 151 mov T0d, INT [LUT + 4*SIZEOF_INT] 152 pinsrw X0, word [BLOCK + T0 * 2], 4 153 154 mov T0d, INT [LUT + 5*SIZEOF_INT] 155 pinsrw X0, word [BLOCK + T0 * 2], 5 156 157 mov T0d, INT [LUT + 6*SIZEOF_INT] 158 pinsrw X0, word [BLOCK + T0 * 2], 6 159 160 mov T0d, INT [LUT + 7*SIZEOF_INT] 161 pinsrw X0, word [BLOCK + T0 * 2], 7 162 %endmacro 163 164 %macro LOAD7 0 165 pxor N0, N0 166 pxor X0, X0 167 168 mov T1d, INT [LUT + 0*SIZEOF_INT] 169 pinsrw X0, word [BLOCK + T1 * 2], 0 170 171 cmp LENEND, 2 172 jl %%.ELOAD7 173 mov T1d, INT [LUT + 1*SIZEOF_INT] 174 pinsrw X0, word [BLOCK + T1 * 2], 1 175 176 cmp LENEND, 3 177 jl %%.ELOAD7 178 mov T1d, INT [LUT + 2*SIZEOF_INT] 179 pinsrw X0, word [BLOCK + T1 * 2], 2 180 181 cmp LENEND, 4 182 jl %%.ELOAD7 183 mov T1d, INT [LUT + 3*SIZEOF_INT] 184 pinsrw X0, word [BLOCK + T1 * 2], 3 185 186 cmp LENEND, 5 187 jl %%.ELOAD7 188 mov T1d, INT [LUT + 4*SIZEOF_INT] 189 pinsrw X0, word [BLOCK + T1 * 2], 4 190 191 cmp LENEND, 6 192 jl %%.ELOAD7 193 mov T1d, INT [LUT + 5*SIZEOF_INT] 194 pinsrw X0, word [BLOCK + T1 * 2], 5 195 196 cmp LENEND, 7 197 jl %%.ELOAD7 198 mov T1d, INT [LUT + 6*SIZEOF_INT] 199 pinsrw X0, word [BLOCK + T1 * 2], 6 200 %%.ELOAD7: 201 %endmacro 202 203 %macro REDUCE0 0 204 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] 205 movdqa xmm1, XMMWORD [VALUES + ( 8*2)] 206 movdqa xmm2, XMMWORD [VALUES + (16*2)] 207 movdqa xmm3, XMMWORD [VALUES + (24*2)] 208 movdqa xmm4, XMMWORD [VALUES + (32*2)] 209 movdqa xmm5, XMMWORD [VALUES + (40*2)] 210 movdqa xmm6, XMMWORD [VALUES + (48*2)] 211 movdqa xmm7, XMMWORD [VALUES + (56*2)] 212 213 pcmpeqw xmm0, ZERO 214 pcmpeqw xmm1, ZERO 215 pcmpeqw xmm2, ZERO 216 pcmpeqw xmm3, ZERO 217 pcmpeqw xmm4, ZERO 218 pcmpeqw xmm5, ZERO 219 pcmpeqw xmm6, ZERO 220 pcmpeqw xmm7, ZERO 221 222 packsswb xmm0, xmm1 223 packsswb xmm2, xmm3 224 packsswb xmm4, xmm5 225 packsswb xmm6, xmm7 226 227 pmovmskb eax, xmm0 228 pmovmskb ecx, xmm2 229 pmovmskb edx, xmm4 230 pmovmskb esi, xmm6 231 232 shl rcx, 16 233 shl rdx, 32 234 shl rsi, 48 235 236 or rax, rcx 237 or rdx, rsi 238 or rax, rdx 239 240 not rax 241 242 mov MMWORD [r15], rax 243 %endmacro 244 245 ; 246 ; Prepare data for jsimd_encode_mcu_AC_first(). 247 ; 248 ; GLOBAL(void) 249 ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, 250 ; const int *jpeg_natural_order_start, 251 ; int Sl, int Al, JCOEF *values, 252 ; size_t *zerobits) 253 ; 254 ; r10 = const JCOEF *block 255 ; r11 = const int *jpeg_natural_order_start 256 ; r12 = int Sl 257 ; r13 = int Al 258 ; r14 = JCOEF *values 259 ; r15 = size_t *zerobits 260 261 %define ZERO xmm9 262 %define X0 xmm0 263 %define X1 xmm1 264 %define N0 xmm2 265 %define N1 xmm3 266 %define AL xmm4 267 %define K eax 268 %define LUT r11 269 %define T0 rcx 270 %define T0d ecx 271 %define T1 rdx 272 %define T1d edx 273 %define BLOCK r10 274 %define VALUES r14 275 %define LEN r12d 276 %define LENEND r13d 277 278 align 32 279 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) 280 281 EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): 282 ENDBR64 283 push rbp 284 mov rbp, rsp 285 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 286 sub rsp, SIZEOF_XMMWORD 287 movdqa XMMWORD [rsp], ZERO 288 COLLECT_ARGS 6 289 290 movd AL, r13d 291 pxor ZERO, ZERO 292 mov K, LEN 293 mov LENEND, LEN 294 and K, -16 295 and LENEND, 7 296 shr K, 4 297 jz .ELOOP16 298 .BLOOP16: 299 LOAD16 300 pcmpgtw N0, X0 301 pcmpgtw N1, X1 302 paddw X0, N0 303 paddw X1, N1 304 pxor X0, N0 305 pxor X1, N1 306 psrlw X0, AL 307 psrlw X1, AL 308 pxor N0, X0 309 pxor N1, X1 310 movdqa XMMWORD [VALUES + (0) * 2], X0 311 movdqa XMMWORD [VALUES + (8) * 2], X1 312 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 313 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 314 add VALUES, 16*2 315 add LUT, 16*SIZEOF_INT 316 dec K 317 jnz .BLOOP16 318 test LEN, 15 319 je .PADDING 320 .ELOOP16: 321 test LEN, 8 322 jz .TRY7 323 test LEN, 7 324 jz .TRY8 325 326 LOAD15 327 pcmpgtw N0, X0 328 pcmpgtw N1, X1 329 paddw X0, N0 330 paddw X1, N1 331 pxor X0, N0 332 pxor X1, N1 333 psrlw X0, AL 334 psrlw X1, AL 335 pxor N0, X0 336 pxor N1, X1 337 movdqa XMMWORD [VALUES + (0) * 2], X0 338 movdqa XMMWORD [VALUES + (8) * 2], X1 339 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 340 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 341 add VALUES, 16*2 342 jmp .PADDING 343 .TRY8: 344 LOAD8 345 pcmpgtw N0, X0 346 paddw X0, N0 347 pxor X0, N0 348 psrlw X0, AL 349 pxor N0, X0 350 movdqa XMMWORD [VALUES + (0) * 2], X0 351 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 352 add VALUES, 8*2 353 jmp .PADDING 354 .TRY7: 355 LOAD7 356 pcmpgtw N0, X0 357 paddw X0, N0 358 pxor X0, N0 359 psrlw X0, AL 360 pxor N0, X0 361 movdqa XMMWORD [VALUES + (0) * 2], X0 362 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 363 add VALUES, 8*2 364 .PADDING: 365 mov K, LEN 366 add K, 7 367 and K, -8 368 shr K, 3 369 sub K, DCTSIZE2/8 370 jz .EPADDING 371 align 16 372 .ZEROLOOP: 373 movdqa XMMWORD [VALUES + 0], ZERO 374 add VALUES, 8*2 375 inc K 376 jnz .ZEROLOOP 377 .EPADDING: 378 sub VALUES, DCTSIZE2*2 379 380 REDUCE0 381 382 UNCOLLECT_ARGS 6 383 movdqa ZERO, XMMWORD [rsp] 384 mov rsp, rbp 385 pop rbp 386 ret 387 388 %undef ZERO 389 %undef X0 390 %undef X1 391 %undef N0 392 %undef N1 393 %undef AL 394 %undef K 395 %undef LUT 396 %undef T0 397 %undef T0d 398 %undef T1 399 %undef T1d 400 %undef BLOCK 401 %undef VALUES 402 %undef LEN 403 %undef LENEND 404 405 ; 406 ; Prepare data for jsimd_encode_mcu_AC_refine(). 407 ; 408 ; GLOBAL(int) 409 ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, 410 ; const int *jpeg_natural_order_start, 411 ; int Sl, int Al, JCOEF *absvalues, 412 ; size_t *bits) 413 ; 414 ; r10 = const JCOEF *block 415 ; r11 = const int *jpeg_natural_order_start 416 ; r12 = int Sl 417 ; r13 = int Al 418 ; r14 = JCOEF *values 419 ; r15 = size_t *bits 420 421 %define ZERO xmm9 422 %define ONE xmm5 423 %define X0 xmm0 424 %define X1 xmm1 425 %define N0 xmm2 426 %define N1 xmm3 427 %define AL xmm4 428 %define K eax 429 %define KK r9d 430 %define EOB r8d 431 %define SIGN rdi 432 %define LUT r11 433 %define T0 rcx 434 %define T0d ecx 435 %define T1 rdx 436 %define T1d edx 437 %define BLOCK r10 438 %define VALUES r14 439 %define LEN r12d 440 %define LENEND r13d 441 442 align 32 443 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) 444 445 EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): 446 ENDBR64 447 push rbp 448 mov rbp, rsp 449 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 450 sub rsp, SIZEOF_XMMWORD 451 movdqa XMMWORD [rsp], ZERO 452 COLLECT_ARGS 6 453 454 xor SIGN, SIGN 455 xor EOB, EOB 456 xor KK, KK 457 movd AL, r13d 458 pxor ZERO, ZERO 459 pcmpeqw ONE, ONE 460 psrlw ONE, 15 461 mov K, LEN 462 mov LENEND, LEN 463 and K, -16 464 and LENEND, 7 465 shr K, 4 466 jz .ELOOPR16 467 .BLOOPR16: 468 LOAD16 469 pcmpgtw N0, X0 470 pcmpgtw N1, X1 471 paddw X0, N0 472 paddw X1, N1 473 pxor X0, N0 474 pxor X1, N1 475 psrlw X0, AL 476 psrlw X1, AL 477 movdqa XMMWORD [VALUES + (0) * 2], X0 478 movdqa XMMWORD [VALUES + (8) * 2], X1 479 pcmpeqw X0, ONE 480 pcmpeqw X1, ONE 481 packsswb N0, N1 482 packsswb X0, X1 483 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 484 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 485 shr SIGN, 16 ; make room for sizebits 486 shl T0, 48 487 or SIGN, T0 488 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 489 jz .CONTINUER16 ; if (idx) { 490 mov EOB, KK 491 add EOB, T1d ; EOB = k + idx; 492 .CONTINUER16: 493 add VALUES, 16*2 494 add LUT, 16*SIZEOF_INT 495 add KK, 16 496 dec K 497 jnz .BLOOPR16 498 test LEN, 15 499 je .PADDINGR 500 .ELOOPR16: 501 test LEN, 8 502 jz .TRYR7 503 test LEN, 7 504 jz .TRYR8 505 506 LOAD15 507 pcmpgtw N0, X0 508 pcmpgtw N1, X1 509 paddw X0, N0 510 paddw X1, N1 511 pxor X0, N0 512 pxor X1, N1 513 psrlw X0, AL 514 psrlw X1, AL 515 movdqa XMMWORD [VALUES + (0) * 2], X0 516 movdqa XMMWORD [VALUES + (8) * 2], X1 517 pcmpeqw X0, ONE 518 pcmpeqw X1, ONE 519 packsswb N0, N1 520 packsswb X0, X1 521 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 522 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 523 shr SIGN, 16 ; make room for sizebits 524 shl T0, 48 525 or SIGN, T0 526 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 527 jz .CONTINUER15 ; if (idx) { 528 mov EOB, KK 529 add EOB, T1d ; EOB = k + idx; 530 .CONTINUER15: 531 add VALUES, 16*2 532 jmp .PADDINGR 533 .TRYR8: 534 LOAD8 535 536 pcmpgtw N0, X0 537 paddw X0, N0 538 pxor X0, N0 539 psrlw X0, AL 540 movdqa XMMWORD [VALUES + (0) * 2], X0 541 pcmpeqw X0, ONE 542 packsswb N0, ZERO 543 packsswb X0, ZERO 544 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 545 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 546 shr SIGN, 8 ; make room for sizebits 547 shl T0, 56 548 or SIGN, T0 549 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 550 jz .CONTINUER8 ; if (idx) { 551 mov EOB, KK 552 add EOB, T1d ; EOB = k + idx; 553 .CONTINUER8: 554 add VALUES, 8*2 555 jmp .PADDINGR 556 .TRYR7: 557 LOAD7 558 559 pcmpgtw N0, X0 560 paddw X0, N0 561 pxor X0, N0 562 psrlw X0, AL 563 movdqa XMMWORD [VALUES + (0) * 2], X0 564 pcmpeqw X0, ONE 565 packsswb N0, ZERO 566 packsswb X0, ZERO 567 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 568 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 569 shr SIGN, 8 ; make room for sizebits 570 shl T0, 56 571 or SIGN, T0 572 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 573 jz .CONTINUER7 ; if (idx) { 574 mov EOB, KK 575 add EOB, T1d ; EOB = k + idx; 576 .CONTINUER7: 577 add VALUES, 8*2 578 .PADDINGR: 579 mov K, LEN 580 add K, 7 581 and K, -8 582 shr K, 3 583 sub K, DCTSIZE2/8 584 jz .EPADDINGR 585 align 16 586 .ZEROLOOPR: 587 movdqa XMMWORD [VALUES + 0], ZERO 588 shr SIGN, 8 589 add VALUES, 8*2 590 inc K 591 jnz .ZEROLOOPR 592 .EPADDINGR: 593 not SIGN 594 sub VALUES, DCTSIZE2*2 595 mov MMWORD [r15+SIZEOF_MMWORD], SIGN 596 597 REDUCE0 598 599 mov eax, EOB 600 UNCOLLECT_ARGS 6 601 movdqa ZERO, XMMWORD [rsp] 602 mov rsp, rbp 603 pop rbp 604 ret 605 606 %undef ZERO 607 %undef ONE 608 %undef X0 609 %undef X1 610 %undef N0 611 %undef N1 612 %undef AL 613 %undef K 614 %undef KK 615 %undef EOB 616 %undef SIGN 617 %undef LUT 618 %undef T0 619 %undef T0d 620 %undef T1 621 %undef T1d 622 %undef BLOCK 623 %undef VALUES 624 %undef LEN 625 %undef LENEND 626 627 ; For some reason, the OS X linker does not honor the request to align the 628 ; segment unless we do this. 629 align 32