jcphuff-sse2.asm (18380B)
1 ; 2 ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2) 3 ; 4 ; Copyright (C) 2016, 2018, Matthieu Darbois 5 ; 6 ; Based on the x86 SIMD extension for IJG JPEG library 7 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 9 ; 10 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 11 ; 12 ; This file contains an SSE2 implementation of data preparation for progressive 13 ; Huffman encoding. See jcphuff.c for more details. 14 15 %include "jsimdext.inc" 16 17 ; -------------------------------------------------------------------------- 18 SECTION SEG_TEXT 19 BITS 32 20 21 ; -------------------------------------------------------------------------- 22 ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and 23 ; jsimd_encode_mcu_AC_refine_prepare_sse2() 24 25 %macro LOAD16 0 26 pxor N0, N0 27 pxor N1, N1 28 29 mov T0, INT [LUT + 0*SIZEOF_INT] 30 mov T1, INT [LUT + 8*SIZEOF_INT] 31 pinsrw X0, word [BLOCK + T0 * 2], 0 32 pinsrw X1, word [BLOCK + T1 * 2], 0 33 34 mov T0, INT [LUT + 1*SIZEOF_INT] 35 mov T1, INT [LUT + 9*SIZEOF_INT] 36 pinsrw X0, word [BLOCK + T0 * 2], 1 37 pinsrw X1, word [BLOCK + T1 * 2], 1 38 39 mov T0, INT [LUT + 2*SIZEOF_INT] 40 mov T1, INT [LUT + 10*SIZEOF_INT] 41 pinsrw X0, word [BLOCK + T0 * 2], 2 42 pinsrw X1, word [BLOCK + T1 * 2], 2 43 44 mov T0, INT [LUT + 3*SIZEOF_INT] 45 mov T1, INT [LUT + 11*SIZEOF_INT] 46 pinsrw X0, word [BLOCK + T0 * 2], 3 47 pinsrw X1, word [BLOCK + T1 * 2], 3 48 49 mov T0, INT [LUT + 4*SIZEOF_INT] 50 mov T1, INT [LUT + 12*SIZEOF_INT] 51 pinsrw X0, word [BLOCK + T0 * 2], 4 52 pinsrw X1, word [BLOCK + T1 * 2], 4 53 54 mov T0, INT [LUT + 5*SIZEOF_INT] 55 mov T1, INT [LUT + 13*SIZEOF_INT] 56 pinsrw X0, word [BLOCK + T0 * 2], 5 57 pinsrw X1, word [BLOCK + T1 * 2], 5 58 59 mov T0, INT [LUT + 6*SIZEOF_INT] 60 mov T1, INT [LUT + 14*SIZEOF_INT] 61 pinsrw X0, word [BLOCK + T0 * 2], 6 62 pinsrw X1, word [BLOCK + T1 * 2], 6 63 64 mov T0, INT [LUT + 7*SIZEOF_INT] 65 mov T1, INT [LUT + 15*SIZEOF_INT] 66 pinsrw X0, word [BLOCK + T0 * 2], 7 67 pinsrw X1, word [BLOCK + T1 * 2], 7 68 %endmacro 69 70 %macro LOAD15 0 71 pxor N0, N0 72 pxor N1, N1 73 pxor X1, X1 74 75 mov T0, INT [LUT + 0*SIZEOF_INT] 76 mov T1, INT [LUT + 8*SIZEOF_INT] 77 pinsrw X0, word [BLOCK + T0 * 2], 0 78 pinsrw X1, word [BLOCK + T1 * 2], 0 79 80 mov T0, INT [LUT + 1*SIZEOF_INT] 81 pinsrw X0, word [BLOCK + T0 * 2], 1 82 83 mov T0, INT [LUT + 2*SIZEOF_INT] 84 pinsrw X0, word [BLOCK + T0 * 2], 2 85 86 mov T0, INT [LUT + 3*SIZEOF_INT] 87 pinsrw X0, word [BLOCK + T0 * 2], 3 88 89 mov T0, INT [LUT + 4*SIZEOF_INT] 90 pinsrw X0, word [BLOCK + T0 * 2], 4 91 92 mov T0, INT [LUT + 5*SIZEOF_INT] 93 pinsrw X0, word [BLOCK + T0 * 2], 5 94 95 mov T0, INT [LUT + 6*SIZEOF_INT] 96 pinsrw X0, word [BLOCK + T0 * 2], 6 97 98 mov T0, INT [LUT + 7*SIZEOF_INT] 99 pinsrw X0, word [BLOCK + T0 * 2], 7 100 101 cmp LENEND, 2 102 jl %%.ELOAD15 103 mov T1, INT [LUT + 9*SIZEOF_INT] 104 pinsrw X1, word [BLOCK + T1 * 2], 1 105 106 cmp LENEND, 3 107 jl %%.ELOAD15 108 mov T1, INT [LUT + 10*SIZEOF_INT] 109 pinsrw X1, word [BLOCK + T1 * 2], 2 110 111 cmp LENEND, 4 112 jl %%.ELOAD15 113 mov T1, INT [LUT + 11*SIZEOF_INT] 114 pinsrw X1, word [BLOCK + T1 * 2], 3 115 116 cmp LENEND, 5 117 jl %%.ELOAD15 118 mov T1, INT [LUT + 12*SIZEOF_INT] 119 pinsrw X1, word [BLOCK + T1 * 2], 4 120 121 cmp LENEND, 6 122 jl %%.ELOAD15 123 mov T1, INT [LUT + 13*SIZEOF_INT] 124 pinsrw X1, word [BLOCK + T1 * 2], 5 125 126 cmp LENEND, 7 127 jl %%.ELOAD15 128 mov T1, INT [LUT + 14*SIZEOF_INT] 129 pinsrw X1, word [BLOCK + T1 * 2], 6 130 %%.ELOAD15: 131 %endmacro 132 133 %macro LOAD8 0 134 pxor N0, N0 135 136 mov T0, INT [LUT + 0*SIZEOF_INT] 137 pinsrw X0, word [BLOCK + T0 * 2], 0 138 139 mov T0, INT [LUT + 1*SIZEOF_INT] 140 pinsrw X0, word [BLOCK + T0 * 2], 1 141 142 mov T0, INT [LUT + 2*SIZEOF_INT] 143 pinsrw X0, word [BLOCK + T0 * 2], 2 144 145 mov T0, INT [LUT + 3*SIZEOF_INT] 146 pinsrw X0, word [BLOCK + T0 * 2], 3 147 148 mov T0, INT [LUT + 4*SIZEOF_INT] 149 pinsrw X0, word [BLOCK + T0 * 2], 4 150 151 mov T0, INT [LUT + 5*SIZEOF_INT] 152 pinsrw X0, word [BLOCK + T0 * 2], 5 153 154 mov T0, INT [LUT + 6*SIZEOF_INT] 155 pinsrw X0, word [BLOCK + T0 * 2], 6 156 157 mov T0, INT [LUT + 7*SIZEOF_INT] 158 pinsrw X0, word [BLOCK + T0 * 2], 7 159 %endmacro 160 161 %macro LOAD7 0 162 pxor N0, N0 163 pxor X0, X0 164 165 mov T1, INT [LUT + 0*SIZEOF_INT] 166 pinsrw X0, word [BLOCK + T1 * 2], 0 167 168 cmp LENEND, 2 169 jl %%.ELOAD7 170 mov T1, INT [LUT + 1*SIZEOF_INT] 171 pinsrw X0, word [BLOCK + T1 * 2], 1 172 173 cmp LENEND, 3 174 jl %%.ELOAD7 175 mov T1, INT [LUT + 2*SIZEOF_INT] 176 pinsrw X0, word [BLOCK + T1 * 2], 2 177 178 cmp LENEND, 4 179 jl %%.ELOAD7 180 mov T1, INT [LUT + 3*SIZEOF_INT] 181 pinsrw X0, word [BLOCK + T1 * 2], 3 182 183 cmp LENEND, 5 184 jl %%.ELOAD7 185 mov T1, INT [LUT + 4*SIZEOF_INT] 186 pinsrw X0, word [BLOCK + T1 * 2], 4 187 188 cmp LENEND, 6 189 jl %%.ELOAD7 190 mov T1, INT [LUT + 5*SIZEOF_INT] 191 pinsrw X0, word [BLOCK + T1 * 2], 5 192 193 cmp LENEND, 7 194 jl %%.ELOAD7 195 mov T1, INT [LUT + 6*SIZEOF_INT] 196 pinsrw X0, word [BLOCK + T1 * 2], 6 197 %%.ELOAD7: 198 %endmacro 199 200 %macro REDUCE0 0 201 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] 202 movdqa xmm1, XMMWORD [VALUES + ( 8*2)] 203 movdqa xmm2, XMMWORD [VALUES + (16*2)] 204 movdqa xmm3, XMMWORD [VALUES + (24*2)] 205 movdqa xmm4, XMMWORD [VALUES + (32*2)] 206 movdqa xmm5, XMMWORD [VALUES + (40*2)] 207 movdqa xmm6, XMMWORD [VALUES + (48*2)] 208 209 pcmpeqw xmm0, ZERO 210 pcmpeqw xmm1, ZERO 211 pcmpeqw xmm2, ZERO 212 pcmpeqw xmm3, ZERO 213 pcmpeqw xmm4, ZERO 214 pcmpeqw xmm5, ZERO 215 pcmpeqw xmm6, ZERO 216 pcmpeqw xmm7, XMMWORD [VALUES + (56*2)] 217 218 packsswb xmm0, xmm1 219 packsswb xmm2, xmm3 220 packsswb xmm4, xmm5 221 packsswb xmm6, xmm7 222 223 pmovmskb eax, xmm0 224 pmovmskb ecx, xmm2 225 pmovmskb edx, xmm4 226 pmovmskb esi, xmm6 227 228 shl ecx, 16 229 shl esi, 16 230 231 or eax, ecx 232 or edx, esi 233 234 not eax 235 not edx 236 237 mov edi, ZEROBITS 238 239 mov INT [edi], eax 240 mov INT [edi+SIZEOF_INT], edx 241 %endmacro 242 243 ; 244 ; Prepare data for jsimd_encode_mcu_AC_first(). 245 ; 246 ; GLOBAL(void) 247 ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, 248 ; const int *jpeg_natural_order_start, 249 ; int Sl, int Al, JCOEF *values, 250 ; size_t *zerobits) 251 ; 252 ; eax + 8 = const JCOEF *block 253 ; eax + 12 = const int *jpeg_natural_order_start 254 ; eax + 16 = int Sl 255 ; eax + 20 = int Al 256 ; eax + 24 = JCOEF *values 257 ; eax + 28 = size_t *zerobits 258 259 %define ZERO xmm7 260 %define X0 xmm0 261 %define X1 xmm1 262 %define N0 xmm2 263 %define N1 xmm3 264 %define AL xmm4 265 %define K eax 266 %define LENEND eax 267 %define LUT ebx 268 %define T0 ecx 269 %define T1 edx 270 %define BLOCK esi 271 %define VALUES edi 272 %define LEN ebp 273 274 %define ZEROBITS INT [esp + 5 * 4] 275 276 align 32 277 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) 278 279 EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): 280 push ebp 281 mov eax, esp ; eax = original ebp 282 sub esp, byte 4 283 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 284 mov [esp], eax 285 mov ebp, esp ; ebp = aligned ebp 286 sub esp, 4 287 push ebx 288 push ecx 289 ; push edx ; need not be preserved 290 push esi 291 push edi 292 push ebp 293 294 mov BLOCK, INT [eax + 8] 295 mov LUT, INT [eax + 12] 296 mov VALUES, INT [eax + 24] 297 movd AL, INT [eax + 20] 298 mov T0, INT [eax + 28] 299 mov ZEROBITS, T0 300 mov LEN, INT [eax + 16] 301 pxor ZERO, ZERO 302 mov K, LEN 303 and K, -16 304 shr K, 4 305 jz .ELOOP16 306 .BLOOP16: 307 LOAD16 308 pcmpgtw N0, X0 309 pcmpgtw N1, X1 310 paddw X0, N0 311 paddw X1, N1 312 pxor X0, N0 313 pxor X1, N1 314 psrlw X0, AL 315 psrlw X1, AL 316 pxor N0, X0 317 pxor N1, X1 318 movdqa XMMWORD [VALUES + (0) * 2], X0 319 movdqa XMMWORD [VALUES + (8) * 2], X1 320 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 321 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 322 add VALUES, 16*2 323 add LUT, 16*SIZEOF_INT 324 dec K 325 jnz .BLOOP16 326 test LEN, 15 327 je .PADDING 328 .ELOOP16: 329 mov LENEND, LEN 330 and LENEND, 7 331 332 test LEN, 8 333 jz .TRY7 334 test LEN, 7 335 jz .TRY8 336 337 LOAD15 338 pcmpgtw N0, X0 339 pcmpgtw N1, X1 340 paddw X0, N0 341 paddw X1, N1 342 pxor X0, N0 343 pxor X1, N1 344 psrlw X0, AL 345 psrlw X1, AL 346 pxor N0, X0 347 pxor N1, X1 348 movdqa XMMWORD [VALUES + (0) * 2], X0 349 movdqa XMMWORD [VALUES + (8) * 2], X1 350 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 351 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 352 add VALUES, 16*2 353 jmp .PADDING 354 .TRY8: 355 LOAD8 356 pcmpgtw N0, X0 357 paddw X0, N0 358 pxor X0, N0 359 psrlw X0, AL 360 pxor N0, X0 361 movdqa XMMWORD [VALUES + (0) * 2], X0 362 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 363 add VALUES, 8*2 364 jmp .PADDING 365 .TRY7: 366 LOAD7 367 pcmpgtw N0, X0 368 paddw X0, N0 369 pxor X0, N0 370 psrlw X0, AL 371 pxor N0, X0 372 movdqa XMMWORD [VALUES + (0) * 2], X0 373 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 374 add VALUES, 8*2 375 .PADDING: 376 mov K, LEN 377 add K, 7 378 and K, -8 379 shr K, 3 380 sub K, DCTSIZE2/8 381 jz .EPADDING 382 align 16 383 .ZEROLOOP: 384 movdqa XMMWORD [VALUES + 0], ZERO 385 add VALUES, 8*2 386 inc K 387 jnz .ZEROLOOP 388 .EPADDING: 389 sub VALUES, DCTSIZE2*2 390 391 REDUCE0 392 393 pop ebp 394 pop edi 395 pop esi 396 ; pop edx ; need not be preserved 397 pop ecx 398 pop ebx 399 mov esp, ebp ; esp <- aligned ebp 400 pop esp ; esp <- original ebp 401 pop ebp 402 ret 403 404 %undef ZERO 405 %undef X0 406 %undef X1 407 %undef N0 408 %undef N1 409 %undef AL 410 %undef K 411 %undef LUT 412 %undef T0 413 %undef T1 414 %undef BLOCK 415 %undef VALUES 416 %undef LEN 417 418 ; 419 ; Prepare data for jsimd_encode_mcu_AC_refine(). 420 ; 421 ; GLOBAL(int) 422 ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, 423 ; const int *jpeg_natural_order_start, 424 ; int Sl, int Al, JCOEF *absvalues, 425 ; size_t *bits) 426 ; 427 ; eax + 8 = const JCOEF *block 428 ; eax + 12 = const int *jpeg_natural_order_start 429 ; eax + 16 = int Sl 430 ; eax + 20 = int Al 431 ; eax + 24 = JCOEF *values 432 ; eax + 28 = size_t *bits 433 434 %define ZERO xmm7 435 %define ONE xmm5 436 %define X0 xmm0 437 %define X1 xmm1 438 %define N0 xmm2 439 %define N1 xmm3 440 %define AL xmm4 441 %define K eax 442 %define LENEND eax 443 %define LUT ebx 444 %define T0 ecx 445 %define T0w cx 446 %define T1 edx 447 %define BLOCK esi 448 %define VALUES edi 449 %define KK ebp 450 451 %define ZEROBITS INT [esp + 5 * 4] 452 %define EOB INT [esp + 5 * 4 + 4] 453 %define LEN INT [esp + 5 * 4 + 8] 454 455 align 32 456 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) 457 458 EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): 459 push ebp 460 mov eax, esp ; eax = original ebp 461 sub esp, byte 4 462 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 463 mov [esp], eax 464 mov ebp, esp ; ebp = aligned ebp 465 sub esp, 16 466 push ebx 467 push ecx 468 ; push edx ; need not be preserved 469 push esi 470 push edi 471 push ebp 472 473 pcmpeqw ONE, ONE 474 psrlw ONE, 15 475 mov BLOCK, INT [eax + 8] 476 mov LUT, INT [eax + 12] 477 mov VALUES, INT [eax + 24] 478 movd AL, INT [eax + 20] 479 mov T0, INT [eax + 28] 480 mov K, INT [eax + 16] 481 mov INT [T0 + 2 * SIZEOF_INT], -1 482 mov INT [T0 + 3 * SIZEOF_INT], -1 483 mov ZEROBITS, T0 484 mov LEN, K 485 pxor ZERO, ZERO 486 and K, -16 487 mov EOB, 0 488 xor KK, KK 489 shr K, 4 490 jz .ELOOPR16 491 .BLOOPR16: 492 LOAD16 493 pcmpgtw N0, X0 494 pcmpgtw N1, X1 495 paddw X0, N0 496 paddw X1, N1 497 pxor X0, N0 498 pxor X1, N1 499 psrlw X0, AL 500 psrlw X1, AL 501 movdqa XMMWORD [VALUES + (0) * 2], X0 502 movdqa XMMWORD [VALUES + (8) * 2], X1 503 pcmpeqw X0, ONE 504 pcmpeqw X1, ONE 505 packsswb N0, N1 506 packsswb X0, X1 507 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 508 mov T1, ZEROBITS 509 not T0 510 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 511 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 512 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 513 jz .CONTINUER16 ; if (idx) { 514 lea T1, [T1+KK*8] 515 mov EOB, T1 ; EOB = k + idx; 516 .CONTINUER16: 517 add VALUES, 16*2 518 add LUT, 16*SIZEOF_INT 519 add KK, 2 520 dec K 521 jnz .BLOOPR16 522 test LEN, 15 523 je .PADDINGR 524 .ELOOPR16: 525 mov LENEND, LEN 526 527 test LENEND, 8 528 jz .TRYR7 529 test LENEND, 7 530 jz .TRYR8 531 532 and LENEND, 7 533 LOAD15 534 pcmpgtw N0, X0 535 pcmpgtw N1, X1 536 paddw X0, N0 537 paddw X1, N1 538 pxor X0, N0 539 pxor X1, N1 540 psrlw X0, AL 541 psrlw X1, AL 542 movdqa XMMWORD [VALUES + (0) * 2], X0 543 movdqa XMMWORD [VALUES + (8) * 2], X1 544 pcmpeqw X0, ONE 545 pcmpeqw X1, ONE 546 packsswb N0, N1 547 packsswb X0, X1 548 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 549 mov T1, ZEROBITS 550 not T0 551 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 552 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 553 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 554 jz .CONTINUER15 ; if (idx) { 555 lea T1, [T1+KK*8] 556 mov EOB, T1 ; EOB = k + idx; 557 .CONTINUER15: 558 add VALUES, 16*2 559 jmp .PADDINGR 560 .TRYR8: 561 LOAD8 562 563 pcmpgtw N0, X0 564 paddw X0, N0 565 pxor X0, N0 566 psrlw X0, AL 567 movdqa XMMWORD [VALUES + (0) * 2], X0 568 pcmpeqw X0, ONE 569 packsswb N0, ZERO 570 packsswb X0, ZERO 571 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 572 mov T1, ZEROBITS 573 not T0 574 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 575 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 576 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 577 jz .CONTINUER8 ; if (idx) { 578 lea T1, [T1+KK*8] 579 mov EOB, T1 ; EOB = k + idx; 580 .CONTINUER8: 581 add VALUES, 8*2 582 jmp .PADDINGR 583 .TRYR7: 584 and LENEND, 7 585 LOAD7 586 587 pcmpgtw N0, X0 588 paddw X0, N0 589 pxor X0, N0 590 psrlw X0, AL 591 movdqa XMMWORD [VALUES + (0) * 2], X0 592 pcmpeqw X0, ONE 593 packsswb N0, ZERO 594 packsswb X0, ZERO 595 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 596 mov T1, ZEROBITS 597 not T0 598 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 599 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 600 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 601 jz .CONTINUER7 ; if (idx) { 602 lea T1, [T1+KK*8] 603 mov EOB, T1 ; EOB = k + idx; 604 .CONTINUER7: 605 add VALUES, 8*2 606 .PADDINGR: 607 mov K, LEN 608 add K, 7 609 and K, -8 610 shr K, 3 611 sub K, DCTSIZE2/8 612 jz .EPADDINGR 613 align 16 614 .ZEROLOOPR: 615 movdqa XMMWORD [VALUES + 0], ZERO 616 add VALUES, 8*2 617 inc K 618 jnz .ZEROLOOPR 619 .EPADDINGR: 620 sub VALUES, DCTSIZE2*2 621 622 REDUCE0 623 624 mov eax, EOB 625 626 pop ebp 627 pop edi 628 pop esi 629 ; pop edx ; need not be preserved 630 pop ecx 631 pop ebx 632 mov esp, ebp ; esp <- aligned ebp 633 pop esp ; esp <- original ebp 634 pop ebp 635 ret 636 637 %undef ZERO 638 %undef ONE 639 %undef X0 640 %undef X1 641 %undef N0 642 %undef N1 643 %undef AL 644 %undef K 645 %undef KK 646 %undef EOB 647 %undef SIGN 648 %undef LUT 649 %undef T0 650 %undef T1 651 %undef BLOCK 652 %undef VALUES 653 %undef LEN 654 %undef LENEND 655 656 ; For some reason, the OS X linker does not honor the request to align the 657 ; segment unless we do this. 658 align 32