tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctred-mmx.asm (27106B)


      1 ;
      2 ; jidctred.asm - reduced-size IDCT (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 ;
     13 ; This file contains inverse-DCT routines that produce reduced-size
     14 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
     15 ; The following code is based directly on the IJG's original jidctred.c;
     16 ; see the jidctred.c for more details.
     17 
     18 %include "jsimdext.inc"
     19 %include "jdct.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 
     23 %define CONST_BITS    13
     24 %define PASS1_BITS    2
     25 
     26 %define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
     27 %define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
     28 %define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
     29 %define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
     30 
     31 %if CONST_BITS == 13
     32 F_0_211 equ  1730  ; FIX(0.211164243)
     33 F_0_509 equ  4176  ; FIX(0.509795579)
     34 F_0_601 equ  4926  ; FIX(0.601344887)
     35 F_0_720 equ  5906  ; FIX(0.720959822)
     36 F_0_765 equ  6270  ; FIX(0.765366865)
     37 F_0_850 equ  6967  ; FIX(0.850430095)
     38 F_0_899 equ  7373  ; FIX(0.899976223)
     39 F_1_061 equ  8697  ; FIX(1.061594337)
     40 F_1_272 equ 10426  ; FIX(1.272758580)
     41 F_1_451 equ 11893  ; FIX(1.451774981)
     42 F_1_847 equ 15137  ; FIX(1.847759065)
     43 F_2_172 equ 17799  ; FIX(2.172734803)
     44 F_2_562 equ 20995  ; FIX(2.562915447)
     45 F_3_624 equ 29692  ; FIX(3.624509785)
     46 %else
     47 ; NASM cannot do compile-time arithmetic on floating-point constants.
     48 %define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
     49 F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
     50 F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
     51 F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
     52 F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
     53 F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
     54 F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
     55 F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
     56 F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
     57 F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
     58 F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
     59 F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
     60 F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
     61 F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
     62 F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
     63 %endif
     64 
     65 ; --------------------------------------------------------------------------
     66    SECTION     SEG_CONST
     67 
     68    ALIGNZ      32
     69    GLOBAL_DATA(jconst_idct_red_mmx)
     70 
     71 EXTN(jconst_idct_red_mmx):
     72 
     73 PW_F184_MF076   times 2 dw  F_1_847, -F_0_765
     74 PW_F256_F089    times 2 dw  F_2_562,  F_0_899
     75 PW_F106_MF217   times 2 dw  F_1_061, -F_2_172
     76 PW_MF060_MF050  times 2 dw -F_0_601, -F_0_509
     77 PW_F145_MF021   times 2 dw  F_1_451, -F_0_211
     78 PW_F362_MF127   times 2 dw  F_3_624, -F_1_272
     79 PW_F085_MF072   times 2 dw  F_0_850, -F_0_720
     80 PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4 - 1)
     81 PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4 - 1)
     82 PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2 - 1)
     83 PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2 - 1)
     84 PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
     85 
     86    ALIGNZ      32
     87 
     88 ; --------------------------------------------------------------------------
     89    SECTION     SEG_TEXT
     90    BITS        32
     91 ;
     92 ; Perform dequantization and inverse DCT on one block of coefficients,
     93 ; producing a reduced-size 4x4 output block.
     94 ;
     95 ; GLOBAL(void)
     96 ; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
     97 ;                    JSAMPARRAY output_buf, JDIMENSION output_col)
     98 ;
     99 
    100 %define dct_table(b)   (b) + 8          ; void *dct_table
    101 %define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
    102 %define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
    103 %define output_col(b)  (b) + 20         ; JDIMENSION output_col
    104 
    105 %define original_ebp   ebp + 0
    106 %define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
    107                                        ; mmword wk[WK_NUM]
    108 %define WK_NUM         2
    109 %define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
    110                                        ; JCOEF workspace[DCTSIZE2]
    111 
    112    align       32
    113    GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
    114 
    115 EXTN(jsimd_idct_4x4_mmx):
    116    push        ebp
    117    mov         eax, esp                    ; eax = original ebp
    118    sub         esp, byte 4
    119    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
    120    mov         [esp], eax
    121    mov         ebp, esp                    ; ebp = aligned ebp
    122    lea         esp, [workspace]
    123    PUSHPIC     ebx
    124 ;   push        ecx                     ; need not be preserved
    125 ;   push        edx                     ; need not be preserved
    126    push        esi
    127    push        edi
    128 
    129    GET_GOT     ebx                     ; get GOT address
    130 
    131    ; ---- Pass 1: process columns from input, store into work array.
    132 
    133 ;   mov         eax, [original_ebp]
    134    mov         edx, POINTER [dct_table(eax)]    ; quantptr
    135    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
    136    lea         edi, [workspace]                 ; JCOEF *wsptr
    137    mov         ecx, DCTSIZE/4                   ; ctr
    138    ALIGNX      16, 7
    139 .columnloop:
    140 %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
    141    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
    142    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
    143    jnz         short .columnDCT
    144 
    145    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    146    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    147    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    148    por         mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    149    por         mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    150    por         mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    151    por         mm0, mm1
    152    packsswb    mm0, mm0
    153    movd        eax, mm0
    154    test        eax, eax
    155    jnz         short .columnDCT
    156 
    157    ; -- AC terms all zero
    158 
    159    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    160    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    161 
    162    psllw       mm0, PASS1_BITS
    163 
    164    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
    165    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
    166    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
    167 
    168    movq        mm1, mm0
    169    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
    170    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
    171    movq        mm3, mm2
    172    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
    173    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
    174 
    175    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
    176    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
    177    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
    178    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
    179    jmp         near .nextcolumn
    180    ALIGNX      16, 7
    181 %endif
    182 .columnDCT:
    183 
    184    ; -- Odd part
    185 
    186    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    187    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    188    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    189    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    190    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    191    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    192    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    193    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    194 
    195    movq        mm4, mm0
    196    movq        mm5, mm0
    197    punpcklwd   mm4, mm1
    198    punpckhwd   mm5, mm1
    199    movq        mm0, mm4
    200    movq        mm1, mm5
    201    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
    202    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
    203    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
    204    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
    205 
    206    movq        mm6, mm2
    207    movq        mm7, mm2
    208    punpcklwd   mm6, mm3
    209    punpckhwd   mm7, mm3
    210    movq        mm2, mm6
    211    movq        mm3, mm7
    212    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
    213    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
    214    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
    215    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
    216 
    217    paddd       mm6, mm4                ; mm6=tmp2L
    218    paddd       mm7, mm5                ; mm7=tmp2H
    219    paddd       mm2, mm0                ; mm2=tmp0L
    220    paddd       mm3, mm1                ; mm3=tmp0H
    221 
    222    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
    223    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
    224 
    225    ; -- Even part
    226 
    227    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    228    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    229    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    230    pmullw      mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    231    pmullw      mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    232    pmullw      mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    233 
    234    pxor        mm1, mm1
    235    pxor        mm2, mm2
    236    punpcklwd   mm1, mm4                ; mm1=tmp0L
    237    punpckhwd   mm2, mm4                ; mm2=tmp0H
    238    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
    239    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
    240 
    241    movq        mm3, mm5                ; mm5=in2=z2
    242    punpcklwd   mm5, mm0                ; mm0=in6=z3
    243    punpckhwd   mm3, mm0
    244    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
    245    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
    246 
    247    movq        mm4, mm1
    248    movq        mm0, mm2
    249    paddd       mm1, mm5                ; mm1=tmp10L
    250    paddd       mm2, mm3                ; mm2=tmp10H
    251    psubd       mm4, mm5                ; mm4=tmp12L
    252    psubd       mm0, mm3                ; mm0=tmp12H
    253 
    254    ; -- Final output stage
    255 
    256    movq        mm5, mm1
    257    movq        mm3, mm2
    258    paddd       mm1, mm6                ; mm1=data0L
    259    paddd       mm2, mm7                ; mm2=data0H
    260    psubd       mm5, mm6                ; mm5=data3L
    261    psubd       mm3, mm7                ; mm3=data3H
    262 
    263    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm6=[PD_DESCALE_P1_4]
    264 
    265    paddd       mm1, mm6
    266    paddd       mm2, mm6
    267    psrad       mm1, DESCALE_P1_4
    268    psrad       mm2, DESCALE_P1_4
    269    paddd       mm5, mm6
    270    paddd       mm3, mm6
    271    psrad       mm5, DESCALE_P1_4
    272    psrad       mm3, DESCALE_P1_4
    273 
    274    packssdw    mm1, mm2                ; mm1=data0=(00 01 02 03)
    275    packssdw    mm5, mm3                ; mm5=data3=(30 31 32 33)
    276 
    277    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
    278    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
    279 
    280    movq        mm2, mm4
    281    movq        mm3, mm0
    282    paddd       mm4, mm7                ; mm4=data1L
    283    paddd       mm0, mm6                ; mm0=data1H
    284    psubd       mm2, mm7                ; mm2=data2L
    285    psubd       mm3, mm6                ; mm3=data2H
    286 
    287    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm7=[PD_DESCALE_P1_4]
    288 
    289    paddd       mm4, mm7
    290    paddd       mm0, mm7
    291    psrad       mm4, DESCALE_P1_4
    292    psrad       mm0, DESCALE_P1_4
    293    paddd       mm2, mm7
    294    paddd       mm3, mm7
    295    psrad       mm2, DESCALE_P1_4
    296    psrad       mm3, DESCALE_P1_4
    297 
    298    packssdw    mm4, mm0                ; mm4=data1=(10 11 12 13)
    299    packssdw    mm2, mm3                ; mm2=data2=(20 21 22 23)
    300 
    301    movq        mm6, mm1                ; transpose coefficients(phase 1)
    302    punpcklwd   mm1, mm4                ; mm1=(00 10 01 11)
    303    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
    304    movq        mm7, mm2                ; transpose coefficients(phase 1)
    305    punpcklwd   mm2, mm5                ; mm2=(20 30 21 31)
    306    punpckhwd   mm7, mm5                ; mm7=(22 32 23 33)
    307 
    308    movq        mm0, mm1                ; transpose coefficients(phase 2)
    309    punpckldq   mm1, mm2                ; mm1=(00 10 20 30)
    310    punpckhdq   mm0, mm2                ; mm0=(01 11 21 31)
    311    movq        mm3, mm6                ; transpose coefficients(phase 2)
    312    punpckldq   mm6, mm7                ; mm6=(02 12 22 32)
    313    punpckhdq   mm3, mm7                ; mm3=(03 13 23 33)
    314 
    315    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
    316    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
    317    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
    318    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
    319 
    320 .nextcolumn:
    321    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
    322    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
    323    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
    324    dec         ecx                                 ; ctr
    325    jnz         near .columnloop
    326 
    327    ; ---- Pass 2: process rows from work array, store into output array.
    328 
    329    mov         eax, [original_ebp]
    330    lea         esi, [workspace]                   ; JCOEF *wsptr
    331    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
    332    mov         eax, JDIMENSION [output_col(eax)]
    333 
    334    ; -- Odd part
    335 
    336    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    337    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    338    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    339    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    340 
    341    movq        mm4, mm0
    342    movq        mm5, mm0
    343    punpcklwd   mm4, mm1
    344    punpckhwd   mm5, mm1
    345    movq        mm0, mm4
    346    movq        mm1, mm5
    347    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
    348    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
    349    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
    350    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
    351 
    352    movq        mm6, mm2
    353    movq        mm7, mm2
    354    punpcklwd   mm6, mm3
    355    punpckhwd   mm7, mm3
    356    movq        mm2, mm6
    357    movq        mm3, mm7
    358    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
    359    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
    360    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
    361    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
    362 
    363    paddd       mm6, mm4                ; mm6=tmp2L
    364    paddd       mm7, mm5                ; mm7=tmp2H
    365    paddd       mm2, mm0                ; mm2=tmp0L
    366    paddd       mm3, mm1                ; mm3=tmp0H
    367 
    368    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
    369    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
    370 
    371    ; -- Even part
    372 
    373    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    374    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    375    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    376 
    377    pxor        mm1, mm1
    378    pxor        mm2, mm2
    379    punpcklwd   mm1, mm4                ; mm1=tmp0L
    380    punpckhwd   mm2, mm4                ; mm2=tmp0H
    381    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
    382    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
    383 
    384    movq        mm3, mm5                ; mm5=in2=z2
    385    punpcklwd   mm5, mm0                ; mm0=in6=z3
    386    punpckhwd   mm3, mm0
    387    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
    388    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
    389 
    390    movq        mm4, mm1
    391    movq        mm0, mm2
    392    paddd       mm1, mm5                ; mm1=tmp10L
    393    paddd       mm2, mm3                ; mm2=tmp10H
    394    psubd       mm4, mm5                ; mm4=tmp12L
    395    psubd       mm0, mm3                ; mm0=tmp12H
    396 
    397    ; -- Final output stage
    398 
    399    movq        mm5, mm1
    400    movq        mm3, mm2
    401    paddd       mm1, mm6                ; mm1=data0L
    402    paddd       mm2, mm7                ; mm2=data0H
    403    psubd       mm5, mm6                ; mm5=data3L
    404    psubd       mm3, mm7                ; mm3=data3H
    405 
    406    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm6=[PD_DESCALE_P2_4]
    407 
    408    paddd       mm1, mm6
    409    paddd       mm2, mm6
    410    psrad       mm1, DESCALE_P2_4
    411    psrad       mm2, DESCALE_P2_4
    412    paddd       mm5, mm6
    413    paddd       mm3, mm6
    414    psrad       mm5, DESCALE_P2_4
    415    psrad       mm3, DESCALE_P2_4
    416 
    417    packssdw    mm1, mm2                ; mm1=data0=(00 10 20 30)
    418    packssdw    mm5, mm3                ; mm5=data3=(03 13 23 33)
    419 
    420    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
    421    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
    422 
    423    movq        mm2, mm4
    424    movq        mm3, mm0
    425    paddd       mm4, mm7                ; mm4=data1L
    426    paddd       mm0, mm6                ; mm0=data1H
    427    psubd       mm2, mm7                ; mm2=data2L
    428    psubd       mm3, mm6                ; mm3=data2H
    429 
    430    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm7=[PD_DESCALE_P2_4]
    431 
    432    paddd       mm4, mm7
    433    paddd       mm0, mm7
    434    psrad       mm4, DESCALE_P2_4
    435    psrad       mm0, DESCALE_P2_4
    436    paddd       mm2, mm7
    437    paddd       mm3, mm7
    438    psrad       mm2, DESCALE_P2_4
    439    psrad       mm3, DESCALE_P2_4
    440 
    441    packssdw    mm4, mm0                ; mm4=data1=(01 11 21 31)
    442    packssdw    mm2, mm3                ; mm2=data2=(02 12 22 32)
    443 
    444    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
    445 
    446    packsswb    mm1, mm2                ; mm1=(00 10 20 30 02 12 22 32)
    447    packsswb    mm4, mm5                ; mm4=(01 11 21 31 03 13 23 33)
    448    paddb       mm1, mm6
    449    paddb       mm4, mm6
    450 
    451    movq        mm7, mm1                ; transpose coefficients(phase 1)
    452    punpcklbw   mm1, mm4                ; mm1=(00 01 10 11 20 21 30 31)
    453    punpckhbw   mm7, mm4                ; mm7=(02 03 12 13 22 23 32 33)
    454 
    455    movq        mm0, mm1                ; transpose coefficients(phase 2)
    456    punpcklwd   mm1, mm7                ; mm1=(00 01 02 03 10 11 12 13)
    457    punpckhwd   mm0, mm7                ; mm0=(20 21 22 23 30 31 32 33)
    458 
    459    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
    460    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
    461    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
    462    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
    463 
    464    psrlq       mm1, 4*BYTE_BIT
    465    psrlq       mm0, 4*BYTE_BIT
    466 
    467    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
    468    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
    469    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
    470    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
    471 
    472    emms                                ; empty MMX state
    473 
    474    pop         edi
    475    pop         esi
    476 ;   pop         edx                     ; need not be preserved
    477 ;   pop         ecx                     ; need not be preserved
    478    POPPIC      ebx
    479    mov         esp, ebp                ; esp <- aligned ebp
    480    pop         esp                     ; esp <- original ebp
    481    pop         ebp
    482    ret
    483 
    484 ; --------------------------------------------------------------------------
    485 ;
    486 ; Perform dequantization and inverse DCT on one block of coefficients,
    487 ; producing a reduced-size 2x2 output block.
    488 ;
    489 ; GLOBAL(void)
    490 ; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
    491 ;                    JSAMPARRAY output_buf, JDIMENSION output_col)
    492 ;
    493 
    494 %define dct_table(b)   (b) + 8          ; void *dct_table
    495 %define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
    496 %define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
    497 %define output_col(b)  (b) + 20         ; JDIMENSION output_col
    498 
    499    align       32
    500    GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
    501 
    502 EXTN(jsimd_idct_2x2_mmx):
    503    push        ebp
    504    mov         ebp, esp
    505    push        ebx
    506 ;   push        ecx                     ; need not be preserved
    507 ;   push        edx                     ; need not be preserved
    508    push        esi
    509    push        edi
    510 
    511    GET_GOT     ebx                     ; get GOT address
    512 
    513    ; ---- Pass 1: process columns from input.
    514 
    515    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
    516    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
    517 
    518    ; | input:                  | result:        |
    519    ; | 00 01 ** 03 ** 05 ** 07 |                |
    520    ; | 10 11 ** 13 ** 15 ** 17 |                |
    521    ; | ** ** ** ** ** ** ** ** |                |
    522    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
    523    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
    524    ; | 50 51 ** 53 ** 55 ** 57 |                |
    525    ; | ** ** ** ** ** ** ** ** |                |
    526    ; | 70 71 ** 73 ** 75 ** 77 |                |
    527 
    528    ; -- Odd part
    529 
    530    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    531    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    532    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    533    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    534    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    535    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    536    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    537    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    538 
    539    ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
    540    ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
    541 
    542    pcmpeqd     mm7, mm7
    543    pslld       mm7, WORD_BIT           ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
    544 
    545    movq        mm4, mm0                ; mm4=(10 11 ** 13)
    546    movq        mm5, mm2                ; mm5=(50 51 ** 53)
    547    punpcklwd   mm4, mm1                ; mm4=(10 30 11 31)
    548    punpcklwd   mm5, mm3                ; mm5=(50 70 51 71)
    549    pmaddwd     mm4, [GOTOFF(ebx,PW_F362_MF127)]
    550    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
    551 
    552    psrld       mm0, WORD_BIT           ; mm0=(11 -- 13 --)
    553    pand        mm1, mm7                ; mm1=(-- 31 -- 33)
    554    psrld       mm2, WORD_BIT           ; mm2=(51 -- 53 --)
    555    pand        mm3, mm7                ; mm3=(-- 71 -- 73)
    556    por         mm0, mm1                ; mm0=(11 31 13 33)
    557    por         mm2, mm3                ; mm2=(51 71 53 73)
    558    pmaddwd     mm0, [GOTOFF(ebx,PW_F362_MF127)]
    559    pmaddwd     mm2, [GOTOFF(ebx,PW_F085_MF072)]
    560 
    561    paddd       mm4, mm5                ; mm4=tmp0[col0 col1]
    562 
    563    movq        mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
    564    movq        mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
    565    pmullw      mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
    566    pmullw      mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
    567    movq        mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
    568    movq        mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
    569    pmullw      mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
    570    pmullw      mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
    571 
    572    ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
    573    ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
    574 
    575    psrld       mm6, WORD_BIT           ; mm6=(15 -- 17 --)
    576    pand        mm1, mm7                ; mm1=(-- 35 -- 37)
    577    psrld       mm3, WORD_BIT           ; mm3=(55 -- 57 --)
    578    pand        mm5, mm7                ; mm5=(-- 75 -- 77)
    579    por         mm6, mm1                ; mm6=(15 35 17 37)
    580    por         mm3, mm5                ; mm3=(55 75 57 77)
    581    pmaddwd     mm6, [GOTOFF(ebx,PW_F362_MF127)]
    582    pmaddwd     mm3, [GOTOFF(ebx,PW_F085_MF072)]
    583 
    584    paddd       mm0, mm2                ; mm0=tmp0[col1 col3]
    585    paddd       mm6, mm3                ; mm6=tmp0[col5 col7]
    586 
    587    ; -- Even part
    588 
    589    movq        mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    590    movq        mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
    591    pmullw      mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
    592    pmullw      mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
    593 
    594    ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
    595 
    596    movq        mm2, mm1                      ; mm2=(00 01 ** 03)
    597    pslld       mm1, WORD_BIT                 ; mm1=(-- 00 -- **)
    598    psrad       mm1, (WORD_BIT-CONST_BITS-2)  ; mm1=tmp10[col0 ****]
    599 
    600    pand        mm2, mm7                      ; mm2=(-- 01 -- 03)
    601    pand        mm5, mm7                      ; mm5=(-- 05 -- 07)
    602    psrad       mm2, (WORD_BIT-CONST_BITS-2)  ; mm2=tmp10[col1 col3]
    603    psrad       mm5, (WORD_BIT-CONST_BITS-2)  ; mm5=tmp10[col5 col7]
    604 
    605    ; -- Final output stage
    606 
    607    movq        mm3, mm1
    608    paddd       mm1, mm4                ; mm1=data0[col0 ****]=(A0 **)
    609    psubd       mm3, mm4                ; mm3=data1[col0 ****]=(B0 **)
    610    punpckldq   mm1, mm3                ; mm1=(A0 B0)
    611 
    612    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; mm7=[PD_DESCALE_P1_2]
    613 
    614    movq        mm4, mm2
    615    movq        mm3, mm5
    616    paddd       mm2, mm0                ; mm2=data0[col1 col3]=(A1 A3)
    617    paddd       mm5, mm6                ; mm5=data0[col5 col7]=(A5 A7)
    618    psubd       mm4, mm0                ; mm4=data1[col1 col3]=(B1 B3)
    619    psubd       mm3, mm6                ; mm3=data1[col5 col7]=(B5 B7)
    620 
    621    paddd       mm1, mm7
    622    psrad       mm1, DESCALE_P1_2
    623 
    624    paddd       mm2, mm7
    625    paddd       mm5, mm7
    626    psrad       mm2, DESCALE_P1_2
    627    psrad       mm5, DESCALE_P1_2
    628    paddd       mm4, mm7
    629    paddd       mm3, mm7
    630    psrad       mm4, DESCALE_P1_2
    631    psrad       mm3, DESCALE_P1_2
    632 
    633    ; ---- Pass 2: process rows, store into output array.
    634 
    635    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
    636    mov         eax, JDIMENSION [output_col(ebp)]
    637 
    638    ; | input:| result:|
    639    ; | A0 B0 |        |
    640    ; | A1 B1 | C0 C1  |
    641    ; | A3 B3 | D0 D1  |
    642    ; | A5 B5 |        |
    643    ; | A7 B7 |        |
    644 
    645    ; -- Odd part
    646 
    647    packssdw    mm2, mm4                ; mm2=(A1 A3 B1 B3)
    648    packssdw    mm5, mm3                ; mm5=(A5 A7 B5 B7)
    649    pmaddwd     mm2, [GOTOFF(ebx,PW_F362_MF127)]
    650    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
    651 
    652    paddd       mm2, mm5                ; mm2=tmp0[row0 row1]
    653 
    654    ; -- Even part
    655 
    656    pslld       mm1, (CONST_BITS+2)     ; mm1=tmp10[row0 row1]
    657 
    658    ; -- Final output stage
    659 
    660    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)]  ; mm0=[PD_DESCALE_P2_2]
    661 
    662    movq        mm6, mm1
    663    paddd       mm1, mm2                ; mm1=data0[row0 row1]=(C0 C1)
    664    psubd       mm6, mm2                ; mm6=data1[row0 row1]=(D0 D1)
    665 
    666    paddd       mm1, mm0
    667    paddd       mm6, mm0
    668    psrad       mm1, DESCALE_P2_2
    669    psrad       mm6, DESCALE_P2_2
    670 
    671    movq        mm7, mm1                ; transpose coefficients
    672    punpckldq   mm1, mm6                ; mm1=(C0 D0)
    673    punpckhdq   mm7, mm6                ; mm7=(C1 D1)
    674 
    675    packssdw    mm1, mm7                ; mm1=(C0 D0 C1 D1)
    676    packsswb    mm1, mm1                ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
    677    paddb       mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
    678 
    679    movd        ecx, mm1
    680    movd        ebx, mm1                ; ebx=(C0 D0 C1 D1)
    681    shr         ecx, 2*BYTE_BIT         ; ecx=(C1 D1 -- --)
    682 
    683    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
    684    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
    685    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
    686    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
    687 
    688    emms                                ; empty MMX state
    689 
    690    pop         edi
    691    pop         esi
    692 ;   pop         edx                     ; need not be preserved
    693 ;   pop         ecx                     ; need not be preserved
    694    pop         ebx
    695    pop         ebp
    696    ret
    697 
    698 ; For some reason, the OS X linker does not honor the request to align the
    699 ; segment unless we do this.
    700    align       32