tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jccolext-mmx.asm (16997B)


      1 ;
      2 ; jccolext.asm - colorspace conversion (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 
     13 %include "jcolsamp.inc"
     14 
     15 ; --------------------------------------------------------------------------
     16 ;
     17 ; Convert some rows of samples to the output colorspace.
     18 ;
     19 ; GLOBAL(void)
     20 ; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
     21 ;                           JSAMPIMAGE output_buf, JDIMENSION output_row,
     22 ;                           int num_rows);
     23 ;
     24 
     25 %define img_width(b)   (b) + 8          ; JDIMENSION img_width
     26 %define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
     27 %define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
     28 %define output_row(b)  (b) + 20         ; JDIMENSION output_row
     29 %define num_rows(b)    (b) + 24         ; int num_rows
     30 
     31 %define original_ebp   ebp + 0
     32 %define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
     33                                        ; mmword wk[WK_NUM]
     34 %define WK_NUM         8
     35 %define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
     36 
     37    align       32
     38    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
     39 
     40 EXTN(jsimd_rgb_ycc_convert_mmx):
     41    push        ebp
     42    mov         eax, esp                    ; eax = original ebp
     43    sub         esp, byte 4
     44    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
     45    mov         [esp], eax
     46    mov         ebp, esp                    ; ebp = aligned ebp
     47    lea         esp, [wk(0)]
     48    PUSHPIC     eax                     ; make a room for GOT address
     49    push        ebx
     50 ;   push        ecx                     ; need not be preserved
     51 ;   push        edx                     ; need not be preserved
     52    push        esi
     53    push        edi
     54 
     55    GET_GOT     ebx                     ; get GOT address
     56    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
     57 
     58    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
     59    test        ecx, ecx
     60    jz          near .return
     61 
     62    push        ecx
     63 
     64    mov         esi, JSAMPIMAGE [output_buf(eax)]
     65    mov         ecx, JDIMENSION [output_row(eax)]
     66    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
     67    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
     68    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
     69    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
     70    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
     71    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
     72 
     73    pop         ecx
     74 
     75    mov         esi, JSAMPARRAY [input_buf(eax)]
     76    mov         eax, INT [num_rows(eax)]
     77    test        eax, eax
     78    jle         near .return
     79    ALIGNX      16, 7
     80 .rowloop:
     81    PUSHPIC     eax
     82    push        edx
     83    push        ebx
     84    push        edi
     85    push        esi
     86    push        ecx                     ; col
     87 
     88    mov         esi, JSAMPROW [esi]     ; inptr
     89    mov         edi, JSAMPROW [edi]     ; outptr0
     90    mov         ebx, JSAMPROW [ebx]     ; outptr1
     91    mov         edx, JSAMPROW [edx]     ; outptr2
     92    MOVPIC      eax, POINTER [gotptr]   ; load GOT address (eax)
     93 
     94    cmp         ecx, byte SIZEOF_MMWORD
     95    jae         short .columnloop
     96    ALIGNX      16, 7
     97 
     98 %if RGB_PIXELSIZE == 3  ; ---------------
     99 
    100 .column_ld1:
    101    push        eax
    102    push        edx
    103    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
    104    test        cl, SIZEOF_BYTE
    105    jz          short .column_ld2
    106    sub         ecx, byte SIZEOF_BYTE
    107    xor         eax, eax
    108    mov         al, byte [esi+ecx]
    109 .column_ld2:
    110    test        cl, SIZEOF_WORD
    111    jz          short .column_ld4
    112    sub         ecx, byte SIZEOF_WORD
    113    xor         edx, edx
    114    mov         dx, word [esi+ecx]
    115    shl         eax, WORD_BIT
    116    or          eax, edx
    117 .column_ld4:
    118    movd        mmA, eax
    119    pop         edx
    120    pop         eax
    121    test        cl, SIZEOF_DWORD
    122    jz          short .column_ld8
    123    sub         ecx, byte SIZEOF_DWORD
    124    movd        mmG, dword [esi+ecx]
    125    psllq       mmA, DWORD_BIT
    126    por         mmA, mmG
    127 .column_ld8:
    128    test        cl, SIZEOF_MMWORD
    129    jz          short .column_ld16
    130    movq        mmG, mmA
    131    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    132    mov         ecx, SIZEOF_MMWORD
    133    jmp         short .rgb_ycc_cnv
    134 .column_ld16:
    135    test        cl, 2*SIZEOF_MMWORD
    136    mov         ecx, SIZEOF_MMWORD
    137    jz          short .rgb_ycc_cnv
    138    movq        mmF, mmA
    139    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    140    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
    141    jmp         short .rgb_ycc_cnv
    142    ALIGNX      16, 7
    143 
    144 .columnloop:
    145    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    146    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
    147    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
    148 
    149 .rgb_ycc_cnv:
    150    ; mmA=(00 10 20 01 11 21 02 12)
    151    ; mmG=(22 03 13 23 04 14 24 05)
    152    ; mmF=(15 25 06 16 26 07 17 27)
    153 
    154    movq        mmD, mmA
    155    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
    156    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
    157 
    158    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
    159    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
    160 
    161    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
    162    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
    163 
    164    movq        mmE, mmA
    165    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
    166    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
    167 
    168    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
    169    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
    170 
    171    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
    172    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
    173 
    174    pxor        mmH, mmH
    175 
    176    movq        mmC, mmA
    177    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
    178    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
    179 
    180    movq        mmB, mmE
    181    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
    182    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
    183 
    184    movq        mmF, mmD
    185    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
    186    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
    187 
    188 %else  ; RGB_PIXELSIZE == 4 ; -----------
    189 
    190 .column_ld1:
    191    test        cl, SIZEOF_MMWORD/8
    192    jz          short .column_ld2
    193    sub         ecx, byte SIZEOF_MMWORD/8
    194    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
    195 .column_ld2:
    196    test        cl, SIZEOF_MMWORD/4
    197    jz          short .column_ld4
    198    sub         ecx, byte SIZEOF_MMWORD/4
    199    movq        mmF, mmA
    200    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
    201 .column_ld4:
    202    test        cl, SIZEOF_MMWORD/2
    203    mov         ecx, SIZEOF_MMWORD
    204    jz          short .rgb_ycc_cnv
    205    movq        mmD, mmA
    206    movq        mmC, mmF
    207    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    208    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
    209    jmp         short .rgb_ycc_cnv
    210    ALIGNX      16, 7
    211 
    212 .columnloop:
    213    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
    214    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
    215    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
    216    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
    217 
    218 .rgb_ycc_cnv:
    219    ; mmA=(00 10 20 30 01 11 21 31)
    220    ; mmF=(02 12 22 32 03 13 23 33)
    221    ; mmD=(04 14 24 34 05 15 25 35)
    222    ; mmC=(06 16 26 36 07 17 27 37)
    223 
    224    movq        mmB, mmA
    225    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
    226    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
    227 
    228    movq        mmG, mmD
    229    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
    230    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
    231 
    232    movq        mmE, mmA
    233    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
    234    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
    235 
    236    movq        mmH, mmB
    237    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
    238    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
    239 
    240    pxor        mmF, mmF
    241 
    242    movq        mmC, mmA
    243    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
    244    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
    245 
    246    movq        mmD, mmB
    247    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
    248    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
    249 
    250    movq        mmG, mmE
    251    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
    252    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
    253 
    254    punpcklbw   mmF, mmH
    255    punpckhbw   mmH, mmH
    256    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
    257    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
    258 
    259 %endif  ; RGB_PIXELSIZE ; ---------------
    260 
    261    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
    262    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
    263 
    264    ; (Original)
    265    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    266    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    267    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    268    ;
    269    ; (This implementation)
    270    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    271    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    272    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    273 
    274    movq        MMWORD [wk(0)], mm0     ; wk(0)=RE
    275    movq        MMWORD [wk(1)], mm1     ; wk(1)=RO
    276    movq        MMWORD [wk(2)], mm4     ; wk(2)=BE
    277    movq        MMWORD [wk(3)], mm5     ; wk(3)=BO
    278 
    279    movq        mm6, mm1
    280    punpcklwd   mm1, mm3
    281    punpckhwd   mm6, mm3
    282    movq        mm7, mm1
    283    movq        mm4, mm6
    284    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    285    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    286    pmaddwd     mm7, [GOTOFF(eax,PW_MF016_MF033)]  ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
    287    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
    288 
    289    movq        MMWORD [wk(4)], mm1     ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
    290    movq        MMWORD [wk(5)], mm6     ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
    291 
    292    pxor        mm1, mm1
    293    pxor        mm6, mm6
    294    punpcklwd   mm1, mm5                ; mm1=BOL
    295    punpckhwd   mm6, mm5                ; mm6=BOH
    296    psrld       mm1, 1                  ; mm1=BOL*FIX(0.500)
    297    psrld       mm6, 1                  ; mm6=BOH*FIX(0.500)
    298 
    299    movq        mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm5=[PD_ONEHALFM1_CJ]
    300 
    301    paddd       mm7, mm1
    302    paddd       mm4, mm6
    303    paddd       mm7, mm5
    304    paddd       mm4, mm5
    305    psrld       mm7, SCALEBITS          ; mm7=CbOL
    306    psrld       mm4, SCALEBITS          ; mm4=CbOH
    307    packssdw    mm7, mm4                ; mm7=CbO
    308 
    309    movq        mm1, MMWORD [wk(2)]     ; mm1=BE
    310 
    311    movq        mm6, mm0
    312    punpcklwd   mm0, mm2
    313    punpckhwd   mm6, mm2
    314    movq        mm5, mm0
    315    movq        mm4, mm6
    316    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
    317    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
    318    pmaddwd     mm5, [GOTOFF(eax,PW_MF016_MF033)]  ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
    319    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
    320 
    321    movq        MMWORD [wk(6)], mm0     ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
    322    movq        MMWORD [wk(7)], mm6     ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
    323 
    324    pxor        mm0, mm0
    325    pxor        mm6, mm6
    326    punpcklwd   mm0, mm1                ; mm0=BEL
    327    punpckhwd   mm6, mm1                ; mm6=BEH
    328    psrld       mm0, 1                  ; mm0=BEL*FIX(0.500)
    329    psrld       mm6, 1                  ; mm6=BEH*FIX(0.500)
    330 
    331    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
    332 
    333    paddd       mm5, mm0
    334    paddd       mm4, mm6
    335    paddd       mm5, mm1
    336    paddd       mm4, mm1
    337    psrld       mm5, SCALEBITS          ; mm5=CbEL
    338    psrld       mm4, SCALEBITS          ; mm4=CbEH
    339    packssdw    mm5, mm4                ; mm5=CbE
    340 
    341    psllw       mm7, BYTE_BIT
    342    por         mm5, mm7                ; mm5=Cb
    343    movq        MMWORD [ebx], mm5       ; Save Cb
    344 
    345    movq        mm0, MMWORD [wk(3)]     ; mm0=BO
    346    movq        mm6, MMWORD [wk(2)]     ; mm6=BE
    347    movq        mm1, MMWORD [wk(1)]     ; mm1=RO
    348 
    349    movq        mm4, mm0
    350    punpcklwd   mm0, mm3
    351    punpckhwd   mm4, mm3
    352    movq        mm7, mm0
    353    movq        mm5, mm4
    354    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    355    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    356    pmaddwd     mm7, [GOTOFF(eax,PW_MF008_MF041)]  ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
    357    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
    358 
    359    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
    360 
    361    paddd       mm0, MMWORD [wk(4)]
    362    paddd       mm4, MMWORD [wk(5)]
    363    paddd       mm0, mm3
    364    paddd       mm4, mm3
    365    psrld       mm0, SCALEBITS          ; mm0=YOL
    366    psrld       mm4, SCALEBITS          ; mm4=YOH
    367    packssdw    mm0, mm4                ; mm0=YO
    368 
    369    pxor        mm3, mm3
    370    pxor        mm4, mm4
    371    punpcklwd   mm3, mm1                ; mm3=ROL
    372    punpckhwd   mm4, mm1                ; mm4=ROH
    373    psrld       mm3, 1                  ; mm3=ROL*FIX(0.500)
    374    psrld       mm4, 1                  ; mm4=ROH*FIX(0.500)
    375 
    376    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
    377 
    378    paddd       mm7, mm3
    379    paddd       mm5, mm4
    380    paddd       mm7, mm1
    381    paddd       mm5, mm1
    382    psrld       mm7, SCALEBITS          ; mm7=CrOL
    383    psrld       mm5, SCALEBITS          ; mm5=CrOH
    384    packssdw    mm7, mm5                ; mm7=CrO
    385 
    386    movq        mm3, MMWORD [wk(0)]     ; mm3=RE
    387 
    388    movq        mm4, mm6
    389    punpcklwd   mm6, mm2
    390    punpckhwd   mm4, mm2
    391    movq        mm1, mm6
    392    movq        mm5, mm4
    393    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    394    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    395    pmaddwd     mm1, [GOTOFF(eax,PW_MF008_MF041)]  ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
    396    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
    397 
    398    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
    399 
    400    paddd       mm6, MMWORD [wk(6)]
    401    paddd       mm4, MMWORD [wk(7)]
    402    paddd       mm6, mm2
    403    paddd       mm4, mm2
    404    psrld       mm6, SCALEBITS          ; mm6=YEL
    405    psrld       mm4, SCALEBITS          ; mm4=YEH
    406    packssdw    mm6, mm4                ; mm6=YE
    407 
    408    psllw       mm0, BYTE_BIT
    409    por         mm6, mm0                ; mm6=Y
    410    movq        MMWORD [edi], mm6       ; Save Y
    411 
    412    pxor        mm2, mm2
    413    pxor        mm4, mm4
    414    punpcklwd   mm2, mm3                ; mm2=REL
    415    punpckhwd   mm4, mm3                ; mm4=REH
    416    psrld       mm2, 1                  ; mm2=REL*FIX(0.500)
    417    psrld       mm4, 1                  ; mm4=REH*FIX(0.500)
    418 
    419    movq        mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm0=[PD_ONEHALFM1_CJ]
    420 
    421    paddd       mm1, mm2
    422    paddd       mm5, mm4
    423    paddd       mm1, mm0
    424    paddd       mm5, mm0
    425    psrld       mm1, SCALEBITS          ; mm1=CrEL
    426    psrld       mm5, SCALEBITS          ; mm5=CrEH
    427    packssdw    mm1, mm5                ; mm1=CrE
    428 
    429    psllw       mm7, BYTE_BIT
    430    por         mm1, mm7                ; mm1=Cr
    431    movq        MMWORD [edx], mm1       ; Save Cr
    432 
    433    sub         ecx, byte SIZEOF_MMWORD
    434    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
    435    add         edi, byte SIZEOF_MMWORD                ; outptr0
    436    add         ebx, byte SIZEOF_MMWORD                ; outptr1
    437    add         edx, byte SIZEOF_MMWORD                ; outptr2
    438    cmp         ecx, byte SIZEOF_MMWORD
    439    jae         near .columnloop
    440    test        ecx, ecx
    441    jnz         near .column_ld1
    442 
    443    pop         ecx                     ; col
    444    pop         esi
    445    pop         edi
    446    pop         ebx
    447    pop         edx
    448    POPPIC      eax
    449 
    450    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
    451    add         edi, byte SIZEOF_JSAMPROW
    452    add         ebx, byte SIZEOF_JSAMPROW
    453    add         edx, byte SIZEOF_JSAMPROW
    454    dec         eax                        ; num_rows
    455    jg          near .rowloop
    456 
    457    emms                                ; empty MMX state
    458 
    459 .return:
    460    pop         edi
    461    pop         esi
    462 ;   pop         edx                     ; need not be preserved
    463 ;   pop         ecx                     ; need not be preserved
    464    pop         ebx
    465    mov         esp, ebp                ; esp <- aligned ebp
    466    pop         esp                     ; esp <- original ebp
    467    pop         ebp
    468    ret
    469 
    470 ; For some reason, the OS X linker does not honor the request to align the
    471 ; segment unless we do this.
    472    align       32