tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdsample-mmx.asm (25380B)


      1 ;
      2 ; jdsample.asm - upsampling (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 
     13 %include "jsimdext.inc"
     14 
     15 ; --------------------------------------------------------------------------
     16    SECTION     SEG_CONST
     17 
     18    ALIGNZ      32
     19    GLOBAL_DATA(jconst_fancy_upsample_mmx)
     20 
     21 EXTN(jconst_fancy_upsample_mmx):
     22 
     23 PW_ONE   times 4 dw 1
     24 PW_TWO   times 4 dw 2
     25 PW_THREE times 4 dw 3
     26 PW_SEVEN times 4 dw 7
     27 PW_EIGHT times 4 dw 8
     28 
     29    ALIGNZ      32
     30 
     31 ; --------------------------------------------------------------------------
     32    SECTION     SEG_TEXT
     33    BITS        32
     34 ;
     35 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     36 ;
     37 ; The upsampling algorithm is linear interpolation between pixel centers,
     38 ; also known as a "triangle filter".  This is a good compromise between
     39 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     40 ; of the way between input pixel centers.
     41 ;
     42 ; GLOBAL(void)
     43 ; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
     44 ;                               JDIMENSION downsampled_width,
     45 ;                               JSAMPARRAY input_data,
     46 ;                               JSAMPARRAY *output_data_ptr);
     47 ;
     48 
     49 %define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
     50 %define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
     51 %define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
     52 %define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
     53 
     54    align       32
     55    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
     56 
     57 EXTN(jsimd_h2v1_fancy_upsample_mmx):
     58    push        ebp
     59    mov         ebp, esp
     60    PUSHPIC     ebx
     61 ;   push        ecx                     ; need not be preserved
     62 ;   push        edx                     ; need not be preserved
     63    push        esi
     64    push        edi
     65 
     66    GET_GOT     ebx                     ; get GOT address
     67 
     68    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
     69    test        eax, eax
     70    jz          near .return
     71 
     72    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
     73    test        ecx, ecx
     74    jz          near .return
     75 
     76    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
     77    mov         edi, POINTER [output_data_ptr(ebp)]
     78    mov         edi, JSAMPARRAY [edi]                ; output_data
     79    ALIGNX      16, 7
     80 .rowloop:
     81    push        eax                     ; colctr
     82    push        edi
     83    push        esi
     84 
     85    mov         esi, JSAMPROW [esi]     ; inptr
     86    mov         edi, JSAMPROW [edi]     ; outptr
     87 
     88    test        eax, SIZEOF_MMWORD-1
     89    jz          short .skip
     90    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
     91    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
     92 .skip:
     93    pxor        mm0, mm0                ; mm0=(all 0's)
     94    pcmpeqb     mm7, mm7
     95    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
     96    pand        mm7,  MMWORD [esi+0*SIZEOF_MMWORD]
     97 
     98    add         eax, byte SIZEOF_MMWORD-1
     99    and         eax, byte -SIZEOF_MMWORD
    100    cmp         eax, byte SIZEOF_MMWORD
    101    ja          short .columnloop
    102    ALIGNX      16, 7
    103 
    104 .columnloop_last:
    105    pcmpeqb     mm6, mm6
    106    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
    107    pand        mm6, MMWORD [esi+0*SIZEOF_MMWORD]
    108    jmp         short .upsample
    109    ALIGNX      16, 7
    110 
    111 .columnloop:
    112    movq        mm6, MMWORD [esi+1*SIZEOF_MMWORD]
    113    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
    114 
    115 .upsample:
    116    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
    117    movq        mm2, mm1
    118    movq        mm3, mm1                ; mm1=( 0 1 2 3 4 5 6 7)
    119    psllq       mm2, BYTE_BIT           ; mm2=( - 0 1 2 3 4 5 6)
    120    psrlq       mm3, BYTE_BIT           ; mm3=( 1 2 3 4 5 6 7 -)
    121 
    122    por         mm2, mm7                ; mm2=(-1 0 1 2 3 4 5 6)
    123    por         mm3, mm6                ; mm3=( 1 2 3 4 5 6 7 8)
    124 
    125    movq        mm7, mm1
    126    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
    127 
    128    movq        mm4, mm1
    129    punpcklbw   mm1, mm0                ; mm1=( 0 1 2 3)
    130    punpckhbw   mm4, mm0                ; mm4=( 4 5 6 7)
    131    movq        mm5, mm2
    132    punpcklbw   mm2, mm0                ; mm2=(-1 0 1 2)
    133    punpckhbw   mm5, mm0                ; mm5=( 3 4 5 6)
    134    movq        mm6, mm3
    135    punpcklbw   mm3, mm0                ; mm3=( 1 2 3 4)
    136    punpckhbw   mm6, mm0                ; mm6=( 5 6 7 8)
    137 
    138    pmullw      mm1, [GOTOFF(ebx,PW_THREE)]
    139    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
    140    paddw       mm2, [GOTOFF(ebx,PW_ONE)]
    141    paddw       mm5, [GOTOFF(ebx,PW_ONE)]
    142    paddw       mm3, [GOTOFF(ebx,PW_TWO)]
    143    paddw       mm6, [GOTOFF(ebx,PW_TWO)]
    144 
    145    paddw       mm2, mm1
    146    paddw       mm5, mm4
    147    psrlw       mm2, 2                  ; mm2=OutLE=( 0  2  4  6)
    148    psrlw       mm5, 2                  ; mm5=OutHE=( 8 10 12 14)
    149    paddw       mm3, mm1
    150    paddw       mm6, mm4
    151    psrlw       mm3, 2                  ; mm3=OutLO=( 1  3  5  7)
    152    psrlw       mm6, 2                  ; mm6=OutHO=( 9 11 13 15)
    153 
    154    psllw       mm3, BYTE_BIT
    155    psllw       mm6, BYTE_BIT
    156    por         mm2, mm3                ; mm2=OutL=( 0  1  2  3  4  5  6  7)
    157    por         mm5, mm6                ; mm5=OutH=( 8  9 10 11 12 13 14 15)
    158 
    159    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
    160    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm5
    161 
    162    sub         eax, byte SIZEOF_MMWORD
    163    add         esi, byte 1*SIZEOF_MMWORD  ; inptr
    164    add         edi, byte 2*SIZEOF_MMWORD  ; outptr
    165    cmp         eax, byte SIZEOF_MMWORD
    166    ja          near .columnloop
    167    test        eax, eax
    168    jnz         near .columnloop_last
    169 
    170    pop         esi
    171    pop         edi
    172    pop         eax
    173 
    174    add         esi, byte SIZEOF_JSAMPROW  ; input_data
    175    add         edi, byte SIZEOF_JSAMPROW  ; output_data
    176    dec         ecx                        ; rowctr
    177    jg          near .rowloop
    178 
    179    emms                                ; empty MMX state
    180 
    181 .return:
    182    pop         edi
    183    pop         esi
    184 ;   pop         edx                     ; need not be preserved
    185 ;   pop         ecx                     ; need not be preserved
    186    POPPIC      ebx
    187    pop         ebp
    188    ret
    189 
    190 ; --------------------------------------------------------------------------
    191 ;
    192 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    193 ; Again a triangle filter; see comments for h2v1 case, above.
    194 ;
    195 ; GLOBAL(void)
    196 ; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
    197 ;                               JDIMENSION downsampled_width,
    198 ;                               JSAMPARRAY input_data,
    199 ;                               JSAMPARRAY *output_data_ptr);
    200 ;
    201 
    202 %define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
    203 %define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
    204 %define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
    205 %define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
    206 
    207 %define original_ebp  ebp + 0
    208 %define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
    209 %define WK_NUM        4
    210 %define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
    211 
    212    align       32
    213    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
    214 
    215 EXTN(jsimd_h2v2_fancy_upsample_mmx):
    216    push        ebp
    217    mov         eax, esp                    ; eax = original ebp
    218    sub         esp, byte 4
    219    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
    220    mov         [esp], eax
    221    mov         ebp, esp                    ; ebp = aligned ebp
    222    lea         esp, [wk(0)]
    223    PUSHPIC     eax                     ; make a room for GOT address
    224    push        ebx
    225 ;   push        ecx                     ; need not be preserved
    226 ;   push        edx                     ; need not be preserved
    227    push        esi
    228    push        edi
    229 
    230    GET_GOT     ebx                     ; get GOT address
    231    MOVPIC      POINTER [gotptr], ebx   ; save GOT address
    232 
    233    mov         edx, eax                ; edx = original ebp
    234    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
    235    test        eax, eax
    236    jz          near .return
    237 
    238    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
    239    test        ecx, ecx
    240    jz          near .return
    241 
    242    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
    243    mov         edi, POINTER [output_data_ptr(edx)]
    244    mov         edi, JSAMPARRAY [edi]                ; output_data
    245    ALIGNX      16, 7
    246 .rowloop:
    247    push        eax                     ; colctr
    248    push        ecx
    249    push        edi
    250    push        esi
    251 
    252    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
    253    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
    254    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
    255    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
    256    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
    257 
    258    test        eax, SIZEOF_MMWORD-1
    259    jz          short .skip
    260    push        edx
    261    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
    262    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
    263    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
    264    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
    265    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
    266    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
    267    pop         edx
    268 .skip:
    269    ; -- process the first column block
    270 
    271    movq        mm0, MMWORD [ebx+0*SIZEOF_MMWORD]  ; mm0=row[ 0][0]
    272    movq        mm1, MMWORD [ecx+0*SIZEOF_MMWORD]  ; mm1=row[-1][0]
    273    movq        mm2, MMWORD [esi+0*SIZEOF_MMWORD]  ; mm2=row[+1][0]
    274 
    275    PUSHPIC     ebx
    276    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
    277 
    278    pxor        mm3, mm3                ; mm3=(all 0's)
    279    movq        mm4, mm0
    280    punpcklbw   mm0, mm3                ; mm0=row[ 0][0]( 0 1 2 3)
    281    punpckhbw   mm4, mm3                ; mm4=row[ 0][0]( 4 5 6 7)
    282    movq        mm5, mm1
    283    punpcklbw   mm1, mm3                ; mm1=row[-1][0]( 0 1 2 3)
    284    punpckhbw   mm5, mm3                ; mm5=row[-1][0]( 4 5 6 7)
    285    movq        mm6, mm2
    286    punpcklbw   mm2, mm3                ; mm2=row[+1][0]( 0 1 2 3)
    287    punpckhbw   mm6, mm3                ; mm6=row[+1][0]( 4 5 6 7)
    288 
    289    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
    290    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
    291 
    292    pcmpeqb     mm7, mm7
    293    psrlq       mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
    294 
    295    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
    296    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
    297    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
    298    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
    299 
    300    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1  ; temporarily save
    301    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5  ; the intermediate data
    302    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
    303    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm6
    304 
    305    pand        mm1, mm7                ; mm1=( 0 - - -)
    306    pand        mm2, mm7                ; mm2=( 0 - - -)
    307 
    308    movq        MMWORD [wk(0)], mm1
    309    movq        MMWORD [wk(1)], mm2
    310 
    311    POPPIC      ebx
    312 
    313    add         eax, byte SIZEOF_MMWORD-1
    314    and         eax, byte -SIZEOF_MMWORD
    315    cmp         eax, byte SIZEOF_MMWORD
    316    ja          short .columnloop
    317    ALIGNX      16, 7
    318 
    319 .columnloop_last:
    320    ; -- process the last column block
    321 
    322    PUSHPIC     ebx
    323    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
    324 
    325    pcmpeqb     mm1, mm1
    326    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
    327    movq        mm2, mm1
    328 
    329    pand        mm1, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm1=( - - - 7)
    330    pand        mm2, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm2=( - - - 7)
    331 
    332    movq        MMWORD [wk(2)], mm1
    333    movq        MMWORD [wk(3)], mm2
    334 
    335    jmp         short .upsample
    336    ALIGNX      16, 7
    337 
    338 .columnloop:
    339    ; -- process the next column block
    340 
    341    movq        mm0, MMWORD [ebx+1*SIZEOF_MMWORD]  ; mm0=row[ 0][1]
    342    movq        mm1, MMWORD [ecx+1*SIZEOF_MMWORD]  ; mm1=row[-1][1]
    343    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]  ; mm2=row[+1][1]
    344 
    345    PUSHPIC     ebx
    346    MOVPIC      ebx, POINTER [gotptr]   ; load GOT address
    347 
    348    pxor        mm3, mm3                ; mm3=(all 0's)
    349    movq        mm4, mm0
    350    punpcklbw   mm0, mm3                ; mm0=row[ 0][1]( 0 1 2 3)
    351    punpckhbw   mm4, mm3                ; mm4=row[ 0][1]( 4 5 6 7)
    352    movq        mm5, mm1
    353    punpcklbw   mm1, mm3                ; mm1=row[-1][1]( 0 1 2 3)
    354    punpckhbw   mm5, mm3                ; mm5=row[-1][1]( 4 5 6 7)
    355    movq        mm6, mm2
    356    punpcklbw   mm2, mm3                ; mm2=row[+1][1]( 0 1 2 3)
    357    punpckhbw   mm6, mm3                ; mm6=row[+1][1]( 4 5 6 7)
    358 
    359    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
    360    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
    361 
    362    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
    363    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
    364    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
    365    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
    366 
    367    movq        MMWORD [edx+2*SIZEOF_MMWORD], mm1  ; temporarily save
    368    movq        MMWORD [edx+3*SIZEOF_MMWORD], mm5  ; the intermediate data
    369    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
    370    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm6
    371 
    372    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
    373    psllq       mm2, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
    374 
    375    movq        MMWORD [wk(2)], mm1
    376    movq        MMWORD [wk(3)], mm2
    377 
    378 .upsample:
    379    ; -- process the upper row
    380 
    381    movq        mm7, MMWORD [edx+0*SIZEOF_MMWORD]  ; mm7=Int0L=( 0 1 2 3)
    382    movq        mm3, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm3=Int0H=( 4 5 6 7)
    383 
    384    movq        mm0, mm7
    385    movq        mm4, mm3
    386    psrlq       mm0, 2*BYTE_BIT                  ; mm0=( 1 2 3 -)
    387    psllq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
    388    movq        mm5, mm7
    389    movq        mm6, mm3
    390    psrlq       mm5, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
    391    psllq       mm6, 2*BYTE_BIT                  ; mm6=( - 4 5 6)
    392 
    393    por         mm0, mm4                         ; mm0=( 1 2 3 4)
    394    por         mm5, mm6                         ; mm5=( 3 4 5 6)
    395 
    396    movq        mm1, mm7
    397    movq        mm2, mm3
    398    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
    399    psrlq       mm2, 2*BYTE_BIT                  ; mm2=( 5 6 7 -)
    400    movq        mm4, mm3
    401    psrlq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
    402 
    403    por         mm1, MMWORD [wk(0)]              ; mm1=(-1 0 1 2)
    404    por         mm2, MMWORD [wk(2)]              ; mm2=( 5 6 7 8)
    405 
    406    movq        MMWORD [wk(0)], mm4
    407 
    408    pmullw      mm7, [GOTOFF(ebx,PW_THREE)]
    409    pmullw      mm3, [GOTOFF(ebx,PW_THREE)]
    410    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
    411    paddw       mm5, [GOTOFF(ebx,PW_EIGHT)]
    412    paddw       mm0, [GOTOFF(ebx,PW_SEVEN)]
    413    paddw       mm2, [GOTOFF(ebx,PW_SEVEN)]
    414 
    415    paddw       mm1, mm7
    416    paddw       mm5, mm3
    417    psrlw       mm1, 4                  ; mm1=Out0LE=( 0  2  4  6)
    418    psrlw       mm5, 4                  ; mm5=Out0HE=( 8 10 12 14)
    419    paddw       mm0, mm7
    420    paddw       mm2, mm3
    421    psrlw       mm0, 4                  ; mm0=Out0LO=( 1  3  5  7)
    422    psrlw       mm2, 4                  ; mm2=Out0HO=( 9 11 13 15)
    423 
    424    psllw       mm0, BYTE_BIT
    425    psllw       mm2, BYTE_BIT
    426    por         mm1, mm0                ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
    427    por         mm5, mm2                ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
    428 
    429    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1
    430    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5
    431 
    432    ; -- process the lower row
    433 
    434    movq        mm6, MMWORD [edi+0*SIZEOF_MMWORD]  ; mm6=Int1L=( 0 1 2 3)
    435    movq        mm4, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm4=Int1H=( 4 5 6 7)
    436 
    437    movq        mm7, mm6
    438    movq        mm3, mm4
    439    psrlq       mm7, 2*BYTE_BIT                  ; mm7=( 1 2 3 -)
    440    psllq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
    441    movq        mm0, mm6
    442    movq        mm2, mm4
    443    psrlq       mm0, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
    444    psllq       mm2, 2*BYTE_BIT                  ; mm2=( - 4 5 6)
    445 
    446    por         mm7, mm3                         ; mm7=( 1 2 3 4)
    447    por         mm0, mm2                         ; mm0=( 3 4 5 6)
    448 
    449    movq        mm1, mm6
    450    movq        mm5, mm4
    451    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
    452    psrlq       mm5, 2*BYTE_BIT                  ; mm5=( 5 6 7 -)
    453    movq        mm3, mm4
    454    psrlq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
    455 
    456    por         mm1, MMWORD [wk(1)]              ; mm1=(-1 0 1 2)
    457    por         mm5, MMWORD [wk(3)]              ; mm5=( 5 6 7 8)
    458 
    459    movq        MMWORD [wk(1)], mm3
    460 
    461    pmullw      mm6, [GOTOFF(ebx,PW_THREE)]
    462    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
    463    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
    464    paddw       mm0, [GOTOFF(ebx,PW_EIGHT)]
    465    paddw       mm7, [GOTOFF(ebx,PW_SEVEN)]
    466    paddw       mm5, [GOTOFF(ebx,PW_SEVEN)]
    467 
    468    paddw       mm1, mm6
    469    paddw       mm0, mm4
    470    psrlw       mm1, 4                  ; mm1=Out1LE=( 0  2  4  6)
    471    psrlw       mm0, 4                  ; mm0=Out1HE=( 8 10 12 14)
    472    paddw       mm7, mm6
    473    paddw       mm5, mm4
    474    psrlw       mm7, 4                  ; mm7=Out1LO=( 1  3  5  7)
    475    psrlw       mm5, 4                  ; mm5=Out1HO=( 9 11 13 15)
    476 
    477    psllw       mm7, BYTE_BIT
    478    psllw       mm5, BYTE_BIT
    479    por         mm1, mm7                ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
    480    por         mm0, mm5                ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
    481 
    482    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm1
    483    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm0
    484 
    485    POPPIC      ebx
    486 
    487    sub         eax, byte SIZEOF_MMWORD
    488    add         ecx, byte 1*SIZEOF_MMWORD  ; inptr1(above)
    489    add         ebx, byte 1*SIZEOF_MMWORD  ; inptr0
    490    add         esi, byte 1*SIZEOF_MMWORD  ; inptr1(below)
    491    add         edx, byte 2*SIZEOF_MMWORD  ; outptr0
    492    add         edi, byte 2*SIZEOF_MMWORD  ; outptr1
    493    cmp         eax, byte SIZEOF_MMWORD
    494    ja          near .columnloop
    495    test        eax, eax
    496    jnz         near .columnloop_last
    497 
    498    pop         esi
    499    pop         edi
    500    pop         ecx
    501    pop         eax
    502 
    503    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
    504    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
    505    sub         ecx, byte 2                  ; rowctr
    506    jg          near .rowloop
    507 
    508    emms                                ; empty MMX state
    509 
    510 .return:
    511    pop         edi
    512    pop         esi
    513 ;   pop         edx                     ; need not be preserved
    514 ;   pop         ecx                     ; need not be preserved
    515    pop         ebx
    516    mov         esp, ebp                ; esp <- aligned ebp
    517    pop         esp                     ; esp <- original ebp
    518    pop         ebp
    519    ret
    520 
    521 ; --------------------------------------------------------------------------
    522 ;
    523 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    524 ; It's still a box filter.
    525 ;
    526 ; GLOBAL(void)
    527 ; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
    528 ;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
    529 ;
    530 
    531 %define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
    532 %define output_width(b)     (b) + 12    ; JDIMENSION output_width
    533 %define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
    534 %define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
    535 
    536    align       32
    537    GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
    538 
    539 EXTN(jsimd_h2v1_upsample_mmx):
    540    push        ebp
    541    mov         ebp, esp
    542 ;   push        ebx                     ; unused
    543 ;   push        ecx                     ; need not be preserved
    544 ;   push        edx                     ; need not be preserved
    545    push        esi
    546    push        edi
    547 
    548    mov         edx, JDIMENSION [output_width(ebp)]
    549    add         edx, byte (2*SIZEOF_MMWORD)-1
    550    and         edx, byte -(2*SIZEOF_MMWORD)
    551    jz          short .return
    552 
    553    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
    554    test        ecx, ecx
    555    jz          short .return
    556 
    557    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
    558    mov         edi, POINTER [output_data_ptr(ebp)]
    559    mov         edi, JSAMPARRAY [edi]                ; output_data
    560    ALIGNX      16, 7
    561 .rowloop:
    562    push        edi
    563    push        esi
    564 
    565    mov         esi, JSAMPROW [esi]     ; inptr
    566    mov         edi, JSAMPROW [edi]     ; outptr
    567    mov         eax, edx                ; colctr
    568    ALIGNX      16, 7
    569 .columnloop:
    570 
    571    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    572 
    573    movq        mm1, mm0
    574    punpcklbw   mm0, mm0
    575    punpckhbw   mm1, mm1
    576 
    577    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
    578    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
    579 
    580    sub         eax, byte 2*SIZEOF_MMWORD
    581    jz          short .nextrow
    582 
    583    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
    584 
    585    movq        mm3, mm2
    586    punpcklbw   mm2, mm2
    587    punpckhbw   mm3, mm3
    588 
    589    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
    590    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
    591 
    592    sub         eax, byte 2*SIZEOF_MMWORD
    593    jz          short .nextrow
    594 
    595    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
    596    add         edi, byte 4*SIZEOF_MMWORD  ; outptr
    597    jmp         short .columnloop
    598    ALIGNX      16, 7
    599 
    600 .nextrow:
    601    pop         esi
    602    pop         edi
    603 
    604    add         esi, byte SIZEOF_JSAMPROW  ; input_data
    605    add         edi, byte SIZEOF_JSAMPROW  ; output_data
    606    dec         ecx                        ; rowctr
    607    jg          short .rowloop
    608 
    609    emms                                ; empty MMX state
    610 
    611 .return:
    612    pop         edi
    613    pop         esi
    614 ;   pop         edx                     ; need not be preserved
    615 ;   pop         ecx                     ; need not be preserved
    616 ;   pop         ebx                     ; unused
    617    pop         ebp
    618    ret
    619 
    620 ; --------------------------------------------------------------------------
    621 ;
    622 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    623 ; It's still a box filter.
    624 ;
    625 ; GLOBAL(void)
    626 ; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
    627 ;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
    628 ;
    629 
    630 %define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
    631 %define output_width(b)     (b) + 12    ; JDIMENSION output_width
    632 %define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
    633 %define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
    634 
    635    align       32
    636    GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
    637 
    638 EXTN(jsimd_h2v2_upsample_mmx):
    639    push        ebp
    640    mov         ebp, esp
    641    push        ebx
    642 ;   push        ecx                     ; need not be preserved
    643 ;   push        edx                     ; need not be preserved
    644    push        esi
    645    push        edi
    646 
    647    mov         edx, JDIMENSION [output_width(ebp)]
    648    add         edx, byte (2*SIZEOF_MMWORD)-1
    649    and         edx, byte -(2*SIZEOF_MMWORD)
    650    jz          near .return
    651 
    652    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
    653    test        ecx, ecx
    654    jz          short .return
    655 
    656    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
    657    mov         edi, POINTER [output_data_ptr(ebp)]
    658    mov         edi, JSAMPARRAY [edi]                ; output_data
    659    ALIGNX      16, 7
    660 .rowloop:
    661    push        edi
    662    push        esi
    663 
    664    mov         esi, JSAMPROW [esi]                    ; inptr
    665    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
    666    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
    667    mov         eax, edx                               ; colctr
    668    ALIGNX      16, 7
    669 .columnloop:
    670 
    671    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    672 
    673    movq        mm1, mm0
    674    punpcklbw   mm0, mm0
    675    punpckhbw   mm1, mm1
    676 
    677    movq        MMWORD [ebx+0*SIZEOF_MMWORD], mm0
    678    movq        MMWORD [ebx+1*SIZEOF_MMWORD], mm1
    679    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
    680    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
    681 
    682    sub         eax, byte 2*SIZEOF_MMWORD
    683    jz          short .nextrow
    684 
    685    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
    686 
    687    movq        mm3, mm2
    688    punpcklbw   mm2, mm2
    689    punpckhbw   mm3, mm3
    690 
    691    movq        MMWORD [ebx+2*SIZEOF_MMWORD], mm2
    692    movq        MMWORD [ebx+3*SIZEOF_MMWORD], mm3
    693    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
    694    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
    695 
    696    sub         eax, byte 2*SIZEOF_MMWORD
    697    jz          short .nextrow
    698 
    699    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
    700    add         ebx, byte 4*SIZEOF_MMWORD  ; outptr0
    701    add         edi, byte 4*SIZEOF_MMWORD  ; outptr1
    702    jmp         short .columnloop
    703    ALIGNX      16, 7
    704 
    705 .nextrow:
    706    pop         esi
    707    pop         edi
    708 
    709    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
    710    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
    711    sub         ecx, byte 2                  ; rowctr
    712    jg          short .rowloop
    713 
    714    emms                                ; empty MMX state
    715 
    716 .return:
    717    pop         edi
    718    pop         esi
    719 ;   pop         edx                     ; need not be preserved
    720 ;   pop         ecx                     ; need not be preserved
    721    pop         ebx
    722    pop         ebp
    723    ret
    724 
    725 ; For some reason, the OS X linker does not honor the request to align the
    726 ; segment unless we do this.
    727    align       32