tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jfdctflt-3dn.asm (12061B)


      1 ;
      2 ; jfdctflt.asm - floating-point FDCT (3DNow!)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 ;
     13 ; This file contains a floating-point implementation of the forward DCT
     14 ; (Discrete Cosine Transform). The following code is based directly on
     15 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
     16 
     17 %include "jsimdext.inc"
     18 %include "jdct.inc"
     19 
     20 ; --------------------------------------------------------------------------
     21    SECTION     SEG_CONST
     22 
     23    ALIGNZ      32
     24    GLOBAL_DATA(jconst_fdct_float_3dnow)
     25 
     26 EXTN(jconst_fdct_float_3dnow):
     27 
     28 PD_0_382 times 2 dd 0.382683432365089771728460
     29 PD_0_707 times 2 dd 0.707106781186547524400844
     30 PD_0_541 times 2 dd 0.541196100146196984399723
     31 PD_1_306 times 2 dd 1.306562964876376527856643
     32 
     33    ALIGNZ      32
     34 
     35 ; --------------------------------------------------------------------------
     36    SECTION     SEG_TEXT
     37    BITS        32
     38 ;
     39 ; Perform the forward DCT on one block of samples.
     40 ;
     41 ; GLOBAL(void)
     42 ; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
     43 ;
     44 
     45 %define data(b)       (b) + 8           ; FAST_FLOAT *data
     46 
     47 %define original_ebp  ebp + 0
     48 %define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
     49 %define WK_NUM        2
     50 
     51    align       32
     52    GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
     53 
     54 EXTN(jsimd_fdct_float_3dnow):
     55    push        ebp
     56    mov         eax, esp                    ; eax = original ebp
     57    sub         esp, byte 4
     58    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
     59    mov         [esp], eax
     60    mov         ebp, esp                    ; ebp = aligned ebp
     61    lea         esp, [wk(0)]
     62    PUSHPIC     ebx
     63 ;   push        ecx                     ; need not be preserved
     64 ;   push        edx                     ; need not be preserved
     65 ;   push        esi                     ; unused
     66 ;   push        edi                     ; unused
     67 
     68    GET_GOT     ebx                     ; get GOT address
     69 
     70    ; ---- Pass 1: process rows.
     71 
     72    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
     73    mov         ecx, DCTSIZE/2
     74    ALIGNX      16, 7
     75 .rowloop:
     76 
     77    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
     78    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
     79    movq        mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
     80    movq        mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
     81 
     82    ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
     83 
     84    movq        mm4, mm0                ; transpose coefficients
     85    punpckldq   mm0, mm1                ; mm0=(00 10)=data0
     86    punpckhdq   mm4, mm1                ; mm4=(01 11)=data1
     87    movq        mm5, mm2                ; transpose coefficients
     88    punpckldq   mm2, mm3                ; mm2=(06 16)=data6
     89    punpckhdq   mm5, mm3                ; mm5=(07 17)=data7
     90 
     91    movq        mm6, mm4
     92    movq        mm7, mm0
     93    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
     94    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
     95    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
     96    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
     97 
     98    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
     99    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
    100    movq        mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
    101    movq        mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
    102 
    103    ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
    104 
    105    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
    106    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
    107 
    108    movq        mm4, mm1                ; transpose coefficients
    109    punpckldq   mm1, mm3                ; mm1=(02 12)=data2
    110    punpckhdq   mm4, mm3                ; mm4=(03 13)=data3
    111    movq        mm0, mm2                ; transpose coefficients
    112    punpckldq   mm2, mm5                ; mm2=(04 14)=data4
    113    punpckhdq   mm0, mm5                ; mm0=(05 15)=data5
    114 
    115    movq        mm3, mm4
    116    movq        mm5, mm1
    117    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
    118    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
    119    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
    120    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
    121 
    122    ; -- Even part
    123 
    124    movq        mm2, mm7
    125    movq        mm0, mm6
    126    pfsub       mm7, mm4                ; mm7=tmp13
    127    pfsub       mm6, mm1                ; mm6=tmp12
    128    pfadd       mm2, mm4                ; mm2=tmp10
    129    pfadd       mm0, mm1                ; mm0=tmp11
    130 
    131    pfadd       mm6, mm7
    132    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
    133 
    134    movq        mm4, mm2
    135    movq        mm1, mm7
    136    pfsub       mm2, mm0                ; mm2=data4
    137    pfsub       mm7, mm6                ; mm7=data6
    138    pfadd       mm4, mm0                ; mm4=data0
    139    pfadd       mm1, mm6                ; mm1=data2
    140 
    141    movq        MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
    142    movq        MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
    143    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
    144    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
    145 
    146    ; -- Odd part
    147 
    148    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
    149    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
    150 
    151    pfadd       mm3, mm5                ; mm3=tmp10
    152    pfadd       mm5, mm0                ; mm5=tmp11
    153    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
    154 
    155    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
    156 
    157    movq        mm2, mm3                     ; mm2=tmp10
    158    pfsub       mm3, mm0
    159    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
    160    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
    161    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
    162    pfadd       mm2, mm3                     ; mm2=z2
    163    pfadd       mm0, mm3                     ; mm0=z4
    164 
    165    movq        mm7, mm6
    166    pfsub       mm6, mm5                ; mm6=z13
    167    pfadd       mm7, mm5                ; mm7=z11
    168 
    169    movq        mm4, mm6
    170    movq        mm1, mm7
    171    pfsub       mm6, mm2                ; mm6=data3
    172    pfsub       mm7, mm0                ; mm7=data7
    173    pfadd       mm4, mm2                ; mm4=data5
    174    pfadd       mm1, mm0                ; mm1=data1
    175 
    176    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
    177    movq        MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
    178    movq        MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
    179    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
    180 
    181    add         edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
    182    dec         ecx
    183    jnz         near .rowloop
    184 
    185    ; ---- Pass 2: process columns.
    186 
    187    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
    188    mov         ecx, DCTSIZE/2
    189    ALIGNX      16, 7
    190 .columnloop:
    191 
    192    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
    193    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
    194    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
    195    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
    196 
    197    ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
    198 
    199    movq        mm4, mm0                ; transpose coefficients
    200    punpckldq   mm0, mm1                ; mm0=(00 01)=data0
    201    punpckhdq   mm4, mm1                ; mm4=(10 11)=data1
    202    movq        mm5, mm2                ; transpose coefficients
    203    punpckldq   mm2, mm3                ; mm2=(60 61)=data6
    204    punpckhdq   mm5, mm3                ; mm5=(70 71)=data7
    205 
    206    movq        mm6, mm4
    207    movq        mm7, mm0
    208    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
    209    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
    210    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
    211    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
    212 
    213    movq        mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
    214    movq        mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
    215    movq        mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
    216    movq        mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
    217 
    218    ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
    219 
    220    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
    221    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
    222 
    223    movq        mm4, mm1                ; transpose coefficients
    224    punpckldq   mm1, mm3                ; mm1=(20 21)=data2
    225    punpckhdq   mm4, mm3                ; mm4=(30 31)=data3
    226    movq        mm0, mm2                ; transpose coefficients
    227    punpckldq   mm2, mm5                ; mm2=(40 41)=data4
    228    punpckhdq   mm0, mm5                ; mm0=(50 51)=data5
    229 
    230    movq        mm3, mm4
    231    movq        mm5, mm1
    232    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
    233    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
    234    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
    235    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
    236 
    237    ; -- Even part
    238 
    239    movq        mm2, mm7
    240    movq        mm0, mm6
    241    pfsub       mm7, mm4                ; mm7=tmp13
    242    pfsub       mm6, mm1                ; mm6=tmp12
    243    pfadd       mm2, mm4                ; mm2=tmp10
    244    pfadd       mm0, mm1                ; mm0=tmp11
    245 
    246    pfadd       mm6, mm7
    247    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
    248 
    249    movq        mm4, mm2
    250    movq        mm1, mm7
    251    pfsub       mm2, mm0                ; mm2=data4
    252    pfsub       mm7, mm6                ; mm7=data6
    253    pfadd       mm4, mm0                ; mm4=data0
    254    pfadd       mm1, mm6                ; mm1=data2
    255 
    256    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
    257    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
    258    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
    259    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
    260 
    261    ; -- Odd part
    262 
    263    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
    264    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
    265 
    266    pfadd       mm3, mm5                ; mm3=tmp10
    267    pfadd       mm5, mm0                ; mm5=tmp11
    268    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
    269 
    270    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
    271 
    272    movq        mm2, mm3                     ; mm2=tmp10
    273    pfsub       mm3, mm0
    274    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
    275    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
    276    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
    277    pfadd       mm2, mm3                     ; mm2=z2
    278    pfadd       mm0, mm3                     ; mm0=z4
    279 
    280    movq        mm7, mm6
    281    pfsub       mm6, mm5                ; mm6=z13
    282    pfadd       mm7, mm5                ; mm7=z11
    283 
    284    movq        mm4, mm6
    285    movq        mm1, mm7
    286    pfsub       mm6, mm2                ; mm6=data3
    287    pfsub       mm7, mm0                ; mm7=data7
    288    pfadd       mm4, mm2                ; mm4=data5
    289    pfadd       mm1, mm0                ; mm1=data1
    290 
    291    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
    292    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
    293    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
    294    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
    295 
    296    add         edx, byte 2*SIZEOF_FAST_FLOAT
    297    dec         ecx
    298    jnz         near .columnloop
    299 
    300    femms                               ; empty MMX/3DNow! state
    301 
    302 ;   pop         edi                     ; unused
    303 ;   pop         esi                     ; unused
    304 ;   pop         edx                     ; need not be preserved
    305 ;   pop         ecx                     ; need not be preserved
    306    POPPIC      ebx
    307    mov         esp, ebp                ; esp <- aligned ebp
    308    pop         esp                     ; esp <- original ebp
    309    pop         ebp
    310    ret
    311 
    312 ; For some reason, the OS X linker does not honor the request to align the
    313 ; segment unless we do this.
    314    align       32