tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jquant-3dn.asm (8731B)


      1 ;
      2 ; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 
     13 %include "jsimdext.inc"
     14 %include "jdct.inc"
     15 
     16 ; --------------------------------------------------------------------------
     17    SECTION     SEG_TEXT
     18    BITS        32
     19 ;
     20 ; Load data into workspace, applying unsigned->signed conversion
     21 ;
     22 ; GLOBAL(void)
     23 ; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
     24 ;                            FAST_FLOAT *workspace);
     25 ;
     26 
     27 %define sample_data  ebp + 8            ; JSAMPARRAY sample_data
     28 %define start_col    ebp + 12           ; JDIMENSION start_col
     29 %define workspace    ebp + 16           ; FAST_FLOAT *workspace
     30 
     31    align       32
     32    GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
     33 
     34 EXTN(jsimd_convsamp_float_3dnow):
     35    push        ebp
     36    mov         ebp, esp
     37    push        ebx
     38 ;   push        ecx                     ; need not be preserved
     39 ;   push        edx                     ; need not be preserved
     40    push        esi
     41    push        edi
     42 
     43    pcmpeqw     mm7, mm7
     44    psllw       mm7, 7
     45    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
     46 
     47    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
     48    mov         eax, JDIMENSION [start_col]
     49    mov         edi, POINTER [workspace]       ; (DCTELEM *)
     50    mov         ecx, DCTSIZE/2
     51    ALIGNX      16, 7
     52 .convloop:
     53    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     54    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     55 
     56    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     57    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
     58 
     59    psubb       mm0, mm7                ; mm0=(01234567)
     60    psubb       mm1, mm7                ; mm1=(89ABCDEF)
     61 
     62    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
     63    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
     64    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
     65    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
     66 
     67    punpcklwd   mm4, mm2                ; mm4=(***0***1)
     68    punpckhwd   mm2, mm2                ; mm2=(***2***3)
     69    punpcklwd   mm5, mm0                ; mm5=(***4***5)
     70    punpckhwd   mm0, mm0                ; mm0=(***6***7)
     71 
     72    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
     73    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
     74    pi2fd       mm4, mm4
     75    pi2fd       mm2, mm2
     76    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
     77    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
     78    pi2fd       mm5, mm5
     79    pi2fd       mm0, mm0
     80 
     81    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
     82    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
     83    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
     84    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
     85 
     86    punpcklwd   mm6, mm3                ; mm6=(***8***9)
     87    punpckhwd   mm3, mm3                ; mm3=(***A***B)
     88    punpcklwd   mm4, mm1                ; mm4=(***C***D)
     89    punpckhwd   mm1, mm1                ; mm1=(***E***F)
     90 
     91    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
     92    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
     93    pi2fd       mm6, mm6
     94    pi2fd       mm3, mm3
     95    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
     96    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
     97    pi2fd       mm4, mm4
     98    pi2fd       mm1, mm1
     99 
    100    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
    101    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
    102    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
    103    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
    104 
    105    add         esi, byte 2*SIZEOF_JSAMPROW
    106    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
    107    dec         ecx
    108    jnz         near .convloop
    109 
    110    femms                               ; empty MMX/3DNow! state
    111 
    112    pop         edi
    113    pop         esi
    114 ;   pop         edx                     ; need not be preserved
    115 ;   pop         ecx                     ; need not be preserved
    116    pop         ebx
    117    pop         ebp
    118    ret
    119 
    120 ; --------------------------------------------------------------------------
    121 ;
    122 ; Quantize/descale the coefficients, and store into coef_block
    123 ;
    124 ; GLOBAL(void)
    125 ; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
    126 ;                            FAST_FLOAT *workspace);
    127 ;
    128 
    129 %define coef_block  ebp + 8             ; JCOEFPTR coef_block
    130 %define divisors    ebp + 12            ; FAST_FLOAT *divisors
    131 %define workspace   ebp + 16            ; FAST_FLOAT *workspace
    132 
    133    align       32
    134    GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
    135 
    136 EXTN(jsimd_quantize_float_3dnow):
    137    push        ebp
    138    mov         ebp, esp
    139 ;   push        ebx                     ; unused
    140 ;   push        ecx                     ; unused
    141 ;   push        edx                     ; need not be preserved
    142    push        esi
    143    push        edi
    144 
    145    mov         eax, 0x4B400000         ; (float)0x00C00000 (rndint_magic)
    146    movd        mm7, eax
    147    punpckldq   mm7, mm7                ; mm7={12582912.0F 12582912.0F}
    148 
    149    mov         esi, POINTER [workspace]
    150    mov         edx, POINTER [divisors]
    151    mov         edi, JCOEFPTR [coef_block]
    152    mov         eax, DCTSIZE2/16
    153    ALIGNX      16, 7
    154 .quantloop:
    155    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    156    movq        mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
    157    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
    158    pfmul       mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
    159    movq        mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
    160    movq        mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
    161    pfmul       mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
    162    pfmul       mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
    163 
    164    pfadd       mm0, mm7                ; mm0=(00 ** 01 **)
    165    pfadd       mm1, mm7                ; mm1=(02 ** 03 **)
    166    pfadd       mm2, mm7                ; mm0=(04 ** 05 **)
    167    pfadd       mm3, mm7                ; mm1=(06 ** 07 **)
    168 
    169    movq        mm4, mm0
    170    punpcklwd   mm0, mm1                ; mm0=(00 02 ** **)
    171    punpckhwd   mm4, mm1                ; mm4=(01 03 ** **)
    172    movq        mm5, mm2
    173    punpcklwd   mm2, mm3                ; mm2=(04 06 ** **)
    174    punpckhwd   mm5, mm3                ; mm5=(05 07 ** **)
    175 
    176    punpcklwd   mm0, mm4                ; mm0=(00 01 02 03)
    177    punpcklwd   mm2, mm5                ; mm2=(04 05 06 07)
    178 
    179    movq        mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    180    movq        mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
    181    pfmul       mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
    182    pfmul       mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
    183    movq        mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
    184    movq        mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
    185    pfmul       mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
    186    pfmul       mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
    187 
    188    pfadd       mm6, mm7                ; mm0=(10 ** 11 **)
    189    pfadd       mm1, mm7                ; mm4=(12 ** 13 **)
    190    pfadd       mm3, mm7                ; mm0=(14 ** 15 **)
    191    pfadd       mm4, mm7                ; mm4=(16 ** 17 **)
    192 
    193    movq        mm5, mm6
    194    punpcklwd   mm6, mm1                ; mm6=(10 12 ** **)
    195    punpckhwd   mm5, mm1                ; mm5=(11 13 ** **)
    196    movq        mm1, mm3
    197    punpcklwd   mm3, mm4                ; mm3=(14 16 ** **)
    198    punpckhwd   mm1, mm4                ; mm1=(15 17 ** **)
    199 
    200    punpcklwd   mm6, mm5                ; mm6=(10 11 12 13)
    201    punpcklwd   mm3, mm1                ; mm3=(14 15 16 17)
    202 
    203    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
    204    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
    205    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
    206    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
    207 
    208    add         esi, byte 16*SIZEOF_FAST_FLOAT
    209    add         edx, byte 16*SIZEOF_FAST_FLOAT
    210    add         edi, byte 16*SIZEOF_JCOEF
    211    dec         eax
    212    jnz         near .quantloop
    213 
    214    femms                               ; empty MMX/3DNow! state
    215 
    216    pop         edi
    217    pop         esi
    218 ;   pop         edx                     ; need not be preserved
    219 ;   pop         ecx                     ; unused
    220 ;   pop         ebx                     ; unused
    221    pop         ebp
    222    ret
    223 
    224 ; For some reason, the OS X linker does not honor the request to align the
    225 ; segment unless we do this.
    226    align       32