tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jquant-mmx.asm (9215B)


      1 ;
      2 ; jquant.asm - sample data conversion and quantization (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 
     13 %include "jsimdext.inc"
     14 %include "jdct.inc"
     15 
     16 ; --------------------------------------------------------------------------
     17    SECTION     SEG_TEXT
     18    BITS        32
     19 ;
     20 ; Load data into workspace, applying unsigned->signed conversion
     21 ;
     22 ; GLOBAL(void)
     23 ; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
     24 ;                    DCTELEM *workspace);
     25 ;
     26 
     27 %define sample_data  ebp + 8            ; JSAMPARRAY sample_data
     28 %define start_col    ebp + 12           ; JDIMENSION start_col
     29 %define workspace    ebp + 16           ; DCTELEM *workspace
     30 
     31    align       32
     32    GLOBAL_FUNCTION(jsimd_convsamp_mmx)
     33 
     34 EXTN(jsimd_convsamp_mmx):
     35    push        ebp
     36    mov         ebp, esp
     37    push        ebx
     38 ;   push        ecx                     ; need not be preserved
     39 ;   push        edx                     ; need not be preserved
     40    push        esi
     41    push        edi
     42 
     43    pxor        mm6, mm6                ; mm6=(all 0's)
     44    pcmpeqw     mm7, mm7
     45    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
     46 
     47    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
     48    mov         eax, JDIMENSION [start_col]
     49    mov         edi, POINTER [workspace]       ; (DCTELEM *)
     50    mov         ecx, DCTSIZE/4
     51    ALIGNX      16, 7
     52 .convloop:
     53    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     54    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     55 
     56    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm0=(01234567)
     57    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm1=(89ABCDEF)
     58 
     59    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     60    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     61 
     62    movq        mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm2=(GHIJKLMN)
     63    movq        mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm3=(OPQRSTUV)
     64 
     65    movq        mm4, mm0
     66    punpcklbw   mm0, mm6                ; mm0=(0123)
     67    punpckhbw   mm4, mm6                ; mm4=(4567)
     68    movq        mm5, mm1
     69    punpcklbw   mm1, mm6                ; mm1=(89AB)
     70    punpckhbw   mm5, mm6                ; mm5=(CDEF)
     71 
     72    paddw       mm0, mm7
     73    paddw       mm4, mm7
     74    paddw       mm1, mm7
     75    paddw       mm5, mm7
     76 
     77    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
     78    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
     79    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
     80    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
     81 
     82    movq        mm0, mm2
     83    punpcklbw   mm2, mm6                ; mm2=(GHIJ)
     84    punpckhbw   mm0, mm6                ; mm0=(KLMN)
     85    movq        mm4, mm3
     86    punpcklbw   mm3, mm6                ; mm3=(OPQR)
     87    punpckhbw   mm4, mm6                ; mm4=(STUV)
     88 
     89    paddw       mm2, mm7
     90    paddw       mm0, mm7
     91    paddw       mm3, mm7
     92    paddw       mm4, mm7
     93 
     94    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
     95    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
     96    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
     97    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
     98 
     99    add         esi, byte 4*SIZEOF_JSAMPROW
    100    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
    101    dec         ecx
    102    jnz         short .convloop
    103 
    104    emms                                ; empty MMX state
    105 
    106    pop         edi
    107    pop         esi
    108 ;   pop         edx                     ; need not be preserved
    109 ;   pop         ecx                     ; need not be preserved
    110    pop         ebx
    111    pop         ebp
    112    ret
    113 
    114 ; --------------------------------------------------------------------------
    115 ;
    116 ; Quantize/descale the coefficients, and store into coef_block
    117 ;
    118 ; This implementation is based on an algorithm described in
    119 ;   "Optimizing subroutines in assembly language:
    120 ;   An optimization guide for x86 platforms" (https://agner.org/optimize).
    121 ;
    122 ; GLOBAL(void)
    123 ; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
    124 ;                    DCTELEM *workspace);
    125 ;
    126 
    127 %define RECIPROCAL(m, n, b) \
    128  MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
    129 %define CORRECTION(m, n, b) \
    130  MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
    131 %define SCALE(m, n, b) \
    132  MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
    133 %define SHIFT(m, n, b) \
    134  MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
    135 
    136 %define coef_block  ebp + 8             ; JCOEFPTR coef_block
    137 %define divisors    ebp + 12            ; DCTELEM *divisors
    138 %define workspace   ebp + 16            ; DCTELEM *workspace
    139 
    140    align       32
    141    GLOBAL_FUNCTION(jsimd_quantize_mmx)
    142 
    143 EXTN(jsimd_quantize_mmx):
    144    push        ebp
    145    mov         ebp, esp
    146 ;   push        ebx                     ; unused
    147 ;   push        ecx                     ; unused
    148 ;   push        edx                     ; need not be preserved
    149    push        esi
    150    push        edi
    151 
    152    mov         esi, POINTER [workspace]
    153    mov         edx, POINTER [divisors]
    154    mov         edi, JCOEFPTR [coef_block]
    155    mov         ah, 2
    156    ALIGNX      16, 7
    157 .quantloop1:
    158    mov         al, DCTSIZE2/8/2
    159    ALIGNX      16, 7
    160 .quantloop2:
    161    movq        mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
    162    movq        mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
    163 
    164    movq        mm0, mm2
    165    movq        mm1, mm3
    166 
    167    psraw       mm2, (WORD_BIT-1)       ; -1 if value < 0, 0 otherwise
    168    psraw       mm3, (WORD_BIT-1)
    169 
    170    pxor        mm0, mm2                ; val = -val
    171    pxor        mm1, mm3
    172    psubw       mm0, mm2
    173    psubw       mm1, mm3
    174 
    175    ;
    176    ; MMX is an annoyingly crappy instruction set. It has two
    177    ; misfeatures that are causing problems here:
    178    ;
    179    ; - All multiplications are signed.
    180    ;
    181    ; - The second operand for the shifts is not treated as packed.
    182    ;
    183    ;
    184    ; We work around the first problem by implementing this algorithm:
    185    ;
    186    ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
    187    ; {
    188    ;   enum { SHORT_BIT = 16 };
    189    ;   signed short sx = (signed short)x;
    190    ;   signed short sy = (signed short)y;
    191    ;   signed long sz;
    192    ;
    193    ;   sz = (long)sx * (long)sy;    /* signed multiply */
    194    ;
    195    ;   if (sx < 0) sz += (long)sy << SHORT_BIT;
    196    ;   if (sy < 0) sz += (long)sx << SHORT_BIT;
    197    ;
    198    ;   return (unsigned long)sz;
    199    ; }
    200    ;
    201    ; (note that a negative sx adds _sy_ and vice versa)
    202    ;
    203    ; For the second problem, we replace the shift by a multiplication.
    204    ; Unfortunately that means we have to deal with the signed issue again.
    205    ;
    206 
    207    paddw       mm0, MMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
    208    paddw       mm1, MMWORD [CORRECTION(0,1,edx)]
    209 
    210    movq        mm4, mm0                ; store current value for later
    211    movq        mm5, mm1
    212    pmulhw      mm0, MMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
    213    pmulhw      mm1, MMWORD [RECIPROCAL(0,1,edx)]
    214    paddw       mm0, mm4  ; reciprocal is always negative (MSB=1),
    215    paddw       mm1, mm5  ; so we always need to add the initial value
    216                          ; (input value is never negative as we
    217                          ; inverted it at the start of this routine)
    218 
    219    ; here it gets a bit tricky as both scale
    220    ; and mm0/mm1 can be negative
    221    movq        mm6, MMWORD [SCALE(0,0,edx)]  ; scale
    222    movq        mm7, MMWORD [SCALE(0,1,edx)]
    223    movq        mm4, mm0
    224    movq        mm5, mm1
    225    pmulhw      mm0, mm6
    226    pmulhw      mm1, mm7
    227 
    228    psraw       mm6, (WORD_BIT-1)       ; determine if scale is negative
    229    psraw       mm7, (WORD_BIT-1)
    230 
    231    pand        mm6, mm4                ; and add input if it is
    232    pand        mm7, mm5
    233    paddw       mm0, mm6
    234    paddw       mm1, mm7
    235 
    236    psraw       mm4, (WORD_BIT-1)       ; then check if negative input
    237    psraw       mm5, (WORD_BIT-1)
    238 
    239    pand        mm4, MMWORD [SCALE(0,0,edx)]  ; and add scale if it is
    240    pand        mm5, MMWORD [SCALE(0,1,edx)]
    241    paddw       mm0, mm4
    242    paddw       mm1, mm5
    243 
    244    pxor        mm0, mm2                ; val = -val
    245    pxor        mm1, mm3
    246    psubw       mm0, mm2
    247    psubw       mm1, mm3
    248 
    249    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
    250    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
    251 
    252    add         esi, byte 8*SIZEOF_DCTELEM
    253    add         edx, byte 8*SIZEOF_DCTELEM
    254    add         edi, byte 8*SIZEOF_JCOEF
    255    dec         al
    256    jnz         near .quantloop2
    257    dec         ah
    258    jnz         near .quantloop1        ; to avoid branch misprediction
    259 
    260    emms                                ; empty MMX state
    261 
    262    pop         edi
    263    pop         esi
    264 ;   pop         edx                     ; need not be preserved
    265 ;   pop         ecx                     ; unused
    266 ;   pop         ebx                     ; unused
    267    pop         ebp
    268    ret
    269 
    270 ; For some reason, the OS X linker does not honor the request to align the
    271 ; segment unless we do this.
    272    align       32