tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jquanti-sse2.asm (7248B)


      1 ;
      2 ; jquanti.asm - sample data conversion and quantization (SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 
     13 %include "jsimdext.inc"
     14 %include "jdct.inc"
     15 
     16 ; --------------------------------------------------------------------------
     17    SECTION     SEG_TEXT
     18    BITS        32
     19 ;
     20 ; Load data into workspace, applying unsigned->signed conversion
     21 ;
     22 ; GLOBAL(void)
     23 ; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
     24 ;                     DCTELEM *workspace);
     25 ;
     26 
     27 %define sample_data  ebp + 8            ; JSAMPARRAY sample_data
     28 %define start_col    ebp + 12           ; JDIMENSION start_col
     29 %define workspace    ebp + 16           ; DCTELEM *workspace
     30 
     31    align       32
     32    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
     33 
     34 EXTN(jsimd_convsamp_sse2):
     35    push        ebp
     36    mov         ebp, esp
     37    push        ebx
     38 ;   push        ecx                     ; need not be preserved
     39 ;   push        edx                     ; need not be preserved
     40    push        esi
     41    push        edi
     42 
     43    pxor        xmm6, xmm6              ; xmm6=(all 0's)
     44    pcmpeqw     xmm7, xmm7
     45    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
     46 
     47    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
     48    mov         eax, JDIMENSION [start_col]
     49    mov         edi, POINTER [workspace]       ; (DCTELEM *)
     50    mov         ecx, DCTSIZE/4
     51    ALIGNX      16, 7
     52 .convloop:
     53    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     54    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     55 
     56    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
     57    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
     58 
     59    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     60    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     61 
     62    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
     63    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
     64 
     65    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
     66    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
     67    paddw       xmm0, xmm7
     68    paddw       xmm1, xmm7
     69    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
     70    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
     71    paddw       xmm2, xmm7
     72    paddw       xmm3, xmm7
     73 
     74    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
     75    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
     76    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
     77    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
     78 
     79    add         esi, byte 4*SIZEOF_JSAMPROW
     80    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
     81    dec         ecx
     82    jnz         short .convloop
     83 
     84    pop         edi
     85    pop         esi
     86 ;   pop         edx                     ; need not be preserved
     87 ;   pop         ecx                     ; need not be preserved
     88    pop         ebx
     89    pop         ebp
     90    ret
     91 
     92 ; --------------------------------------------------------------------------
     93 ;
     94 ; Quantize/descale the coefficients, and store into coef_block
     95 ;
     96 ; This implementation is based on an algorithm described in
     97 ;   "Optimizing subroutines in assembly language:
     98 ;   An optimization guide for x86 platforms" (https://agner.org/optimize).
     99 ;
    100 ; GLOBAL(void)
    101 ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
    102 ;                     DCTELEM *workspace);
    103 ;
    104 
    105 %define RECIPROCAL(m, n, b) \
    106  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
    107 %define CORRECTION(m, n, b) \
    108  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
    109 %define SCALE(m, n, b) \
    110  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
    111 
    112 %define coef_block  ebp + 8             ; JCOEFPTR coef_block
    113 %define divisors    ebp + 12            ; DCTELEM *divisors
    114 %define workspace   ebp + 16            ; DCTELEM *workspace
    115 
    116    align       32
    117    GLOBAL_FUNCTION(jsimd_quantize_sse2)
    118 
    119 EXTN(jsimd_quantize_sse2):
    120    push        ebp
    121    mov         ebp, esp
    122 ;   push        ebx                     ; unused
    123 ;   push        ecx                     ; unused
    124 ;   push        edx                     ; need not be preserved
    125    push        esi
    126    push        edi
    127 
    128    mov         esi, POINTER [workspace]
    129    mov         edx, POINTER [divisors]
    130    mov         edi, JCOEFPTR [coef_block]
    131    mov         eax, DCTSIZE2/32
    132    ALIGNX      16, 7
    133 .quantloop:
    134    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
    135    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
    136    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
    137    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
    138    movdqa      xmm0, xmm4
    139    movdqa      xmm1, xmm5
    140    movdqa      xmm2, xmm6
    141    movdqa      xmm3, xmm7
    142    psraw       xmm4, (WORD_BIT-1)
    143    psraw       xmm5, (WORD_BIT-1)
    144    psraw       xmm6, (WORD_BIT-1)
    145    psraw       xmm7, (WORD_BIT-1)
    146    pxor        xmm0, xmm4
    147    pxor        xmm1, xmm5
    148    pxor        xmm2, xmm6
    149    pxor        xmm3, xmm7
    150    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
    151    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
    152    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
    153    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
    154 
    155    paddw       xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
    156    paddw       xmm1, XMMWORD [CORRECTION(1,0,edx)]
    157    paddw       xmm2, XMMWORD [CORRECTION(2,0,edx)]
    158    paddw       xmm3, XMMWORD [CORRECTION(3,0,edx)]
    159    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
    160    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
    161    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
    162    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
    163    pmulhuw     xmm0, XMMWORD [SCALE(0,0,edx)]       ; scale
    164    pmulhuw     xmm1, XMMWORD [SCALE(1,0,edx)]
    165    pmulhuw     xmm2, XMMWORD [SCALE(2,0,edx)]
    166    pmulhuw     xmm3, XMMWORD [SCALE(3,0,edx)]
    167 
    168    pxor        xmm0, xmm4
    169    pxor        xmm1, xmm5
    170    pxor        xmm2, xmm6
    171    pxor        xmm3, xmm7
    172    psubw       xmm0, xmm4
    173    psubw       xmm1, xmm5
    174    psubw       xmm2, xmm6
    175    psubw       xmm3, xmm7
    176    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
    177    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
    178    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
    179    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
    180 
    181    add         esi, byte 32*SIZEOF_DCTELEM
    182    add         edx, byte 32*SIZEOF_DCTELEM
    183    add         edi, byte 32*SIZEOF_JCOEF
    184    dec         eax
    185    jnz         near .quantloop
    186 
    187    pop         edi
    188    pop         esi
    189 ;   pop         edx                     ; need not be preserved
    190 ;   pop         ecx                     ; unused
    191 ;   pop         ebx                     ; unused
    192    pop         ebp
    193    ret
    194 
    195 ; For some reason, the OS X linker does not honor the request to align the
    196 ; segment unless we do this.
    197    align       32