tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jquanti-avx2.asm (6697B)


      1 ;
      2 ; jquanti.asm - sample data conversion and quantization (AVX2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2018, 2024, D. R. Commander.
      6 ; Copyright (C) 2016, Matthieu Darbois.
      7 ;
      8 ; Based on the x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     13 
     14 %include "jsimdext.inc"
     15 %include "jdct.inc"
     16 
     17 ; --------------------------------------------------------------------------
     18    SECTION     SEG_TEXT
     19    BITS        32
     20 ;
     21 ; Load data into workspace, applying unsigned->signed conversion
     22 ;
     23 ; GLOBAL(void)
     24 ; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
     25 ;                     DCTELEM *workspace);
     26 ;
     27 
     28 %define sample_data  ebp + 8            ; JSAMPARRAY sample_data
     29 %define start_col    ebp + 12           ; JDIMENSION start_col
     30 %define workspace    ebp + 16           ; DCTELEM *workspace
     31 
     32    align       32
     33    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
     34 
     35 EXTN(jsimd_convsamp_avx2):
     36    push        ebp
     37    mov         ebp, esp
     38    push        ebx
     39 ;   push        ecx                     ; need not be preserved
     40 ;   push        edx                     ; need not be preserved
     41    push        esi
     42    push        edi
     43 
     44    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
     45    mov         eax, JDIMENSION [start_col]
     46    mov         edi, POINTER [workspace]       ; (DCTELEM *)
     47 
     48    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     49    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     50    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     51    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     52 
     53    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     54    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     55    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     56    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     57 
     58    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     59    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     60    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     61    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     62 
     63    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     64    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     65    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     66    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     67 
     68    vinserti128 ymm0, ymm0, xmm1, 1
     69    vinserti128 ymm2, ymm2, xmm3, 1
     70    vinserti128 ymm4, ymm4, xmm5, 1
     71    vinserti128 ymm6, ymm6, xmm7, 1
     72 
     73    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
     74    vpunpcklbw  ymm0, ymm0, ymm1
     75    vpunpcklbw  ymm2, ymm2, ymm1
     76    vpunpcklbw  ymm4, ymm4, ymm1
     77    vpunpcklbw  ymm6, ymm6, ymm1
     78 
     79    vpcmpeqw    ymm7, ymm7, ymm7
     80    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
     81 
     82    vpaddw      ymm0, ymm0, ymm7
     83    vpaddw      ymm2, ymm2, ymm7
     84    vpaddw      ymm4, ymm4, ymm7
     85    vpaddw      ymm6, ymm6, ymm7
     86 
     87    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
     88    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
     89    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
     90    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
     91 
     92    vzeroupper
     93    pop         edi
     94    pop         esi
     95 ;   pop         edx                     ; need not be preserved
     96 ;   pop         ecx                     ; need not be preserved
     97    pop         ebx
     98    pop         ebp
     99    ret
    100 
    101 ; --------------------------------------------------------------------------
    102 ;
    103 ; Quantize/descale the coefficients, and store into coef_block
    104 ;
    105 ; This implementation is based on an algorithm described in
    106 ;   "Optimizing subroutines in assembly language:
    107 ;   An optimization guide for x86 platforms" (https://agner.org/optimize).
    108 ;
    109 ; GLOBAL(void)
    110 ; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
    111 ;                     DCTELEM *workspace);
    112 ;
    113 
    114 %define RECIPROCAL(m, n, b) \
    115  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
    116 %define CORRECTION(m, n, b) \
    117  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
    118 %define SCALE(m, n, b) \
    119  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
    120 
    121 %define coef_block  ebp + 8             ; JCOEFPTR coef_block
    122 %define divisors    ebp + 12            ; DCTELEM *divisors
    123 %define workspace   ebp + 16            ; DCTELEM *workspace
    124 
    125    align       32
    126    GLOBAL_FUNCTION(jsimd_quantize_avx2)
    127 
    128 EXTN(jsimd_quantize_avx2):
    129    push        ebp
    130    mov         ebp, esp
    131 ;   push        ebx                     ; unused
    132 ;   push        ecx                     ; unused
    133 ;   push        edx                     ; need not be preserved
    134    push        esi
    135    push        edi
    136 
    137    mov         esi, POINTER [workspace]
    138    mov         edx, POINTER [divisors]
    139    mov         edi, JCOEFPTR [coef_block]
    140 
    141    vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
    142    vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
    143    vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
    144    vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
    145    vpabsw      ymm0, ymm4
    146    vpabsw      ymm1, ymm5
    147    vpabsw      ymm2, ymm6
    148    vpabsw      ymm3, ymm7
    149 
    150    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
    151    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
    152    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
    153    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
    154    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
    155    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
    156    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
    157    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
    158    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
    159    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
    160    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
    161    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
    162 
    163    vpsignw     ymm0, ymm0, ymm4
    164    vpsignw     ymm1, ymm1, ymm5
    165    vpsignw     ymm2, ymm2, ymm6
    166    vpsignw     ymm3, ymm3, ymm7
    167 
    168    vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
    169    vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
    170    vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
    171    vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
    172 
    173    vzeroupper
    174    pop         edi
    175    pop         esi
    176 ;   pop         edx                     ; need not be preserved
    177 ;   pop         ecx                     ; unused
    178 ;   pop         ebx                     ; unused
    179    pop         ebp
    180    ret
    181 
    182 ; For some reason, the OS X linker does not honor the request to align the
    183 ; segment unless we do this.
    184    align       32