tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jquanti-avx2.asm (5680B)


      1 ;
      2 ; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, 2018, 2024, D. R. Commander.
      6 ; Copyright (C) 2016, Matthieu Darbois.
      7 ; Copyright (C) 2018, Matthias Räncker.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 
     15 %include "jsimdext.inc"
     16 %include "jdct.inc"
     17 
     18 ; --------------------------------------------------------------------------
     19    SECTION     SEG_TEXT
     20    BITS        64
     21 ;
     22 ; Load data into workspace, applying unsigned->signed conversion
     23 ;
     24 ; GLOBAL(void)
     25 ; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
     26 ;                     DCTELEM *workspace);
     27 ;
     28 
     29 ; r10 = JSAMPARRAY sample_data
     30 ; r11d = JDIMENSION start_col
     31 ; r12 = DCTELEM *workspace
     32 
     33    align       32
     34    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
     35 
     36 EXTN(jsimd_convsamp_avx2):
     37    ENDBR64
     38    push        rbp
     39    mov         rbp, rsp
     40    COLLECT_ARGS 3
     41 
     42    mov         eax, r11d
     43 
     44    mov         rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     45    mov         rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     46    movq        xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     47    pinsrq      xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
     48 
     49    mov         rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     50    mov         rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     51    movq        xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     52    pinsrq      xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
     53 
     54    mov         rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     55    mov         rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     56    movq        xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     57    pinsrq      xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
     58 
     59    mov         rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     60    mov         rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     61    movq        xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     62    pinsrq      xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
     63 
     64    vpmovzxbw   ymm0, xmm0              ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
     65    vpmovzxbw   ymm1, xmm1              ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
     66    vpmovzxbw   ymm2, xmm2              ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
     67    vpmovzxbw   ymm3, xmm3              ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
     68 
     69    vpcmpeqw    ymm7, ymm7, ymm7
     70    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
     71 
     72    vpaddw      ymm0, ymm0, ymm7
     73    vpaddw      ymm1, ymm1, ymm7
     74    vpaddw      ymm2, ymm2, ymm7
     75    vpaddw      ymm3, ymm3, ymm7
     76 
     77    vmovdqu     YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
     78    vmovdqu     YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
     79    vmovdqu     YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
     80    vmovdqu     YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
     81 
     82    vzeroupper
     83    UNCOLLECT_ARGS 3
     84    pop         rbp
     85    ret
     86 
     87 ; --------------------------------------------------------------------------
     88 ;
     89 ; Quantize/descale the coefficients, and store into coef_block
     90 ;
     91 ; This implementation is based on an algorithm described in
     92 ;   "Optimizing subroutines in assembly language:
     93 ;   An optimization guide for x86 platforms" (https://agner.org/optimize).
     94 ;
     95 ; GLOBAL(void)
     96 ; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
     97 ;                     DCTELEM *workspace);
     98 ;
     99 
    100 %define RECIPROCAL(m, n, b) \
    101  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
    102 %define CORRECTION(m, n, b) \
    103  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
    104 %define SCALE(m, n, b) \
    105  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
    106 
    107 ; r10 = JCOEFPTR coef_block
    108 ; r11 = DCTELEM *divisors
    109 ; r12 = DCTELEM *workspace
    110 
    111    align       32
    112    GLOBAL_FUNCTION(jsimd_quantize_avx2)
    113 
    114 EXTN(jsimd_quantize_avx2):
    115    ENDBR64
    116    push        rbp
    117    mov         rbp, rsp
    118    COLLECT_ARGS 3
    119 
    120    vmovdqu     ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
    121    vmovdqu     ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
    122    vmovdqu     ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
    123    vmovdqu     ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
    124    vpabsw      ymm0, ymm4
    125    vpabsw      ymm1, ymm5
    126    vpabsw      ymm2, ymm6
    127    vpabsw      ymm3, ymm7
    128 
    129    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,r11)]  ; correction + roundfactor
    130    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,r11)]
    131    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,r11)]
    132    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,r11)]
    133    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,r11)]  ; reciprocal
    134    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
    135    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
    136    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
    137    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,r11)]       ; scale
    138    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,r11)]
    139    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,r11)]
    140    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,r11)]
    141 
    142    vpsignw     ymm0, ymm0, ymm4
    143    vpsignw     ymm1, ymm1, ymm5
    144    vpsignw     ymm2, ymm2, ymm6
    145    vpsignw     ymm3, ymm3, ymm7
    146 
    147    vmovdqu     [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
    148    vmovdqu     [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
    149    vmovdqu     [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
    150    vmovdqu     [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
    151 
    152    vzeroupper
    153    UNCOLLECT_ARGS 3
    154    pop         rbp
    155    ret
    156 
    157 ; For some reason, the OS X linker does not honor the request to align the
    158 ; segment unless we do this.
    159    align       32