tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcsample-avx2.asm (10314B)


      1 ;
      2 ; jcsample.asm - downsampling (64-bit AVX2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
      6 ; Copyright (C) 2015, Intel Corporation.
      7 ; Copyright (C) 2018, Matthias Räncker.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 
     15 %include "jsimdext.inc"
     16 
     17 ; --------------------------------------------------------------------------
     18    SECTION     SEG_TEXT
     19    BITS        64
     20 ;
     21 ; Downsample pixel values of a single component.
     22 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     23 ; without smoothing.
     24 ;
     25 ; GLOBAL(void)
     26 ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
     27 ;                            JDIMENSION v_samp_factor,
     28 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
     29 ;                            JSAMPARRAY output_data);
     30 ;
     31 
     32 ; r10d = JDIMENSION image_width
     33 ; r11 = int max_v_samp_factor
     34 ; r12d = JDIMENSION v_samp_factor
     35 ; r13d = JDIMENSION width_in_blocks
     36 ; r14 = JSAMPARRAY input_data
     37 ; r15 = JSAMPARRAY output_data
     38 
     39    align       32
     40    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
     41 
     42 EXTN(jsimd_h2v1_downsample_avx2):
     43    ENDBR64
     44    push        rbp
     45    mov         rbp, rsp
     46    COLLECT_ARGS 6
     47 
     48    mov         ecx, r13d
     49    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
     50    jz          near .return
     51 
     52    mov         edx, r10d
     53 
     54    ; -- expand_right_edge
     55 
     56    push        rcx
     57    shl         rcx, 1                  ; output_cols * 2
     58    sub         rcx, rdx
     59    jle         short .expand_end
     60 
     61    mov         rax, r11
     62    test        rax, rax
     63    jle         short .expand_end
     64 
     65    cld
     66    mov         rsi, r14                ; input_data
     67 .expandloop:
     68    push        rax
     69    push        rcx
     70 
     71    mov         rdip, JSAMPROW [rsi]
     72    add         rdi, rdx
     73    mov         al, JSAMPLE [rdi-1]
     74 
     75    rep stosb
     76 
     77    pop         rcx
     78    pop         rax
     79 
     80    add         rsi, byte SIZEOF_JSAMPROW
     81    dec         rax
     82    jg          short .expandloop
     83 
     84 .expand_end:
     85    pop         rcx                     ; output_cols
     86 
     87    ; -- h2v1_downsample
     88 
     89    mov         eax, r12d               ; rowctr
     90    test        eax, eax
     91    jle         near .return
     92 
     93    mov         rdx, 0x00010000         ; bias pattern
     94    vmovd       xmm7, edx
     95    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
     96    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
     97    vpcmpeqw    ymm6, ymm6, ymm6
     98    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
     99 
    100    mov         rsi, r14                ; input_data
    101    mov         rdi, r15                ; output_data
    102 .rowloop:
    103    push        rcx
    104    push        rdi
    105    push        rsi
    106 
    107    mov         rsip, JSAMPROW [rsi]    ; inptr
    108    mov         rdip, JSAMPROW [rdi]    ; outptr
    109 
    110    cmp         rcx, byte SIZEOF_YMMWORD
    111    jae         short .columnloop
    112 
    113 .columnloop_r24:
    114    ; rcx can possibly be 8, 16, 24
    115    cmp         rcx, 24
    116    jne         .columnloop_r16
    117    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    118    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
    119    mov         rcx, SIZEOF_YMMWORD
    120    jmp         short .downsample
    121 
    122 .columnloop_r16:
    123    cmp         rcx, 16
    124    jne         .columnloop_r8
    125    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    126    vpxor       ymm1, ymm1, ymm1
    127    mov         rcx, SIZEOF_YMMWORD
    128    jmp         short .downsample
    129 
    130 .columnloop_r8:
    131    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
    132    vpxor       ymm1, ymm1, ymm1
    133    mov         rcx, SIZEOF_YMMWORD
    134    jmp         short .downsample
    135 
    136 .columnloop:
    137    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    138    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    139 
    140 .downsample:
    141    vpsrlw      ymm2, ymm0, BYTE_BIT
    142    vpand       ymm0, ymm0, ymm6
    143    vpsrlw      ymm3, ymm1, BYTE_BIT
    144    vpand       ymm1, ymm1, ymm6
    145 
    146    vpaddw      ymm0, ymm0, ymm2
    147    vpaddw      ymm1, ymm1, ymm3
    148    vpaddw      ymm0, ymm0, ymm7
    149    vpaddw      ymm1, ymm1, ymm7
    150    vpsrlw      ymm0, ymm0, 1
    151    vpsrlw      ymm1, ymm1, 1
    152 
    153    vpackuswb   ymm0, ymm0, ymm1
    154    vpermq      ymm0, ymm0, 0xd8
    155 
    156    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
    157 
    158    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
    159    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
    160    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
    161    cmp         rcx, byte SIZEOF_YMMWORD
    162    jae         short .columnloop
    163    test        rcx, rcx
    164    jnz         near .columnloop_r24
    165 
    166    pop         rsi
    167    pop         rdi
    168    pop         rcx
    169 
    170    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
    171    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
    172    dec         rax                        ; rowctr
    173    jg          near .rowloop
    174 
    175 .return:
    176    vzeroupper
    177    UNCOLLECT_ARGS 6
    178    pop         rbp
    179    ret
    180 
    181 ; --------------------------------------------------------------------------
    182 ;
    183 ; Downsample pixel values of a single component.
    184 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    185 ; without smoothing.
    186 ;
    187 ; GLOBAL(void)
    188 ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
    189 ;                            JDIMENSION v_samp_factor,
    190 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
    191 ;                            JSAMPARRAY output_data);
    192 ;
    193 
    194 ; r10d = JDIMENSION image_width
    195 ; r11 = int max_v_samp_factor
    196 ; r12d = JDIMENSION v_samp_factor
    197 ; r13d = JDIMENSION width_in_blocks
    198 ; r14 = JSAMPARRAY input_data
    199 ; r15 = JSAMPARRAY output_data
    200 
    201    align       32
    202    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
    203 
    204 EXTN(jsimd_h2v2_downsample_avx2):
    205    ENDBR64
    206    push        rbp
    207    mov         rbp, rsp
    208    COLLECT_ARGS 6
    209 
    210    mov         ecx, r13d
    211    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
    212    jz          near .return
    213 
    214    mov         edx, r10d
    215 
    216    ; -- expand_right_edge
    217 
    218    push        rcx
    219    shl         rcx, 1                  ; output_cols * 2
    220    sub         rcx, rdx
    221    jle         short .expand_end
    222 
    223    mov         rax, r11
    224    test        rax, rax
    225    jle         short .expand_end
    226 
    227    cld
    228    mov         rsi, r14                ; input_data
    229 .expandloop:
    230    push        rax
    231    push        rcx
    232 
    233    mov         rdip, JSAMPROW [rsi]
    234    add         rdi, rdx
    235    mov         al, JSAMPLE [rdi-1]
    236 
    237    rep stosb
    238 
    239    pop         rcx
    240    pop         rax
    241 
    242    add         rsi, byte SIZEOF_JSAMPROW
    243    dec         rax
    244    jg          short .expandloop
    245 
    246 .expand_end:
    247    pop         rcx                     ; output_cols
    248 
    249    ; -- h2v2_downsample
    250 
    251    mov         eax, r12d               ; rowctr
    252    test        rax, rax
    253    jle         near .return
    254 
    255    mov         rdx, 0x00020001         ; bias pattern
    256    vmovd       xmm7, edx
    257    vpcmpeqw    ymm6, ymm6, ymm6
    258    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
    259    vperm2i128  ymm7, ymm7, ymm7, 0
    260    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
    261 
    262    mov         rsi, r14                ; input_data
    263    mov         rdi, r15                ; output_data
    264 .rowloop:
    265    push        rcx
    266    push        rdi
    267    push        rsi
    268 
    269    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
    270    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
    271    mov         rdip, JSAMPROW [rdi]                    ; outptr
    272 
    273    cmp         rcx, byte SIZEOF_YMMWORD
    274    jae         short .columnloop
    275 
    276 .columnloop_r24:
    277    cmp         rcx, 24
    278    jne         .columnloop_r16
    279    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
    280    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    281    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
    282    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
    283    mov         rcx, SIZEOF_YMMWORD
    284    jmp         short .downsample
    285 
    286 .columnloop_r16:
    287    cmp         rcx, 16
    288    jne         .columnloop_r8
    289    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
    290    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    291    vpxor       ymm2, ymm2, ymm2
    292    vpxor       ymm3, ymm3, ymm3
    293    mov         rcx, SIZEOF_YMMWORD
    294    jmp         short .downsample
    295 
    296 .columnloop_r8:
    297    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    298    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    299    vpxor       ymm2, ymm2, ymm2
    300    vpxor       ymm3, ymm3, ymm3
    301    mov         rcx, SIZEOF_YMMWORD
    302    jmp         short .downsample
    303 
    304 .columnloop:
    305    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
    306    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    307    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
    308    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    309 
    310 .downsample:
    311    vpand       ymm4, ymm0, ymm6
    312    vpsrlw      ymm0, ymm0, BYTE_BIT
    313    vpand       ymm5, ymm1, ymm6
    314    vpsrlw      ymm1, ymm1, BYTE_BIT
    315    vpaddw      ymm0, ymm0, ymm4
    316    vpaddw      ymm1, ymm1, ymm5
    317 
    318    vpand       ymm4, ymm2, ymm6
    319    vpsrlw      ymm2, ymm2, BYTE_BIT
    320    vpand       ymm5, ymm3, ymm6
    321    vpsrlw      ymm3, ymm3, BYTE_BIT
    322    vpaddw      ymm2, ymm2, ymm4
    323    vpaddw      ymm3, ymm3, ymm5
    324 
    325    vpaddw      ymm0, ymm0, ymm1
    326    vpaddw      ymm2, ymm2, ymm3
    327    vpaddw      ymm0, ymm0, ymm7
    328    vpaddw      ymm2, ymm2, ymm7
    329    vpsrlw      ymm0, ymm0, 2
    330    vpsrlw      ymm2, ymm2, 2
    331 
    332    vpackuswb   ymm0, ymm0, ymm2
    333    vpermq      ymm0, ymm0, 0xd8
    334 
    335    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
    336 
    337    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
    338    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
    339    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
    340    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
    341    cmp         rcx, byte SIZEOF_YMMWORD
    342    jae         near .columnloop
    343    test        rcx, rcx
    344    jnz         near .columnloop_r24
    345 
    346    pop         rsi
    347    pop         rdi
    348    pop         rcx
    349 
    350    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
    351    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
    352    dec         rax                          ; rowctr
    353    jg          near .rowloop
    354 
    355 .return:
    356    vzeroupper
    357    UNCOLLECT_ARGS 6
    358    pop         rbp
    359    ret
    360 
    361 ; For some reason, the OS X linker does not honor the request to align the
    362 ; segment unless we do this.
    363    align       32