tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcsample-avx2.asm (11811B)


      1 ;
      2 ; jcsample.asm - downsampling (AVX2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2015, Intel Corporation.
      6 ; Copyright (C) 2016, 2024, D. R. Commander.
      7 ;
      8 ; Based on the x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     13 
     14 %include "jsimdext.inc"
     15 
     16 ; --------------------------------------------------------------------------
     17    SECTION     SEG_TEXT
     18    BITS        32
     19 ;
     20 ; Downsample pixel values of a single component.
     21 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     22 ; without smoothing.
     23 ;
     24 ; GLOBAL(void)
     25 ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
     26 ;                            JDIMENSION v_samp_factor,
     27 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
     28 ;                            JSAMPARRAY output_data);
     29 ;
     30 
     31 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
     32 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
     33 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
     34 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
     35 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
     36 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
     37 
     38    align       32
     39    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
     40 
     41 EXTN(jsimd_h2v1_downsample_avx2):
     42    push        ebp
     43    mov         ebp, esp
     44 ;   push        ebx                     ; unused
     45 ;   push        ecx                     ; need not be preserved
     46 ;   push        edx                     ; need not be preserved
     47    push        esi
     48    push        edi
     49 
     50    mov         ecx, JDIMENSION [width_blks(ebp)]
     51    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
     52    jz          near .return
     53 
     54    mov         edx, JDIMENSION [img_width(ebp)]
     55 
     56    ; -- expand_right_edge
     57 
     58    push        ecx
     59    shl         ecx, 1                  ; output_cols * 2
     60    sub         ecx, edx
     61    jle         short .expand_end
     62 
     63    mov         eax, INT [max_v_samp(ebp)]
     64    test        eax, eax
     65    jle         short .expand_end
     66 
     67    cld
     68    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
     69    ALIGNX      16, 7
     70 .expandloop:
     71    push        eax
     72    push        ecx
     73 
     74    mov         edi, JSAMPROW [esi]
     75    add         edi, edx
     76    mov         al, JSAMPLE [edi-1]
     77 
     78    rep stosb
     79 
     80    pop         ecx
     81    pop         eax
     82 
     83    add         esi, byte SIZEOF_JSAMPROW
     84    dec         eax
     85    jg          short .expandloop
     86 
     87 .expand_end:
     88    pop         ecx                     ; output_cols
     89 
     90    ; -- h2v1_downsample
     91 
     92    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
     93    test        eax, eax
     94    jle         near .return
     95 
     96    mov         edx, 0x00010000         ; bias pattern
     97    vmovd       xmm7, edx
     98    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
     99    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
    100    vpcmpeqw    ymm6, ymm6, ymm6
    101    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
    102 
    103    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    104    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    105    ALIGNX      16, 7
    106 .rowloop:
    107    push        ecx
    108    push        edi
    109    push        esi
    110 
    111    mov         esi, JSAMPROW [esi]     ; inptr
    112    mov         edi, JSAMPROW [edi]     ; outptr
    113 
    114    cmp         ecx, byte SIZEOF_YMMWORD
    115    jae         short .columnloop
    116    ALIGNX      16, 7
    117 
    118 .columnloop_r24:
    119    ; ecx can possibly be 8, 16, 24
    120    cmp         ecx, 24
    121    jne         .columnloop_r16
    122    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
    123    vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
    124    mov         ecx, SIZEOF_YMMWORD
    125    jmp         short .downsample
    126 
    127 .columnloop_r16:
    128    cmp         ecx, 16
    129    jne         .columnloop_r8
    130    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
    131    vpxor       ymm1, ymm1, ymm1
    132    mov         ecx, SIZEOF_YMMWORD
    133    jmp         short .downsample
    134 
    135 .columnloop_r8:
    136    vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
    137    vpxor       ymm1, ymm1, ymm1
    138    mov         ecx, SIZEOF_YMMWORD
    139    jmp         short .downsample
    140    ALIGNX      16, 7
    141 
    142 .columnloop:
    143    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
    144    vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
    145 
    146 .downsample:
    147    vpsrlw      ymm2, ymm0, BYTE_BIT
    148    vpand       ymm0, ymm0, ymm6
    149    vpsrlw      ymm3, ymm1, BYTE_BIT
    150    vpand       ymm1, ymm1, ymm6
    151 
    152    vpaddw      ymm0, ymm0, ymm2
    153    vpaddw      ymm1, ymm1, ymm3
    154    vpaddw      ymm0, ymm0, ymm7
    155    vpaddw      ymm1, ymm1, ymm7
    156    vpsrlw      ymm0, ymm0, 1
    157    vpsrlw      ymm1, ymm1, 1
    158 
    159    vpackuswb   ymm0, ymm0, ymm1
    160    vpermq      ymm0, ymm0, 0xd8
    161 
    162    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
    163 
    164    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
    165    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
    166    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
    167    cmp         ecx, byte SIZEOF_YMMWORD
    168    jae         short .columnloop
    169    test        ecx, ecx
    170    jnz         near .columnloop_r24
    171 
    172    pop         esi
    173    pop         edi
    174    pop         ecx
    175 
    176    add         esi, byte SIZEOF_JSAMPROW  ; input_data
    177    add         edi, byte SIZEOF_JSAMPROW  ; output_data
    178    dec         eax                        ; rowctr
    179    jg          near .rowloop
    180 
    181 .return:
    182    vzeroupper
    183    pop         edi
    184    pop         esi
    185 ;   pop         edx                     ; need not be preserved
    186 ;   pop         ecx                     ; need not be preserved
    187 ;   pop         ebx                     ; unused
    188    pop         ebp
    189    ret
    190 
    191 ; --------------------------------------------------------------------------
    192 ;
    193 ; Downsample pixel values of a single component.
    194 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    195 ; without smoothing.
    196 ;
    197 ; GLOBAL(void)
    198 ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
    199 ;                            JDIMENSION v_samp_factor,
    200 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
    201 ;                            JSAMPARRAY output_data);
    202 ;
    203 
    204 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
    205 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
    206 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
    207 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
    208 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
    209 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
    210 
    211    align       32
    212    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
    213 
    214 EXTN(jsimd_h2v2_downsample_avx2):
    215    push        ebp
    216    mov         ebp, esp
    217 ;   push        ebx                     ; unused
    218 ;   push        ecx                     ; need not be preserved
    219 ;   push        edx                     ; need not be preserved
    220    push        esi
    221    push        edi
    222 
    223    mov         ecx, JDIMENSION [width_blks(ebp)]
    224    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
    225    jz          near .return
    226 
    227    mov         edx, JDIMENSION [img_width(ebp)]
    228 
    229    ; -- expand_right_edge
    230 
    231    push        ecx
    232    shl         ecx, 1                  ; output_cols * 2
    233    sub         ecx, edx
    234    jle         short .expand_end
    235 
    236    mov         eax, INT [max_v_samp(ebp)]
    237    test        eax, eax
    238    jle         short .expand_end
    239 
    240    cld
    241    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
    242    ALIGNX      16, 7
    243 .expandloop:
    244    push        eax
    245    push        ecx
    246 
    247    mov         edi, JSAMPROW [esi]
    248    add         edi, edx
    249    mov         al, JSAMPLE [edi-1]
    250 
    251    rep stosb
    252 
    253    pop         ecx
    254    pop         eax
    255 
    256    add         esi, byte SIZEOF_JSAMPROW
    257    dec         eax
    258    jg          short .expandloop
    259 
    260 .expand_end:
    261    pop         ecx                     ; output_cols
    262 
    263    ; -- h2v2_downsample
    264 
    265    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
    266    test        eax, eax
    267    jle         near .return
    268 
    269    mov         edx, 0x00020001         ; bias pattern
    270    vmovd       xmm7, edx
    271    vpcmpeqw    ymm6, ymm6, ymm6
    272    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
    273    vperm2i128  ymm7, ymm7, ymm7, 0
    274    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
    275 
    276    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    277    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    278    ALIGNX      16, 7
    279 .rowloop:
    280    push        ecx
    281    push        edi
    282    push        esi
    283 
    284    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
    285    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
    286    mov         edi, JSAMPROW [edi]                    ; outptr
    287 
    288    cmp         ecx, byte SIZEOF_YMMWORD
    289    jae         short .columnloop
    290    ALIGNX      16, 7
    291 
    292 .columnloop_r24:
    293    cmp         ecx, 24
    294    jne         .columnloop_r16
    295    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
    296    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
    297    vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
    298    vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
    299    mov         ecx, SIZEOF_YMMWORD
    300    jmp         short .downsample
    301 
    302 .columnloop_r16:
    303    cmp         ecx, 16
    304    jne         .columnloop_r8
    305    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
    306    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
    307    vpxor       ymm2, ymm2, ymm2
    308    vpxor       ymm3, ymm3, ymm3
    309    mov         ecx, SIZEOF_YMMWORD
    310    jmp         short .downsample
    311 
    312 .columnloop_r8:
    313    vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    314    vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    315    vpxor       ymm2, ymm2, ymm2
    316    vpxor       ymm3, ymm3, ymm3
    317    mov         ecx, SIZEOF_YMMWORD
    318    jmp         short .downsample
    319    ALIGNX      16, 7
    320 
    321 .columnloop:
    322    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
    323    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
    324    vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
    325    vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
    326 
    327 .downsample:
    328    vpand       ymm4, ymm0, ymm6
    329    vpsrlw      ymm0, ymm0, BYTE_BIT
    330    vpand       ymm5, ymm1, ymm6
    331    vpsrlw      ymm1, ymm1, BYTE_BIT
    332    vpaddw      ymm0, ymm0, ymm4
    333    vpaddw      ymm1, ymm1, ymm5
    334 
    335    vpand       ymm4, ymm2, ymm6
    336    vpsrlw      ymm2, ymm2, BYTE_BIT
    337    vpand       ymm5, ymm3, ymm6
    338    vpsrlw      ymm3, ymm3, BYTE_BIT
    339    vpaddw      ymm2, ymm2, ymm4
    340    vpaddw      ymm3, ymm3, ymm5
    341 
    342    vpaddw      ymm0, ymm0, ymm1
    343    vpaddw      ymm2, ymm2, ymm3
    344    vpaddw      ymm0, ymm0, ymm7
    345    vpaddw      ymm2, ymm2, ymm7
    346    vpsrlw      ymm0, ymm0, 2
    347    vpsrlw      ymm2, ymm2, 2
    348 
    349    vpackuswb   ymm0, ymm0, ymm2
    350    vpermq      ymm0, ymm0, 0xd8
    351 
    352    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
    353 
    354    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
    355    add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
    356    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
    357    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
    358    cmp         ecx, byte SIZEOF_YMMWORD
    359    jae         near .columnloop
    360    test        ecx, ecx
    361    jnz         near .columnloop_r24
    362 
    363    pop         esi
    364    pop         edi
    365    pop         ecx
    366 
    367    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
    368    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
    369    dec         eax                          ; rowctr
    370    jg          near .rowloop
    371 
    372 .return:
    373    vzeroupper
    374    pop         edi
    375    pop         esi
    376 ;   pop         edx                     ; need not be preserved
    377 ;   pop         ecx                     ; need not be preserved
    378 ;   pop         ebx                     ; unused
    379    pop         ebp
    380    ret
    381 
    382 ; For some reason, the OS X linker does not honor the request to align the
    383 ; segment unless we do this.
    384    align       32