jquantf-sse2.asm (5008B)
1 ; 2 ; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 14 %include "jsimdext.inc" 15 %include "jdct.inc" 16 17 ; -------------------------------------------------------------------------- 18 SECTION SEG_TEXT 19 BITS 64 20 ; 21 ; Load data into workspace, applying unsigned->signed conversion 22 ; 23 ; GLOBAL(void) 24 ; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 25 ; FAST_FLOAT *workspace); 26 ; 27 28 ; r10 = JSAMPARRAY sample_data 29 ; r11d = JDIMENSION start_col 30 ; r12 = FAST_FLOAT *workspace 31 32 align 32 33 GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) 34 35 EXTN(jsimd_convsamp_float_sse2): 36 ENDBR64 37 push rbp 38 mov rbp, rsp 39 COLLECT_ARGS 3 40 push rbx 41 42 pcmpeqw xmm7, xmm7 43 psllw xmm7, 7 44 packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 45 46 mov rsi, r10 47 mov eax, r11d 48 mov rdi, r12 49 mov rcx, DCTSIZE/2 50 .convloop: 51 mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 52 mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 53 54 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] 55 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] 56 57 psubb xmm0, xmm7 ; xmm0=(01234567) 58 psubb xmm1, xmm7 ; xmm1=(89ABCDEF) 59 60 punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 61 punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 62 63 punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) 64 punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) 65 punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) 66 punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) 67 68 psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 69 psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 70 cvtdq2ps xmm2, xmm2 ; xmm2=(0123) 71 cvtdq2ps xmm0, xmm0 ; xmm0=(4567) 72 psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 73 psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 74 cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) 75 cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) 76 77 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 78 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 79 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 80 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 81 82 add rsi, byte 2*SIZEOF_JSAMPROW 83 add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 84 dec rcx 85 jnz short .convloop 86 87 pop rbx 88 UNCOLLECT_ARGS 3 89 pop rbp 90 ret 91 92 ; -------------------------------------------------------------------------- 93 ; 94 ; Quantize/descale the coefficients, and store into coef_block 95 ; 96 ; GLOBAL(void) 97 ; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, 98 ; FAST_FLOAT *workspace); 99 ; 100 101 ; r10 = JCOEFPTR coef_block 102 ; r11 = FAST_FLOAT *divisors 103 ; r12 = FAST_FLOAT *workspace 104 105 align 32 106 GLOBAL_FUNCTION(jsimd_quantize_float_sse2) 107 108 EXTN(jsimd_quantize_float_sse2): 109 ENDBR64 110 push rbp 111 mov rbp, rsp 112 COLLECT_ARGS 3 113 114 mov rsi, r12 115 mov rdx, r11 116 mov rdi, r10 117 mov rax, DCTSIZE2/16 118 .quantloop: 119 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 120 movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] 121 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] 122 mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] 123 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 124 movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] 125 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] 126 mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] 127 128 cvtps2dq xmm0, xmm0 129 cvtps2dq xmm1, xmm1 130 cvtps2dq xmm2, xmm2 131 cvtps2dq xmm3, xmm3 132 133 packssdw xmm0, xmm1 134 packssdw xmm2, xmm3 135 136 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 137 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 138 139 add rsi, byte 16*SIZEOF_FAST_FLOAT 140 add rdx, byte 16*SIZEOF_FAST_FLOAT 141 add rdi, byte 16*SIZEOF_JCOEF 142 dec rax 143 jnz short .quantloop 144 145 UNCOLLECT_ARGS 3 146 pop rbp 147 ret 148 149 ; For some reason, the OS X linker does not honor the request to align the 150 ; segment unless we do this. 151 align 32