jquantf-sse2.asm (5975B)
1 ; 2 ; jquantf.asm - sample data conversion and quantization (SSE & SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jsimdext.inc" 14 %include "jdct.inc" 15 16 ; -------------------------------------------------------------------------- 17 SECTION SEG_TEXT 18 BITS 32 19 ; 20 ; Load data into workspace, applying unsigned->signed conversion 21 ; 22 ; GLOBAL(void) 23 ; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 24 ; FAST_FLOAT *workspace); 25 ; 26 27 %define sample_data ebp + 8 ; JSAMPARRAY sample_data 28 %define start_col ebp + 12 ; JDIMENSION start_col 29 %define workspace ebp + 16 ; FAST_FLOAT *workspace 30 31 align 32 32 GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) 33 34 EXTN(jsimd_convsamp_float_sse2): 35 push ebp 36 mov ebp, esp 37 push ebx 38 ; push ecx ; need not be preserved 39 ; push edx ; need not be preserved 40 push esi 41 push edi 42 43 pcmpeqw xmm7, xmm7 44 psllw xmm7, 7 45 packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 46 47 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 48 mov eax, JDIMENSION [start_col] 49 mov edi, POINTER [workspace] ; (DCTELEM *) 50 mov ecx, DCTSIZE/2 51 ALIGNX 16, 7 52 .convloop: 53 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 54 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 55 56 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 57 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 58 59 psubb xmm0, xmm7 ; xmm0=(01234567) 60 psubb xmm1, xmm7 ; xmm1=(89ABCDEF) 61 62 punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 63 punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 64 65 punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) 66 punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) 67 punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) 68 punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) 69 70 psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 71 psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 72 cvtdq2ps xmm2, xmm2 ; xmm2=(0123) 73 cvtdq2ps xmm0, xmm0 ; xmm0=(4567) 74 psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 75 psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 76 cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) 77 cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) 78 79 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 80 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 81 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 82 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 83 84 add esi, byte 2*SIZEOF_JSAMPROW 85 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 86 dec ecx 87 jnz short .convloop 88 89 pop edi 90 pop esi 91 ; pop edx ; need not be preserved 92 ; pop ecx ; need not be preserved 93 pop ebx 94 pop ebp 95 ret 96 97 ; -------------------------------------------------------------------------- 98 ; 99 ; Quantize/descale the coefficients, and store into coef_block 100 ; 101 ; GLOBAL(void) 102 ; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, 103 ; FAST_FLOAT *workspace); 104 ; 105 106 %define coef_block ebp + 8 ; JCOEFPTR coef_block 107 %define divisors ebp + 12 ; FAST_FLOAT *divisors 108 %define workspace ebp + 16 ; FAST_FLOAT *workspace 109 110 align 32 111 GLOBAL_FUNCTION(jsimd_quantize_float_sse2) 112 113 EXTN(jsimd_quantize_float_sse2): 114 push ebp 115 mov ebp, esp 116 ; push ebx ; unused 117 ; push ecx ; unused 118 ; push edx ; need not be preserved 119 push esi 120 push edi 121 122 mov esi, POINTER [workspace] 123 mov edx, POINTER [divisors] 124 mov edi, JCOEFPTR [coef_block] 125 mov eax, DCTSIZE2/16 126 ALIGNX 16, 7 127 .quantloop: 128 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 129 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 130 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 131 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 132 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 133 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 134 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 135 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 136 137 cvtps2dq xmm0, xmm0 138 cvtps2dq xmm1, xmm1 139 cvtps2dq xmm2, xmm2 140 cvtps2dq xmm3, xmm3 141 142 packssdw xmm0, xmm1 143 packssdw xmm2, xmm3 144 145 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 146 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 147 148 add esi, byte 16*SIZEOF_FAST_FLOAT 149 add edx, byte 16*SIZEOF_FAST_FLOAT 150 add edi, byte 16*SIZEOF_JCOEF 151 dec eax 152 jnz short .quantloop 153 154 pop edi 155 pop esi 156 ; pop edx ; need not be preserved 157 ; pop ecx ; unused 158 ; pop ebx ; unused 159 pop ebp 160 ret 161 162 ; For some reason, the OS X linker does not honor the request to align the 163 ; segment unless we do this. 164 align 32