jquant-3dn.asm (8731B)
1 ; 2 ; jquant.asm - sample data conversion and quantization (3DNow! & MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jsimdext.inc" 14 %include "jdct.inc" 15 16 ; -------------------------------------------------------------------------- 17 SECTION SEG_TEXT 18 BITS 32 19 ; 20 ; Load data into workspace, applying unsigned->signed conversion 21 ; 22 ; GLOBAL(void) 23 ; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col, 24 ; FAST_FLOAT *workspace); 25 ; 26 27 %define sample_data ebp + 8 ; JSAMPARRAY sample_data 28 %define start_col ebp + 12 ; JDIMENSION start_col 29 %define workspace ebp + 16 ; FAST_FLOAT *workspace 30 31 align 32 32 GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow) 33 34 EXTN(jsimd_convsamp_float_3dnow): 35 push ebp 36 mov ebp, esp 37 push ebx 38 ; push ecx ; need not be preserved 39 ; push edx ; need not be preserved 40 push esi 41 push edi 42 43 pcmpeqw mm7, mm7 44 psllw mm7, 7 45 packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) 46 47 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 48 mov eax, JDIMENSION [start_col] 49 mov edi, POINTER [workspace] ; (DCTELEM *) 50 mov ecx, DCTSIZE/2 51 ALIGNX 16, 7 52 .convloop: 53 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 54 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 55 56 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] 57 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] 58 59 psubb mm0, mm7 ; mm0=(01234567) 60 psubb mm1, mm7 ; mm1=(89ABCDEF) 61 62 punpcklbw mm2, mm0 ; mm2=(*0*1*2*3) 63 punpckhbw mm0, mm0 ; mm0=(*4*5*6*7) 64 punpcklbw mm3, mm1 ; mm3=(*8*9*A*B) 65 punpckhbw mm1, mm1 ; mm1=(*C*D*E*F) 66 67 punpcklwd mm4, mm2 ; mm4=(***0***1) 68 punpckhwd mm2, mm2 ; mm2=(***2***3) 69 punpcklwd mm5, mm0 ; mm5=(***4***5) 70 punpckhwd mm0, mm0 ; mm0=(***6***7) 71 72 psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01) 73 psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23) 74 pi2fd mm4, mm4 75 pi2fd mm2, mm2 76 psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45) 77 psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67) 78 pi2fd mm5, mm5 79 pi2fd mm0, mm0 80 81 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 82 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 83 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 84 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 85 86 punpcklwd mm6, mm3 ; mm6=(***8***9) 87 punpckhwd mm3, mm3 ; mm3=(***A***B) 88 punpcklwd mm4, mm1 ; mm4=(***C***D) 89 punpckhwd mm1, mm1 ; mm1=(***E***F) 90 91 psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89) 92 psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB) 93 pi2fd mm6, mm6 94 pi2fd mm3, mm3 95 psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD) 96 psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF) 97 pi2fd mm4, mm4 98 pi2fd mm1, mm1 99 100 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 101 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 102 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 103 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 104 105 add esi, byte 2*SIZEOF_JSAMPROW 106 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 107 dec ecx 108 jnz near .convloop 109 110 femms ; empty MMX/3DNow! state 111 112 pop edi 113 pop esi 114 ; pop edx ; need not be preserved 115 ; pop ecx ; need not be preserved 116 pop ebx 117 pop ebp 118 ret 119 120 ; -------------------------------------------------------------------------- 121 ; 122 ; Quantize/descale the coefficients, and store into coef_block 123 ; 124 ; GLOBAL(void) 125 ; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors, 126 ; FAST_FLOAT *workspace); 127 ; 128 129 %define coef_block ebp + 8 ; JCOEFPTR coef_block 130 %define divisors ebp + 12 ; FAST_FLOAT *divisors 131 %define workspace ebp + 16 ; FAST_FLOAT *workspace 132 133 align 32 134 GLOBAL_FUNCTION(jsimd_quantize_float_3dnow) 135 136 EXTN(jsimd_quantize_float_3dnow): 137 push ebp 138 mov ebp, esp 139 ; push ebx ; unused 140 ; push ecx ; unused 141 ; push edx ; need not be preserved 142 push esi 143 push edi 144 145 mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) 146 movd mm7, eax 147 punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F} 148 149 mov esi, POINTER [workspace] 150 mov edx, POINTER [divisors] 151 mov edi, JCOEFPTR [coef_block] 152 mov eax, DCTSIZE2/16 153 ALIGNX 16, 7 154 .quantloop: 155 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 156 movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 157 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 158 pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 159 movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] 160 movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] 161 pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] 162 pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] 163 164 pfadd mm0, mm7 ; mm0=(00 ** 01 **) 165 pfadd mm1, mm7 ; mm1=(02 ** 03 **) 166 pfadd mm2, mm7 ; mm0=(04 ** 05 **) 167 pfadd mm3, mm7 ; mm1=(06 ** 07 **) 168 169 movq mm4, mm0 170 punpcklwd mm0, mm1 ; mm0=(00 02 ** **) 171 punpckhwd mm4, mm1 ; mm4=(01 03 ** **) 172 movq mm5, mm2 173 punpcklwd mm2, mm3 ; mm2=(04 06 ** **) 174 punpckhwd mm5, mm3 ; mm5=(05 07 ** **) 175 176 punpcklwd mm0, mm4 ; mm0=(00 01 02 03) 177 punpcklwd mm2, mm5 ; mm2=(04 05 06 07) 178 179 movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 180 movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 181 pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 182 pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 183 movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] 184 movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] 185 pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] 186 pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] 187 188 pfadd mm6, mm7 ; mm0=(10 ** 11 **) 189 pfadd mm1, mm7 ; mm4=(12 ** 13 **) 190 pfadd mm3, mm7 ; mm0=(14 ** 15 **) 191 pfadd mm4, mm7 ; mm4=(16 ** 17 **) 192 193 movq mm5, mm6 194 punpcklwd mm6, mm1 ; mm6=(10 12 ** **) 195 punpckhwd mm5, mm1 ; mm5=(11 13 ** **) 196 movq mm1, mm3 197 punpcklwd mm3, mm4 ; mm3=(14 16 ** **) 198 punpckhwd mm1, mm4 ; mm1=(15 17 ** **) 199 200 punpcklwd mm6, mm5 ; mm6=(10 11 12 13) 201 punpcklwd mm3, mm1 ; mm3=(14 15 16 17) 202 203 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 204 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 205 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 206 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 207 208 add esi, byte 16*SIZEOF_FAST_FLOAT 209 add edx, byte 16*SIZEOF_FAST_FLOAT 210 add edi, byte 16*SIZEOF_JCOEF 211 dec eax 212 jnz near .quantloop 213 214 femms ; empty MMX/3DNow! state 215 216 pop edi 217 pop esi 218 ; pop edx ; need not be preserved 219 ; pop ecx ; unused 220 ; pop ebx ; unused 221 pop ebp 222 ret 223 224 ; For some reason, the OS X linker does not honor the request to align the 225 ; segment unless we do this. 226 align 32