jcsample-mmx.asm (9482B)
1 ; 2 ; jcsample.asm - downsampling (MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jsimdext.inc" 14 15 ; -------------------------------------------------------------------------- 16 SECTION SEG_TEXT 17 BITS 32 18 ; 19 ; Downsample pixel values of a single component. 20 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 21 ; without smoothing. 22 ; 23 ; GLOBAL(void) 24 ; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, 25 ; JDIMENSION v_samp_factor, 26 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 27 ; JSAMPARRAY output_data); 28 ; 29 30 %define img_width(b) (b) + 8 ; JDIMENSION image_width 31 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 32 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 33 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 34 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 35 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx) 39 40 EXTN(jsimd_h2v1_downsample_mmx): 41 push ebp 42 mov ebp, esp 43 ; push ebx ; unused 44 ; push ecx ; need not be preserved 45 ; push edx ; need not be preserved 46 push esi 47 push edi 48 49 mov ecx, JDIMENSION [width_blks(ebp)] 50 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 51 jz near .return 52 53 mov edx, JDIMENSION [img_width(ebp)] 54 55 ; -- expand_right_edge 56 57 push ecx 58 shl ecx, 1 ; output_cols * 2 59 sub ecx, edx 60 jle short .expand_end 61 62 mov eax, INT [max_v_samp(ebp)] 63 test eax, eax 64 jle short .expand_end 65 66 cld 67 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 68 ALIGNX 16, 7 69 .expandloop: 70 push eax 71 push ecx 72 73 mov edi, JSAMPROW [esi] 74 add edi, edx 75 mov al, JSAMPLE [edi-1] 76 77 rep stosb 78 79 pop ecx 80 pop eax 81 82 add esi, byte SIZEOF_JSAMPROW 83 dec eax 84 jg short .expandloop 85 86 .expand_end: 87 pop ecx ; output_cols 88 89 ; -- h2v1_downsample 90 91 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 92 test eax, eax 93 jle near .return 94 95 mov edx, 0x00010000 ; bias pattern 96 movd mm7, edx 97 pcmpeqw mm6, mm6 98 punpckldq mm7, mm7 ; mm7={0, 1, 0, 1} 99 psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 100 101 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 102 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 103 ALIGNX 16, 7 104 .rowloop: 105 push ecx 106 push edi 107 push esi 108 109 mov esi, JSAMPROW [esi] ; inptr 110 mov edi, JSAMPROW [edi] ; outptr 111 ALIGNX 16, 7 112 .columnloop: 113 114 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 115 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] 116 movq mm2, mm0 117 movq mm3, mm1 118 119 pand mm0, mm6 120 psrlw mm2, BYTE_BIT 121 pand mm1, mm6 122 psrlw mm3, BYTE_BIT 123 124 paddw mm0, mm2 125 paddw mm1, mm3 126 paddw mm0, mm7 127 paddw mm1, mm7 128 psrlw mm0, 1 129 psrlw mm1, 1 130 131 packuswb mm0, mm1 132 133 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 134 135 add esi, byte 2*SIZEOF_MMWORD ; inptr 136 add edi, byte 1*SIZEOF_MMWORD ; outptr 137 sub ecx, byte SIZEOF_MMWORD ; outcol 138 jnz short .columnloop 139 140 pop esi 141 pop edi 142 pop ecx 143 144 add esi, byte SIZEOF_JSAMPROW ; input_data 145 add edi, byte SIZEOF_JSAMPROW ; output_data 146 dec eax ; rowctr 147 jg short .rowloop 148 149 emms ; empty MMX state 150 151 .return: 152 pop edi 153 pop esi 154 ; pop edx ; need not be preserved 155 ; pop ecx ; need not be preserved 156 ; pop ebx ; unused 157 pop ebp 158 ret 159 160 ; -------------------------------------------------------------------------- 161 ; 162 ; Downsample pixel values of a single component. 163 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 164 ; without smoothing. 165 ; 166 ; GLOBAL(void) 167 ; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, 168 ; JDIMENSION v_samp_factor, 169 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 170 ; JSAMPARRAY output_data); 171 ; 172 173 %define img_width(b) (b) + 8 ; JDIMENSION image_width 174 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 175 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 176 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 177 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 178 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 179 180 align 32 181 GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx) 182 183 EXTN(jsimd_h2v2_downsample_mmx): 184 push ebp 185 mov ebp, esp 186 ; push ebx ; unused 187 ; push ecx ; need not be preserved 188 ; push edx ; need not be preserved 189 push esi 190 push edi 191 192 mov ecx, JDIMENSION [width_blks(ebp)] 193 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 194 jz near .return 195 196 mov edx, JDIMENSION [img_width(ebp)] 197 198 ; -- expand_right_edge 199 200 push ecx 201 shl ecx, 1 ; output_cols * 2 202 sub ecx, edx 203 jle short .expand_end 204 205 mov eax, INT [max_v_samp(ebp)] 206 test eax, eax 207 jle short .expand_end 208 209 cld 210 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 211 ALIGNX 16, 7 212 .expandloop: 213 push eax 214 push ecx 215 216 mov edi, JSAMPROW [esi] 217 add edi, edx 218 mov al, JSAMPLE [edi-1] 219 220 rep stosb 221 222 pop ecx 223 pop eax 224 225 add esi, byte SIZEOF_JSAMPROW 226 dec eax 227 jg short .expandloop 228 229 .expand_end: 230 pop ecx ; output_cols 231 232 ; -- h2v2_downsample 233 234 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 235 test eax, eax 236 jle near .return 237 238 mov edx, 0x00020001 ; bias pattern 239 movd mm7, edx 240 pcmpeqw mm6, mm6 241 punpckldq mm7, mm7 ; mm7={1, 2, 1, 2} 242 psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 243 244 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 245 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 246 ALIGNX 16, 7 247 .rowloop: 248 push ecx 249 push edi 250 push esi 251 252 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 253 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 254 mov edi, JSAMPROW [edi] ; outptr 255 ALIGNX 16, 7 256 .columnloop: 257 258 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] 259 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 260 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] 261 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] 262 263 movq mm4, mm0 264 movq mm5, mm1 265 pand mm0, mm6 266 psrlw mm4, BYTE_BIT 267 pand mm1, mm6 268 psrlw mm5, BYTE_BIT 269 paddw mm0, mm4 270 paddw mm1, mm5 271 272 movq mm4, mm2 273 movq mm5, mm3 274 pand mm2, mm6 275 psrlw mm4, BYTE_BIT 276 pand mm3, mm6 277 psrlw mm5, BYTE_BIT 278 paddw mm2, mm4 279 paddw mm3, mm5 280 281 paddw mm0, mm1 282 paddw mm2, mm3 283 paddw mm0, mm7 284 paddw mm2, mm7 285 psrlw mm0, 2 286 psrlw mm2, 2 287 288 packuswb mm0, mm2 289 290 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 291 292 add edx, byte 2*SIZEOF_MMWORD ; inptr0 293 add esi, byte 2*SIZEOF_MMWORD ; inptr1 294 add edi, byte 1*SIZEOF_MMWORD ; outptr 295 sub ecx, byte SIZEOF_MMWORD ; outcol 296 jnz near .columnloop 297 298 pop esi 299 pop edi 300 pop ecx 301 302 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 303 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 304 dec eax ; rowctr 305 jg near .rowloop 306 307 emms ; empty MMX state 308 309 .return: 310 pop edi 311 pop esi 312 ; pop edx ; need not be preserved 313 ; pop ecx ; need not be preserved 314 ; pop ebx ; unused 315 pop ebp 316 ret 317 318 ; For some reason, the OS X linker does not honor the request to align the 319 ; segment unless we do this. 320 align 32