jcsample-sse2.asm (10345B)
1 ; 2 ; jcsample.asm - downsampling (SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jsimdext.inc" 14 15 ; -------------------------------------------------------------------------- 16 SECTION SEG_TEXT 17 BITS 32 18 ; 19 ; Downsample pixel values of a single component. 20 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 21 ; without smoothing. 22 ; 23 ; GLOBAL(void) 24 ; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 25 ; JDIMENSION v_samp_factor, 26 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 27 ; JSAMPARRAY output_data); 28 ; 29 30 %define img_width(b) (b) + 8 ; JDIMENSION image_width 31 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 32 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 33 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 34 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 35 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) 39 40 EXTN(jsimd_h2v1_downsample_sse2): 41 push ebp 42 mov ebp, esp 43 ; push ebx ; unused 44 ; push ecx ; need not be preserved 45 ; push edx ; need not be preserved 46 push esi 47 push edi 48 49 mov ecx, JDIMENSION [width_blks(ebp)] 50 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 51 jz near .return 52 53 mov edx, JDIMENSION [img_width(ebp)] 54 55 ; -- expand_right_edge 56 57 push ecx 58 shl ecx, 1 ; output_cols * 2 59 sub ecx, edx 60 jle short .expand_end 61 62 mov eax, INT [max_v_samp(ebp)] 63 test eax, eax 64 jle short .expand_end 65 66 cld 67 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 68 ALIGNX 16, 7 69 .expandloop: 70 push eax 71 push ecx 72 73 mov edi, JSAMPROW [esi] 74 add edi, edx 75 mov al, JSAMPLE [edi-1] 76 77 rep stosb 78 79 pop ecx 80 pop eax 81 82 add esi, byte SIZEOF_JSAMPROW 83 dec eax 84 jg short .expandloop 85 86 .expand_end: 87 pop ecx ; output_cols 88 89 ; -- h2v1_downsample 90 91 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 92 test eax, eax 93 jle near .return 94 95 mov edx, 0x00010000 ; bias pattern 96 movd xmm7, edx 97 pcmpeqw xmm6, xmm6 98 pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 99 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 100 101 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 102 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 103 ALIGNX 16, 7 104 .rowloop: 105 push ecx 106 push edi 107 push esi 108 109 mov esi, JSAMPROW [esi] ; inptr 110 mov edi, JSAMPROW [edi] ; outptr 111 112 cmp ecx, byte SIZEOF_XMMWORD 113 jae short .columnloop 114 ALIGNX 16, 7 115 116 .columnloop_r8: 117 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 118 pxor xmm1, xmm1 119 mov ecx, SIZEOF_XMMWORD 120 jmp short .downsample 121 ALIGNX 16, 7 122 123 .columnloop: 124 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 125 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] 126 127 .downsample: 128 movdqa xmm2, xmm0 129 movdqa xmm3, xmm1 130 131 pand xmm0, xmm6 132 psrlw xmm2, BYTE_BIT 133 pand xmm1, xmm6 134 psrlw xmm3, BYTE_BIT 135 136 paddw xmm0, xmm2 137 paddw xmm1, xmm3 138 paddw xmm0, xmm7 139 paddw xmm1, xmm7 140 psrlw xmm0, 1 141 psrlw xmm1, 1 142 143 packuswb xmm0, xmm1 144 145 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 146 147 sub ecx, byte SIZEOF_XMMWORD ; outcol 148 add esi, byte 2*SIZEOF_XMMWORD ; inptr 149 add edi, byte 1*SIZEOF_XMMWORD ; outptr 150 cmp ecx, byte SIZEOF_XMMWORD 151 jae short .columnloop 152 test ecx, ecx 153 jnz short .columnloop_r8 154 155 pop esi 156 pop edi 157 pop ecx 158 159 add esi, byte SIZEOF_JSAMPROW ; input_data 160 add edi, byte SIZEOF_JSAMPROW ; output_data 161 dec eax ; rowctr 162 jg near .rowloop 163 164 .return: 165 pop edi 166 pop esi 167 ; pop edx ; need not be preserved 168 ; pop ecx ; need not be preserved 169 ; pop ebx ; unused 170 pop ebp 171 ret 172 173 ; -------------------------------------------------------------------------- 174 ; 175 ; Downsample pixel values of a single component. 176 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 177 ; without smoothing. 178 ; 179 ; GLOBAL(void) 180 ; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 181 ; JDIMENSION v_samp_factor, 182 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 183 ; JSAMPARRAY output_data); 184 ; 185 186 %define img_width(b) (b) + 8 ; JDIMENSION image_width 187 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 188 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 189 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 190 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 191 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 192 193 align 32 194 GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) 195 196 EXTN(jsimd_h2v2_downsample_sse2): 197 push ebp 198 mov ebp, esp 199 ; push ebx ; unused 200 ; push ecx ; need not be preserved 201 ; push edx ; need not be preserved 202 push esi 203 push edi 204 205 mov ecx, JDIMENSION [width_blks(ebp)] 206 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 207 jz near .return 208 209 mov edx, JDIMENSION [img_width(ebp)] 210 211 ; -- expand_right_edge 212 213 push ecx 214 shl ecx, 1 ; output_cols * 2 215 sub ecx, edx 216 jle short .expand_end 217 218 mov eax, INT [max_v_samp(ebp)] 219 test eax, eax 220 jle short .expand_end 221 222 cld 223 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 224 ALIGNX 16, 7 225 .expandloop: 226 push eax 227 push ecx 228 229 mov edi, JSAMPROW [esi] 230 add edi, edx 231 mov al, JSAMPLE [edi-1] 232 233 rep stosb 234 235 pop ecx 236 pop eax 237 238 add esi, byte SIZEOF_JSAMPROW 239 dec eax 240 jg short .expandloop 241 242 .expand_end: 243 pop ecx ; output_cols 244 245 ; -- h2v2_downsample 246 247 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 248 test eax, eax 249 jle near .return 250 251 mov edx, 0x00020001 ; bias pattern 252 movd xmm7, edx 253 pcmpeqw xmm6, xmm6 254 pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 255 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 256 257 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 258 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 259 ALIGNX 16, 7 260 .rowloop: 261 push ecx 262 push edi 263 push esi 264 265 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 266 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 267 mov edi, JSAMPROW [edi] ; outptr 268 269 cmp ecx, byte SIZEOF_XMMWORD 270 jae short .columnloop 271 ALIGNX 16, 7 272 273 .columnloop_r8: 274 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 275 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 276 pxor xmm2, xmm2 277 pxor xmm3, xmm3 278 mov ecx, SIZEOF_XMMWORD 279 jmp short .downsample 280 ALIGNX 16, 7 281 282 .columnloop: 283 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 284 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 285 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] 286 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] 287 288 .downsample: 289 movdqa xmm4, xmm0 290 movdqa xmm5, xmm1 291 pand xmm0, xmm6 292 psrlw xmm4, BYTE_BIT 293 pand xmm1, xmm6 294 psrlw xmm5, BYTE_BIT 295 paddw xmm0, xmm4 296 paddw xmm1, xmm5 297 298 movdqa xmm4, xmm2 299 movdqa xmm5, xmm3 300 pand xmm2, xmm6 301 psrlw xmm4, BYTE_BIT 302 pand xmm3, xmm6 303 psrlw xmm5, BYTE_BIT 304 paddw xmm2, xmm4 305 paddw xmm3, xmm5 306 307 paddw xmm0, xmm1 308 paddw xmm2, xmm3 309 paddw xmm0, xmm7 310 paddw xmm2, xmm7 311 psrlw xmm0, 2 312 psrlw xmm2, 2 313 314 packuswb xmm0, xmm2 315 316 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 317 318 sub ecx, byte SIZEOF_XMMWORD ; outcol 319 add edx, byte 2*SIZEOF_XMMWORD ; inptr0 320 add esi, byte 2*SIZEOF_XMMWORD ; inptr1 321 add edi, byte 1*SIZEOF_XMMWORD ; outptr 322 cmp ecx, byte SIZEOF_XMMWORD 323 jae near .columnloop 324 test ecx, ecx 325 jnz near .columnloop_r8 326 327 pop esi 328 pop edi 329 pop ecx 330 331 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 332 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 333 dec eax ; rowctr 334 jg near .rowloop 335 336 .return: 337 pop edi 338 pop esi 339 ; pop edx ; need not be preserved 340 ; pop ecx ; need not be preserved 341 ; pop ebx ; unused 342 pop ebp 343 ret 344 345 ; For some reason, the OS X linker does not honor the request to align the 346 ; segment unless we do this. 347 align 32