jcsample-sse2.asm (8848B)
1 ; 2 ; jcsample.asm - downsampling (64-bit SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 14 %include "jsimdext.inc" 15 16 ; -------------------------------------------------------------------------- 17 SECTION SEG_TEXT 18 BITS 64 19 ; 20 ; Downsample pixel values of a single component. 21 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 22 ; without smoothing. 23 ; 24 ; GLOBAL(void) 25 ; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 26 ; JDIMENSION v_samp_factor, 27 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 28 ; JSAMPARRAY output_data); 29 ; 30 31 ; r10d = JDIMENSION image_width 32 ; r11 = int max_v_samp_factor 33 ; r12d = JDIMENSION v_samp_factor 34 ; r13d = JDIMENSION width_in_blocks 35 ; r14 = JSAMPARRAY input_data 36 ; r15 = JSAMPARRAY output_data 37 38 align 32 39 GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) 40 41 EXTN(jsimd_h2v1_downsample_sse2): 42 ENDBR64 43 push rbp 44 mov rbp, rsp 45 COLLECT_ARGS 6 46 47 mov ecx, r13d 48 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 49 jz near .return 50 51 mov edx, r10d 52 53 ; -- expand_right_edge 54 55 push rcx 56 shl rcx, 1 ; output_cols * 2 57 sub rcx, rdx 58 jle short .expand_end 59 60 mov rax, r11 61 test rax, rax 62 jle short .expand_end 63 64 cld 65 mov rsi, r14 ; input_data 66 .expandloop: 67 push rax 68 push rcx 69 70 mov rdip, JSAMPROW [rsi] 71 add rdi, rdx 72 mov al, JSAMPLE [rdi-1] 73 74 rep stosb 75 76 pop rcx 77 pop rax 78 79 add rsi, byte SIZEOF_JSAMPROW 80 dec rax 81 jg short .expandloop 82 83 .expand_end: 84 pop rcx ; output_cols 85 86 ; -- h2v1_downsample 87 88 mov eax, r12d ; rowctr 89 test eax, eax 90 jle near .return 91 92 mov rdx, 0x00010000 ; bias pattern 93 movd xmm7, edx 94 pcmpeqw xmm6, xmm6 95 pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 96 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 97 98 mov rsi, r14 ; input_data 99 mov rdi, r15 ; output_data 100 .rowloop: 101 push rcx 102 push rdi 103 push rsi 104 105 mov rsip, JSAMPROW [rsi] ; inptr 106 mov rdip, JSAMPROW [rdi] ; outptr 107 108 cmp rcx, byte SIZEOF_XMMWORD 109 jae short .columnloop 110 111 .columnloop_r8: 112 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 113 pxor xmm1, xmm1 114 mov rcx, SIZEOF_XMMWORD 115 jmp short .downsample 116 117 .columnloop: 118 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 119 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] 120 121 .downsample: 122 movdqa xmm2, xmm0 123 movdqa xmm3, xmm1 124 125 pand xmm0, xmm6 126 psrlw xmm2, BYTE_BIT 127 pand xmm1, xmm6 128 psrlw xmm3, BYTE_BIT 129 130 paddw xmm0, xmm2 131 paddw xmm1, xmm3 132 paddw xmm0, xmm7 133 paddw xmm1, xmm7 134 psrlw xmm0, 1 135 psrlw xmm1, 1 136 137 packuswb xmm0, xmm1 138 139 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 140 141 sub rcx, byte SIZEOF_XMMWORD ; outcol 142 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 143 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 144 cmp rcx, byte SIZEOF_XMMWORD 145 jae short .columnloop 146 test rcx, rcx 147 jnz short .columnloop_r8 148 149 pop rsi 150 pop rdi 151 pop rcx 152 153 add rsi, byte SIZEOF_JSAMPROW ; input_data 154 add rdi, byte SIZEOF_JSAMPROW ; output_data 155 dec rax ; rowctr 156 jg near .rowloop 157 158 .return: 159 UNCOLLECT_ARGS 6 160 pop rbp 161 ret 162 163 ; -------------------------------------------------------------------------- 164 ; 165 ; Downsample pixel values of a single component. 166 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 167 ; without smoothing. 168 ; 169 ; GLOBAL(void) 170 ; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 171 ; JDIMENSION v_samp_factor, 172 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 173 ; JSAMPARRAY output_data); 174 ; 175 176 ; r10d = JDIMENSION image_width 177 ; r11 = int max_v_samp_factor 178 ; r12d = JDIMENSION v_samp_factor 179 ; r13d = JDIMENSION width_in_blocks 180 ; r14 = JSAMPARRAY input_data 181 ; r15 = JSAMPARRAY output_data 182 183 align 32 184 GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) 185 186 EXTN(jsimd_h2v2_downsample_sse2): 187 ENDBR64 188 push rbp 189 mov rbp, rsp 190 COLLECT_ARGS 6 191 192 mov ecx, r13d 193 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 194 jz near .return 195 196 mov edx, r10d 197 198 ; -- expand_right_edge 199 200 push rcx 201 shl rcx, 1 ; output_cols * 2 202 sub rcx, rdx 203 jle short .expand_end 204 205 mov rax, r11 206 test rax, rax 207 jle short .expand_end 208 209 cld 210 mov rsi, r14 ; input_data 211 .expandloop: 212 push rax 213 push rcx 214 215 mov rdip, JSAMPROW [rsi] 216 add rdi, rdx 217 mov al, JSAMPLE [rdi-1] 218 219 rep stosb 220 221 pop rcx 222 pop rax 223 224 add rsi, byte SIZEOF_JSAMPROW 225 dec rax 226 jg short .expandloop 227 228 .expand_end: 229 pop rcx ; output_cols 230 231 ; -- h2v2_downsample 232 233 mov eax, r12d ; rowctr 234 test rax, rax 235 jle near .return 236 237 mov rdx, 0x00020001 ; bias pattern 238 movd xmm7, edx 239 pcmpeqw xmm6, xmm6 240 pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 241 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 242 243 mov rsi, r14 ; input_data 244 mov rdi, r15 ; output_data 245 .rowloop: 246 push rcx 247 push rdi 248 push rsi 249 250 mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 251 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 252 mov rdip, JSAMPROW [rdi] ; outptr 253 254 cmp rcx, byte SIZEOF_XMMWORD 255 jae short .columnloop 256 257 .columnloop_r8: 258 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 259 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 260 pxor xmm2, xmm2 261 pxor xmm3, xmm3 262 mov rcx, SIZEOF_XMMWORD 263 jmp short .downsample 264 265 .columnloop: 266 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 267 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 268 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] 269 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] 270 271 .downsample: 272 movdqa xmm4, xmm0 273 movdqa xmm5, xmm1 274 pand xmm0, xmm6 275 psrlw xmm4, BYTE_BIT 276 pand xmm1, xmm6 277 psrlw xmm5, BYTE_BIT 278 paddw xmm0, xmm4 279 paddw xmm1, xmm5 280 281 movdqa xmm4, xmm2 282 movdqa xmm5, xmm3 283 pand xmm2, xmm6 284 psrlw xmm4, BYTE_BIT 285 pand xmm3, xmm6 286 psrlw xmm5, BYTE_BIT 287 paddw xmm2, xmm4 288 paddw xmm3, xmm5 289 290 paddw xmm0, xmm1 291 paddw xmm2, xmm3 292 paddw xmm0, xmm7 293 paddw xmm2, xmm7 294 psrlw xmm0, 2 295 psrlw xmm2, 2 296 297 packuswb xmm0, xmm2 298 299 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 300 301 sub rcx, byte SIZEOF_XMMWORD ; outcol 302 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 303 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 304 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 305 cmp rcx, byte SIZEOF_XMMWORD 306 jae near .columnloop 307 test rcx, rcx 308 jnz near .columnloop_r8 309 310 pop rsi 311 pop rdi 312 pop rcx 313 314 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 315 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 316 dec rax ; rowctr 317 jg near .rowloop 318 319 .return: 320 UNCOLLECT_ARGS 6 321 pop rbp 322 ret 323 324 ; For some reason, the OS X linker does not honor the request to align the 325 ; segment unless we do this. 326 align 32