jcsample-avx2.asm (10314B)
1 ; 2 ; jcsample.asm - downsampling (64-bit AVX2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2015, Intel Corporation. 7 ; Copyright (C) 2018, Matthias Räncker. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 15 %include "jsimdext.inc" 16 17 ; -------------------------------------------------------------------------- 18 SECTION SEG_TEXT 19 BITS 64 20 ; 21 ; Downsample pixel values of a single component. 22 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 23 ; without smoothing. 24 ; 25 ; GLOBAL(void) 26 ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 27 ; JDIMENSION v_samp_factor, 28 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 29 ; JSAMPARRAY output_data); 30 ; 31 32 ; r10d = JDIMENSION image_width 33 ; r11 = int max_v_samp_factor 34 ; r12d = JDIMENSION v_samp_factor 35 ; r13d = JDIMENSION width_in_blocks 36 ; r14 = JSAMPARRAY input_data 37 ; r15 = JSAMPARRAY output_data 38 39 align 32 40 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) 41 42 EXTN(jsimd_h2v1_downsample_avx2): 43 ENDBR64 44 push rbp 45 mov rbp, rsp 46 COLLECT_ARGS 6 47 48 mov ecx, r13d 49 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 50 jz near .return 51 52 mov edx, r10d 53 54 ; -- expand_right_edge 55 56 push rcx 57 shl rcx, 1 ; output_cols * 2 58 sub rcx, rdx 59 jle short .expand_end 60 61 mov rax, r11 62 test rax, rax 63 jle short .expand_end 64 65 cld 66 mov rsi, r14 ; input_data 67 .expandloop: 68 push rax 69 push rcx 70 71 mov rdip, JSAMPROW [rsi] 72 add rdi, rdx 73 mov al, JSAMPLE [rdi-1] 74 75 rep stosb 76 77 pop rcx 78 pop rax 79 80 add rsi, byte SIZEOF_JSAMPROW 81 dec rax 82 jg short .expandloop 83 84 .expand_end: 85 pop rcx ; output_cols 86 87 ; -- h2v1_downsample 88 89 mov eax, r12d ; rowctr 90 test eax, eax 91 jle near .return 92 93 mov rdx, 0x00010000 ; bias pattern 94 vmovd xmm7, edx 95 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 96 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} 97 vpcmpeqw ymm6, ymm6, ymm6 98 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 99 100 mov rsi, r14 ; input_data 101 mov rdi, r15 ; output_data 102 .rowloop: 103 push rcx 104 push rdi 105 push rsi 106 107 mov rsip, JSAMPROW [rsi] ; inptr 108 mov rdip, JSAMPROW [rdi] ; outptr 109 110 cmp rcx, byte SIZEOF_YMMWORD 111 jae short .columnloop 112 113 .columnloop_r24: 114 ; rcx can possibly be 8, 16, 24 115 cmp rcx, 24 116 jne .columnloop_r16 117 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 118 vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD] 119 mov rcx, SIZEOF_YMMWORD 120 jmp short .downsample 121 122 .columnloop_r16: 123 cmp rcx, 16 124 jne .columnloop_r8 125 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 126 vpxor ymm1, ymm1, ymm1 127 mov rcx, SIZEOF_YMMWORD 128 jmp short .downsample 129 130 .columnloop_r8: 131 vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD] 132 vpxor ymm1, ymm1, ymm1 133 mov rcx, SIZEOF_YMMWORD 134 jmp short .downsample 135 136 .columnloop: 137 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 138 vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD] 139 140 .downsample: 141 vpsrlw ymm2, ymm0, BYTE_BIT 142 vpand ymm0, ymm0, ymm6 143 vpsrlw ymm3, ymm1, BYTE_BIT 144 vpand ymm1, ymm1, ymm6 145 146 vpaddw ymm0, ymm0, ymm2 147 vpaddw ymm1, ymm1, ymm3 148 vpaddw ymm0, ymm0, ymm7 149 vpaddw ymm1, ymm1, ymm7 150 vpsrlw ymm0, ymm0, 1 151 vpsrlw ymm1, ymm1, 1 152 153 vpackuswb ymm0, ymm0, ymm1 154 vpermq ymm0, ymm0, 0xd8 155 156 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 157 158 sub rcx, byte SIZEOF_YMMWORD ; outcol 159 add rsi, byte 2*SIZEOF_YMMWORD ; inptr 160 add rdi, byte 1*SIZEOF_YMMWORD ; outptr 161 cmp rcx, byte SIZEOF_YMMWORD 162 jae short .columnloop 163 test rcx, rcx 164 jnz near .columnloop_r24 165 166 pop rsi 167 pop rdi 168 pop rcx 169 170 add rsi, byte SIZEOF_JSAMPROW ; input_data 171 add rdi, byte SIZEOF_JSAMPROW ; output_data 172 dec rax ; rowctr 173 jg near .rowloop 174 175 .return: 176 vzeroupper 177 UNCOLLECT_ARGS 6 178 pop rbp 179 ret 180 181 ; -------------------------------------------------------------------------- 182 ; 183 ; Downsample pixel values of a single component. 184 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 185 ; without smoothing. 186 ; 187 ; GLOBAL(void) 188 ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 189 ; JDIMENSION v_samp_factor, 190 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 191 ; JSAMPARRAY output_data); 192 ; 193 194 ; r10d = JDIMENSION image_width 195 ; r11 = int max_v_samp_factor 196 ; r12d = JDIMENSION v_samp_factor 197 ; r13d = JDIMENSION width_in_blocks 198 ; r14 = JSAMPARRAY input_data 199 ; r15 = JSAMPARRAY output_data 200 201 align 32 202 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) 203 204 EXTN(jsimd_h2v2_downsample_avx2): 205 ENDBR64 206 push rbp 207 mov rbp, rsp 208 COLLECT_ARGS 6 209 210 mov ecx, r13d 211 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 212 jz near .return 213 214 mov edx, r10d 215 216 ; -- expand_right_edge 217 218 push rcx 219 shl rcx, 1 ; output_cols * 2 220 sub rcx, rdx 221 jle short .expand_end 222 223 mov rax, r11 224 test rax, rax 225 jle short .expand_end 226 227 cld 228 mov rsi, r14 ; input_data 229 .expandloop: 230 push rax 231 push rcx 232 233 mov rdip, JSAMPROW [rsi] 234 add rdi, rdx 235 mov al, JSAMPLE [rdi-1] 236 237 rep stosb 238 239 pop rcx 240 pop rax 241 242 add rsi, byte SIZEOF_JSAMPROW 243 dec rax 244 jg short .expandloop 245 246 .expand_end: 247 pop rcx ; output_cols 248 249 ; -- h2v2_downsample 250 251 mov eax, r12d ; rowctr 252 test rax, rax 253 jle near .return 254 255 mov rdx, 0x00020001 ; bias pattern 256 vmovd xmm7, edx 257 vpcmpeqw ymm6, ymm6, ymm6 258 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} 259 vperm2i128 ymm7, ymm7, ymm7, 0 260 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 261 262 mov rsi, r14 ; input_data 263 mov rdi, r15 ; output_data 264 .rowloop: 265 push rcx 266 push rdi 267 push rsi 268 269 mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 270 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 271 mov rdip, JSAMPROW [rdi] ; outptr 272 273 cmp rcx, byte SIZEOF_YMMWORD 274 jae short .columnloop 275 276 .columnloop_r24: 277 cmp rcx, 24 278 jne .columnloop_r16 279 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 280 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 281 vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD] 282 vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD] 283 mov rcx, SIZEOF_YMMWORD 284 jmp short .downsample 285 286 .columnloop_r16: 287 cmp rcx, 16 288 jne .columnloop_r8 289 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 290 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 291 vpxor ymm2, ymm2, ymm2 292 vpxor ymm3, ymm3, ymm3 293 mov rcx, SIZEOF_YMMWORD 294 jmp short .downsample 295 296 .columnloop_r8: 297 vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 298 vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 299 vpxor ymm2, ymm2, ymm2 300 vpxor ymm3, ymm3, ymm3 301 mov rcx, SIZEOF_YMMWORD 302 jmp short .downsample 303 304 .columnloop: 305 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 306 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 307 vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD] 308 vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD] 309 310 .downsample: 311 vpand ymm4, ymm0, ymm6 312 vpsrlw ymm0, ymm0, BYTE_BIT 313 vpand ymm5, ymm1, ymm6 314 vpsrlw ymm1, ymm1, BYTE_BIT 315 vpaddw ymm0, ymm0, ymm4 316 vpaddw ymm1, ymm1, ymm5 317 318 vpand ymm4, ymm2, ymm6 319 vpsrlw ymm2, ymm2, BYTE_BIT 320 vpand ymm5, ymm3, ymm6 321 vpsrlw ymm3, ymm3, BYTE_BIT 322 vpaddw ymm2, ymm2, ymm4 323 vpaddw ymm3, ymm3, ymm5 324 325 vpaddw ymm0, ymm0, ymm1 326 vpaddw ymm2, ymm2, ymm3 327 vpaddw ymm0, ymm0, ymm7 328 vpaddw ymm2, ymm2, ymm7 329 vpsrlw ymm0, ymm0, 2 330 vpsrlw ymm2, ymm2, 2 331 332 vpackuswb ymm0, ymm0, ymm2 333 vpermq ymm0, ymm0, 0xd8 334 335 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 336 337 sub rcx, byte SIZEOF_YMMWORD ; outcol 338 add rdx, byte 2*SIZEOF_YMMWORD ; inptr0 339 add rsi, byte 2*SIZEOF_YMMWORD ; inptr1 340 add rdi, byte 1*SIZEOF_YMMWORD ; outptr 341 cmp rcx, byte SIZEOF_YMMWORD 342 jae near .columnloop 343 test rcx, rcx 344 jnz near .columnloop_r24 345 346 pop rsi 347 pop rdi 348 pop rcx 349 350 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 351 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 352 dec rax ; rowctr 353 jg near .rowloop 354 355 .return: 356 vzeroupper 357 UNCOLLECT_ARGS 6 358 pop rbp 359 ret 360 361 ; For some reason, the OS X linker does not honor the request to align the 362 ; segment unless we do this. 363 align 32