jcsample-avx2.asm (11811B)
1 ; 2 ; jcsample.asm - downsampling (AVX2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2015, Intel Corporation. 6 ; Copyright (C) 2016, 2024, D. R. Commander. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 14 %include "jsimdext.inc" 15 16 ; -------------------------------------------------------------------------- 17 SECTION SEG_TEXT 18 BITS 32 19 ; 20 ; Downsample pixel values of a single component. 21 ; This version handles the common case of 2:1 horizontal and 1:1 vertical, 22 ; without smoothing. 23 ; 24 ; GLOBAL(void) 25 ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 26 ; JDIMENSION v_samp_factor, 27 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 28 ; JSAMPARRAY output_data); 29 ; 30 31 %define img_width(b) (b) + 8 ; JDIMENSION image_width 32 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 33 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 34 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 35 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 36 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 37 38 align 32 39 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) 40 41 EXTN(jsimd_h2v1_downsample_avx2): 42 push ebp 43 mov ebp, esp 44 ; push ebx ; unused 45 ; push ecx ; need not be preserved 46 ; push edx ; need not be preserved 47 push esi 48 push edi 49 50 mov ecx, JDIMENSION [width_blks(ebp)] 51 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 52 jz near .return 53 54 mov edx, JDIMENSION [img_width(ebp)] 55 56 ; -- expand_right_edge 57 58 push ecx 59 shl ecx, 1 ; output_cols * 2 60 sub ecx, edx 61 jle short .expand_end 62 63 mov eax, INT [max_v_samp(ebp)] 64 test eax, eax 65 jle short .expand_end 66 67 cld 68 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 69 ALIGNX 16, 7 70 .expandloop: 71 push eax 72 push ecx 73 74 mov edi, JSAMPROW [esi] 75 add edi, edx 76 mov al, JSAMPLE [edi-1] 77 78 rep stosb 79 80 pop ecx 81 pop eax 82 83 add esi, byte SIZEOF_JSAMPROW 84 dec eax 85 jg short .expandloop 86 87 .expand_end: 88 pop ecx ; output_cols 89 90 ; -- h2v1_downsample 91 92 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 93 test eax, eax 94 jle near .return 95 96 mov edx, 0x00010000 ; bias pattern 97 vmovd xmm7, edx 98 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 99 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} 100 vpcmpeqw ymm6, ymm6, ymm6 101 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 102 103 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 104 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 105 ALIGNX 16, 7 106 .rowloop: 107 push ecx 108 push edi 109 push esi 110 111 mov esi, JSAMPROW [esi] ; inptr 112 mov edi, JSAMPROW [edi] ; outptr 113 114 cmp ecx, byte SIZEOF_YMMWORD 115 jae short .columnloop 116 ALIGNX 16, 7 117 118 .columnloop_r24: 119 ; ecx can possibly be 8, 16, 24 120 cmp ecx, 24 121 jne .columnloop_r16 122 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 123 vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD] 124 mov ecx, SIZEOF_YMMWORD 125 jmp short .downsample 126 127 .columnloop_r16: 128 cmp ecx, 16 129 jne .columnloop_r8 130 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 131 vpxor ymm1, ymm1, ymm1 132 mov ecx, SIZEOF_YMMWORD 133 jmp short .downsample 134 135 .columnloop_r8: 136 vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD] 137 vpxor ymm1, ymm1, ymm1 138 mov ecx, SIZEOF_YMMWORD 139 jmp short .downsample 140 ALIGNX 16, 7 141 142 .columnloop: 143 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 144 vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD] 145 146 .downsample: 147 vpsrlw ymm2, ymm0, BYTE_BIT 148 vpand ymm0, ymm0, ymm6 149 vpsrlw ymm3, ymm1, BYTE_BIT 150 vpand ymm1, ymm1, ymm6 151 152 vpaddw ymm0, ymm0, ymm2 153 vpaddw ymm1, ymm1, ymm3 154 vpaddw ymm0, ymm0, ymm7 155 vpaddw ymm1, ymm1, ymm7 156 vpsrlw ymm0, ymm0, 1 157 vpsrlw ymm1, ymm1, 1 158 159 vpackuswb ymm0, ymm0, ymm1 160 vpermq ymm0, ymm0, 0xd8 161 162 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 163 164 sub ecx, byte SIZEOF_YMMWORD ; outcol 165 add esi, byte 2*SIZEOF_YMMWORD ; inptr 166 add edi, byte 1*SIZEOF_YMMWORD ; outptr 167 cmp ecx, byte SIZEOF_YMMWORD 168 jae short .columnloop 169 test ecx, ecx 170 jnz near .columnloop_r24 171 172 pop esi 173 pop edi 174 pop ecx 175 176 add esi, byte SIZEOF_JSAMPROW ; input_data 177 add edi, byte SIZEOF_JSAMPROW ; output_data 178 dec eax ; rowctr 179 jg near .rowloop 180 181 .return: 182 vzeroupper 183 pop edi 184 pop esi 185 ; pop edx ; need not be preserved 186 ; pop ecx ; need not be preserved 187 ; pop ebx ; unused 188 pop ebp 189 ret 190 191 ; -------------------------------------------------------------------------- 192 ; 193 ; Downsample pixel values of a single component. 194 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 195 ; without smoothing. 196 ; 197 ; GLOBAL(void) 198 ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 199 ; JDIMENSION v_samp_factor, 200 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 201 ; JSAMPARRAY output_data); 202 ; 203 204 %define img_width(b) (b) + 8 ; JDIMENSION image_width 205 %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 206 %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 207 %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 208 %define input_data(b) (b) + 24 ; JSAMPARRAY input_data 209 %define output_data(b) (b) + 28 ; JSAMPARRAY output_data 210 211 align 32 212 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) 213 214 EXTN(jsimd_h2v2_downsample_avx2): 215 push ebp 216 mov ebp, esp 217 ; push ebx ; unused 218 ; push ecx ; need not be preserved 219 ; push edx ; need not be preserved 220 push esi 221 push edi 222 223 mov ecx, JDIMENSION [width_blks(ebp)] 224 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 225 jz near .return 226 227 mov edx, JDIMENSION [img_width(ebp)] 228 229 ; -- expand_right_edge 230 231 push ecx 232 shl ecx, 1 ; output_cols * 2 233 sub ecx, edx 234 jle short .expand_end 235 236 mov eax, INT [max_v_samp(ebp)] 237 test eax, eax 238 jle short .expand_end 239 240 cld 241 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 242 ALIGNX 16, 7 243 .expandloop: 244 push eax 245 push ecx 246 247 mov edi, JSAMPROW [esi] 248 add edi, edx 249 mov al, JSAMPLE [edi-1] 250 251 rep stosb 252 253 pop ecx 254 pop eax 255 256 add esi, byte SIZEOF_JSAMPROW 257 dec eax 258 jg short .expandloop 259 260 .expand_end: 261 pop ecx ; output_cols 262 263 ; -- h2v2_downsample 264 265 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 266 test eax, eax 267 jle near .return 268 269 mov edx, 0x00020001 ; bias pattern 270 vmovd xmm7, edx 271 vpcmpeqw ymm6, ymm6, ymm6 272 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} 273 vperm2i128 ymm7, ymm7, ymm7, 0 274 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 275 276 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 277 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 278 ALIGNX 16, 7 279 .rowloop: 280 push ecx 281 push edi 282 push esi 283 284 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 285 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 286 mov edi, JSAMPROW [edi] ; outptr 287 288 cmp ecx, byte SIZEOF_YMMWORD 289 jae short .columnloop 290 ALIGNX 16, 7 291 292 .columnloop_r24: 293 cmp ecx, 24 294 jne .columnloop_r16 295 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 296 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 297 vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD] 298 vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD] 299 mov ecx, SIZEOF_YMMWORD 300 jmp short .downsample 301 302 .columnloop_r16: 303 cmp ecx, 16 304 jne .columnloop_r8 305 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 306 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 307 vpxor ymm2, ymm2, ymm2 308 vpxor ymm3, ymm3, ymm3 309 mov ecx, SIZEOF_YMMWORD 310 jmp short .downsample 311 312 .columnloop_r8: 313 vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 314 vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 315 vpxor ymm2, ymm2, ymm2 316 vpxor ymm3, ymm3, ymm3 317 mov ecx, SIZEOF_YMMWORD 318 jmp short .downsample 319 ALIGNX 16, 7 320 321 .columnloop: 322 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 323 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 324 vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD] 325 vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD] 326 327 .downsample: 328 vpand ymm4, ymm0, ymm6 329 vpsrlw ymm0, ymm0, BYTE_BIT 330 vpand ymm5, ymm1, ymm6 331 vpsrlw ymm1, ymm1, BYTE_BIT 332 vpaddw ymm0, ymm0, ymm4 333 vpaddw ymm1, ymm1, ymm5 334 335 vpand ymm4, ymm2, ymm6 336 vpsrlw ymm2, ymm2, BYTE_BIT 337 vpand ymm5, ymm3, ymm6 338 vpsrlw ymm3, ymm3, BYTE_BIT 339 vpaddw ymm2, ymm2, ymm4 340 vpaddw ymm3, ymm3, ymm5 341 342 vpaddw ymm0, ymm0, ymm1 343 vpaddw ymm2, ymm2, ymm3 344 vpaddw ymm0, ymm0, ymm7 345 vpaddw ymm2, ymm2, ymm7 346 vpsrlw ymm0, ymm0, 2 347 vpsrlw ymm2, ymm2, 2 348 349 vpackuswb ymm0, ymm0, ymm2 350 vpermq ymm0, ymm0, 0xd8 351 352 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 353 354 sub ecx, byte SIZEOF_YMMWORD ; outcol 355 add edx, byte 2*SIZEOF_YMMWORD ; inptr0 356 add esi, byte 2*SIZEOF_YMMWORD ; inptr1 357 add edi, byte 1*SIZEOF_YMMWORD ; outptr 358 cmp ecx, byte SIZEOF_YMMWORD 359 jae near .columnloop 360 test ecx, ecx 361 jnz near .columnloop_r24 362 363 pop esi 364 pop edi 365 pop ecx 366 367 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 368 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 369 dec eax ; rowctr 370 jg near .rowloop 371 372 .return: 373 vzeroupper 374 pop edi 375 pop esi 376 ; pop edx ; need not be preserved 377 ; pop ecx ; need not be preserved 378 ; pop ebx ; unused 379 pop ebp 380 ret 381 382 ; For some reason, the OS X linker does not honor the request to align the 383 ; segment unless we do this. 384 align 32