jcgryext-sse2.asm (12887B)
1 ; 2 ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) 3 ; 4 ; Copyright (C) 2011, 2016, 2024, D. R. Commander. 5 ; Copyright (C) 2018, Matthias Räncker. 6 ; Copyright (C) 2023, Aliaksiej Kandracienka. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 14 %include "jcolsamp.inc" 15 16 ; -------------------------------------------------------------------------- 17 ; 18 ; Convert some rows of samples to the output colorspace. 19 ; 20 ; GLOBAL(void) 21 ; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, 22 ; JSAMPIMAGE output_buf, JDIMENSION output_row, 23 ; int num_rows); 24 ; 25 26 ; r10d = JDIMENSION img_width 27 ; r11 = JSAMPARRAY input_buf 28 ; r12 = JSAMPIMAGE output_buf 29 ; r13d = JDIMENSION output_row 30 ; r14d = int num_rows 31 32 %define wk(i) r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 33 %define WK_NUM 2 34 35 align 32 36 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) 37 38 EXTN(jsimd_rgb_gray_convert_sse2): 39 ENDBR64 40 push rbp 41 mov rbp, rsp 42 push r15 43 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 44 ; Allocate stack space for wk array. r15 is used to access it. 45 mov r15, rsp 46 sub rsp, byte (SIZEOF_XMMWORD * WK_NUM) 47 COLLECT_ARGS 5 48 push rbx 49 50 mov ecx, r10d 51 test rcx, rcx 52 jz near .return 53 54 push rcx 55 56 mov rsi, r12 57 mov ecx, r13d 58 mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 59 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 60 61 pop rcx 62 63 mov rsi, r11 64 mov eax, r14d 65 test rax, rax 66 jle near .return 67 .rowloop: 68 push rdi 69 push rsi 70 push rcx ; col 71 72 mov rsip, JSAMPROW [rsi] ; inptr 73 mov rdip, JSAMPROW [rdi] ; outptr0 74 75 cmp rcx, byte SIZEOF_XMMWORD 76 jae near .columnloop 77 78 %if RGB_PIXELSIZE == 3 ; --------------- 79 80 .column_ld1: 81 push rax 82 push rdx 83 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 84 test cl, SIZEOF_BYTE 85 jz short .column_ld2 86 sub rcx, byte SIZEOF_BYTE 87 movzx rax, byte [rsi+rcx] 88 .column_ld2: 89 test cl, SIZEOF_WORD 90 jz short .column_ld4 91 sub rcx, byte SIZEOF_WORD 92 movzx rdx, word [rsi+rcx] 93 shl rax, WORD_BIT 94 or rax, rdx 95 .column_ld4: 96 movd xmmA, eax 97 pop rdx 98 pop rax 99 test cl, SIZEOF_DWORD 100 jz short .column_ld8 101 sub rcx, byte SIZEOF_DWORD 102 movd xmmF, XMM_DWORD [rsi+rcx] 103 pslldq xmmA, SIZEOF_DWORD 104 por xmmA, xmmF 105 .column_ld8: 106 test cl, SIZEOF_MMWORD 107 jz short .column_ld16 108 sub rcx, byte SIZEOF_MMWORD 109 movq xmmB, XMM_MMWORD [rsi+rcx] 110 pslldq xmmA, SIZEOF_MMWORD 111 por xmmA, xmmB 112 .column_ld16: 113 test cl, SIZEOF_XMMWORD 114 jz short .column_ld32 115 movdqa xmmF, xmmA 116 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 117 mov rcx, SIZEOF_XMMWORD 118 jmp short .rgb_gray_cnv 119 .column_ld32: 120 test cl, 2*SIZEOF_XMMWORD 121 mov rcx, SIZEOF_XMMWORD 122 jz short .rgb_gray_cnv 123 movdqa xmmB, xmmA 124 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 125 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 126 jmp short .rgb_gray_cnv 127 128 .columnloop: 129 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 130 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 131 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 132 133 .rgb_gray_cnv: 134 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 135 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 136 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 137 138 movdqa xmmG, xmmA 139 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 140 psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 141 142 punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 143 pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 144 145 punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 146 punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 147 148 movdqa xmmD, xmmA 149 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 150 psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 151 152 punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 153 pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 154 155 punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 156 punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 157 158 movdqa xmmE, xmmA 159 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 160 psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 161 162 punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 163 pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 164 165 punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 166 punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 167 168 pxor xmmH, xmmH 169 170 movdqa xmmC, xmmA 171 punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 172 punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 173 174 movdqa xmmB, xmmE 175 punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 176 punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 177 178 movdqa xmmF, xmmD 179 punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 180 punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 181 182 %else ; RGB_PIXELSIZE == 4 ; ----------- 183 184 .column_ld1: 185 test cl, SIZEOF_XMMWORD/16 186 jz short .column_ld2 187 sub rcx, byte SIZEOF_XMMWORD/16 188 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 189 .column_ld2: 190 test cl, SIZEOF_XMMWORD/8 191 jz short .column_ld4 192 sub rcx, byte SIZEOF_XMMWORD/8 193 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 194 pslldq xmmA, SIZEOF_MMWORD 195 por xmmA, xmmE 196 .column_ld4: 197 test cl, SIZEOF_XMMWORD/4 198 jz short .column_ld8 199 sub rcx, byte SIZEOF_XMMWORD/4 200 movdqa xmmE, xmmA 201 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 202 .column_ld8: 203 test cl, SIZEOF_XMMWORD/2 204 mov rcx, SIZEOF_XMMWORD 205 jz short .rgb_gray_cnv 206 movdqa xmmF, xmmA 207 movdqa xmmH, xmmE 208 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 209 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 210 jmp short .rgb_gray_cnv 211 212 .columnloop: 213 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 214 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 215 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 216 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 217 218 .rgb_gray_cnv: 219 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 220 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 221 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 222 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 223 224 movdqa xmmD, xmmA 225 punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 226 punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 227 228 movdqa xmmC, xmmF 229 punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 230 punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 231 232 movdqa xmmB, xmmA 233 punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 234 punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 235 236 movdqa xmmG, xmmD 237 punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 238 punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 239 240 movdqa xmmE, xmmA 241 punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 242 punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 243 244 movdqa xmmH, xmmB 245 punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 246 punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 247 248 pxor xmmF, xmmF 249 250 movdqa xmmC, xmmA 251 punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 252 punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 253 254 movdqa xmmD, xmmB 255 punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 256 punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 257 258 movdqa xmmG, xmmE 259 punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 260 punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 261 262 punpcklbw xmmF, xmmH 263 punpckhbw xmmH, xmmH 264 psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 265 psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 266 267 %endif ; RGB_PIXELSIZE ; --------------- 268 269 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 270 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 271 272 ; (Original) 273 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 274 ; 275 ; (This implementation) 276 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 277 278 movdqa xmm6, xmm1 279 punpcklwd xmm1, xmm3 280 punpckhwd xmm6, xmm3 281 pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 282 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 283 284 movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) 285 286 movdqa xmm6, xmm0 287 punpcklwd xmm0, xmm2 288 punpckhwd xmm6, xmm2 289 pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 290 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 291 292 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 293 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 294 295 movdqa xmm0, xmm5 ; xmm0=BO 296 movdqa xmm6, xmm4 ; xmm6=BE 297 298 movdqa xmm4, xmm0 299 punpcklwd xmm0, xmm3 300 punpckhwd xmm4, xmm3 301 pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 302 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 303 304 movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 305 306 paddd xmm0, xmm1 307 paddd xmm4, xmm7 308 paddd xmm0, xmm3 309 paddd xmm4, xmm3 310 psrld xmm0, SCALEBITS ; xmm0=YOL 311 psrld xmm4, SCALEBITS ; xmm4=YOH 312 packssdw xmm0, xmm4 ; xmm0=YO 313 314 movdqa xmm4, xmm6 315 punpcklwd xmm6, xmm2 316 punpckhwd xmm4, xmm2 317 pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 318 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 319 320 movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 321 322 paddd xmm6, XMMWORD [wk(0)] 323 paddd xmm4, XMMWORD [wk(1)] 324 paddd xmm6, xmm2 325 paddd xmm4, xmm2 326 psrld xmm6, SCALEBITS ; xmm6=YEL 327 psrld xmm4, SCALEBITS ; xmm4=YEH 328 packssdw xmm6, xmm4 ; xmm6=YE 329 330 psllw xmm0, BYTE_BIT 331 por xmm6, xmm0 ; xmm6=Y 332 movdqa XMMWORD [rdi], xmm6 ; Save Y 333 334 sub rcx, byte SIZEOF_XMMWORD 335 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 336 add rdi, byte SIZEOF_XMMWORD ; outptr0 337 cmp rcx, byte SIZEOF_XMMWORD 338 jae near .columnloop 339 test rcx, rcx 340 jnz near .column_ld1 341 342 pop rcx ; col 343 pop rsi 344 pop rdi 345 346 add rsi, byte SIZEOF_JSAMPROW ; input_buf 347 add rdi, byte SIZEOF_JSAMPROW 348 dec rax ; num_rows 349 jg near .rowloop 350 351 .return: 352 pop rbx 353 UNCOLLECT_ARGS 5 354 lea rsp, [rbp-8] 355 pop r15 356 pop rbp 357 ret 358 359 ; For some reason, the OS X linker does not honor the request to align the 360 ; segment unless we do this. 361 align 32