jdcolext-mmx.asm (15431B)
1 ; 2 ; jdcolext.asm - colorspace conversion (MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jcolsamp.inc" 14 15 ; -------------------------------------------------------------------------- 16 ; 17 ; Convert some rows of samples to the output colorspace. 18 ; 19 ; GLOBAL(void) 20 ; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf, 21 ; JDIMENSION input_row, JSAMPARRAY output_buf, 22 ; int num_rows) 23 ; 24 25 %define out_width(b) (b) + 8 ; JDIMENSION out_width 26 %define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 27 %define input_row(b) (b) + 16 ; JDIMENSION input_row 28 %define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 29 %define num_rows(b) (b) + 24 ; int num_rows 30 31 %define original_ebp ebp + 0 32 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD 33 ; mmword wk[WK_NUM] 34 %define WK_NUM 2 35 %define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx) 39 40 EXTN(jsimd_ycc_rgb_convert_mmx): 41 push ebp 42 mov eax, esp ; eax = original ebp 43 sub esp, byte 4 44 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 45 mov [esp], eax 46 mov ebp, esp ; ebp = aligned ebp 47 lea esp, [wk(0)] 48 PUSHPIC eax ; make a room for GOT address 49 push ebx 50 ; push ecx ; need not be preserved 51 ; push edx ; need not be preserved 52 push esi 53 push edi 54 55 GET_GOT ebx ; get GOT address 56 MOVPIC POINTER [gotptr], ebx ; save GOT address 57 58 mov ecx, JDIMENSION [out_width(eax)] ; num_cols 59 test ecx, ecx 60 jz near .return 61 62 push ecx 63 64 mov edi, JSAMPIMAGE [input_buf(eax)] 65 mov ecx, JDIMENSION [input_row(eax)] 66 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 67 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 68 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 69 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 70 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 71 lea edx, [edx+ecx*SIZEOF_JSAMPROW] 72 73 pop ecx 74 75 mov edi, JSAMPARRAY [output_buf(eax)] 76 mov eax, INT [num_rows(eax)] 77 test eax, eax 78 jle near .return 79 ALIGNX 16, 7 80 .rowloop: 81 push eax 82 push edi 83 push edx 84 push ebx 85 push esi 86 push ecx ; col 87 88 mov esi, JSAMPROW [esi] ; inptr0 89 mov ebx, JSAMPROW [ebx] ; inptr1 90 mov edx, JSAMPROW [edx] ; inptr2 91 mov edi, JSAMPROW [edi] ; outptr 92 MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) 93 ALIGNX 16, 7 94 .columnloop: 95 96 movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) 97 movq mm1, MMWORD [edx] ; mm1=Cr(01234567) 98 99 pcmpeqw mm4, mm4 100 pcmpeqw mm7, mm7 101 psrlw mm4, BYTE_BIT 102 psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} 103 movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} 104 105 pand mm4, mm5 ; mm4=Cb(0246)=CbE 106 psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO 107 pand mm0, mm1 ; mm0=Cr(0246)=CrE 108 psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO 109 110 paddw mm4, mm7 111 paddw mm5, mm7 112 paddw mm0, mm7 113 paddw mm1, mm7 114 115 ; (Original) 116 ; R = Y + 1.40200 * Cr 117 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 118 ; B = Y + 1.77200 * Cb 119 ; 120 ; (This implementation) 121 ; R = Y + 0.40200 * Cr + Cr 122 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 123 ; B = Y - 0.22800 * Cb + Cb + Cb 124 125 movq mm2, mm4 ; mm2=CbE 126 movq mm3, mm5 ; mm3=CbO 127 paddw mm4, mm4 ; mm4=2*CbE 128 paddw mm5, mm5 ; mm5=2*CbO 129 movq mm6, mm0 ; mm6=CrE 130 movq mm7, mm1 ; mm7=CrO 131 paddw mm0, mm0 ; mm0=2*CrE 132 paddw mm1, mm1 ; mm1=2*CrO 133 134 pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) 135 pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) 136 pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) 137 pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) 138 139 paddw mm4, [GOTOFF(eax,PW_ONE)] 140 paddw mm5, [GOTOFF(eax,PW_ONE)] 141 psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800)) 142 psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800)) 143 paddw mm0, [GOTOFF(eax,PW_ONE)] 144 paddw mm1, [GOTOFF(eax,PW_ONE)] 145 psraw mm0, 1 ; mm0=(CrE * FIX(0.40200)) 146 psraw mm1, 1 ; mm1=(CrO * FIX(0.40200)) 147 148 paddw mm4, mm2 149 paddw mm5, mm3 150 paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E 151 paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O 152 paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E 153 paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O 154 155 movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E 156 movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O 157 158 movq mm4, mm2 159 movq mm5, mm3 160 punpcklwd mm2, mm6 161 punpckhwd mm4, mm6 162 pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)] 163 pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)] 164 punpcklwd mm3, mm7 165 punpckhwd mm5, mm7 166 pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)] 167 pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)] 168 169 paddd mm2, [GOTOFF(eax,PD_ONEHALF)] 170 paddd mm4, [GOTOFF(eax,PD_ONEHALF)] 171 psrad mm2, SCALEBITS 172 psrad mm4, SCALEBITS 173 paddd mm3, [GOTOFF(eax,PD_ONEHALF)] 174 paddd mm5, [GOTOFF(eax,PD_ONEHALF)] 175 psrad mm3, SCALEBITS 176 psrad mm5, SCALEBITS 177 178 packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 179 packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 180 psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 181 psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 182 183 movq mm5, MMWORD [esi] ; mm5=Y(01234567) 184 185 pcmpeqw mm4, mm4 186 psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} 187 pand mm4, mm5 ; mm4=Y(0246)=YE 188 psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO 189 190 paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) 191 paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) 192 packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) 193 packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) 194 195 paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) 196 paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) 197 packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) 198 packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) 199 200 paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) 201 paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) 202 packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) 203 packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) 204 205 %if RGB_PIXELSIZE == 3 ; --------------- 206 207 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 208 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 209 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 210 ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) 211 212 punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) 213 punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07) 214 punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27) 215 216 movq mmG, mmA 217 movq mmH, mmA 218 punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03) 219 punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07) 220 221 psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) 222 psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) 223 224 movq mmC, mmD 225 movq mmB, mmD 226 punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14) 227 punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --) 228 229 psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) 230 231 movq mmF, mmE 232 punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25) 233 punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --) 234 235 punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12) 236 punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05) 237 punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27) 238 239 cmp ecx, byte SIZEOF_MMWORD 240 jb short .column_st16 241 242 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 243 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 244 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 245 246 sub ecx, byte SIZEOF_MMWORD 247 jz short .nextrow 248 249 add esi, byte SIZEOF_MMWORD ; inptr0 250 add ebx, byte SIZEOF_MMWORD ; inptr1 251 add edx, byte SIZEOF_MMWORD ; inptr2 252 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 253 jmp near .columnloop 254 ALIGNX 16, 7 255 256 .column_st16: 257 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 258 cmp ecx, byte 2*SIZEOF_MMWORD 259 jb short .column_st8 260 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 261 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 262 movq mmA, mmC 263 sub ecx, byte 2*SIZEOF_MMWORD 264 add edi, byte 2*SIZEOF_MMWORD 265 jmp short .column_st4 266 .column_st8: 267 cmp ecx, byte SIZEOF_MMWORD 268 jb short .column_st4 269 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 270 movq mmA, mmE 271 sub ecx, byte SIZEOF_MMWORD 272 add edi, byte SIZEOF_MMWORD 273 .column_st4: 274 movd eax, mmA 275 cmp ecx, byte SIZEOF_DWORD 276 jb short .column_st2 277 mov dword [edi+0*SIZEOF_DWORD], eax 278 psrlq mmA, DWORD_BIT 279 movd eax, mmA 280 sub ecx, byte SIZEOF_DWORD 281 add edi, byte SIZEOF_DWORD 282 .column_st2: 283 cmp ecx, byte SIZEOF_WORD 284 jb short .column_st1 285 mov word [edi+0*SIZEOF_WORD], ax 286 shr eax, WORD_BIT 287 sub ecx, byte SIZEOF_WORD 288 add edi, byte SIZEOF_WORD 289 .column_st1: 290 cmp ecx, byte SIZEOF_BYTE 291 jb short .nextrow 292 mov byte [edi+0*SIZEOF_BYTE], al 293 294 %else ; RGB_PIXELSIZE == 4 ; ----------- 295 296 %ifdef RGBX_FILLER_0XFF 297 pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 298 pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 299 %else 300 pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 301 pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 302 %endif 303 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 304 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 305 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 306 ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) 307 308 punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) 309 punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36) 310 punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17) 311 punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37) 312 313 movq mmC, mmA 314 punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32) 315 punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36) 316 movq mmG, mmB 317 punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33) 318 punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37) 319 320 movq mmD, mmA 321 punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31) 322 punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33) 323 movq mmH, mmC 324 punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35) 325 punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37) 326 327 cmp ecx, byte SIZEOF_MMWORD 328 jb short .column_st16 329 330 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 331 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 332 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 333 movq MMWORD [edi+3*SIZEOF_MMWORD], mmH 334 335 sub ecx, byte SIZEOF_MMWORD 336 jz short .nextrow 337 338 add esi, byte SIZEOF_MMWORD ; inptr0 339 add ebx, byte SIZEOF_MMWORD ; inptr1 340 add edx, byte SIZEOF_MMWORD ; inptr2 341 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 342 jmp near .columnloop 343 ALIGNX 16, 7 344 345 .column_st16: 346 cmp ecx, byte SIZEOF_MMWORD/2 347 jb short .column_st8 348 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 349 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 350 movq mmA, mmC 351 movq mmD, mmH 352 sub ecx, byte SIZEOF_MMWORD/2 353 add edi, byte 2*SIZEOF_MMWORD 354 .column_st8: 355 cmp ecx, byte SIZEOF_MMWORD/4 356 jb short .column_st4 357 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 358 movq mmA, mmD 359 sub ecx, byte SIZEOF_MMWORD/4 360 add edi, byte 1*SIZEOF_MMWORD 361 .column_st4: 362 cmp ecx, byte SIZEOF_MMWORD/8 363 jb short .nextrow 364 movd dword [edi+0*SIZEOF_DWORD], mmA 365 366 %endif ; RGB_PIXELSIZE ; --------------- 367 368 ALIGNX 16, 7 369 370 .nextrow: 371 pop ecx 372 pop esi 373 pop ebx 374 pop edx 375 pop edi 376 pop eax 377 378 add esi, byte SIZEOF_JSAMPROW 379 add ebx, byte SIZEOF_JSAMPROW 380 add edx, byte SIZEOF_JSAMPROW 381 add edi, byte SIZEOF_JSAMPROW ; output_buf 382 dec eax ; num_rows 383 jg near .rowloop 384 385 emms ; empty MMX state 386 387 .return: 388 pop edi 389 pop esi 390 ; pop edx ; need not be preserved 391 ; pop ecx ; need not be preserved 392 pop ebx 393 mov esp, ebp ; esp <- aligned ebp 394 pop esp ; esp <- original ebp 395 pop ebp 396 ret 397 398 ; For some reason, the OS X linker does not honor the request to align the 399 ; segment unless we do this. 400 align 32