jccolext-mmx.asm (16997B)
1 ; 2 ; jccolext.asm - colorspace conversion (MMX) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jcolsamp.inc" 14 15 ; -------------------------------------------------------------------------- 16 ; 17 ; Convert some rows of samples to the output colorspace. 18 ; 19 ; GLOBAL(void) 20 ; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf, 21 ; JSAMPIMAGE output_buf, JDIMENSION output_row, 22 ; int num_rows); 23 ; 24 25 %define img_width(b) (b) + 8 ; JDIMENSION img_width 26 %define input_buf(b) (b) + 12 ; JSAMPARRAY input_buf 27 %define output_buf(b) (b) + 16 ; JSAMPIMAGE output_buf 28 %define output_row(b) (b) + 20 ; JDIMENSION output_row 29 %define num_rows(b) (b) + 24 ; int num_rows 30 31 %define original_ebp ebp + 0 32 %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD 33 ; mmword wk[WK_NUM] 34 %define WK_NUM 8 35 %define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx) 39 40 EXTN(jsimd_rgb_ycc_convert_mmx): 41 push ebp 42 mov eax, esp ; eax = original ebp 43 sub esp, byte 4 44 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 45 mov [esp], eax 46 mov ebp, esp ; ebp = aligned ebp 47 lea esp, [wk(0)] 48 PUSHPIC eax ; make a room for GOT address 49 push ebx 50 ; push ecx ; need not be preserved 51 ; push edx ; need not be preserved 52 push esi 53 push edi 54 55 GET_GOT ebx ; get GOT address 56 MOVPIC POINTER [gotptr], ebx ; save GOT address 57 58 mov ecx, JDIMENSION [img_width(eax)] ; num_cols 59 test ecx, ecx 60 jz near .return 61 62 push ecx 63 64 mov esi, JSAMPIMAGE [output_buf(eax)] 65 mov ecx, JDIMENSION [output_row(eax)] 66 mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] 67 mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] 68 mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] 69 lea edi, [edi+ecx*SIZEOF_JSAMPROW] 70 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 71 lea edx, [edx+ecx*SIZEOF_JSAMPROW] 72 73 pop ecx 74 75 mov esi, JSAMPARRAY [input_buf(eax)] 76 mov eax, INT [num_rows(eax)] 77 test eax, eax 78 jle near .return 79 ALIGNX 16, 7 80 .rowloop: 81 PUSHPIC eax 82 push edx 83 push ebx 84 push edi 85 push esi 86 push ecx ; col 87 88 mov esi, JSAMPROW [esi] ; inptr 89 mov edi, JSAMPROW [edi] ; outptr0 90 mov ebx, JSAMPROW [ebx] ; outptr1 91 mov edx, JSAMPROW [edx] ; outptr2 92 MOVPIC eax, POINTER [gotptr] ; load GOT address (eax) 93 94 cmp ecx, byte SIZEOF_MMWORD 95 jae short .columnloop 96 ALIGNX 16, 7 97 98 %if RGB_PIXELSIZE == 3 ; --------------- 99 100 .column_ld1: 101 push eax 102 push edx 103 lea ecx, [ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE 104 test cl, SIZEOF_BYTE 105 jz short .column_ld2 106 sub ecx, byte SIZEOF_BYTE 107 xor eax, eax 108 mov al, byte [esi+ecx] 109 .column_ld2: 110 test cl, SIZEOF_WORD 111 jz short .column_ld4 112 sub ecx, byte SIZEOF_WORD 113 xor edx, edx 114 mov dx, word [esi+ecx] 115 shl eax, WORD_BIT 116 or eax, edx 117 .column_ld4: 118 movd mmA, eax 119 pop edx 120 pop eax 121 test cl, SIZEOF_DWORD 122 jz short .column_ld8 123 sub ecx, byte SIZEOF_DWORD 124 movd mmG, dword [esi+ecx] 125 psllq mmA, DWORD_BIT 126 por mmA, mmG 127 .column_ld8: 128 test cl, SIZEOF_MMWORD 129 jz short .column_ld16 130 movq mmG, mmA 131 movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 132 mov ecx, SIZEOF_MMWORD 133 jmp short .rgb_ycc_cnv 134 .column_ld16: 135 test cl, 2*SIZEOF_MMWORD 136 mov ecx, SIZEOF_MMWORD 137 jz short .rgb_ycc_cnv 138 movq mmF, mmA 139 movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 140 movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] 141 jmp short .rgb_ycc_cnv 142 ALIGNX 16, 7 143 144 .columnloop: 145 movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 146 movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] 147 movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] 148 149 .rgb_ycc_cnv: 150 ; mmA=(00 10 20 01 11 21 02 12) 151 ; mmG=(22 03 13 23 04 14 24 05) 152 ; mmF=(15 25 06 16 26 07 17 27) 153 154 movq mmD, mmA 155 psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) 156 psrlq mmD, 4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) 157 158 punpckhbw mmA, mmG ; mmA=(00 04 10 14 20 24 01 05) 159 psllq mmG, 4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) 160 161 punpcklbw mmD, mmF ; mmD=(11 15 21 25 02 06 12 16) 162 punpckhbw mmG, mmF ; mmG=(22 26 03 07 13 17 23 27) 163 164 movq mmE, mmA 165 psllq mmA, 4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) 166 psrlq mmE, 4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) 167 168 punpckhbw mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) 169 psllq mmD, 4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) 170 171 punpcklbw mmE, mmG ; mmE=(20 22 24 26 01 03 05 07) 172 punpckhbw mmD, mmG ; mmD=(11 13 15 17 21 23 25 27) 173 174 pxor mmH, mmH 175 176 movq mmC, mmA 177 punpcklbw mmA, mmH ; mmA=(00 02 04 06) 178 punpckhbw mmC, mmH ; mmC=(10 12 14 16) 179 180 movq mmB, mmE 181 punpcklbw mmE, mmH ; mmE=(20 22 24 26) 182 punpckhbw mmB, mmH ; mmB=(01 03 05 07) 183 184 movq mmF, mmD 185 punpcklbw mmD, mmH ; mmD=(11 13 15 17) 186 punpckhbw mmF, mmH ; mmF=(21 23 25 27) 187 188 %else ; RGB_PIXELSIZE == 4 ; ----------- 189 190 .column_ld1: 191 test cl, SIZEOF_MMWORD/8 192 jz short .column_ld2 193 sub ecx, byte SIZEOF_MMWORD/8 194 movd mmA, dword [esi+ecx*RGB_PIXELSIZE] 195 .column_ld2: 196 test cl, SIZEOF_MMWORD/4 197 jz short .column_ld4 198 sub ecx, byte SIZEOF_MMWORD/4 199 movq mmF, mmA 200 movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] 201 .column_ld4: 202 test cl, SIZEOF_MMWORD/2 203 mov ecx, SIZEOF_MMWORD 204 jz short .rgb_ycc_cnv 205 movq mmD, mmA 206 movq mmC, mmF 207 movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 208 movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] 209 jmp short .rgb_ycc_cnv 210 ALIGNX 16, 7 211 212 .columnloop: 213 movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 214 movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] 215 movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] 216 movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] 217 218 .rgb_ycc_cnv: 219 ; mmA=(00 10 20 30 01 11 21 31) 220 ; mmF=(02 12 22 32 03 13 23 33) 221 ; mmD=(04 14 24 34 05 15 25 35) 222 ; mmC=(06 16 26 36 07 17 27 37) 223 224 movq mmB, mmA 225 punpcklbw mmA, mmF ; mmA=(00 02 10 12 20 22 30 32) 226 punpckhbw mmB, mmF ; mmB=(01 03 11 13 21 23 31 33) 227 228 movq mmG, mmD 229 punpcklbw mmD, mmC ; mmD=(04 06 14 16 24 26 34 36) 230 punpckhbw mmG, mmC ; mmG=(05 07 15 17 25 27 35 37) 231 232 movq mmE, mmA 233 punpcklwd mmA, mmD ; mmA=(00 02 04 06 10 12 14 16) 234 punpckhwd mmE, mmD ; mmE=(20 22 24 26 30 32 34 36) 235 236 movq mmH, mmB 237 punpcklwd mmB, mmG ; mmB=(01 03 05 07 11 13 15 17) 238 punpckhwd mmH, mmG ; mmH=(21 23 25 27 31 33 35 37) 239 240 pxor mmF, mmF 241 242 movq mmC, mmA 243 punpcklbw mmA, mmF ; mmA=(00 02 04 06) 244 punpckhbw mmC, mmF ; mmC=(10 12 14 16) 245 246 movq mmD, mmB 247 punpcklbw mmB, mmF ; mmB=(01 03 05 07) 248 punpckhbw mmD, mmF ; mmD=(11 13 15 17) 249 250 movq mmG, mmE 251 punpcklbw mmE, mmF ; mmE=(20 22 24 26) 252 punpckhbw mmG, mmF ; mmG=(30 32 34 36) 253 254 punpcklbw mmF, mmH 255 punpckhbw mmH, mmH 256 psrlw mmF, BYTE_BIT ; mmF=(21 23 25 27) 257 psrlw mmH, BYTE_BIT ; mmH=(31 33 35 37) 258 259 %endif ; RGB_PIXELSIZE ; --------------- 260 261 ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE 262 ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO 263 264 ; (Original) 265 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 266 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 267 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 268 ; 269 ; (This implementation) 270 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 271 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 272 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 273 274 movq MMWORD [wk(0)], mm0 ; wk(0)=RE 275 movq MMWORD [wk(1)], mm1 ; wk(1)=RO 276 movq MMWORD [wk(2)], mm4 ; wk(2)=BE 277 movq MMWORD [wk(3)], mm5 ; wk(3)=BO 278 279 movq mm6, mm1 280 punpcklwd mm1, mm3 281 punpckhwd mm6, mm3 282 movq mm7, mm1 283 movq mm4, mm6 284 pmaddwd mm1, [GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) 285 pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) 286 pmaddwd mm7, [GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 287 pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 288 289 movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 290 movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 291 292 pxor mm1, mm1 293 pxor mm6, mm6 294 punpcklwd mm1, mm5 ; mm1=BOL 295 punpckhwd mm6, mm5 ; mm6=BOH 296 psrld mm1, 1 ; mm1=BOL*FIX(0.500) 297 psrld mm6, 1 ; mm6=BOH*FIX(0.500) 298 299 movq mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] 300 301 paddd mm7, mm1 302 paddd mm4, mm6 303 paddd mm7, mm5 304 paddd mm4, mm5 305 psrld mm7, SCALEBITS ; mm7=CbOL 306 psrld mm4, SCALEBITS ; mm4=CbOH 307 packssdw mm7, mm4 ; mm7=CbO 308 309 movq mm1, MMWORD [wk(2)] ; mm1=BE 310 311 movq mm6, mm0 312 punpcklwd mm0, mm2 313 punpckhwd mm6, mm2 314 movq mm5, mm0 315 movq mm4, mm6 316 pmaddwd mm0, [GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) 317 pmaddwd mm6, [GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) 318 pmaddwd mm5, [GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 319 pmaddwd mm4, [GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 320 321 movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 322 movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 323 324 pxor mm0, mm0 325 pxor mm6, mm6 326 punpcklwd mm0, mm1 ; mm0=BEL 327 punpckhwd mm6, mm1 ; mm6=BEH 328 psrld mm0, 1 ; mm0=BEL*FIX(0.500) 329 psrld mm6, 1 ; mm6=BEH*FIX(0.500) 330 331 movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] 332 333 paddd mm5, mm0 334 paddd mm4, mm6 335 paddd mm5, mm1 336 paddd mm4, mm1 337 psrld mm5, SCALEBITS ; mm5=CbEL 338 psrld mm4, SCALEBITS ; mm4=CbEH 339 packssdw mm5, mm4 ; mm5=CbE 340 341 psllw mm7, BYTE_BIT 342 por mm5, mm7 ; mm5=Cb 343 movq MMWORD [ebx], mm5 ; Save Cb 344 345 movq mm0, MMWORD [wk(3)] ; mm0=BO 346 movq mm6, MMWORD [wk(2)] ; mm6=BE 347 movq mm1, MMWORD [wk(1)] ; mm1=RO 348 349 movq mm4, mm0 350 punpcklwd mm0, mm3 351 punpckhwd mm4, mm3 352 movq mm7, mm0 353 movq mm5, mm4 354 pmaddwd mm0, [GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) 355 pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) 356 pmaddwd mm7, [GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 357 pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 358 359 movq mm3, [GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] 360 361 paddd mm0, MMWORD [wk(4)] 362 paddd mm4, MMWORD [wk(5)] 363 paddd mm0, mm3 364 paddd mm4, mm3 365 psrld mm0, SCALEBITS ; mm0=YOL 366 psrld mm4, SCALEBITS ; mm4=YOH 367 packssdw mm0, mm4 ; mm0=YO 368 369 pxor mm3, mm3 370 pxor mm4, mm4 371 punpcklwd mm3, mm1 ; mm3=ROL 372 punpckhwd mm4, mm1 ; mm4=ROH 373 psrld mm3, 1 ; mm3=ROL*FIX(0.500) 374 psrld mm4, 1 ; mm4=ROH*FIX(0.500) 375 376 movq mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] 377 378 paddd mm7, mm3 379 paddd mm5, mm4 380 paddd mm7, mm1 381 paddd mm5, mm1 382 psrld mm7, SCALEBITS ; mm7=CrOL 383 psrld mm5, SCALEBITS ; mm5=CrOH 384 packssdw mm7, mm5 ; mm7=CrO 385 386 movq mm3, MMWORD [wk(0)] ; mm3=RE 387 388 movq mm4, mm6 389 punpcklwd mm6, mm2 390 punpckhwd mm4, mm2 391 movq mm1, mm6 392 movq mm5, mm4 393 pmaddwd mm6, [GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) 394 pmaddwd mm4, [GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) 395 pmaddwd mm1, [GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 396 pmaddwd mm5, [GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 397 398 movq mm2, [GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] 399 400 paddd mm6, MMWORD [wk(6)] 401 paddd mm4, MMWORD [wk(7)] 402 paddd mm6, mm2 403 paddd mm4, mm2 404 psrld mm6, SCALEBITS ; mm6=YEL 405 psrld mm4, SCALEBITS ; mm4=YEH 406 packssdw mm6, mm4 ; mm6=YE 407 408 psllw mm0, BYTE_BIT 409 por mm6, mm0 ; mm6=Y 410 movq MMWORD [edi], mm6 ; Save Y 411 412 pxor mm2, mm2 413 pxor mm4, mm4 414 punpcklwd mm2, mm3 ; mm2=REL 415 punpckhwd mm4, mm3 ; mm4=REH 416 psrld mm2, 1 ; mm2=REL*FIX(0.500) 417 psrld mm4, 1 ; mm4=REH*FIX(0.500) 418 419 movq mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] 420 421 paddd mm1, mm2 422 paddd mm5, mm4 423 paddd mm1, mm0 424 paddd mm5, mm0 425 psrld mm1, SCALEBITS ; mm1=CrEL 426 psrld mm5, SCALEBITS ; mm5=CrEH 427 packssdw mm1, mm5 ; mm1=CrE 428 429 psllw mm7, BYTE_BIT 430 por mm1, mm7 ; mm1=Cr 431 movq MMWORD [edx], mm1 ; Save Cr 432 433 sub ecx, byte SIZEOF_MMWORD 434 add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr 435 add edi, byte SIZEOF_MMWORD ; outptr0 436 add ebx, byte SIZEOF_MMWORD ; outptr1 437 add edx, byte SIZEOF_MMWORD ; outptr2 438 cmp ecx, byte SIZEOF_MMWORD 439 jae near .columnloop 440 test ecx, ecx 441 jnz near .column_ld1 442 443 pop ecx ; col 444 pop esi 445 pop edi 446 pop ebx 447 pop edx 448 POPPIC eax 449 450 add esi, byte SIZEOF_JSAMPROW ; input_buf 451 add edi, byte SIZEOF_JSAMPROW 452 add ebx, byte SIZEOF_JSAMPROW 453 add edx, byte SIZEOF_JSAMPROW 454 dec eax ; num_rows 455 jg near .rowloop 456 457 emms ; empty MMX state 458 459 .return: 460 pop edi 461 pop esi 462 ; pop edx ; need not be preserved 463 ; pop ecx ; need not be preserved 464 pop ebx 465 mov esp, ebp ; esp <- aligned ebp 466 pop esp ; esp <- original ebp 467 pop ebp 468 ret 469 470 ; For some reason, the OS X linker does not honor the request to align the 471 ; segment unless we do this. 472 align 32