jdcolext-mmi.c (15201B)
1 /* 2 * Loongson MMI optimizations for libjpeg-turbo 3 * 4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved. 6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. 7 * All Rights Reserved. 8 * Authors: ZhuChen <zhuchen@loongson.cn> 9 * SunZhangzhi <sunzhangzhi-cq@loongson.cn> 10 * CaiWanwei <caiwanwei@loongson.cn> 11 * 12 * Based on the x86 SIMD extension for IJG JPEG library 13 * Copyright (C) 1999-2006, MIYASAKA Masaru. 14 * 15 * This software is provided 'as-is', without any express or implied 16 * warranty. In no event will the authors be held liable for any damages 17 * arising from the use of this software. 18 * 19 * Permission is granted to anyone to use this software for any purpose, 20 * including commercial applications, and to alter it and redistribute it 21 * freely, subject to the following restrictions: 22 * 23 * 1. The origin of this software must not be misrepresented; you must not 24 * claim that you wrote the original software. If you use this software 25 * in a product, an acknowledgment in the product documentation would be 26 * appreciated but is not required. 27 * 2. Altered source versions must be plainly marked as such, and must not be 28 * misrepresented as being the original software. 29 * 3. This notice may not be removed or altered from any source distribution. 30 */ 31 32 /* This file is included by jdcolor-mmi.c */ 33 34 35 #if RGB_RED == 0 36 #define mmA re 37 #define mmB ro 38 #elif RGB_GREEN == 0 39 #define mmA ge 40 #define mmB go 41 #elif RGB_BLUE == 0 42 #define mmA be 43 #define mmB bo 44 #else 45 #define mmA xe 46 #define mmB xo 47 #endif 48 49 #if RGB_RED == 1 50 #define mmC re 51 #define mmD ro 52 #elif RGB_GREEN == 1 53 #define mmC ge 54 #define mmD go 55 #elif RGB_BLUE == 1 56 #define mmC be 57 #define mmD bo 58 #else 59 #define mmC xe 60 #define mmD xo 61 #endif 62 63 #if RGB_RED == 2 64 #define mmE re 65 #define mmF ro 66 #elif RGB_GREEN == 2 67 #define mmE ge 68 #define mmF go 69 #elif RGB_BLUE == 2 70 #define mmE be 71 #define mmF bo 72 #else 73 #define mmE xe 74 #define mmF xo 75 #endif 76 77 #if RGB_RED == 3 78 #define mmG re 79 #define mmH ro 80 #elif RGB_GREEN == 3 81 #define mmG ge 82 #define mmH go 83 #elif RGB_BLUE == 3 84 #define mmG be 85 #define mmH bo 86 #else 87 #define mmG xe 88 #define mmH xo 89 #endif 90 91 92 void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, 93 JDIMENSION input_row, JSAMPARRAY output_buf, 94 int num_rows) 95 { 96 JSAMPROW outptr, inptr0, inptr1, inptr2; 97 int num_cols, col; 98 __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr; 99 __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0; 100 __m64 decenter, mask; 101 102 while (--num_rows >= 0) { 103 inptr0 = input_buf[0][input_row]; 104 inptr1 = input_buf[1][input_row]; 105 inptr2 = input_buf[2][input_row]; 106 input_row++; 107 outptr = *output_buf++; 108 109 for (num_cols = out_width; num_cols > 0; num_cols -= 8, 110 inptr0 += 8, inptr1 += 8, inptr2 += 8) { 111 112 cb = _mm_load_si64((__m64 *)inptr1); 113 cr = _mm_load_si64((__m64 *)inptr2); 114 y = _mm_load_si64((__m64 *)inptr0); 115 116 mask = decenter = 0.0; 117 mask = _mm_cmpeq_pi16(mask, mask); 118 decenter = _mm_cmpeq_pi16(decenter, decenter); 119 mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ 120 decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ 121 122 cbe = _mm_and_si64(mask, cb); /* Cb(0246) */ 123 cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */ 124 cre = _mm_and_si64(mask, cr); /* Cr(0246) */ 125 cro = _mm_srli_pi16(cr, BYTE_BIT); /* Cr(1357) */ 126 cbe = _mm_add_pi16(cbe, decenter); 127 cbo = _mm_add_pi16(cbo, decenter); 128 cre = _mm_add_pi16(cre, decenter); 129 cro = _mm_add_pi16(cro, decenter); 130 131 /* (Original) 132 * R = Y + 1.40200 * Cr 133 * G = Y - 0.34414 * Cb - 0.71414 * Cr 134 * B = Y + 1.77200 * Cb 135 * 136 * (This implementation) 137 * R = Y + 0.40200 * Cr + Cr 138 * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 139 * B = Y - 0.22800 * Cb + Cb + Cb 140 */ 141 142 cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */ 143 cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */ 144 cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */ 145 cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */ 146 147 be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */ 148 bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */ 149 re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */ 150 ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */ 151 152 be = _mm_add_pi16(be, PW_ONE); 153 bo = _mm_add_pi16(bo, PW_ONE); 154 be = _mm_srai_pi16(be, 1); /* (CbE * -FIX(0.22800)) */ 155 bo = _mm_srai_pi16(bo, 1); /* (CbO * -FIX(0.22800)) */ 156 re = _mm_add_pi16(re, PW_ONE); 157 ro = _mm_add_pi16(ro, PW_ONE); 158 re = _mm_srai_pi16(re, 1); /* (CrE * FIX(0.40200)) */ 159 ro = _mm_srai_pi16(ro, 1); /* (CrO * FIX(0.40200)) */ 160 161 be = _mm_add_pi16(be, cbe); 162 bo = _mm_add_pi16(bo, cbo); 163 be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */ 164 bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */ 165 re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */ 166 ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */ 167 168 gle = _mm_unpacklo_pi16(cbe, cre); 169 ghe = _mm_unpackhi_pi16(cbe, cre); 170 gle = _mm_madd_pi16(gle, PW_MF0344_F0285); 171 ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285); 172 glo = _mm_unpacklo_pi16(cbo, cro); 173 gho = _mm_unpackhi_pi16(cbo, cro); 174 glo = _mm_madd_pi16(glo, PW_MF0344_F0285); 175 gho = _mm_madd_pi16(gho, PW_MF0344_F0285); 176 177 gle = _mm_add_pi32(gle, PD_ONEHALF); 178 ghe = _mm_add_pi32(ghe, PD_ONEHALF); 179 gle = _mm_srai_pi32(gle, SCALEBITS); 180 ghe = _mm_srai_pi32(ghe, SCALEBITS); 181 glo = _mm_add_pi32(glo, PD_ONEHALF); 182 gho = _mm_add_pi32(gho, PD_ONEHALF); 183 glo = _mm_srai_pi32(glo, SCALEBITS); 184 gho = _mm_srai_pi32(gho, SCALEBITS); 185 186 ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */ 187 go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */ 188 ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */ 189 go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */ 190 191 ye = _mm_and_si64(mask, y); /* Y(0246) */ 192 yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */ 193 194 re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */ 195 ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */ 196 re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */ 197 ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */ 198 199 ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */ 200 go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */ 201 ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */ 202 go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */ 203 204 be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */ 205 bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */ 206 be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */ 207 bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */ 208 209 #if RGB_PIXELSIZE == 3 210 211 /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ 212 /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ 213 mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ 214 mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */ 215 mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */ 216 217 mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT); 218 219 mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */ 220 mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */ 221 222 mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT); 223 mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */ 224 225 mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */ 226 mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */ 227 228 mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */ 229 mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */ 230 231 mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */ 232 mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */ 233 mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */ 234 235 if (num_cols >= 8) { 236 if (!(((long)outptr) & 7)) { 237 _mm_store_si64((__m64 *)outptr, mmA); 238 _mm_store_si64((__m64 *)(outptr + 8), mmE); 239 _mm_store_si64((__m64 *)(outptr + 16), mmC); 240 } else { 241 _mm_storeu_si64((__m64 *)outptr, mmA); 242 _mm_storeu_si64((__m64 *)(outptr + 8), mmE); 243 _mm_storeu_si64((__m64 *)(outptr + 16), mmC); 244 } 245 outptr += RGB_PIXELSIZE * 8; 246 } else { 247 col = num_cols * 3; 248 asm(".set noreorder\r\n" 249 250 "li $8, 16\r\n" 251 "move $9, %4\r\n" 252 "mov.s $f4, %1\r\n" 253 "mov.s $f6, %3\r\n" 254 "move $10, %5\r\n" 255 "bltu $9, $8, 1f\r\n" 256 "nop \r\n" 257 "gssdlc1 $f4, 7($10)\r\n" 258 "gssdrc1 $f4, 0($10)\r\n" 259 "gssdlc1 $f6, 7+8($10)\r\n" 260 "gssdrc1 $f6, 8($10)\r\n" 261 "mov.s $f4, %2\r\n" 262 "subu $9, $9, 16\r\n" 263 PTR_ADDU "$10, $10, 16\r\n" 264 "b 2f\r\n" 265 "nop \r\n" 266 267 "1: \r\n" 268 "li $8, 8\r\n" /* st8 */ 269 "bltu $9, $8, 2f\r\n" 270 "nop \r\n" 271 "gssdlc1 $f4, 7($10)\r\n" 272 "gssdrc1 $f4, 0($10)\r\n" 273 "mov.s $f4, %3\r\n" 274 "subu $9, $9, 8\r\n" 275 PTR_ADDU "$10, $10, 8\r\n" 276 277 "2: \r\n" 278 "li $8, 4\r\n" /* st4 */ 279 "mfc1 $11, $f4\r\n" 280 "bltu $9, $8, 3f\r\n" 281 "nop \r\n" 282 "swl $11, 3($10)\r\n" 283 "swr $11, 0($10)\r\n" 284 "li $8, 32\r\n" 285 "mtc1 $8, $f6\r\n" 286 "dsrl $f4, $f4, $f6\r\n" 287 "mfc1 $11, $f4\r\n" 288 "subu $9, $9, 4\r\n" 289 PTR_ADDU "$10, $10, 4\r\n" 290 291 "3: \r\n" 292 "li $8, 2\r\n" /* st2 */ 293 "bltu $9, $8, 4f\r\n" 294 "nop \r\n" 295 "ush $11, 0($10)\r\n" 296 "srl $11, 16\r\n" 297 "subu $9, $9, 2\r\n" 298 PTR_ADDU "$10, $10, 2\r\n" 299 300 "4: \r\n" 301 "li $8, 1\r\n" /* st1 */ 302 "bltu $9, $8, 5f\r\n" 303 "nop \r\n" 304 "sb $11, 0($10)\r\n" 305 306 "5: \r\n" 307 "nop \r\n" /* end */ 308 : "=m" (*outptr) 309 : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr) 310 : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory" 311 ); 312 } 313 314 #else /* RGB_PIXELSIZE == 4 */ 315 316 #ifdef RGBX_FILLER_0XFF 317 xe = _mm_cmpeq_pi8(xe, xe); 318 xo = _mm_cmpeq_pi8(xo, xo); 319 #else 320 xe = _mm_xor_si64(xe, xe); 321 xo = _mm_xor_si64(xo, xo); 322 #endif 323 /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ 324 /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ 325 /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */ 326 /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */ 327 328 mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ 329 mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */ 330 mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */ 331 mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */ 332 333 mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */ 334 mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */ 335 mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */ 336 mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */ 337 338 mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */ 339 mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */ 340 mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */ 341 mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */ 342 343 if (num_cols >= 8) { 344 if (!(((long)outptr) & 7)) { 345 _mm_store_si64((__m64 *)outptr, mmA); 346 _mm_store_si64((__m64 *)(outptr + 8), mmD); 347 _mm_store_si64((__m64 *)(outptr + 16), mmC); 348 _mm_store_si64((__m64 *)(outptr + 24), mmH); 349 } else { 350 _mm_storeu_si64((__m64 *)outptr, mmA); 351 _mm_storeu_si64((__m64 *)(outptr + 8), mmD); 352 _mm_storeu_si64((__m64 *)(outptr + 16), mmC); 353 _mm_storeu_si64((__m64 *)(outptr + 24), mmH); 354 } 355 outptr += RGB_PIXELSIZE * 8; 356 } else { 357 col = num_cols; 358 asm(".set noreorder\r\n" /* st16 */ 359 360 "li $8, 4\r\n" 361 "move $9, %6\r\n" 362 "move $10, %7\r\n" 363 "mov.s $f4, %2\r\n" 364 "mov.s $f6, %4\r\n" 365 "bltu $9, $8, 1f\r\n" 366 "nop \r\n" 367 "gssdlc1 $f4, 7($10)\r\n" 368 "gssdrc1 $f4, 0($10)\r\n" 369 "gssdlc1 $f6, 7+8($10)\r\n" 370 "gssdrc1 $f6, 8($10)\r\n" 371 "mov.s $f4, %3\r\n" 372 "mov.s $f6, %5\r\n" 373 "subu $9, $9, 4\r\n" 374 PTR_ADDU "$10, $10, 16\r\n" 375 376 "1: \r\n" 377 "li $8, 2\r\n" /* st8 */ 378 "bltu $9, $8, 2f\r\n" 379 "nop \r\n" 380 "gssdlc1 $f4, 7($10)\r\n" 381 "gssdrc1 $f4, 0($10)\r\n" 382 "mov.s $f4, $f6\r\n" 383 "subu $9, $9, 2\r\n" 384 PTR_ADDU "$10, $10, 8\r\n" 385 386 "2: \r\n" 387 "li $8, 1\r\n" /* st4 */ 388 "bltu $9, $8, 3f\r\n" 389 "nop \r\n" 390 "gsswlc1 $f4, 3($10)\r\n" 391 "gsswrc1 $f4, 0($10)\r\n" 392 393 "3: \r\n" 394 "li %1, 0\r\n" /* end */ 395 : "=m" (*outptr), "=r" (col) 396 : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col), 397 "r" (outptr) 398 : "$f4", "$f6", "$8", "$9", "$10", "memory" 399 ); 400 } 401 402 #endif 403 404 } 405 } 406 } 407 408 #undef mmA 409 #undef mmB 410 #undef mmC 411 #undef mmD 412 #undef mmE 413 #undef mmF 414 #undef mmG 415 #undef mmH