jccolext-mmi.c (14239B)
1 /* 2 * Loongson MMI optimizations for libjpeg-turbo 3 * 4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved. 6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. 7 * All Rights Reserved. 8 * Authors: ZhuChen <zhuchen@loongson.cn> 9 * SunZhangzhi <sunzhangzhi-cq@loongson.cn> 10 * CaiWanwei <caiwanwei@loongson.cn> 11 * ZhangLixia <zhanglixia-hf@loongson.cn> 12 * 13 * Based on the x86 SIMD extension for IJG JPEG library 14 * Copyright (C) 1999-2006, MIYASAKA Masaru. 15 * 16 * This software is provided 'as-is', without any express or implied 17 * warranty. In no event will the authors be held liable for any damages 18 * arising from the use of this software. 19 * 20 * Permission is granted to anyone to use this software for any purpose, 21 * including commercial applications, and to alter it and redistribute it 22 * freely, subject to the following restrictions: 23 * 24 * 1. The origin of this software must not be misrepresented; you must not 25 * claim that you wrote the original software. If you use this software 26 * in a product, an acknowledgment in the product documentation would be 27 * appreciated but is not required. 28 * 2. Altered source versions must be plainly marked as such, and must not be 29 * misrepresented as being the original software. 30 * 3. This notice may not be removed or altered from any source distribution. 31 */ 32 33 /* This file is included by jccolor-mmi.c */ 34 35 36 #if RGB_RED == 0 37 #define mmA re 38 #define mmB ro 39 #elif RGB_GREEN == 0 40 #define mmA ge 41 #define mmB go 42 #elif RGB_BLUE == 0 43 #define mmA be 44 #define mmB bo 45 #else 46 #define mmA xe 47 #define mmB xo 48 #endif 49 50 #if RGB_RED == 1 51 #define mmC re 52 #define mmD ro 53 #elif RGB_GREEN == 1 54 #define mmC ge 55 #define mmD go 56 #elif RGB_BLUE == 1 57 #define mmC be 58 #define mmD bo 59 #else 60 #define mmC xe 61 #define mmD xo 62 #endif 63 64 #if RGB_RED == 2 65 #define mmE re 66 #define mmF ro 67 #elif RGB_GREEN == 2 68 #define mmE ge 69 #define mmF go 70 #elif RGB_BLUE == 2 71 #define mmE be 72 #define mmF bo 73 #else 74 #define mmE xe 75 #define mmF xo 76 #endif 77 78 #if RGB_RED == 3 79 #define mmG re 80 #define mmH ro 81 #elif RGB_GREEN == 3 82 #define mmG ge 83 #define mmH go 84 #elif RGB_BLUE == 3 85 #define mmG be 86 #define mmH bo 87 #else 88 #define mmG xe 89 #define mmH xo 90 #endif 91 92 93 void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, 94 JSAMPIMAGE output_buf, JDIMENSION output_row, 95 int num_rows) 96 { 97 JSAMPROW inptr, outptr0, outptr1, outptr2; 98 int num_cols, col; 99 __m64 re, ro, ge, go, be, bo, xe; 100 #if RGB_PIXELSIZE == 4 101 __m64 xo; 102 #endif 103 __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho; 104 __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho; 105 __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho; 106 __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye; 107 __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y; 108 __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb; 109 __m64 crle, crhe, cre, crlo, crho, cro, cr; 110 111 while (--num_rows >= 0) { 112 inptr = *input_buf++; 113 outptr0 = output_buf[0][output_row]; 114 outptr1 = output_buf[1][output_row]; 115 outptr2 = output_buf[2][output_row]; 116 output_row++; 117 118 for (num_cols = image_width; num_cols > 0; num_cols -= 8, 119 outptr0 += 8, outptr1 += 8, outptr2 += 8) { 120 121 #if RGB_PIXELSIZE == 3 122 123 if (num_cols < 8) { 124 col = num_cols * 3; 125 asm(".set noreorder\r\n" 126 127 "li $8, 1\r\n" 128 "move $9, %3\r\n" 129 "and $10, $9, $8\r\n" 130 "beqz $10, 1f\r\n" 131 "nop \r\n" 132 "subu $9, $9, 1\r\n" 133 "xor $12, $12, $12\r\n" 134 "move $13, %5\r\n" 135 PTR_ADDU "$13, $13, $9\r\n" 136 "lbu $12, 0($13)\r\n" 137 138 "1: \r\n" 139 "li $8, 2\r\n" 140 "and $10, $9, $8\r\n" 141 "beqz $10, 2f\r\n" 142 "nop \r\n" 143 "subu $9, $9, 2\r\n" 144 "xor $11, $11, $11\r\n" 145 "move $13, %5\r\n" 146 PTR_ADDU "$13, $13, $9\r\n" 147 "lhu $11, 0($13)\r\n" 148 "sll $12, $12, 16\r\n" 149 "or $12, $12, $11\r\n" 150 151 "2: \r\n" 152 "dmtc1 $12, %0\r\n" 153 "li $8, 4\r\n" 154 "and $10, $9, $8\r\n" 155 "beqz $10, 3f\r\n" 156 "nop \r\n" 157 "subu $9, $9, 4\r\n" 158 "move $13, %5\r\n" 159 PTR_ADDU "$13, $13, $9\r\n" 160 "lwu $14, 0($13)\r\n" 161 "dmtc1 $14, %1\r\n" 162 "dsll32 $12, $12, 0\r\n" 163 "or $12, $12, $14\r\n" 164 "dmtc1 $12, %0\r\n" 165 166 "3: \r\n" 167 "li $8, 8\r\n" 168 "and $10, $9, $8\r\n" 169 "beqz $10, 4f\r\n" 170 "nop \r\n" 171 "mov.s %1, %0\r\n" 172 "ldc1 %0, 0(%5)\r\n" 173 "li $9, 8\r\n" 174 "j 5f\r\n" 175 "nop \r\n" 176 177 "4: \r\n" 178 "li $8, 16\r\n" 179 "and $10, $9, $8\r\n" 180 "beqz $10, 5f\r\n" 181 "nop \r\n" 182 "mov.s %2, %0\r\n" 183 "ldc1 %0, 0(%5)\r\n" 184 "ldc1 %1, 8(%5)\r\n" 185 186 "5: \r\n" 187 "nop \r\n" 188 ".set reorder\r\n" 189 190 : "=f" (mmA), "=f" (mmG), "=f" (mmF) 191 : "r" (col), "r" (num_rows), "r" (inptr) 192 : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13", 193 "$14", "memory" 194 ); 195 } else { 196 if (!(((long)inptr) & 7)) { 197 mmA = _mm_load_si64((__m64 *)&inptr[0]); 198 mmG = _mm_load_si64((__m64 *)&inptr[8]); 199 mmF = _mm_load_si64((__m64 *)&inptr[16]); 200 } else { 201 mmA = _mm_loadu_si64((__m64 *)&inptr[0]); 202 mmG = _mm_loadu_si64((__m64 *)&inptr[8]); 203 mmF = _mm_loadu_si64((__m64 *)&inptr[16]); 204 } 205 inptr += RGB_PIXELSIZE * 8; 206 } 207 mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT); 208 mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); 209 210 mmA = _mm_unpackhi_pi8(mmA, mmG); 211 mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT); 212 213 mmD = _mm_unpacklo_pi8(mmD, mmF); 214 mmG = _mm_unpackhi_pi8(mmG, mmF); 215 216 mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT); 217 mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); 218 219 mmA = _mm_unpackhi_pi8(mmA, mmD); 220 mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT); 221 222 mmE = _mm_unpacklo_pi8(mmE, mmG); 223 mmD = _mm_unpackhi_pi8(mmD, mmG); 224 mmC = _mm_loadhi_pi8_f(mmA); 225 mmA = _mm_loadlo_pi8_f(mmA); 226 227 mmB = _mm_loadhi_pi8_f(mmE); 228 mmE = _mm_loadlo_pi8_f(mmE); 229 230 mmF = _mm_loadhi_pi8_f(mmD); 231 mmD = _mm_loadlo_pi8_f(mmD); 232 233 #else /* RGB_PIXELSIZE == 4 */ 234 235 if (num_cols < 8) { 236 col = num_cols; 237 asm(".set noreorder\r\n" 238 239 "li $8, 1\r\n" 240 "move $9, %4\r\n" 241 "and $10, $9, $8\r\n" 242 "beqz $10, 1f\r\n" 243 "nop \r\n" 244 "subu $9, $9, 1\r\n" 245 PTR_SLL "$11, $9, 2\r\n" 246 "move $13, %5\r\n" 247 PTR_ADDU "$13, $13, $11\r\n" 248 "lwc1 %0, 0($13)\r\n" 249 250 "1: \r\n" 251 "li $8, 2\r\n" 252 "and $10, $9, $8\r\n" 253 "beqz $10, 2f\r\n" 254 "nop \r\n" 255 "subu $9, $9, 2\r\n" 256 PTR_SLL "$11, $9, 2\r\n" 257 "move $13, %5\r\n" 258 PTR_ADDU "$13, $13, $11\r\n" 259 "mov.s %1, %0\r\n" 260 "ldc1 %0, 0($13)\r\n" 261 262 "2: \r\n" 263 "li $8, 4\r\n" 264 "and $10, $9, $8\r\n" 265 "beqz $10, 3f\r\n" 266 "nop \r\n" 267 "mov.s %2, %0\r\n" 268 "mov.s %3, %1\r\n" 269 "ldc1 %0, 0(%5)\r\n" 270 "ldc1 %1, 8(%5)\r\n" 271 272 "3: \r\n" 273 "nop \r\n" 274 ".set reorder\r\n" 275 276 : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC) 277 : "r" (col), "r" (inptr) 278 : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory" 279 ); 280 } else { 281 if (!(((long)inptr) & 7)) { 282 mmA = _mm_load_si64((__m64 *)&inptr[0]); 283 mmF = _mm_load_si64((__m64 *)&inptr[8]); 284 mmD = _mm_load_si64((__m64 *)&inptr[16]); 285 mmC = _mm_load_si64((__m64 *)&inptr[24]); 286 } else { 287 mmA = _mm_loadu_si64((__m64 *)&inptr[0]); 288 mmF = _mm_loadu_si64((__m64 *)&inptr[8]); 289 mmD = _mm_loadu_si64((__m64 *)&inptr[16]); 290 mmC = _mm_loadu_si64((__m64 *)&inptr[24]); 291 } 292 inptr += RGB_PIXELSIZE * 8; 293 } 294 mmB = _mm_unpackhi_pi8(mmA, mmF); 295 mmA = _mm_unpacklo_pi8(mmA, mmF); 296 297 mmG = _mm_unpackhi_pi8(mmD, mmC); 298 mmD = _mm_unpacklo_pi8(mmD, mmC); 299 300 mmE = _mm_unpackhi_pi16(mmA, mmD); 301 mmA = _mm_unpacklo_pi16(mmA, mmD); 302 303 mmH = _mm_unpackhi_pi16(mmB, mmG); 304 mmB = _mm_unpacklo_pi16(mmB, mmG); 305 306 mmC = _mm_loadhi_pi8_f(mmA); 307 mmA = _mm_loadlo_pi8_f(mmA); 308 309 mmD = _mm_loadhi_pi8_f(mmB); 310 mmB = _mm_loadlo_pi8_f(mmB); 311 312 mmG = _mm_loadhi_pi8_f(mmE); 313 mmE = _mm_loadlo_pi8_f(mmE); 314 315 mmF = _mm_unpacklo_pi8(mmH, mmH); 316 mmH = _mm_unpackhi_pi8(mmH, mmH); 317 mmF = _mm_srli_pi16(mmF, BYTE_BIT); 318 mmH = _mm_srli_pi16(mmH, BYTE_BIT); 319 320 #endif 321 322 /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6) 323 * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7) 324 * 325 * (Original) 326 * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 327 * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 328 * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 329 * 330 * (This implementation) 331 * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 332 * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 333 * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 334 */ 335 336 rglo = _mm_unpacklo_pi16(ro, go); 337 rgho = _mm_unpackhi_pi16(ro, go); 338 ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337); 339 yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337); 340 cblo = _mm_madd_pi16(rglo, PW_MF016_MF033); 341 cbho = _mm_madd_pi16(rgho, PW_MF016_MF033); 342 343 blo = _mm_loadlo_pi16_f(bo); 344 bho = _mm_loadhi_pi16_f(bo); 345 halfblo = _mm_srli_pi32(blo, 1); 346 halfbho = _mm_srli_pi32(bho, 1); 347 348 cblo = _mm_add_pi32(cblo, halfblo); 349 cbho = _mm_add_pi32(cbho, halfbho); 350 cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ); 351 cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ); 352 cblo = _mm_srli_pi32(cblo, SCALEBITS); 353 cbho = _mm_srli_pi32(cbho, SCALEBITS); 354 cbo = _mm_packs_pi32(cblo, cbho); 355 356 rgle = _mm_unpacklo_pi16(re, ge); 357 rghe = _mm_unpackhi_pi16(re, ge); 358 yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337); 359 yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337); 360 cble = _mm_madd_pi16(rgle, PW_MF016_MF033); 361 cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033); 362 363 ble = _mm_loadlo_pi16_f(be); 364 bhe = _mm_loadhi_pi16_f(be); 365 halfble = _mm_srli_pi32(ble, 1); 366 halfbhe = _mm_srli_pi32(bhe, 1); 367 368 cble = _mm_add_pi32(cble, halfble); 369 cbhe = _mm_add_pi32(cbhe, halfbhe); 370 cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ); 371 cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ); 372 cble = _mm_srli_pi32(cble, SCALEBITS); 373 cbhe = _mm_srli_pi32(cbhe, SCALEBITS); 374 cbe = _mm_packs_pi32(cble, cbhe); 375 376 cbo = _mm_slli_pi16(cbo, BYTE_BIT); 377 cb = _mm_or_si64(cbe, cbo); 378 379 bglo = _mm_unpacklo_pi16(bo, go); 380 bgho = _mm_unpackhi_pi16(bo, go); 381 ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250); 382 yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250); 383 crlo = _mm_madd_pi16(bglo, PW_MF008_MF041); 384 crho = _mm_madd_pi16(bgho, PW_MF008_MF041); 385 386 ylo = _mm_add_pi32(ylo_bg, ylo_rg); 387 yho = _mm_add_pi32(yho_bg, yho_rg); 388 ylo = _mm_add_pi32(ylo, PD_ONEHALF); 389 yho = _mm_add_pi32(yho, PD_ONEHALF); 390 ylo = _mm_srli_pi32(ylo, SCALEBITS); 391 yho = _mm_srli_pi32(yho, SCALEBITS); 392 yo = _mm_packs_pi32(ylo, yho); 393 394 rlo = _mm_loadlo_pi16_f(ro); 395 rho = _mm_loadhi_pi16_f(ro); 396 halfrlo = _mm_srli_pi32(rlo, 1); 397 halfrho = _mm_srli_pi32(rho, 1); 398 399 crlo = _mm_add_pi32(crlo, halfrlo); 400 crho = _mm_add_pi32(crho, halfrho); 401 crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ); 402 crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ); 403 crlo = _mm_srli_pi32(crlo, SCALEBITS); 404 crho = _mm_srli_pi32(crho, SCALEBITS); 405 cro = _mm_packs_pi32(crlo, crho); 406 407 bgle = _mm_unpacklo_pi16(be, ge); 408 bghe = _mm_unpackhi_pi16(be, ge); 409 yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250); 410 yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250); 411 crle = _mm_madd_pi16(bgle, PW_MF008_MF041); 412 crhe = _mm_madd_pi16(bghe, PW_MF008_MF041); 413 414 yle = _mm_add_pi32(yle_bg, yle_rg); 415 yhe = _mm_add_pi32(yhe_bg, yhe_rg); 416 yle = _mm_add_pi32(yle, PD_ONEHALF); 417 yhe = _mm_add_pi32(yhe, PD_ONEHALF); 418 yle = _mm_srli_pi32(yle, SCALEBITS); 419 yhe = _mm_srli_pi32(yhe, SCALEBITS); 420 ye = _mm_packs_pi32(yle, yhe); 421 422 yo = _mm_slli_pi16(yo, BYTE_BIT); 423 y = _mm_or_si64(ye, yo); 424 425 rle = _mm_loadlo_pi16_f(re); 426 rhe = _mm_loadhi_pi16_f(re); 427 halfrle = _mm_srli_pi32(rle, 1); 428 halfrhe = _mm_srli_pi32(rhe, 1); 429 430 crle = _mm_add_pi32(crle, halfrle); 431 crhe = _mm_add_pi32(crhe, halfrhe); 432 crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ); 433 crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ); 434 crle = _mm_srli_pi32(crle, SCALEBITS); 435 crhe = _mm_srli_pi32(crhe, SCALEBITS); 436 cre = _mm_packs_pi32(crle, crhe); 437 438 cro = _mm_slli_pi16(cro, BYTE_BIT); 439 cr = _mm_or_si64(cre, cro); 440 441 _mm_store_si64((__m64 *)&outptr0[0], y); 442 _mm_store_si64((__m64 *)&outptr1[0], cb); 443 _mm_store_si64((__m64 *)&outptr2[0], cr); 444 } 445 } 446 } 447 448 #undef mmA 449 #undef mmB 450 #undef mmC 451 #undef mmD 452 #undef mmE 453 #undef mmF 454 #undef mmG 455 #undef mmH