jcgryext-mmi.c (10948B)
1 /* 2 * Loongson MMI optimizations for libjpeg-turbo 3 * 4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved. 6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. 7 * All Rights Reserved. 8 * Authors: ZhangLixia <zhanglixia-hf@loongson.cn> 9 * 10 * Based on the x86 SIMD extension for IJG JPEG library 11 * Copyright (C) 1999-2006, MIYASAKA Masaru. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30 /* This file is included by jcgray-mmi.c */ 31 32 33 #if RGB_RED == 0 34 #define mmA re 35 #define mmB ro 36 #elif RGB_GREEN == 0 37 #define mmA ge 38 #define mmB go 39 #elif RGB_BLUE == 0 40 #define mmA be 41 #define mmB bo 42 #else 43 #define mmA xe 44 #define mmB xo 45 #endif 46 47 #if RGB_RED == 1 48 #define mmC re 49 #define mmD ro 50 #elif RGB_GREEN == 1 51 #define mmC ge 52 #define mmD go 53 #elif RGB_BLUE == 1 54 #define mmC be 55 #define mmD bo 56 #else 57 #define mmC xe 58 #define mmD xo 59 #endif 60 61 #if RGB_RED == 2 62 #define mmE re 63 #define mmF ro 64 #elif RGB_GREEN == 2 65 #define mmE ge 66 #define mmF go 67 #elif RGB_BLUE == 2 68 #define mmE be 69 #define mmF bo 70 #else 71 #define mmE xe 72 #define mmF xo 73 #endif 74 75 #if RGB_RED == 3 76 #define mmG re 77 #define mmH ro 78 #elif RGB_GREEN == 3 79 #define mmG ge 80 #define mmH go 81 #elif RGB_BLUE == 3 82 #define mmG be 83 #define mmH bo 84 #else 85 #define mmG xe 86 #define mmH xo 87 #endif 88 89 90 void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, 91 JSAMPIMAGE output_buf, JDIMENSION output_row, 92 int num_rows) 93 { 94 JSAMPROW inptr, outptr; 95 int num_cols, col; 96 __m64 re, ro, ge, go, be, bo, xe; 97 #if RGB_PIXELSIZE == 4 98 __m64 xo; 99 #endif 100 __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho; 101 __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye; 102 __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y; 103 104 while (--num_rows >= 0) { 105 inptr = *input_buf++; 106 outptr = output_buf[0][output_row]; 107 output_row++; 108 109 for (num_cols = image_width; num_cols > 0; num_cols -= 8, 110 outptr += 8) { 111 112 #if RGB_PIXELSIZE == 3 113 114 if (num_cols < 8) { 115 col = num_cols * 3; 116 asm(".set noreorder\r\n" 117 118 "li $8, 1\r\n" 119 "move $9, %3\r\n" 120 "and $10, $9, $8\r\n" 121 "beqz $10, 1f\r\n" 122 "nop \r\n" 123 "subu $9, $9, 1\r\n" 124 "xor $12, $12, $12\r\n" 125 "move $13, %5\r\n" 126 PTR_ADDU "$13, $13, $9\r\n" 127 "lbu $12, 0($13)\r\n" 128 129 "1: \r\n" 130 "li $8, 2\r\n" 131 "and $10, $9, $8\r\n" 132 "beqz $10, 2f\r\n" 133 "nop \r\n" 134 "subu $9, $9, 2\r\n" 135 "xor $11, $11, $11\r\n" 136 "move $13, %5\r\n" 137 PTR_ADDU "$13, $13, $9\r\n" 138 "lhu $11, 0($13)\r\n" 139 "sll $12, $12, 16\r\n" 140 "or $12, $12, $11\r\n" 141 142 "2: \r\n" 143 "dmtc1 $12, %0\r\n" 144 "li $8, 4\r\n" 145 "and $10, $9, $8\r\n" 146 "beqz $10, 3f\r\n" 147 "nop \r\n" 148 "subu $9, $9, 4\r\n" 149 "move $13, %5\r\n" 150 PTR_ADDU "$13, $13, $9\r\n" 151 "lwu $14, 0($13)\r\n" 152 "dmtc1 $14, %1\r\n" 153 "dsll32 $12, $12, 0\r\n" 154 "or $12, $12, $14\r\n" 155 "dmtc1 $12, %0\r\n" 156 157 "3: \r\n" 158 "li $8, 8\r\n" 159 "and $10, $9, $8\r\n" 160 "beqz $10, 4f\r\n" 161 "nop \r\n" 162 "mov.s %1, %0\r\n" 163 "ldc1 %0, 0(%5)\r\n" 164 "li $9, 8\r\n" 165 "j 5f\r\n" 166 "nop \r\n" 167 168 "4: \r\n" 169 "li $8, 16\r\n" 170 "and $10, $9, $8\r\n" 171 "beqz $10, 5f\r\n" 172 "nop \r\n" 173 "mov.s %2, %0\r\n" 174 "ldc1 %0, 0(%5)\r\n" 175 "ldc1 %1, 8(%5)\r\n" 176 177 "5: \r\n" 178 "nop \r\n" 179 ".set reorder\r\n" 180 181 : "=f" (mmA), "=f" (mmG), "=f" (mmF) 182 : "r" (col), "r" (num_rows), "r" (inptr) 183 : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13", 184 "$14", "memory" 185 ); 186 } else { 187 if (!(((long)inptr) & 7)) { 188 mmA = _mm_load_si64((__m64 *)&inptr[0]); 189 mmG = _mm_load_si64((__m64 *)&inptr[8]); 190 mmF = _mm_load_si64((__m64 *)&inptr[16]); 191 } else { 192 mmA = _mm_loadu_si64((__m64 *)&inptr[0]); 193 mmG = _mm_loadu_si64((__m64 *)&inptr[8]); 194 mmF = _mm_loadu_si64((__m64 *)&inptr[16]); 195 } 196 inptr += RGB_PIXELSIZE * 8; 197 } 198 mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT); 199 mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); 200 201 mmA = _mm_unpackhi_pi8(mmA, mmG); 202 mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT); 203 204 mmD = _mm_unpacklo_pi8(mmD, mmF); 205 mmG = _mm_unpackhi_pi8(mmG, mmF); 206 207 mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT); 208 mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); 209 210 mmA = _mm_unpackhi_pi8(mmA, mmD); 211 mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT); 212 213 mmE = _mm_unpacklo_pi8(mmE, mmG); 214 mmD = _mm_unpackhi_pi8(mmD, mmG); 215 mmC = _mm_loadhi_pi8_f(mmA); 216 mmA = _mm_loadlo_pi8_f(mmA); 217 218 mmB = _mm_loadhi_pi8_f(mmE); 219 mmE = _mm_loadlo_pi8_f(mmE); 220 221 mmF = _mm_loadhi_pi8_f(mmD); 222 mmD = _mm_loadlo_pi8_f(mmD); 223 224 #else /* RGB_PIXELSIZE == 4 */ 225 226 if (num_cols < 8) { 227 col = num_cols; 228 asm(".set noreorder\r\n" 229 230 "li $8, 1\r\n" 231 "move $9, %4\r\n" 232 "and $10, $9, $8\r\n" 233 "beqz $10, 1f\r\n" 234 "nop \r\n" 235 "subu $9, $9, 1\r\n" 236 PTR_SLL "$11, $9, 2\r\n" 237 "move $13, %5\r\n" 238 PTR_ADDU "$13, $13, $11\r\n" 239 "lwc1 %0, 0($13)\r\n" 240 241 "1: \r\n" 242 "li $8, 2\r\n" 243 "and $10, $9, $8\r\n" 244 "beqz $10, 2f\r\n" 245 "nop \r\n" 246 "subu $9, $9, 2\r\n" 247 PTR_SLL "$11, $9, 2\r\n" 248 "move $13, %5\r\n" 249 PTR_ADDU "$13, $13, $11\r\n" 250 "mov.s %1, %0\r\n" 251 "ldc1 %0, 0($13)\r\n" 252 253 "2: \r\n" 254 "li $8, 4\r\n" 255 "and $10, $9, $8\r\n" 256 "beqz $10, 3f\r\n" 257 "nop \r\n" 258 "mov.s %2, %0\r\n" 259 "mov.s %3, %1\r\n" 260 "ldc1 %0, 0(%5)\r\n" 261 "ldc1 %1, 8(%5)\r\n" 262 263 "3: \r\n" 264 "nop \r\n" 265 ".set reorder\r\n" 266 267 : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC) 268 : "r" (col), "r" (inptr) 269 : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory" 270 ); 271 } else { 272 if (!(((long)inptr) & 7)) { 273 mmA = _mm_load_si64((__m64 *)&inptr[0]); 274 mmF = _mm_load_si64((__m64 *)&inptr[8]); 275 mmD = _mm_load_si64((__m64 *)&inptr[16]); 276 mmC = _mm_load_si64((__m64 *)&inptr[24]); 277 } else { 278 mmA = _mm_loadu_si64((__m64 *)&inptr[0]); 279 mmF = _mm_loadu_si64((__m64 *)&inptr[8]); 280 mmD = _mm_loadu_si64((__m64 *)&inptr[16]); 281 mmC = _mm_loadu_si64((__m64 *)&inptr[24]); 282 } 283 inptr += RGB_PIXELSIZE * 8; 284 } 285 mmB = _mm_unpackhi_pi8(mmA, mmF); 286 mmA = _mm_unpacklo_pi8(mmA, mmF); 287 288 mmG = _mm_unpackhi_pi8(mmD, mmC); 289 mmD = _mm_unpacklo_pi8(mmD, mmC); 290 291 mmE = _mm_unpackhi_pi16(mmA, mmD); 292 mmA = _mm_unpacklo_pi16(mmA, mmD); 293 294 mmH = _mm_unpackhi_pi16(mmB, mmG); 295 mmB = _mm_unpacklo_pi16(mmB, mmG); 296 297 mmC = _mm_loadhi_pi8_f(mmA); 298 mmA = _mm_loadlo_pi8_f(mmA); 299 300 mmD = _mm_loadhi_pi8_f(mmB); 301 mmB = _mm_loadlo_pi8_f(mmB); 302 303 mmG = _mm_loadhi_pi8_f(mmE); 304 mmE = _mm_loadlo_pi8_f(mmE); 305 306 mmF = _mm_unpacklo_pi8(mmH, mmH); 307 mmH = _mm_unpackhi_pi8(mmH, mmH); 308 mmF = _mm_srli_pi16(mmF, BYTE_BIT); 309 mmH = _mm_srli_pi16(mmH, BYTE_BIT); 310 311 #endif 312 313 /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6) 314 * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7) 315 * 316 * (Original) 317 * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 318 * 319 * (This implementation) 320 * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 321 */ 322 323 rglo = _mm_unpacklo_pi16(ro, go); 324 rgho = _mm_unpackhi_pi16(ro, go); 325 ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337); 326 yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337); 327 328 rgle = _mm_unpacklo_pi16(re, ge); 329 rghe = _mm_unpackhi_pi16(re, ge); 330 yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337); 331 yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337); 332 333 bglo = _mm_unpacklo_pi16(bo, go); 334 bgho = _mm_unpackhi_pi16(bo, go); 335 ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250); 336 yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250); 337 338 ylo = _mm_add_pi32(ylo_bg, ylo_rg); 339 yho = _mm_add_pi32(yho_bg, yho_rg); 340 ylo = _mm_add_pi32(ylo, PD_ONEHALF); 341 yho = _mm_add_pi32(yho, PD_ONEHALF); 342 ylo = _mm_srli_pi32(ylo, SCALEBITS); 343 yho = _mm_srli_pi32(yho, SCALEBITS); 344 yo = _mm_packs_pi32(ylo, yho); 345 346 bgle = _mm_unpacklo_pi16(be, ge); 347 bghe = _mm_unpackhi_pi16(be, ge); 348 yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250); 349 yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250); 350 351 yle = _mm_add_pi32(yle_bg, yle_rg); 352 yhe = _mm_add_pi32(yhe_bg, yhe_rg); 353 yle = _mm_add_pi32(yle, PD_ONEHALF); 354 yhe = _mm_add_pi32(yhe, PD_ONEHALF); 355 yle = _mm_srli_pi32(yle, SCALEBITS); 356 yhe = _mm_srli_pi32(yhe, SCALEBITS); 357 ye = _mm_packs_pi32(yle, yhe); 358 359 yo = _mm_slli_pi16(yo, BYTE_BIT); 360 y = _mm_or_si64(ye, yo); 361 362 _mm_store_si64((__m64 *)&outptr[0], y); 363 } 364 } 365 } 366 367 #undef mmA 368 #undef mmB 369 #undef mmC 370 #undef mmD 371 #undef mmE 372 #undef mmF 373 #undef mmG 374 #undef mmH