jdmrgext-mmi.c (23377B)
1 /* 2 * Loongson MMI optimizations for libjpeg-turbo 3 * 4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved. 6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. 7 * All Rights Reserved. 8 * Authors: ZhangLixia <zhanglixia-hf@loongson.cn> 9 * 10 * Based on the x86 SIMD extension for IJG JPEG library 11 * Copyright (C) 1999-2006, MIYASAKA Masaru. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30 /* This file is included by jdmerge-mmi.c */ 31 32 33 #if RGB_RED == 0 34 #define mmA re 35 #define mmB ro 36 #elif RGB_GREEN == 0 37 #define mmA ge 38 #define mmB go 39 #elif RGB_BLUE == 0 40 #define mmA be 41 #define mmB bo 42 #else 43 #define mmA xe 44 #define mmB xo 45 #endif 46 47 #if RGB_RED == 1 48 #define mmC re 49 #define mmD ro 50 #elif RGB_GREEN == 1 51 #define mmC ge 52 #define mmD go 53 #elif RGB_BLUE == 1 54 #define mmC be 55 #define mmD bo 56 #else 57 #define mmC xe 58 #define mmD xo 59 #endif 60 61 #if RGB_RED == 2 62 #define mmE re 63 #define mmF ro 64 #elif RGB_GREEN == 2 65 #define mmE ge 66 #define mmF go 67 #elif RGB_BLUE == 2 68 #define mmE be 69 #define mmF bo 70 #else 71 #define mmE xe 72 #define mmF xo 73 #endif 74 75 #if RGB_RED == 3 76 #define mmG re 77 #define mmH ro 78 #elif RGB_GREEN == 3 79 #define mmG ge 80 #define mmH go 81 #elif RGB_BLUE == 3 82 #define mmG be 83 #define mmH bo 84 #else 85 #define mmG xe 86 #define mmH xo 87 #endif 88 89 90 void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width, 91 JSAMPIMAGE input_buf, 92 JDIMENSION in_row_group_ctr, 93 JSAMPARRAY output_buf) 94 { 95 JSAMPROW outptr, inptr0, inptr1, inptr2; 96 int num_cols, col; 97 __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y; 98 __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr; 99 __m64 rle, rlo, rl, rhe, rho, rh, re, ro; 100 __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go; 101 __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0; 102 __m64 decenter, mask, zero = 0.0; 103 #if RGB_PIXELSIZE == 4 104 __m64 mm8, mm9; 105 #endif 106 107 inptr0 = input_buf[0][in_row_group_ctr]; 108 inptr1 = input_buf[1][in_row_group_ctr]; 109 inptr2 = input_buf[2][in_row_group_ctr]; 110 outptr = output_buf[0]; 111 112 for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8, 113 inptr0 += 16, inptr1 += 8, inptr2 += 8) { 114 115 cb = _mm_load_si64((__m64 *)inptr1); 116 cr = _mm_load_si64((__m64 *)inptr2); 117 ythis = _mm_load_si64((__m64 *)inptr0); 118 ynext = _mm_load_si64((__m64 *)inptr0 + 1); 119 120 mask = decenter = 0.0; 121 mask = _mm_cmpeq_pi16(mask, mask); 122 decenter = _mm_cmpeq_pi16(decenter, decenter); 123 mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ 124 decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ 125 126 cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */ 127 cbh = _mm_unpackhi_pi8(cb, zero); /* Cb(4567) */ 128 crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */ 129 crh = _mm_unpackhi_pi8(cr, zero); /* Cr(4567) */ 130 cbl = _mm_add_pi16(cbl, decenter); 131 cbh = _mm_add_pi16(cbh, decenter); 132 crl = _mm_add_pi16(crl, decenter); 133 crh = _mm_add_pi16(crh, decenter); 134 135 /* (Original) 136 * R = Y + 1.40200 * Cr 137 * G = Y - 0.34414 * Cb - 0.71414 * Cr 138 * B = Y + 1.77200 * Cb 139 * 140 * (This implementation) 141 * R = Y + 0.40200 * Cr + Cr 142 * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 143 * B = Y - 0.22800 * Cb + Cb + Cb 144 */ 145 146 cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */ 147 cbh2 = _mm_add_pi16(cbh, cbh); /* 2*CbH */ 148 crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */ 149 crh2 = _mm_add_pi16(crh, crh); /* 2*CrH */ 150 151 bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */ 152 bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2*CbH * -FIX(0.22800) */ 153 rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */ 154 rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2*CrH * FIX(0.40200)) */ 155 156 bl = _mm_add_pi16(bl, PW_ONE); 157 bh = _mm_add_pi16(bh, PW_ONE); 158 bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */ 159 bh = _mm_srai_pi16(bh, 1); /* (CbH * -FIX(0.22800)) */ 160 rl = _mm_add_pi16(rl, PW_ONE); 161 rh = _mm_add_pi16(rh, PW_ONE); 162 rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */ 163 rh = _mm_srai_pi16(rh, 1); /* (CrH * FIX(0.40200)) */ 164 165 bl = _mm_add_pi16(bl, cbl); 166 bh = _mm_add_pi16(bh, cbh); 167 bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */ 168 bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200))=(B-Y)H */ 169 rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */ 170 rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200))=(R-Y)H */ 171 172 ga = _mm_unpacklo_pi16(cbl, crl); 173 gb = _mm_unpackhi_pi16(cbl, crl); 174 ga = _mm_madd_pi16(ga, PW_MF0344_F0285); 175 gb = _mm_madd_pi16(gb, PW_MF0344_F0285); 176 gc = _mm_unpacklo_pi16(cbh, crh); 177 gd = _mm_unpackhi_pi16(cbh, crh); 178 gc = _mm_madd_pi16(gc, PW_MF0344_F0285); 179 gd = _mm_madd_pi16(gd, PW_MF0344_F0285); 180 181 ga = _mm_add_pi32(ga, PD_ONEHALF); 182 gb = _mm_add_pi32(gb, PD_ONEHALF); 183 ga = _mm_srai_pi32(ga, SCALEBITS); 184 gb = _mm_srai_pi32(gb, SCALEBITS); 185 gc = _mm_add_pi32(gc, PD_ONEHALF); 186 gd = _mm_add_pi32(gd, PD_ONEHALF); 187 gc = _mm_srai_pi32(gc, SCALEBITS); 188 gd = _mm_srai_pi32(gd, SCALEBITS); 189 190 gl = _mm_packs_pi32(ga, gb); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */ 191 gh = _mm_packs_pi32(gc, gd); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */ 192 gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */ 193 gh = _mm_sub_pi16(gh, crh); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */ 194 195 ythise = _mm_and_si64(mask, ythis); /* Y(0246) */ 196 ythiso = _mm_srli_pi16(ythis, BYTE_BIT); /* Y(1357) */ 197 ynexte = _mm_and_si64(mask, ynext); /* Y(8ACE) */ 198 ynexto = _mm_srli_pi16(ynext, BYTE_BIT); /* Y(9BDF) */ 199 200 rle = _mm_add_pi16(rl, ythise); /* (R0 R2 R4 R6) */ 201 rlo = _mm_add_pi16(rl, ythiso); /* (R1 R3 R5 R7) */ 202 rhe = _mm_add_pi16(rh, ynexte); /* (R8 RA RC RE) */ 203 rho = _mm_add_pi16(rh, ynexto); /* (R9 RB RD RF) */ 204 re = _mm_packs_pu16(rle, rhe); /* (R0 R2 R4 R6 R8 RA RC RE) */ 205 ro = _mm_packs_pu16(rlo, rho); /* (R1 R3 R5 R7 R9 RB RD RF) */ 206 207 gle = _mm_add_pi16(gl, ythise); /* (G0 G2 G4 G6) */ 208 glo = _mm_add_pi16(gl, ythiso); /* (G1 G3 G5 G7) */ 209 ghe = _mm_add_pi16(gh, ynexte); /* (G8 GA GC GE) */ 210 gho = _mm_add_pi16(gh, ynexto); /* (G9 GB GD GF) */ 211 ge = _mm_packs_pu16(gle, ghe); /* (G0 G2 G4 G6 G8 GA GC GE) */ 212 go = _mm_packs_pu16(glo, gho); /* (G1 G3 G5 G7 G9 GB GD GF) */ 213 214 ble = _mm_add_pi16(bl, ythise); /* (B0 B2 B4 B6) */ 215 blo = _mm_add_pi16(bl, ythiso); /* (B1 B3 B5 B7) */ 216 bhe = _mm_add_pi16(bh, ynexte); /* (B8 BA BC BE) */ 217 bho = _mm_add_pi16(bh, ynexto); /* (B9 BB BD BF) */ 218 be = _mm_packs_pu16(ble, bhe); /* (B0 B2 B4 B6 B8 BA BC BE) */ 219 bo = _mm_packs_pu16(blo, bho); /* (B1 B3 B5 B7 B9 BB BD BF) */ 220 221 #if RGB_PIXELSIZE == 3 222 223 /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */ 224 /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */ 225 /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */ 226 mmG = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ 227 mmA = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */ 228 mmH = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */ 229 mmE = _mm_unpackhi_pi8(mmE, mmB); /* (28 09 2A 0B 2C 0D 2E 0F) */ 230 mmC = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */ 231 mmD = _mm_unpackhi_pi8(mmD, mmF); /* (19 29 1B 2B 1D 2D 1F 2F) */ 232 233 mmB = _mm_unpacklo_pi16(mmG, mmA); /* (00 10 08 18 02 12 0A 1A) */ 234 mmA = _mm_unpackhi_pi16(mmG, mmA); /* (04 14 0C 1C 06 16 0E 1E) */ 235 mmF = _mm_unpacklo_pi16(mmH, mmE); /* (20 01 28 09 22 03 2A 0B) */ 236 mmE = _mm_unpackhi_pi16(mmH, mmE); /* (24 05 2C 0D 26 07 2E 0F) */ 237 mmH = _mm_unpacklo_pi16(mmC, mmD); /* (11 21 19 29 13 23 1B 2B) */ 238 mmG = _mm_unpackhi_pi16(mmC, mmD); /* (15 25 1D 2D 17 27 1F 2F) */ 239 240 mmC = _mm_unpacklo_pi16(mmB, mmF); /* (00 10 20 01 08 18 28 09) */ 241 mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT); 242 mmB = _mm_unpacklo_pi16(mmH, mmB); /* (11 21 02 12 19 29 0A 1A) */ 243 mmD = _mm_unpackhi_pi16(mmF, mmH); /* (22 03 13 23 2A 0B 1B 2B) */ 244 mmF = _mm_unpacklo_pi16(mmA, mmE); /* (04 14 24 05 0C 1C 2C 0D) */ 245 mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT); 246 mmH = _mm_unpacklo_pi16(mmG, mmA); /* (15 25 06 16 1D 2D 0E 1E) */ 247 mmG = _mm_unpackhi_pi16(mmE, mmG); /* (26 07 17 27 2E 0F 1F 2F) */ 248 249 mmA = _mm_unpacklo_pi32(mmC, mmB); /* (00 10 20 01 11 21 02 12) */ 250 mmE = _mm_unpackhi_pi32(mmC, mmB); /* (08 18 28 09 19 29 0A 1A) */ 251 mmB = _mm_unpacklo_pi32(mmD, mmF); /* (22 03 13 23 04 14 24 05) */ 252 mmF = _mm_unpackhi_pi32(mmD, mmF); /* (2A 0B 1B 2B 0C 1C 2C 0D) */ 253 mmC = _mm_unpacklo_pi32(mmH, mmG); /* (15 25 06 16 26 07 17 27) */ 254 mmG = _mm_unpackhi_pi32(mmH, mmG); /* (1D 2D 0E 1E 2E 0F 1F 2F) */ 255 256 if (num_cols >= 8) { 257 if (!(((long)outptr) & 7)) { 258 _mm_store_si64((__m64 *)outptr, mmA); 259 _mm_store_si64((__m64 *)(outptr + 8), mmB); 260 _mm_store_si64((__m64 *)(outptr + 16), mmC); 261 _mm_store_si64((__m64 *)(outptr + 24), mmE); 262 _mm_store_si64((__m64 *)(outptr + 32), mmF); 263 _mm_store_si64((__m64 *)(outptr + 40), mmG); 264 } else { 265 _mm_storeu_si64((__m64 *)outptr, mmA); 266 _mm_storeu_si64((__m64 *)(outptr + 8), mmB); 267 _mm_storeu_si64((__m64 *)(outptr + 16), mmC); 268 _mm_storeu_si64((__m64 *)(outptr + 24), mmE); 269 _mm_storeu_si64((__m64 *)(outptr + 32), mmF); 270 _mm_storeu_si64((__m64 *)(outptr + 40), mmG); 271 } 272 outptr += RGB_PIXELSIZE * 16; 273 } else { 274 if (output_width & 1) 275 col = num_cols * 6 + 3; 276 else 277 col = num_cols * 6; 278 279 asm(".set noreorder\r\n" /* st24 */ 280 281 "li $8, 24\r\n" 282 "move $9, %7\r\n" 283 "mov.s $f4, %1\r\n" 284 "mov.s $f6, %2\r\n" 285 "mov.s $f8, %3\r\n" 286 "move $10, %8\r\n" 287 "bltu $9, $8, 1f\r\n" 288 "nop \r\n" 289 "gssdlc1 $f4, 7($10)\r\n" 290 "gssdrc1 $f4, 0($10)\r\n" 291 "gssdlc1 $f6, 7+8($10)\r\n" 292 "gssdrc1 $f6, 8($10)\r\n" 293 "gssdlc1 $f8, 7+16($10)\r\n" 294 "gssdrc1 $f8, 16($10)\r\n" 295 "mov.s $f4, %4\r\n" 296 "mov.s $f6, %5\r\n" 297 "mov.s $f8, %6\r\n" 298 "subu $9, $9, 24\r\n" 299 PTR_ADDU "$10, $10, 24\r\n" 300 301 "1: \r\n" 302 "li $8, 16\r\n" /* st16 */ 303 "bltu $9, $8, 2f\r\n" 304 "nop \r\n" 305 "gssdlc1 $f4, 7($10)\r\n" 306 "gssdrc1 $f4, 0($10)\r\n" 307 "gssdlc1 $f6, 7+8($10)\r\n" 308 "gssdrc1 $f6, 8($10)\r\n" 309 "mov.s $f4, $f8\r\n" 310 "subu $9, $9, 16\r\n" 311 PTR_ADDU "$10, $10, 16\r\n" 312 313 "2: \r\n" 314 "li $8, 8\r\n" /* st8 */ 315 "bltu $9, $8, 3f\r\n" 316 "nop \r\n" 317 "gssdlc1 $f4, 7($10)\r\n" 318 "gssdrc1 $f4, 0($10)\r\n" 319 "mov.s $f4, $f6\r\n" 320 "subu $9, $9, 8\r\n" 321 PTR_ADDU "$10, $10, 8\r\n" 322 323 "3: \r\n" 324 "li $8, 4\r\n" /* st4 */ 325 "mfc1 $11, $f4\r\n" 326 "bltu $9, $8, 4f\r\n" 327 "nop \r\n" 328 "swl $11, 3($10)\r\n" 329 "swr $11, 0($10)\r\n" 330 "li $8, 32\r\n" 331 "mtc1 $8, $f6\r\n" 332 "dsrl $f4, $f4, $f6\r\n" 333 "mfc1 $11, $f4\r\n" 334 "subu $9, $9, 4\r\n" 335 PTR_ADDU "$10, $10, 4\r\n" 336 337 "4: \r\n" 338 "li $8, 2\r\n" /* st2 */ 339 "bltu $9, $8, 5f\r\n" 340 "nop \r\n" 341 "ush $11, 0($10)\r\n" 342 "srl $11, 16\r\n" 343 "subu $9, $9, 2\r\n" 344 PTR_ADDU "$10, $10, 2\r\n" 345 346 "5: \r\n" 347 "li $8, 1\r\n" /* st1 */ 348 "bltu $9, $8, 6f\r\n" 349 "nop \r\n" 350 "sb $11, 0($10)\r\n" 351 352 "6: \r\n" 353 "nop \r\n" /* end */ 354 : "=m" (*outptr) 355 : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF), 356 "f" (mmG), "r" (col), "r" (outptr) 357 : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory" 358 ); 359 } 360 361 #else /* RGB_PIXELSIZE == 4 */ 362 363 #ifdef RGBX_FILLER_0XFF 364 xe = _mm_cmpeq_pi8(xe, xe); 365 xo = _mm_cmpeq_pi8(xo, xo); 366 #else 367 xe = _mm_xor_si64(xe, xe); 368 xo = _mm_xor_si64(xo, xo); 369 #endif 370 /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */ 371 /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */ 372 /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */ 373 /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */ 374 375 mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ 376 mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */ 377 mmA = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */ 378 mmE = _mm_unpackhi_pi8(mmE, mmG); /* (28 38 2A 3A 2C 3C 2E 3E) */ 379 380 mmG = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */ 381 mmB = _mm_unpackhi_pi8(mmB, mmD); /* (09 19 0B 1B 0D 1D 0F 1F) */ 382 mmD = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */ 383 mmF = _mm_unpackhi_pi8(mmF, mmH); /* (29 39 2B 3B 2D 3D 2F 3F) */ 384 385 mmH = _mm_unpacklo_pi16(mm8, mmA); /* (00 10 20 30 02 12 22 32) */ 386 mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (04 14 24 34 06 16 26 36) */ 387 mmA = _mm_unpacklo_pi16(mmG, mmD); /* (01 11 21 31 03 13 23 33) */ 388 mmD = _mm_unpackhi_pi16(mmG, mmD); /* (05 15 25 35 07 17 27 37) */ 389 390 mmG = _mm_unpackhi_pi16(mm9, mmE); /* (0C 1C 2C 3C 0E 1E 2E 3E) */ 391 mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (08 18 28 38 0A 1A 2A 3A) */ 392 mmE = _mm_unpacklo_pi16(mmB, mmF); /* (09 19 29 39 0B 1B 2B 3B) */ 393 mmF = _mm_unpackhi_pi16(mmB, mmF); /* (0D 1D 2D 3D 0F 1F 2F 3F) */ 394 395 mmB = _mm_unpackhi_pi32(mmH, mmA); /* (02 12 22 32 03 13 23 33) */ 396 mmA = _mm_unpacklo_pi32(mmH, mmA); /* (00 10 20 30 01 11 21 31) */ 397 mmC = _mm_unpacklo_pi32(mm8, mmD); /* (04 14 24 34 05 15 25 35) */ 398 mmD = _mm_unpackhi_pi32(mm8, mmD); /* (06 16 26 36 07 17 27 37) */ 399 400 mmH = _mm_unpackhi_pi32(mmG, mmF); /* (0E 1E 2E 3E 0F 1F 2F 3F) */ 401 mmG = _mm_unpacklo_pi32(mmG, mmF); /* (0C 1C 2C 3C 0D 1D 2D 3D) */ 402 mmF = _mm_unpackhi_pi32(mm9, mmE); /* (0A 1A 2A 3A 0B 1B 2B 3B) */ 403 mmE = _mm_unpacklo_pi32(mm9, mmE); /* (08 18 28 38 09 19 29 39) */ 404 405 if (num_cols >= 8) { 406 if (!(((long)outptr) & 7)) { 407 _mm_store_si64((__m64 *)outptr, mmA); 408 _mm_store_si64((__m64 *)(outptr + 8), mmB); 409 _mm_store_si64((__m64 *)(outptr + 16), mmC); 410 _mm_store_si64((__m64 *)(outptr + 24), mmD); 411 _mm_store_si64((__m64 *)(outptr + 32), mmE); 412 _mm_store_si64((__m64 *)(outptr + 40), mmF); 413 _mm_store_si64((__m64 *)(outptr + 48), mmG); 414 _mm_store_si64((__m64 *)(outptr + 56), mmH); 415 } else { 416 _mm_storeu_si64((__m64 *)outptr, mmA); 417 _mm_storeu_si64((__m64 *)(outptr + 8), mmB); 418 _mm_storeu_si64((__m64 *)(outptr + 16), mmC); 419 _mm_storeu_si64((__m64 *)(outptr + 24), mmD); 420 _mm_storeu_si64((__m64 *)(outptr + 32), mmE); 421 _mm_storeu_si64((__m64 *)(outptr + 40), mmF); 422 _mm_storeu_si64((__m64 *)(outptr + 48), mmG); 423 _mm_storeu_si64((__m64 *)(outptr + 56), mmH); 424 } 425 outptr += RGB_PIXELSIZE * 16; 426 } else { 427 if (output_width & 1) 428 col = num_cols * 2 + 1; 429 else 430 col = num_cols * 2; 431 asm(".set noreorder\r\n" /* st32 */ 432 433 "li $8, 8\r\n" 434 "move $9, %10\r\n" 435 "move $10, %11\r\n" 436 "mov.s $f4, %2\r\n" 437 "mov.s $f6, %3\r\n" 438 "mov.s $f8, %4\r\n" 439 "mov.s $f10, %5\r\n" 440 "bltu $9, $8, 1f\r\n" 441 "nop \r\n" 442 "gssdlc1 $f4, 7($10)\r\n" 443 "gssdrc1 $f4, 0($10)\r\n" 444 "gssdlc1 $f6, 7+8($10)\r\n" 445 "gssdrc1 $f6, 8($10)\r\n" 446 "gssdlc1 $f8, 7+16($10)\r\n" 447 "gssdrc1 $f8, 16($10)\r\n" 448 "gssdlc1 $f10, 7+24($10)\r\n" 449 "gssdrc1 $f10, 24($10)\r\n" 450 "mov.s $f4, %6\r\n" 451 "mov.s $f6, %7\r\n" 452 "mov.s $f8, %8\r\n" 453 "mov.s $f10, %9\r\n" 454 "subu $9, $9, 8\r\n" 455 PTR_ADDU "$10, $10, 32\r\n" 456 457 "1: \r\n" 458 "li $8, 4\r\n" /* st16 */ 459 "bltu $9, $8, 2f\r\n" 460 "nop \r\n" 461 "gssdlc1 $f4, 7($10)\r\n" 462 "gssdrc1 $f4, 0($10)\r\n" 463 "gssdlc1 $f6, 7+8($10)\r\n" 464 "gssdrc1 $f6, 8($10)\r\n" 465 "mov.s $f4, $f8\r\n" 466 "mov.s $f6, $f10\r\n" 467 "subu $9, $9, 4\r\n" 468 PTR_ADDU "$10, $10, 16\r\n" 469 470 "2: \r\n" 471 "li $8, 2\r\n" /* st8 */ 472 "bltu $9, $8, 3f\r\n" 473 "nop \r\n" 474 "gssdlc1 $f4, 7($10)\r\n" 475 "gssdrc1 $f4, 0($10)\r\n" 476 "mov.s $f4, $f6\r\n" 477 "subu $9, $9, 2\r\n" 478 PTR_ADDU "$10, $10, 8\r\n" 479 480 "3: \r\n" 481 "li $8, 1\r\n" /* st4 */ 482 "bltu $9, $8, 4f\r\n" 483 "nop \r\n" 484 "gsswlc1 $f4, 3($10)\r\n" 485 "gsswrc1 $f4, 0($10)\r\n" 486 487 "4: \r\n" 488 "li %1, 0\r\n" /* end */ 489 : "=m" (*outptr), "=r" (col) 490 : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF), 491 "f" (mmG), "f" (mmH), "r" (col), "r" (outptr) 492 : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory" 493 ); 494 } 495 496 #endif 497 498 } 499 500 if (!((output_width >> 1) & 7)) { 501 if (output_width & 1) { 502 cb = _mm_load_si64((__m64 *)inptr1); 503 cr = _mm_load_si64((__m64 *)inptr2); 504 y = _mm_load_si64((__m64 *)inptr0); 505 506 decenter = 0.0; 507 decenter = _mm_cmpeq_pi16(decenter, decenter); 508 decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ 509 510 cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */ 511 crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */ 512 cbl = _mm_add_pi16(cbl, decenter); 513 crl = _mm_add_pi16(crl, decenter); 514 515 cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */ 516 crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */ 517 bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */ 518 rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */ 519 520 bl = _mm_add_pi16(bl, PW_ONE); 521 bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */ 522 rl = _mm_add_pi16(rl, PW_ONE); 523 rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */ 524 525 bl = _mm_add_pi16(bl, cbl); 526 bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */ 527 rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */ 528 529 gl = _mm_unpacklo_pi16(cbl, crl); 530 gl = _mm_madd_pi16(gl, PW_MF0344_F0285); 531 gl = _mm_add_pi32(gl, PD_ONEHALF); 532 gl = _mm_srai_pi32(gl, SCALEBITS); 533 gl = _mm_packs_pi32(gl, zero); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */ 534 gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */ 535 536 yl = _mm_unpacklo_pi8(y, zero); /* Y(0123) */ 537 rl = _mm_add_pi16(rl, yl); /* (R0 R1 R2 R3) */ 538 gl = _mm_add_pi16(gl, yl); /* (G0 G1 G2 G3) */ 539 bl = _mm_add_pi16(bl, yl); /* (B0 B1 B2 B3) */ 540 re = _mm_packs_pu16(rl, rl); 541 ge = _mm_packs_pu16(gl, gl); 542 be = _mm_packs_pu16(bl, bl); 543 #if RGB_PIXELSIZE == 3 544 mmA = _mm_unpacklo_pi8(mmA, mmC); 545 mmA = _mm_unpacklo_pi16(mmA, mmE); 546 asm(".set noreorder\r\n" 547 548 "move $8, %2\r\n" 549 "mov.s $f4, %1\r\n" 550 "mfc1 $9, $f4\r\n" 551 "ush $9, 0($8)\r\n" 552 "srl $9, 16\r\n" 553 "sb $9, 2($8)\r\n" 554 : "=m" (*outptr) 555 : "f" (mmA), "r" (outptr) 556 : "$f4", "$8", "$9", "memory" 557 ); 558 #else /* RGB_PIXELSIZE == 4 */ 559 560 #ifdef RGBX_FILLER_0XFF 561 xe = _mm_cmpeq_pi8(xe, xe); 562 #else 563 xe = _mm_xor_si64(xe, xe); 564 #endif 565 mmA = _mm_unpacklo_pi8(mmA, mmC); 566 mmE = _mm_unpacklo_pi8(mmE, mmG); 567 mmA = _mm_unpacklo_pi16(mmA, mmE); 568 asm(".set noreorder\r\n" 569 570 "move $8, %2\r\n" 571 "mov.s $f4, %1\r\n" 572 "gsswlc1 $f4, 3($8)\r\n" 573 "gsswrc1 $f4, 0($8)\r\n" 574 : "=m" (*outptr) 575 : "f" (mmA), "r" (outptr) 576 : "$f4", "$8", "memory" 577 ); 578 #endif 579 } 580 } 581 } 582 583 584 void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width, 585 JSAMPIMAGE input_buf, 586 JDIMENSION in_row_group_ctr, 587 JSAMPARRAY output_buf) 588 { 589 JSAMPROW inptr, outptr; 590 591 inptr = input_buf[0][in_row_group_ctr]; 592 outptr = output_buf[0]; 593 594 input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2]; 595 jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr, 596 output_buf); 597 598 input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1]; 599 output_buf[0] = output_buf[1]; 600 jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr, 601 output_buf); 602 603 input_buf[0][in_row_group_ctr] = inptr; 604 output_buf[0] = outptr; 605 } 606 607 608 #undef mmA 609 #undef mmB 610 #undef mmC 611 #undef mmD 612 #undef mmE 613 #undef mmF 614 #undef mmG 615 #undef mmH