jdsample-altivec.c (14201B)
1 /* 2 * AltiVec optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 5 * 6 * This software is provided 'as-is', without any express or implied 7 * warranty. In no event will the authors be held liable for any damages 8 * arising from the use of this software. 9 * 10 * Permission is granted to anyone to use this software for any purpose, 11 * including commercial applications, and to alter it and redistribute it 12 * freely, subject to the following restrictions: 13 * 14 * 1. The origin of this software must not be misrepresented; you must not 15 * claim that you wrote the original software. If you use this software 16 * in a product, an acknowledgment in the product documentation would be 17 * appreciated but is not required. 18 * 2. Altered source versions must be plainly marked as such, and must not be 19 * misrepresented as being the original software. 20 * 3. This notice may not be removed or altered from any source distribution. 21 */ 22 23 /* CHROMA UPSAMPLING */ 24 25 #include "jsimd_altivec.h" 26 27 28 void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor, 29 JDIMENSION downsampled_width, 30 JSAMPARRAY input_data, 31 JSAMPARRAY *output_data_ptr) 32 { 33 JSAMPARRAY output_data = *output_data_ptr; 34 JSAMPROW inptr, outptr; 35 int inrow, incol; 36 37 __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0, 38 out; 39 __vector short this0e, this0o, this0l, this0h, last0l, last0h, 40 next0l, next0h, outle, outhe, outlo, outho; 41 42 /* Constants */ 43 __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) }, 44 last_index_col0 = 45 { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, 46 last_index = 47 { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }, 48 next_index = 49 { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }, 50 next_index_lastcol = 51 { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 }, 52 #if __BIG_ENDIAN__ 53 merge_pack_index = 54 { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; 55 #else 56 merge_pack_index = 57 { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }; 58 #endif 59 __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) }; 60 61 for (inrow = 0; inrow < max_v_samp_factor; inrow++) { 62 inptr = input_data[inrow]; 63 outptr = output_data[inrow]; 64 65 if (downsampled_width & 15) 66 inptr[downsampled_width] = inptr[downsampled_width - 1]; 67 68 this0 = vec_ld(0, inptr); 69 p_last0 = vec_perm(this0, this0, last_index_col0); 70 last0 = this0; 71 72 for (incol = downsampled_width; incol > 0; 73 incol -= 16, inptr += 16, outptr += 32) { 74 75 if (downsampled_width - incol > 0) { 76 p_last0 = vec_perm(last0, this0, last_index); 77 last0 = this0; 78 } 79 80 if (incol <= 16) 81 p_next0 = vec_perm(this0, this0, next_index_lastcol); 82 else { 83 next0 = vec_ld(16, inptr); 84 p_next0 = vec_perm(this0, next0, next_index); 85 } 86 87 this0e = (__vector short)vec_mule(this0, pb_three); 88 this0o = (__vector short)vec_mulo(this0, pb_three); 89 this0l = vec_mergeh(this0e, this0o); 90 this0h = vec_mergel(this0e, this0o); 91 92 last0l = (__vector short)VEC_UNPACKHU(p_last0); 93 last0h = (__vector short)VEC_UNPACKLU(p_last0); 94 last0l = vec_add(last0l, pw_one); 95 96 next0l = (__vector short)VEC_UNPACKHU(p_next0); 97 next0h = (__vector short)VEC_UNPACKLU(p_next0); 98 next0l = vec_add(next0l, pw_two); 99 100 outle = vec_add(this0l, last0l); 101 outlo = vec_add(this0l, next0l); 102 outle = vec_sr(outle, (__vector unsigned short)pw_two); 103 outlo = vec_sr(outlo, (__vector unsigned short)pw_two); 104 105 out = vec_perm((__vector unsigned char)outle, 106 (__vector unsigned char)outlo, merge_pack_index); 107 vec_st(out, 0, outptr); 108 109 if (incol > 8) { 110 last0h = vec_add(last0h, pw_one); 111 next0h = vec_add(next0h, pw_two); 112 113 outhe = vec_add(this0h, last0h); 114 outho = vec_add(this0h, next0h); 115 outhe = vec_sr(outhe, (__vector unsigned short)pw_two); 116 outho = vec_sr(outho, (__vector unsigned short)pw_two); 117 118 out = vec_perm((__vector unsigned char)outhe, 119 (__vector unsigned char)outho, merge_pack_index); 120 vec_st(out, 16, outptr); 121 } 122 123 this0 = next0; 124 } 125 } 126 } 127 128 129 void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor, 130 JDIMENSION downsampled_width, 131 JSAMPARRAY input_data, 132 JSAMPARRAY *output_data_ptr) 133 { 134 JSAMPARRAY output_data = *output_data_ptr; 135 JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; 136 int inrow, outrow, incol; 137 138 __vector unsigned char this_1, this0, this1, out; 139 __vector short this_1l, this_1h, this0l, this0h, this1l, this1h, 140 lastcolsum_1h, lastcolsum1h, 141 p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h, 142 thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h, 143 nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 }, 144 nextcolsum1l = { 0 }, nextcolsum1h = { 0 }, 145 p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h, 146 tmpl, tmph, outle, outhe, outlo, outho; 147 148 /* Constants */ 149 __vector unsigned char pb_zero = { __16X(0) }, 150 last_index_col0 = 151 { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, 152 last_index = 153 { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, 154 next_index = 155 { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }, 156 next_index_lastcol = 157 { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 }, 158 #if __BIG_ENDIAN__ 159 merge_pack_index = 160 { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 }; 161 #else 162 merge_pack_index = 163 { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 }; 164 #endif 165 __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) }, 166 pw_seven = { __8X(7) }, pw_eight = { __8X(8) }; 167 __vector unsigned short pw_four = { __8X(4) }; 168 169 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { 170 171 inptr_1 = input_data[inrow - 1]; 172 inptr0 = input_data[inrow]; 173 inptr1 = input_data[inrow + 1]; 174 outptr0 = output_data[outrow++]; 175 outptr1 = output_data[outrow++]; 176 177 if (downsampled_width & 15) { 178 inptr_1[downsampled_width] = inptr_1[downsampled_width - 1]; 179 inptr0[downsampled_width] = inptr0[downsampled_width - 1]; 180 inptr1[downsampled_width] = inptr1[downsampled_width - 1]; 181 } 182 183 this0 = vec_ld(0, inptr0); 184 this0l = (__vector short)VEC_UNPACKHU(this0); 185 this0h = (__vector short)VEC_UNPACKLU(this0); 186 this0l = vec_mladd(this0l, pw_three, pw_zero); 187 this0h = vec_mladd(this0h, pw_three, pw_zero); 188 189 this_1 = vec_ld(0, inptr_1); 190 this_1l = (__vector short)VEC_UNPACKHU(this_1); 191 this_1h = (__vector short)VEC_UNPACKLU(this_1); 192 thiscolsum_1l = vec_add(this0l, this_1l); 193 thiscolsum_1h = vec_add(this0h, this_1h); 194 lastcolsum_1h = thiscolsum_1h; 195 p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0); 196 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); 197 198 this1 = vec_ld(0, inptr1); 199 this1l = (__vector short)VEC_UNPACKHU(this1); 200 this1h = (__vector short)VEC_UNPACKLU(this1); 201 thiscolsum1l = vec_add(this0l, this1l); 202 thiscolsum1h = vec_add(this0h, this1h); 203 lastcolsum1h = thiscolsum1h; 204 p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0); 205 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); 206 207 for (incol = downsampled_width; incol > 0; 208 incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16, 209 outptr0 += 32, outptr1 += 32) { 210 211 if (downsampled_width - incol > 0) { 212 p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index); 213 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index); 214 p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index); 215 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index); 216 lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h; 217 } 218 219 if (incol <= 16) { 220 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); 221 p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h, 222 next_index_lastcol); 223 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); 224 p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h, 225 next_index_lastcol); 226 } else { 227 this0 = vec_ld(16, inptr0); 228 this0l = (__vector short)VEC_UNPACKHU(this0); 229 this0h = (__vector short)VEC_UNPACKLU(this0); 230 this0l = vec_mladd(this0l, pw_three, pw_zero); 231 this0h = vec_mladd(this0h, pw_three, pw_zero); 232 233 this_1 = vec_ld(16, inptr_1); 234 this_1l = (__vector short)VEC_UNPACKHU(this_1); 235 this_1h = (__vector short)VEC_UNPACKLU(this_1); 236 nextcolsum_1l = vec_add(this0l, this_1l); 237 nextcolsum_1h = vec_add(this0h, this_1h); 238 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index); 239 p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index); 240 241 this1 = vec_ld(16, inptr1); 242 this1l = (__vector short)VEC_UNPACKHU(this1); 243 this1h = (__vector short)VEC_UNPACKLU(this1); 244 nextcolsum1l = vec_add(this0l, this1l); 245 nextcolsum1h = vec_add(this0h, this1h); 246 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index); 247 p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index); 248 } 249 250 /* Process the upper row */ 251 252 tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero); 253 outle = vec_add(tmpl, p_lastcolsum_1l); 254 outle = vec_add(outle, pw_eight); 255 outle = vec_sr(outle, pw_four); 256 257 outlo = vec_add(tmpl, p_nextcolsum_1l); 258 outlo = vec_add(outlo, pw_seven); 259 outlo = vec_sr(outlo, pw_four); 260 261 out = vec_perm((__vector unsigned char)outle, 262 (__vector unsigned char)outlo, merge_pack_index); 263 vec_st(out, 0, outptr0); 264 265 if (incol > 8) { 266 tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero); 267 outhe = vec_add(tmph, p_lastcolsum_1h); 268 outhe = vec_add(outhe, pw_eight); 269 outhe = vec_sr(outhe, pw_four); 270 271 outho = vec_add(tmph, p_nextcolsum_1h); 272 outho = vec_add(outho, pw_seven); 273 outho = vec_sr(outho, pw_four); 274 275 out = vec_perm((__vector unsigned char)outhe, 276 (__vector unsigned char)outho, merge_pack_index); 277 vec_st(out, 16, outptr0); 278 } 279 280 /* Process the lower row */ 281 282 tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero); 283 outle = vec_add(tmpl, p_lastcolsum1l); 284 outle = vec_add(outle, pw_eight); 285 outle = vec_sr(outle, pw_four); 286 287 outlo = vec_add(tmpl, p_nextcolsum1l); 288 outlo = vec_add(outlo, pw_seven); 289 outlo = vec_sr(outlo, pw_four); 290 291 out = vec_perm((__vector unsigned char)outle, 292 (__vector unsigned char)outlo, merge_pack_index); 293 vec_st(out, 0, outptr1); 294 295 if (incol > 8) { 296 tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero); 297 outhe = vec_add(tmph, p_lastcolsum1h); 298 outhe = vec_add(outhe, pw_eight); 299 outhe = vec_sr(outhe, pw_four); 300 301 outho = vec_add(tmph, p_nextcolsum1h); 302 outho = vec_add(outho, pw_seven); 303 outho = vec_sr(outho, pw_four); 304 305 out = vec_perm((__vector unsigned char)outhe, 306 (__vector unsigned char)outho, merge_pack_index); 307 vec_st(out, 16, outptr1); 308 } 309 310 thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h; 311 thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h; 312 } 313 } 314 } 315 316 317 /* These are rarely used (mainly just for decompressing YCCK images) */ 318 319 void jsimd_h2v1_upsample_altivec(int max_v_samp_factor, 320 JDIMENSION output_width, 321 JSAMPARRAY input_data, 322 JSAMPARRAY *output_data_ptr) 323 { 324 JSAMPARRAY output_data = *output_data_ptr; 325 JSAMPROW inptr, outptr; 326 int inrow, incol; 327 328 __vector unsigned char in, inl, inh; 329 330 for (inrow = 0; inrow < max_v_samp_factor; inrow++) { 331 inptr = input_data[inrow]; 332 outptr = output_data[inrow]; 333 334 for (incol = (output_width + 31) & (~31); incol > 0; 335 incol -= 64, inptr += 32, outptr += 64) { 336 337 in = vec_ld(0, inptr); 338 inl = vec_mergeh(in, in); 339 inh = vec_mergel(in, in); 340 341 vec_st(inl, 0, outptr); 342 vec_st(inh, 16, outptr); 343 344 if (incol > 32) { 345 in = vec_ld(16, inptr); 346 inl = vec_mergeh(in, in); 347 inh = vec_mergel(in, in); 348 349 vec_st(inl, 32, outptr); 350 vec_st(inh, 48, outptr); 351 } 352 } 353 } 354 } 355 356 357 void jsimd_h2v2_upsample_altivec(int max_v_samp_factor, 358 JDIMENSION output_width, 359 JSAMPARRAY input_data, 360 JSAMPARRAY *output_data_ptr) 361 { 362 JSAMPARRAY output_data = *output_data_ptr; 363 JSAMPROW inptr, outptr0, outptr1; 364 int inrow, outrow, incol; 365 366 __vector unsigned char in, inl, inh; 367 368 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { 369 370 inptr = input_data[inrow]; 371 outptr0 = output_data[outrow++]; 372 outptr1 = output_data[outrow++]; 373 374 for (incol = (output_width + 31) & (~31); incol > 0; 375 incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) { 376 377 in = vec_ld(0, inptr); 378 inl = vec_mergeh(in, in); 379 inh = vec_mergel(in, in); 380 381 vec_st(inl, 0, outptr0); 382 vec_st(inl, 0, outptr1); 383 384 vec_st(inh, 16, outptr0); 385 vec_st(inh, 16, outptr1); 386 387 if (incol > 32) { 388 in = vec_ld(16, inptr); 389 inl = vec_mergeh(in, in); 390 inh = vec_mergel(in, in); 391 392 vec_st(inl, 32, outptr0); 393 vec_st(inl, 32, outptr1); 394 395 vec_st(inh, 48, outptr0); 396 vec_st(inh, 48, outptr1); 397 } 398 } 399 } 400 }