rescaler_msa.c (17225B)
1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MSA version of rescaling functions 11 // 12 // Author: Prashant Patil (prashant.patil@imgtec.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE) 17 18 #include <assert.h> 19 20 #include "src/utils/rescaler_utils.h" 21 #include "src/dsp/msa_macro.h" 22 23 #define ROUNDER (WEBP_RESCALER_ONE >> 1) 24 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) 25 #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) 26 27 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \ 28 v4u32 tmp0, tmp1, tmp2, tmp3; \ 29 v16u8 t0, t1, t2, t3, t4, t5; \ 30 v2u64 out0, out1, out2, out3; \ 31 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 32 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \ 33 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 34 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \ 35 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 36 PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \ 37 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \ 38 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \ 39 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 40 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \ 41 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 42 PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \ 43 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \ 44 dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \ 45 } while (0) 46 47 #define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \ 48 v4u32 tmp0, tmp1; \ 49 v16i8 t0, t1; \ 50 v2u64 out0, out1; \ 51 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 52 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 53 SRAR_D2_UD(out0, out1, shift); \ 54 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \ 55 t1 = __msa_pckev_b(t0, t0); \ 56 t0 = __msa_pckev_b(t1, t1); \ 57 dst = __msa_copy_s_w((v4i32)t0, 0); \ 58 } while (0) 59 60 #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \ 61 dst0, dst1, dst2, dst3) do { \ 62 v4u32 tmp0, tmp1, tmp2, tmp3; \ 63 v2u64 out0, out1, out2, out3; \ 64 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 65 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \ 66 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \ 67 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \ 68 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 69 PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \ 70 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \ 71 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \ 72 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \ 73 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \ 74 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 75 PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \ 76 } while (0) 77 78 #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \ 79 v4u32 tmp0, tmp1; \ 80 v2u64 out0, out1; \ 81 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 82 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 83 SRAR_D2_UD(out0, out1, shift); \ 84 dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \ 85 } while (0) 86 87 #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \ 88 dst0, dst1) do { \ 89 v4u32 tmp0, tmp1, tmp2, tmp3; \ 90 v2u64 out0, out1, out2, out3; \ 91 ILVRL_W2_UW(in0, in2, tmp0, tmp1); \ 92 ILVRL_W2_UW(in1, in3, tmp2, tmp3); \ 93 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \ 94 DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \ 95 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 96 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \ 97 DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \ 98 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 99 PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \ 100 } while (0) 101 102 #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \ 103 v4u32 tmp0, tmp1; \ 104 v2u64 out0, out1; \ 105 v16i8 t0, t1; \ 106 ILVRL_W2_UW(in0, in1, tmp0, tmp1); \ 107 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \ 108 SRAR_D2_UD(out0, out1, shift); \ 109 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \ 110 SRAR_D2_UD(out0, out1, shift); \ 111 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \ 112 t1 = __msa_pckev_b(t0, t0); \ 113 t0 = __msa_pckev_b(t1, t1); \ 114 dst = __msa_copy_s_w((v4i32)t0, 0); \ 115 } while (0) 116 117 static WEBP_INLINE void ExportRowExpand_0( 118 const uint32_t* WEBP_RESTRICT frow, uint8_t* WEBP_RESTRICT dst, int length, 119 WebPRescaler* WEBP_RESTRICT const wrk) { 120 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); 121 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 122 const v4i32 zero = { 0 }; 123 124 while (length >= 16) { 125 v4u32 src0, src1, src2, src3; 126 v16u8 out; 127 LD_UW4(frow, 4, src0, src1, src2, src3); 128 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out); 129 ST_UB(out, dst); 130 length -= 16; 131 frow += 16; 132 dst += 16; 133 } 134 if (length > 0) { 135 int x_out; 136 if (length >= 12) { 137 uint32_t val0_m, val1_m, val2_m; 138 v4u32 src0, src1, src2; 139 LD_UW3(frow, 4, src0, src1, src2); 140 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 141 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 142 CALC_MULT_FIX_4(src2, scale, shift, val2_m); 143 SW3(val0_m, val1_m, val2_m, dst, 4); 144 length -= 12; 145 frow += 12; 146 dst += 12; 147 } else if (length >= 8) { 148 uint32_t val0_m, val1_m; 149 v4u32 src0, src1; 150 LD_UW2(frow, 4, src0, src1); 151 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 152 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 153 SW2(val0_m, val1_m, dst, 4); 154 length -= 8; 155 frow += 8; 156 dst += 8; 157 } else if (length >= 4) { 158 uint32_t val0_m; 159 const v4u32 src0 = LD_UW(frow); 160 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 161 SW(val0_m, dst); 162 length -= 4; 163 frow += 4; 164 dst += 4; 165 } 166 for (x_out = 0; x_out < length; ++x_out) { 167 const uint32_t J = frow[x_out]; 168 const int v = (int)MULT_FIX(J, wrk->fy_scale); 169 dst[x_out] = (v > 255) ? 255u : (uint8_t)v; 170 } 171 } 172 } 173 174 static WEBP_INLINE void ExportRowExpand_1( 175 const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow, 176 uint8_t* WEBP_RESTRICT dst, int length, 177 WebPRescaler* WEBP_RESTRICT const wrk) { 178 const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); 179 const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); 180 const v4i32 B1 = __msa_fill_w(B); 181 const v4i32 A1 = __msa_fill_w(A); 182 const v4i32 AB = __msa_ilvr_w(A1, B1); 183 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); 184 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 185 186 while (length >= 16) { 187 v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3; 188 v16u8 t0, t1, t2, t3, t4, t5; 189 LD_UW4(frow, 4, frow0, frow1, frow2, frow3); 190 LD_UW4(irow, 4, irow0, irow1, irow2, irow3); 191 CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1); 192 CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3); 193 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); 194 t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); 195 ST_UB(t0, dst); 196 frow += 16; 197 irow += 16; 198 dst += 16; 199 length -= 16; 200 } 201 if (length > 0) { 202 int x_out; 203 if (length >= 12) { 204 uint32_t val0_m, val1_m, val2_m; 205 v4u32 frow0, frow1, frow2, irow0, irow1, irow2; 206 LD_UW3(frow, 4, frow0, frow1, frow2); 207 LD_UW3(irow, 4, irow0, irow1, irow2); 208 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 209 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m); 210 CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m); 211 SW3(val0_m, val1_m, val2_m, dst, 4); 212 frow += 12; 213 irow += 12; 214 dst += 12; 215 length -= 12; 216 } else if (length >= 8) { 217 uint32_t val0_m, val1_m; 218 v4u32 frow0, frow1, irow0, irow1; 219 LD_UW2(frow, 4, frow0, frow1); 220 LD_UW2(irow, 4, irow0, irow1); 221 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 222 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m); 223 SW2(val0_m, val1_m, dst, 4); 224 frow += 4; 225 irow += 4; 226 dst += 4; 227 length -= 4; 228 } else if (length >= 4) { 229 uint32_t val0_m; 230 const v4u32 frow0 = LD_UW(frow + 0); 231 const v4u32 irow0 = LD_UW(irow + 0); 232 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 233 SW(val0_m, dst); 234 frow += 4; 235 irow += 4; 236 dst += 4; 237 length -= 4; 238 } 239 for (x_out = 0; x_out < length; ++x_out) { 240 const uint64_t I = (uint64_t)A * frow[x_out] 241 + (uint64_t)B * irow[x_out]; 242 const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); 243 const int v = (int)MULT_FIX(J, wrk->fy_scale); 244 dst[x_out] = (v > 255) ? 255u : (uint8_t)v; 245 } 246 } 247 } 248 249 static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { 250 uint8_t* dst = wrk->dst; 251 rescaler_t* irow = wrk->irow; 252 const int x_out_max = wrk->dst_width * wrk->num_channels; 253 const rescaler_t* frow = wrk->frow; 254 assert(!WebPRescalerOutputDone(wrk)); 255 assert(wrk->y_accum <= 0); 256 assert(wrk->y_expand); 257 assert(wrk->y_sub != 0); 258 if (wrk->y_accum == 0) { 259 ExportRowExpand_0(frow, dst, x_out_max, wrk); 260 } else { 261 ExportRowExpand_1(frow, irow, dst, x_out_max, wrk); 262 } 263 } 264 265 #if 0 // disabled for now. TODO(skal): make match the C-code 266 static WEBP_INLINE void ExportRowShrink_0( 267 const uint32_t* WEBP_RESTRICT frow, uint32_t* WEBP_RESTRICT irow, 268 uint8_t* WEBP_RESTRICT dst, int length, const uint32_t yscale, 269 WebPRescaler* WEBP_RESTRICT const wrk) { 270 const v4u32 y_scale = (v4u32)__msa_fill_w(yscale); 271 const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale); 272 const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 273 const v4i32 zero = { 0 }; 274 275 while (length >= 16) { 276 v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3; 277 v16u8 out; 278 LD_UW4(frow, 4, src0, src1, src2, src3); 279 CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval, 280 frac0, frac1, frac2, frac3); 281 LD_UW4(irow, 4, src0, src1, src2, src3); 282 SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3, 283 src0, src1, src2, src3); 284 CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out); 285 ST_UB(out, dst); 286 ST_UW4(frac0, frac1, frac2, frac3, irow, 4); 287 frow += 16; 288 irow += 16; 289 dst += 16; 290 length -= 16; 291 } 292 if (length > 0) { 293 int x_out; 294 if (length >= 12) { 295 uint32_t val0_m, val1_m, val2_m; 296 v4u32 src0, src1, src2, frac0, frac1, frac2; 297 LD_UW3(frow, 4, src0, src1, src2); 298 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 299 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1); 300 CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2); 301 LD_UW3(irow, 4, src0, src1, src2); 302 SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2); 303 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 304 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m); 305 CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m); 306 SW3(val0_m, val1_m, val2_m, dst, 4); 307 ST_UW3(frac0, frac1, frac2, irow, 4); 308 frow += 12; 309 irow += 12; 310 dst += 12; 311 length -= 12; 312 } else if (length >= 8) { 313 uint32_t val0_m, val1_m; 314 v4u32 src0, src1, frac0, frac1; 315 LD_UW2(frow, 4, src0, src1); 316 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 317 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1); 318 LD_UW2(irow, 4, src0, src1); 319 SUB2(src0, frac0, src1, frac1, src0, src1); 320 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 321 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m); 322 SW2(val0_m, val1_m, dst, 4); 323 ST_UW2(frac0, frac1, irow, 4); 324 frow += 8; 325 irow += 8; 326 dst += 8; 327 length -= 8; 328 } else if (length >= 4) { 329 uint32_t val0_m; 330 v4u32 frac0; 331 v4u32 src0 = LD_UW(frow); 332 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 333 src0 = LD_UW(irow); 334 src0 = src0 - frac0; 335 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 336 SW(val0_m, dst); 337 ST_UW(frac0, irow); 338 frow += 4; 339 irow += 4; 340 dst += 4; 341 length -= 4; 342 } 343 for (x_out = 0; x_out < length; ++x_out) { 344 const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale); 345 const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); 346 dst[x_out] = (v > 255) ? 255u : (uint8_t)v; 347 irow[x_out] = frac; 348 } 349 } 350 } 351 352 static WEBP_INLINE void ExportRowShrink_1( 353 uint32_t* WEBP_RESTRICT irow, uint8_t* WEBP_RESTRICT dst, int length, 354 WebPRescaler* WEBP_RESTRICT const wrk) { 355 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale); 356 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 357 const v4i32 zero = { 0 }; 358 359 while (length >= 16) { 360 v4u32 src0, src1, src2, src3; 361 v16u8 dst0; 362 LD_UW4(irow, 4, src0, src1, src2, src3); 363 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0); 364 ST_UB(dst0, dst); 365 ST_SW4(zero, zero, zero, zero, irow, 4); 366 length -= 16; 367 irow += 16; 368 dst += 16; 369 } 370 if (length > 0) { 371 int x_out; 372 if (length >= 12) { 373 uint32_t val0_m, val1_m, val2_m; 374 v4u32 src0, src1, src2; 375 LD_UW3(irow, 4, src0, src1, src2); 376 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 377 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 378 CALC_MULT_FIX_4(src2, scale, shift, val2_m); 379 SW3(val0_m, val1_m, val2_m, dst, 4); 380 ST_SW3(zero, zero, zero, irow, 4); 381 length -= 12; 382 irow += 12; 383 dst += 12; 384 } else if (length >= 8) { 385 uint32_t val0_m, val1_m; 386 v4u32 src0, src1; 387 LD_UW2(irow, 4, src0, src1); 388 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 389 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 390 SW2(val0_m, val1_m, dst, 4); 391 ST_SW2(zero, zero, irow, 4); 392 length -= 8; 393 irow += 8; 394 dst += 8; 395 } else if (length >= 4) { 396 uint32_t val0_m; 397 const v4u32 src0 = LD_UW(irow + 0); 398 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 399 SW(val0_m, dst); 400 ST_SW(zero, irow); 401 length -= 4; 402 irow += 4; 403 dst += 4; 404 } 405 for (x_out = 0; x_out < length; ++x_out) { 406 const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale); 407 dst[x_out] = (v > 255) ? 255u : (uint8_t)v; 408 irow[x_out] = 0; 409 } 410 } 411 } 412 413 static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { 414 uint8_t* dst = wrk->dst; 415 rescaler_t* irow = wrk->irow; 416 const int x_out_max = wrk->dst_width * wrk->num_channels; 417 const rescaler_t* frow = wrk->frow; 418 const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum); 419 assert(!WebPRescalerOutputDone(wrk)); 420 assert(wrk->y_accum <= 0); 421 assert(!wrk->y_expand); 422 if (yscale) { 423 ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk); 424 } else { 425 ExportRowShrink_1(irow, dst, x_out_max, wrk); 426 } 427 } 428 #endif // 0 429 430 //------------------------------------------------------------------------------ 431 // Entry point 432 433 extern void WebPRescalerDspInitMSA(void); 434 435 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) { 436 WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2; 437 // WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2; 438 } 439 440 #else // !WEBP_USE_MSA 441 442 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA) 443 444 #endif // WEBP_USE_MSA