enc_msa.c (34626B)
1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MSA version of encoder dsp functions. 11 // 12 // Author: Prashant Patil (prashant.patil@imgtec.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_MSA) 17 18 #include <stdlib.h> 19 #include "src/dsp/msa_macro.h" 20 #include "src/enc/vp8i_enc.h" 21 22 //------------------------------------------------------------------------------ 23 // Transforms 24 25 #define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \ 26 v4i32 a1_m, b1_m, c1_m, d1_m; \ 27 const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \ 28 const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \ 29 v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \ 30 v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \ 31 v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \ 32 v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \ 33 \ 34 ADDSUB2(in0, in2, a1_m, b1_m); \ 35 SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \ 36 c_tmp2_m = c_tmp2_m + in3; \ 37 c1_m = c_tmp1_m - c_tmp2_m; \ 38 SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \ 39 d_tmp1_m = d_tmp1_m + in1; \ 40 d1_m = d_tmp1_m + d_tmp2_m; \ 41 BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ 42 } while (0) 43 44 static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref, 45 const int16_t* WEBP_RESTRICT in, 46 uint8_t* WEBP_RESTRICT dst) { 47 v8i16 input0, input1; 48 v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; 49 v4i32 res0, res1, res2, res3; 50 v16i8 dest0, dest1, dest2, dest3; 51 const v16i8 zero = { 0 }; 52 53 LD_SH2(in, 8, input0, input1); 54 UNPCK_SH_SW(input0, in0, in1); 55 UNPCK_SH_SW(input1, in2, in3); 56 IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); 57 TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); 58 IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); 59 SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); 60 TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); 61 LD_SB4(ref, BPS, dest0, dest1, dest2, dest3); 62 ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, 63 res0, res1, res2, res3); 64 ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, 65 res0, res1, res2, res3); 66 ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); 67 CLIP_SW4_0_255(res0, res1, res2, res3); 68 PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); 69 res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); 70 ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); 71 } 72 73 static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref, 74 const int16_t* WEBP_RESTRICT in, 75 uint8_t* WEBP_RESTRICT dst, int do_two) { 76 ITransformOne(ref, in, dst); 77 if (do_two) { 78 ITransformOne(ref + 4, in + 16, dst + 4); 79 } 80 } 81 82 static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src, 83 const uint8_t* WEBP_RESTRICT ref, 84 int16_t* WEBP_RESTRICT out) { 85 uint64_t out0, out1, out2, out3; 86 uint32_t in0, in1, in2, in3; 87 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 88 v8i16 t0, t1, t2, t3; 89 v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 }; 90 const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; 91 const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; 92 const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; 93 const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; 94 const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; 95 const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; 96 97 LW4(src, BPS, in0, in1, in2, in3); 98 INSERT_W4_UB(in0, in1, in2, in3, src0); 99 LW4(ref, BPS, in0, in1, in2, in3); 100 INSERT_W4_UB(in0, in1, in2, in3, src1); 101 ILVRL_B2_UB(src0, src1, srcl0, srcl1); 102 HSUB_UB2_SH(srcl0, srcl1, t0, t1); 103 VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); 104 ADDSUB2(t2, t3, t0, t1); 105 t0 = SRLI_H(t0, 3); 106 VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); 107 tmp0 = __msa_hadd_s_w(t3, t3); 108 tmp2 = __msa_hsub_s_w(t3, t3); 109 FILL_W2_SW(1812, 937, tmp1, tmp3); 110 DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); 111 SRAI_W2_SW(tmp1, tmp3, 9); 112 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); 113 VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); 114 ADDSUB2(t2, t3, t0, t1); 115 VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); 116 tmp0 = __msa_hadd_s_w(t3, t3); 117 tmp2 = __msa_hsub_s_w(t3, t3); 118 ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); 119 SRAI_W2_SW(tmp0, tmp2, 4); 120 FILL_W2_SW(12000, 51000, tmp1, tmp3); 121 DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); 122 SRAI_W2_SW(tmp1, tmp3, 16); 123 UNPCK_R_SH_SW(t1, tmp4); 124 tmp5 = __msa_ceqi_w(tmp4, 0); 125 tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); 126 tmp5 = __msa_fill_w(1); 127 tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); 128 tmp1 += tmp5; 129 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); 130 out0 = __msa_copy_s_d((v2i64)t0, 0); 131 out1 = __msa_copy_s_d((v2i64)t0, 1); 132 out2 = __msa_copy_s_d((v2i64)t1, 0); 133 out3 = __msa_copy_s_d((v2i64)t1, 1); 134 SD4(out0, out1, out2, out3, out, 8); 135 } 136 137 static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in, 138 int16_t* WEBP_RESTRICT out) { 139 v8i16 in0 = { 0 }; 140 v8i16 in1 = { 0 }; 141 v8i16 tmp0, tmp1, tmp2, tmp3; 142 v8i16 out0, out1; 143 const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; 144 const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; 145 const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; 146 const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; 147 148 in0 = __msa_insert_h(in0, 0, in[ 0]); 149 in0 = __msa_insert_h(in0, 1, in[ 64]); 150 in0 = __msa_insert_h(in0, 2, in[128]); 151 in0 = __msa_insert_h(in0, 3, in[192]); 152 in0 = __msa_insert_h(in0, 4, in[ 16]); 153 in0 = __msa_insert_h(in0, 5, in[ 80]); 154 in0 = __msa_insert_h(in0, 6, in[144]); 155 in0 = __msa_insert_h(in0, 7, in[208]); 156 in1 = __msa_insert_h(in1, 0, in[ 48]); 157 in1 = __msa_insert_h(in1, 1, in[112]); 158 in1 = __msa_insert_h(in1, 2, in[176]); 159 in1 = __msa_insert_h(in1, 3, in[240]); 160 in1 = __msa_insert_h(in1, 4, in[ 32]); 161 in1 = __msa_insert_h(in1, 5, in[ 96]); 162 in1 = __msa_insert_h(in1, 6, in[160]); 163 in1 = __msa_insert_h(in1, 7, in[224]); 164 ADDSUB2(in0, in1, tmp0, tmp1); 165 VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); 166 ADDSUB2(tmp2, tmp3, tmp0, tmp1); 167 VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); 168 ADDSUB2(in0, in1, tmp0, tmp1); 169 VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); 170 ADDSUB2(tmp2, tmp3, out0, out1); 171 SRAI_H2_SH(out0, out1, 1); 172 ST_SH2(out0, out1, out, 8); 173 } 174 175 static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in, 176 const uint16_t* WEBP_RESTRICT w) { 177 int sum; 178 uint32_t in0_m, in1_m, in2_m, in3_m; 179 v16i8 src0 = { 0 }; 180 v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3; 181 v4i32 dst0, dst1; 182 const v16i8 zero = { 0 }; 183 const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; 184 const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; 185 const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; 186 const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; 187 188 LW4(in, BPS, in0_m, in1_m, in2_m, in3_m); 189 INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0); 190 ILVRL_B2_SH(zero, src0, tmp0, tmp1); 191 VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); 192 ADDSUB2(in0, in1, tmp0, tmp1); 193 VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); 194 ADDSUB2(tmp2, tmp3, tmp0, tmp1); 195 VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); 196 ADDSUB2(in0, in1, tmp0, tmp1); 197 VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); 198 ADDSUB2(tmp2, tmp3, tmp0, tmp1); 199 tmp0 = __msa_add_a_h(tmp0, (v8i16)zero); 200 tmp1 = __msa_add_a_h(tmp1, (v8i16)zero); 201 LD_SH2(w, 8, tmp2, tmp3); 202 DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1); 203 dst0 = dst0 + dst1; 204 sum = HADD_SW_S32(dst0); 205 return sum; 206 } 207 208 static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a, 209 const uint8_t* WEBP_RESTRICT const b, 210 const uint16_t* WEBP_RESTRICT const w) { 211 const int sum1 = TTransform_MSA(a, w); 212 const int sum2 = TTransform_MSA(b, w); 213 return abs(sum2 - sum1) >> 5; 214 } 215 216 static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a, 217 const uint8_t* WEBP_RESTRICT const b, 218 const uint16_t* WEBP_RESTRICT const w) { 219 int D = 0; 220 int x, y; 221 for (y = 0; y < 16 * BPS; y += 4 * BPS) { 222 for (x = 0; x < 16; x += 4) { 223 D += Disto4x4_MSA(a + x + y, b + x + y, w); 224 } 225 } 226 return D; 227 } 228 229 //------------------------------------------------------------------------------ 230 // Histogram 231 232 static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred, 233 int start_block, int end_block, 234 VP8Histogram* const histo) { 235 int j; 236 int distribution[MAX_COEFF_THRESH + 1] = { 0 }; 237 for (j = start_block; j < end_block; ++j) { 238 int16_t out[16]; 239 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 240 { 241 int k; 242 v8i16 coeff0, coeff1; 243 const v8i16 zero = { 0 }; 244 const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH); 245 LD_SH2(&out[0], 8, coeff0, coeff1); 246 coeff0 = __msa_add_a_h(coeff0, zero); 247 coeff1 = __msa_add_a_h(coeff1, zero); 248 SRAI_H2_SH(coeff0, coeff1, 3); 249 coeff0 = __msa_min_s_h(coeff0, max_coeff_thr); 250 coeff1 = __msa_min_s_h(coeff1, max_coeff_thr); 251 ST_SH2(coeff0, coeff1, &out[0], 8); 252 for (k = 0; k < 16; ++k) { 253 ++distribution[out[k]]; 254 } 255 } 256 } 257 VP8SetHistogramData(distribution, histo); 258 } 259 260 //------------------------------------------------------------------------------ 261 // Intra predictions 262 263 // luma 4x4 prediction 264 265 #define DST(x, y) dst[(x) + (y) * BPS] 266 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) 267 #define AVG2(a, b) (((a) + (b) + 1) >> 1) 268 269 // vertical 270 static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst, 271 const uint8_t* WEBP_RESTRICT top) { 272 const v16u8 A1 = { 0 }; 273 const uint64_t val_m = LD(top - 1); 274 const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); 275 const v16u8 B = SLDI_UB(A, A, 1); 276 const v16u8 C = SLDI_UB(A, A, 2); 277 const v16u8 AC = __msa_ave_u_b(A, C); 278 const v16u8 B2 = __msa_ave_u_b(B, B); 279 const v16u8 R = __msa_aver_u_b(AC, B2); 280 const uint32_t out = __msa_copy_s_w((v4i32)R, 0); 281 SW4(out, out, out, out, dst, BPS); 282 } 283 284 // horizontal 285 static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst, 286 const uint8_t* WEBP_RESTRICT top) { 287 const int X = top[-1]; 288 const int I = top[-2]; 289 const int J = top[-3]; 290 const int K = top[-4]; 291 const int L = top[-5]; 292 WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); 293 WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); 294 WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); 295 WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); 296 } 297 298 static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst, 299 const uint8_t* WEBP_RESTRICT top) { 300 uint32_t dc = 4; 301 int i; 302 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; 303 dc >>= 3; 304 dc = dc | (dc << 8) | (dc << 16) | (dc << 24); 305 SW4(dc, dc, dc, dc, dst, BPS); 306 } 307 308 static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst, 309 const uint8_t* WEBP_RESTRICT top) { 310 const v16u8 A2 = { 0 }; 311 const uint64_t val_m = LD(top - 5); 312 const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m); 313 const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]); 314 const v16u8 B = SLDI_UB(A, A, 1); 315 const v16u8 C = SLDI_UB(A, A, 2); 316 const v16u8 AC = __msa_ave_u_b(A, C); 317 const v16u8 B2 = __msa_ave_u_b(B, B); 318 const v16u8 R0 = __msa_aver_u_b(AC, B2); 319 const v16u8 R1 = SLDI_UB(R0, R0, 1); 320 const v16u8 R2 = SLDI_UB(R1, R1, 1); 321 const v16u8 R3 = SLDI_UB(R2, R2, 1); 322 const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0); 323 const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0); 324 const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0); 325 const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0); 326 SW4(val3, val2, val1, val0, dst, BPS); 327 } 328 329 static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst, 330 const uint8_t* WEBP_RESTRICT top) { 331 const v16u8 A1 = { 0 }; 332 const uint64_t val_m = LD(top); 333 const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m); 334 const v16u8 B = SLDI_UB(A, A, 1); 335 const v16u8 C1 = SLDI_UB(A, A, 2); 336 const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]); 337 const v16u8 AC = __msa_ave_u_b(A, C); 338 const v16u8 B2 = __msa_ave_u_b(B, B); 339 const v16u8 R0 = __msa_aver_u_b(AC, B2); 340 const v16u8 R1 = SLDI_UB(R0, R0, 1); 341 const v16u8 R2 = SLDI_UB(R1, R1, 1); 342 const v16u8 R3 = SLDI_UB(R2, R2, 1); 343 const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0); 344 const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0); 345 const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0); 346 const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0); 347 SW4(val0, val1, val2, val3, dst, BPS); 348 } 349 350 static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst, 351 const uint8_t* WEBP_RESTRICT top) { 352 const int X = top[-1]; 353 const int I = top[-2]; 354 const int J = top[-3]; 355 const int K = top[-4]; 356 const int A = top[0]; 357 const int B = top[1]; 358 const int C = top[2]; 359 const int D = top[3]; 360 DST(0, 0) = DST(1, 2) = AVG2(X, A); 361 DST(1, 0) = DST(2, 2) = AVG2(A, B); 362 DST(2, 0) = DST(3, 2) = AVG2(B, C); 363 DST(3, 0) = AVG2(C, D); 364 DST(0, 3) = AVG3(K, J, I); 365 DST(0, 2) = AVG3(J, I, X); 366 DST(0, 1) = DST(1, 3) = AVG3(I, X, A); 367 DST(1, 1) = DST(2, 3) = AVG3(X, A, B); 368 DST(2, 1) = DST(3, 3) = AVG3(A, B, C); 369 DST(3, 1) = AVG3(B, C, D); 370 } 371 372 static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst, 373 const uint8_t* WEBP_RESTRICT top) { 374 const int A = top[0]; 375 const int B = top[1]; 376 const int C = top[2]; 377 const int D = top[3]; 378 const int E = top[4]; 379 const int F = top[5]; 380 const int G = top[6]; 381 const int H = top[7]; 382 DST(0, 0) = AVG2(A, B); 383 DST(1, 0) = DST(0, 2) = AVG2(B, C); 384 DST(2, 0) = DST(1, 2) = AVG2(C, D); 385 DST(3, 0) = DST(2, 2) = AVG2(D, E); 386 DST(0, 1) = AVG3(A, B, C); 387 DST(1, 1) = DST(0, 3) = AVG3(B, C, D); 388 DST(2, 1) = DST(1, 3) = AVG3(C, D, E); 389 DST(3, 1) = DST(2, 3) = AVG3(D, E, F); 390 DST(3, 2) = AVG3(E, F, G); 391 DST(3, 3) = AVG3(F, G, H); 392 } 393 394 static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst, 395 const uint8_t* WEBP_RESTRICT top) { 396 const int I = top[-2]; 397 const int J = top[-3]; 398 const int K = top[-4]; 399 const int L = top[-5]; 400 DST(0, 0) = AVG2(I, J); 401 DST(2, 0) = DST(0, 1) = AVG2(J, K); 402 DST(2, 1) = DST(0, 2) = AVG2(K, L); 403 DST(1, 0) = AVG3(I, J, K); 404 DST(3, 0) = DST(1, 1) = AVG3(J, K, L); 405 DST(3, 1) = DST(1, 2) = AVG3(K, L, L); 406 DST(3, 2) = DST(2, 2) = 407 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; 408 } 409 410 static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst, 411 const uint8_t* WEBP_RESTRICT top) { 412 const int X = top[-1]; 413 const int I = top[-2]; 414 const int J = top[-3]; 415 const int K = top[-4]; 416 const int L = top[-5]; 417 const int A = top[0]; 418 const int B = top[1]; 419 const int C = top[2]; 420 DST(0, 0) = DST(2, 1) = AVG2(I, X); 421 DST(0, 1) = DST(2, 2) = AVG2(J, I); 422 DST(0, 2) = DST(2, 3) = AVG2(K, J); 423 DST(0, 3) = AVG2(L, K); 424 DST(3, 0) = AVG3(A, B, C); 425 DST(2, 0) = AVG3(X, A, B); 426 DST(1, 0) = DST(3, 1) = AVG3(I, X, A); 427 DST(1, 1) = DST(3, 2) = AVG3(J, I, X); 428 DST(1, 2) = DST(3, 3) = AVG3(K, J, I); 429 DST(1, 3) = AVG3(L, K, J); 430 } 431 432 static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst, 433 const uint8_t* WEBP_RESTRICT top) { 434 const v16i8 zero = { 0 }; 435 const v8i16 TL = (v8i16)__msa_fill_h(top[-1]); 436 const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]); 437 const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]); 438 const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]); 439 const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]); 440 const v16u8 T1 = LD_UB(top); 441 const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1); 442 const v8i16 d = T - TL; 443 v8i16 r0, r1, r2, r3; 444 ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3); 445 CLIP_SH4_0_255(r0, r1, r2, r3); 446 PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS); 447 } 448 449 #undef DST 450 #undef AVG3 451 #undef AVG2 452 453 static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst, 454 const uint8_t* WEBP_RESTRICT top) { 455 DC4(I4DC4 + dst, top); 456 TM4(I4TM4 + dst, top); 457 VE4(I4VE4 + dst, top); 458 HE4(I4HE4 + dst, top); 459 RD4(I4RD4 + dst, top); 460 VR4(I4VR4 + dst, top); 461 LD4(I4LD4 + dst, top); 462 VL4(I4VL4 + dst, top); 463 HD4(I4HD4 + dst, top); 464 HU4(I4HU4 + dst, top); 465 } 466 467 // luma 16x16 prediction 468 469 #define STORE16x16(out, dst) do { \ 470 ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS); \ 471 ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \ 472 } while (0) 473 474 static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst, 475 const uint8_t* WEBP_RESTRICT top) { 476 if (top != NULL) { 477 const v16u8 out = LD_UB(top); 478 STORE16x16(out, dst); 479 } else { 480 const v16u8 out = (v16u8)__msa_fill_b(0x7f); 481 STORE16x16(out, dst); 482 } 483 } 484 485 static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst, 486 const uint8_t* WEBP_RESTRICT left) { 487 if (left != NULL) { 488 int j; 489 for (j = 0; j < 16; j += 4) { 490 const v16u8 L0 = (v16u8)__msa_fill_b(left[0]); 491 const v16u8 L1 = (v16u8)__msa_fill_b(left[1]); 492 const v16u8 L2 = (v16u8)__msa_fill_b(left[2]); 493 const v16u8 L3 = (v16u8)__msa_fill_b(left[3]); 494 ST_UB4(L0, L1, L2, L3, dst, BPS); 495 dst += 4 * BPS; 496 left += 4; 497 } 498 } else { 499 const v16u8 out = (v16u8)__msa_fill_b(0x81); 500 STORE16x16(out, dst); 501 } 502 } 503 504 static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst, 505 const uint8_t* WEBP_RESTRICT left, 506 const uint8_t* WEBP_RESTRICT top) { 507 if (left != NULL) { 508 if (top != NULL) { 509 int j; 510 v8i16 d1, d2; 511 const v16i8 zero = { 0 }; 512 const v8i16 TL = (v8i16)__msa_fill_h(left[-1]); 513 const v16u8 T = LD_UB(top); 514 ILVRL_B2_SH(zero, T, d1, d2); 515 SUB2(d1, TL, d2, TL, d1, d2); 516 for (j = 0; j < 16; j += 4) { 517 v16i8 t0, t1, t2, t3; 518 v8i16 r0, r1, r2, r3, r4, r5, r6, r7; 519 const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]); 520 const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]); 521 const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]); 522 const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]); 523 ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3); 524 ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7); 525 CLIP_SH4_0_255(r0, r1, r2, r3); 526 CLIP_SH4_0_255(r4, r5, r6, r7); 527 PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3); 528 ST_SB4(t0, t1, t2, t3, dst, BPS); 529 dst += 4 * BPS; 530 } 531 } else { 532 HorizontalPred16x16(dst, left); 533 } 534 } else { 535 if (top != NULL) { 536 VerticalPred16x16(dst, top); 537 } else { 538 const v16u8 out = (v16u8)__msa_fill_b(0x81); 539 STORE16x16(out, dst); 540 } 541 } 542 } 543 544 static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst, 545 const uint8_t* WEBP_RESTRICT left, 546 const uint8_t* WEBP_RESTRICT top) { 547 int DC; 548 v16u8 out; 549 if (top != NULL && left != NULL) { 550 const v16u8 rtop = LD_UB(top); 551 const v8u16 dctop = __msa_hadd_u_h(rtop, rtop); 552 const v16u8 rleft = LD_UB(left); 553 const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft); 554 const v8u16 dctemp = dctop + dcleft; 555 DC = HADD_UH_U32(dctemp); 556 DC = (DC + 16) >> 5; 557 } else if (left != NULL) { // left but no top 558 const v16u8 rleft = LD_UB(left); 559 const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft); 560 DC = HADD_UH_U32(dcleft); 561 DC = (DC + DC + 16) >> 5; 562 } else if (top != NULL) { // top but no left 563 const v16u8 rtop = LD_UB(top); 564 const v8u16 dctop = __msa_hadd_u_h(rtop, rtop); 565 DC = HADD_UH_U32(dctop); 566 DC = (DC + DC + 16) >> 5; 567 } else { // no top, no left, nothing. 568 DC = 0x80; 569 } 570 out = (v16u8)__msa_fill_b(DC); 571 STORE16x16(out, dst); 572 } 573 574 static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst, 575 const uint8_t* WEBP_RESTRICT left, 576 const uint8_t* WEBP_RESTRICT top) { 577 DCMode16x16(I16DC16 + dst, left, top); 578 VerticalPred16x16(I16VE16 + dst, top); 579 HorizontalPred16x16(I16HE16 + dst, left); 580 TrueMotion16x16(I16TM16 + dst, left, top); 581 } 582 583 // Chroma 8x8 prediction 584 585 #define CALC_DC8(in, out) do { \ 586 const v8u16 temp0 = __msa_hadd_u_h(in, in); \ 587 const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0); \ 588 const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1); \ 589 const v2i64 temp3 = __msa_splati_d(temp2, 1); \ 590 const v2i64 temp4 = temp3 + temp2; \ 591 const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4); \ 592 const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0); \ 593 out = __msa_copy_s_d(temp6, 0); \ 594 } while (0) 595 596 #define STORE8x8(out, dst) do { \ 597 SD4(out, out, out, out, dst + 0 * BPS, BPS); \ 598 SD4(out, out, out, out, dst + 4 * BPS, BPS); \ 599 } while (0) 600 601 static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst, 602 const uint8_t* WEBP_RESTRICT top) { 603 if (top != NULL) { 604 const uint64_t out = LD(top); 605 STORE8x8(out, dst); 606 } else { 607 const uint64_t out = 0x7f7f7f7f7f7f7f7fULL; 608 STORE8x8(out, dst); 609 } 610 } 611 612 static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst, 613 const uint8_t* WEBP_RESTRICT left) { 614 if (left != NULL) { 615 int j; 616 for (j = 0; j < 8; j += 4) { 617 const v16u8 L0 = (v16u8)__msa_fill_b(left[0]); 618 const v16u8 L1 = (v16u8)__msa_fill_b(left[1]); 619 const v16u8 L2 = (v16u8)__msa_fill_b(left[2]); 620 const v16u8 L3 = (v16u8)__msa_fill_b(left[3]); 621 const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0); 622 const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0); 623 const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0); 624 const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0); 625 SD4(out0, out1, out2, out3, dst, BPS); 626 dst += 4 * BPS; 627 left += 4; 628 } 629 } else { 630 const uint64_t out = 0x8181818181818181ULL; 631 STORE8x8(out, dst); 632 } 633 } 634 635 static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst, 636 const uint8_t* WEBP_RESTRICT left, 637 const uint8_t* WEBP_RESTRICT top) { 638 if (left != NULL) { 639 if (top != NULL) { 640 int j; 641 const v8i16 TL = (v8i16)__msa_fill_h(left[-1]); 642 const v16u8 T1 = LD_UB(top); 643 const v16i8 zero = { 0 }; 644 const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1); 645 const v8i16 d = T - TL; 646 for (j = 0; j < 8; j += 4) { 647 uint64_t out0, out1, out2, out3; 648 v16i8 t0, t1; 649 v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]); 650 v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]); 651 v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]); 652 v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]); 653 ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3); 654 CLIP_SH4_0_255(r0, r1, r2, r3); 655 PCKEV_B2_SB(r1, r0, r3, r2, t0, t1); 656 out0 = __msa_copy_s_d((v2i64)t0, 0); 657 out1 = __msa_copy_s_d((v2i64)t0, 1); 658 out2 = __msa_copy_s_d((v2i64)t1, 0); 659 out3 = __msa_copy_s_d((v2i64)t1, 1); 660 SD4(out0, out1, out2, out3, dst, BPS); 661 dst += 4 * BPS; 662 } 663 } else { 664 HorizontalPred8x8(dst, left); 665 } 666 } else { 667 if (top != NULL) { 668 VerticalPred8x8(dst, top); 669 } else { 670 const uint64_t out = 0x8181818181818181ULL; 671 STORE8x8(out, dst); 672 } 673 } 674 } 675 676 static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst, 677 const uint8_t* WEBP_RESTRICT left, 678 const uint8_t* WEBP_RESTRICT top) { 679 uint64_t out; 680 v16u8 src = { 0 }; 681 if (top != NULL && left != NULL) { 682 const uint64_t left_m = LD(left); 683 const uint64_t top_m = LD(top); 684 INSERT_D2_UB(left_m, top_m, src); 685 CALC_DC8(src, out); 686 } else if (left != NULL) { // left but no top 687 const uint64_t left_m = LD(left); 688 INSERT_D2_UB(left_m, left_m, src); 689 CALC_DC8(src, out); 690 } else if (top != NULL) { // top but no left 691 const uint64_t top_m = LD(top); 692 INSERT_D2_UB(top_m, top_m, src); 693 CALC_DC8(src, out); 694 } else { // no top, no left, nothing. 695 src = (v16u8)__msa_fill_b(0x80); 696 out = __msa_copy_s_d((v2i64)src, 0); 697 } 698 STORE8x8(out, dst); 699 } 700 701 static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst, 702 const uint8_t* WEBP_RESTRICT left, 703 const uint8_t* WEBP_RESTRICT top) { 704 // U block 705 DCMode8x8(C8DC8 + dst, left, top); 706 VerticalPred8x8(C8VE8 + dst, top); 707 HorizontalPred8x8(C8HE8 + dst, left); 708 TrueMotion8x8(C8TM8 + dst, left, top); 709 // V block 710 dst += 8; 711 if (top != NULL) top += 8; 712 if (left != NULL) left += 16; 713 DCMode8x8(C8DC8 + dst, left, top); 714 VerticalPred8x8(C8VE8 + dst, top); 715 HorizontalPred8x8(C8HE8 + dst, left); 716 TrueMotion8x8(C8TM8 + dst, left, top); 717 } 718 719 //------------------------------------------------------------------------------ 720 // Metric 721 722 #define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \ 723 v16u8 tmp0, tmp1; \ 724 v8i16 tmp2, tmp3; \ 725 ILVRL_B2_UB(in0, in1, tmp0, tmp1); \ 726 HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \ 727 DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \ 728 ILVRL_B2_UB(in2, in3, tmp0, tmp1); \ 729 HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \ 730 DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \ 731 } while (0) 732 733 #define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \ 734 v16u8 tmp0, tmp1; \ 735 v8i16 tmp2, tmp3; \ 736 ILVRL_B2_UB(in0, in1, tmp0, tmp1); \ 737 HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \ 738 DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \ 739 ILVRL_B2_UB(in2, in3, tmp0, tmp1); \ 740 HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \ 741 DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \ 742 } while (0) 743 744 static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a, 745 const uint8_t* WEBP_RESTRICT b) { 746 uint32_t sum; 747 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 748 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 749 v4i32 out0, out1, out2, out3; 750 751 LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7); 752 LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 753 PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3); 754 PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3); 755 PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3); 756 PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3); 757 a += 8 * BPS; 758 b += 8 * BPS; 759 LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7); 760 LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 761 PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3); 762 PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3); 763 PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3); 764 PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3); 765 out0 += out1; 766 out2 += out3; 767 out0 += out2; 768 sum = HADD_SW_S32(out0); 769 return sum; 770 } 771 772 static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a, 773 const uint8_t* WEBP_RESTRICT b) { 774 uint32_t sum; 775 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 776 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 777 v4i32 out0, out1, out2, out3; 778 779 LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7); 780 LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 781 PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3); 782 PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3); 783 PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3); 784 PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3); 785 out0 += out1; 786 out2 += out3; 787 out0 += out2; 788 sum = HADD_SW_S32(out0); 789 return sum; 790 } 791 792 static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a, 793 const uint8_t* WEBP_RESTRICT b) { 794 uint32_t sum; 795 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 796 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; 797 v16u8 t0, t1, t2, t3; 798 v4i32 out0, out1, out2, out3; 799 800 LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7); 801 LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); 802 ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3); 803 PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3); 804 ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3); 805 PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3); 806 out0 += out1; 807 out2 += out3; 808 out0 += out2; 809 sum = HADD_SW_S32(out0); 810 return sum; 811 } 812 813 static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a, 814 const uint8_t* WEBP_RESTRICT b) { 815 uint32_t sum = 0; 816 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; 817 v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1; 818 v8i16 diff0, diff1; 819 v4i32 out0, out1; 820 821 LW4(a, BPS, src0, src1, src2, src3); 822 LW4(b, BPS, ref0, ref1, ref2, ref3); 823 INSERT_W4_UB(src0, src1, src2, src3, src); 824 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); 825 ILVRL_B2_UB(src, ref, tmp0, tmp1); 826 HSUB_UB2_SH(tmp0, tmp1, diff0, diff1); 827 DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1); 828 out0 += out1; 829 sum = HADD_SW_S32(out0); 830 return sum; 831 } 832 833 //------------------------------------------------------------------------------ 834 // Quantization 835 836 static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16], 837 const VP8Matrix* WEBP_RESTRICT const mtx) { 838 int sum; 839 v8i16 in0, in1, sh0, sh1, out0, out1; 840 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1; 841 v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3; 842 const v8i16 zero = { 0 }; 843 const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 }; 844 const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 }; 845 const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL); 846 847 LD_SH2(&in[0], 8, in0, in1); 848 LD_SH2(&mtx->sharpen[0], 8, sh0, sh1); 849 tmp4 = __msa_add_a_h(in0, zero); 850 tmp5 = __msa_add_a_h(in1, zero); 851 ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1); 852 ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3); 853 HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3); 854 sign0 = (in0 < zero); 855 sign1 = (in1 < zero); // sign 856 LD_SH2(&mtx->iq[0], 8, tmp0, tmp1); // iq 857 ILVRL_H2_SW(zero, tmp0, t0, t1); 858 ILVRL_H2_SW(zero, tmp1, t2, t3); 859 LD_SW4(&mtx->bias[0], 4, b0, b1, b2, b3); // bias 860 MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3); 861 ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3); 862 SRAI_W4_SW(b0, b1, b2, b3, 17); 863 PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3); 864 tmp0 = (tmp2 > maxlevel); 865 tmp1 = (tmp3 > maxlevel); 866 tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0); 867 tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1); 868 SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1); 869 tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0); 870 tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1); 871 LD_SW4(&mtx->zthresh[0], 4, t0, t1, t2, t3); // zthresh 872 t0 = (s0 > t0); 873 t1 = (s1 > t1); 874 t2 = (s2 > t2); 875 t3 = (s3 > t3); 876 PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1); 877 tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0); 878 tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1); 879 LD_SH2(&mtx->q[0], 8, tmp0, tmp1); 880 MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1); 881 VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1); 882 ST_SH2(in0, in1, &in[0], 8); 883 ST_SH2(out0, out1, &out[0], 8); 884 out0 = __msa_add_a_h(out0, out1); 885 sum = HADD_SH_S32(out0); 886 return (sum > 0); 887 } 888 889 static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32], 890 const VP8Matrix* WEBP_RESTRICT const mtx) { 891 int nz; 892 nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0; 893 nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1; 894 return nz; 895 } 896 897 //------------------------------------------------------------------------------ 898 // Entry point 899 900 extern void VP8EncDspInitMSA(void); 901 902 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) { 903 VP8ITransform = ITransform_MSA; 904 VP8FTransform = FTransform_MSA; 905 VP8FTransformWHT = FTransformWHT_MSA; 906 907 VP8TDisto4x4 = Disto4x4_MSA; 908 VP8TDisto16x16 = Disto16x16_MSA; 909 VP8CollectHistogram = CollectHistogram_MSA; 910 911 VP8EncPredLuma4 = Intra4Preds_MSA; 912 VP8EncPredLuma16 = Intra16Preds_MSA; 913 VP8EncPredChroma8 = IntraChromaPreds_MSA; 914 915 VP8SSE16x16 = SSE16x16_MSA; 916 VP8SSE16x8 = SSE16x8_MSA; 917 VP8SSE8x8 = SSE8x8_MSA; 918 VP8SSE4x4 = SSE4x4_MSA; 919 920 VP8EncQuantizeBlock = QuantizeBlock_MSA; 921 VP8EncQuantize2Blocks = Quantize2Blocks_MSA; 922 VP8EncQuantizeBlockWHT = QuantizeBlock_MSA; 923 } 924 925 #else // !WEBP_USE_MSA 926 927 WEBP_DSP_INIT_STUB(VP8EncDspInitMSA) 928 929 #endif // WEBP_USE_MSA