tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

enc_msa.c (34626B)


      1 // Copyright 2016 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // MSA version of encoder dsp functions.
     11 //
     12 // Author:  Prashant Patil   (prashant.patil@imgtec.com)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_MSA)
     17 
     18 #include <stdlib.h>
     19 #include "src/dsp/msa_macro.h"
     20 #include "src/enc/vp8i_enc.h"
     21 
     22 //------------------------------------------------------------------------------
     23 // Transforms
     24 
     25 #define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
     26  v4i32 a1_m, b1_m, c1_m, d1_m;                                     \
     27  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);              \
     28  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                    \
     29  v4i32 c_tmp1_m = in1 * sinpi8sqrt2;                               \
     30  v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1;                         \
     31  v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1;                         \
     32  v4i32 d_tmp2_m = in3 * sinpi8sqrt2;                               \
     33                                                                    \
     34  ADDSUB2(in0, in2, a1_m, b1_m);                                    \
     35  SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16);                               \
     36  c_tmp2_m = c_tmp2_m + in3;                                        \
     37  c1_m = c_tmp1_m - c_tmp2_m;                                       \
     38  SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16);                               \
     39  d_tmp1_m = d_tmp1_m + in1;                                        \
     40  d1_m = d_tmp1_m + d_tmp2_m;                                       \
     41  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
     42 } while (0)
     43 
     44 static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
     45                                      const int16_t* WEBP_RESTRICT in,
     46                                      uint8_t* WEBP_RESTRICT dst) {
     47  v8i16 input0, input1;
     48  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
     49  v4i32 res0, res1, res2, res3;
     50  v16i8 dest0, dest1, dest2, dest3;
     51  const v16i8 zero = { 0 };
     52 
     53  LD_SH2(in, 8, input0, input1);
     54  UNPCK_SH_SW(input0, in0, in1);
     55  UNPCK_SH_SW(input1, in2, in3);
     56  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
     57  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
     58  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
     59  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
     60  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
     61  LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
     62  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
     63             res0, res1, res2, res3);
     64  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
     65             res0, res1, res2, res3);
     66  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
     67  CLIP_SW4_0_255(res0, res1, res2, res3);
     68  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
     69  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
     70  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
     71 }
     72 
     73 static void ITransform_MSA(const uint8_t* WEBP_RESTRICT ref,
     74                           const int16_t* WEBP_RESTRICT in,
     75                           uint8_t* WEBP_RESTRICT dst, int do_two) {
     76  ITransformOne(ref, in, dst);
     77  if (do_two) {
     78    ITransformOne(ref + 4, in + 16, dst + 4);
     79  }
     80 }
     81 
     82 static void FTransform_MSA(const uint8_t* WEBP_RESTRICT src,
     83                           const uint8_t* WEBP_RESTRICT ref,
     84                           int16_t* WEBP_RESTRICT out) {
     85  uint64_t out0, out1, out2, out3;
     86  uint32_t in0, in1, in2, in3;
     87  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
     88  v8i16 t0, t1, t2, t3;
     89  v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
     90  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
     91  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
     92  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
     93  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
     94  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
     95  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
     96 
     97  LW4(src, BPS, in0, in1, in2, in3);
     98  INSERT_W4_UB(in0, in1, in2, in3, src0);
     99  LW4(ref, BPS, in0, in1, in2, in3);
    100  INSERT_W4_UB(in0, in1, in2, in3, src1);
    101  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
    102  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
    103  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
    104  ADDSUB2(t2, t3, t0, t1);
    105  t0 = SRLI_H(t0, 3);
    106  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
    107  tmp0 = __msa_hadd_s_w(t3, t3);
    108  tmp2 = __msa_hsub_s_w(t3, t3);
    109  FILL_W2_SW(1812, 937, tmp1, tmp3);
    110  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
    111  SRAI_W2_SW(tmp1, tmp3, 9);
    112  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
    113  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
    114  ADDSUB2(t2, t3, t0, t1);
    115  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
    116  tmp0 = __msa_hadd_s_w(t3, t3);
    117  tmp2 = __msa_hsub_s_w(t3, t3);
    118  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
    119  SRAI_W2_SW(tmp0, tmp2, 4);
    120  FILL_W2_SW(12000, 51000, tmp1, tmp3);
    121  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
    122  SRAI_W2_SW(tmp1, tmp3, 16);
    123  UNPCK_R_SH_SW(t1, tmp4);
    124  tmp5 = __msa_ceqi_w(tmp4, 0);
    125  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
    126  tmp5 = __msa_fill_w(1);
    127  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
    128  tmp1 += tmp5;
    129  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
    130  out0 = __msa_copy_s_d((v2i64)t0, 0);
    131  out1 = __msa_copy_s_d((v2i64)t0, 1);
    132  out2 = __msa_copy_s_d((v2i64)t1, 0);
    133  out3 = __msa_copy_s_d((v2i64)t1, 1);
    134  SD4(out0, out1, out2, out3, out, 8);
    135 }
    136 
    137 static void FTransformWHT_MSA(const int16_t* WEBP_RESTRICT in,
    138                              int16_t* WEBP_RESTRICT out) {
    139  v8i16 in0 = { 0 };
    140  v8i16 in1 = { 0 };
    141  v8i16 tmp0, tmp1, tmp2, tmp3;
    142  v8i16 out0, out1;
    143  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
    144  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
    145  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
    146  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
    147 
    148  in0 = __msa_insert_h(in0, 0, in[  0]);
    149  in0 = __msa_insert_h(in0, 1, in[ 64]);
    150  in0 = __msa_insert_h(in0, 2, in[128]);
    151  in0 = __msa_insert_h(in0, 3, in[192]);
    152  in0 = __msa_insert_h(in0, 4, in[ 16]);
    153  in0 = __msa_insert_h(in0, 5, in[ 80]);
    154  in0 = __msa_insert_h(in0, 6, in[144]);
    155  in0 = __msa_insert_h(in0, 7, in[208]);
    156  in1 = __msa_insert_h(in1, 0, in[ 48]);
    157  in1 = __msa_insert_h(in1, 1, in[112]);
    158  in1 = __msa_insert_h(in1, 2, in[176]);
    159  in1 = __msa_insert_h(in1, 3, in[240]);
    160  in1 = __msa_insert_h(in1, 4, in[ 32]);
    161  in1 = __msa_insert_h(in1, 5, in[ 96]);
    162  in1 = __msa_insert_h(in1, 6, in[160]);
    163  in1 = __msa_insert_h(in1, 7, in[224]);
    164  ADDSUB2(in0, in1, tmp0, tmp1);
    165  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
    166  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
    167  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
    168  ADDSUB2(in0, in1, tmp0, tmp1);
    169  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
    170  ADDSUB2(tmp2, tmp3, out0, out1);
    171  SRAI_H2_SH(out0, out1, 1);
    172  ST_SH2(out0, out1, out, 8);
    173 }
    174 
    175 static int TTransform_MSA(const uint8_t* WEBP_RESTRICT in,
    176                          const uint16_t* WEBP_RESTRICT w) {
    177  int sum;
    178  uint32_t in0_m, in1_m, in2_m, in3_m;
    179  v16i8 src0 = { 0 };
    180  v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
    181  v4i32 dst0, dst1;
    182  const v16i8 zero = { 0 };
    183  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
    184  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
    185  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
    186  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
    187 
    188  LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
    189  INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
    190  ILVRL_B2_SH(zero, src0, tmp0, tmp1);
    191  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
    192  ADDSUB2(in0, in1, tmp0, tmp1);
    193  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
    194  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
    195  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
    196  ADDSUB2(in0, in1, tmp0, tmp1);
    197  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
    198  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
    199  tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
    200  tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
    201  LD_SH2(w, 8, tmp2, tmp3);
    202  DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
    203  dst0 = dst0 + dst1;
    204  sum = HADD_SW_S32(dst0);
    205  return sum;
    206 }
    207 
    208 static int Disto4x4_MSA(const uint8_t* WEBP_RESTRICT const a,
    209                        const uint8_t* WEBP_RESTRICT const b,
    210                        const uint16_t* WEBP_RESTRICT const w) {
    211  const int sum1 = TTransform_MSA(a, w);
    212  const int sum2 = TTransform_MSA(b, w);
    213  return abs(sum2 - sum1) >> 5;
    214 }
    215 
    216 static int Disto16x16_MSA(const uint8_t* WEBP_RESTRICT const a,
    217                          const uint8_t* WEBP_RESTRICT const b,
    218                          const uint16_t* WEBP_RESTRICT const w) {
    219  int D = 0;
    220  int x, y;
    221  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    222    for (x = 0; x < 16; x += 4) {
    223      D += Disto4x4_MSA(a + x + y, b + x + y, w);
    224    }
    225  }
    226  return D;
    227 }
    228 
    229 //------------------------------------------------------------------------------
    230 // Histogram
    231 
    232 static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
    233                                 int start_block, int end_block,
    234                                 VP8Histogram* const histo) {
    235  int j;
    236  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
    237  for (j = start_block; j < end_block; ++j) {
    238    int16_t out[16];
    239    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
    240    {
    241      int k;
    242      v8i16 coeff0, coeff1;
    243      const v8i16 zero = { 0 };
    244      const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
    245      LD_SH2(&out[0], 8, coeff0, coeff1);
    246      coeff0 = __msa_add_a_h(coeff0, zero);
    247      coeff1 = __msa_add_a_h(coeff1, zero);
    248      SRAI_H2_SH(coeff0, coeff1, 3);
    249      coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
    250      coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
    251      ST_SH2(coeff0, coeff1, &out[0], 8);
    252      for (k = 0; k < 16; ++k) {
    253        ++distribution[out[k]];
    254      }
    255    }
    256  }
    257  VP8SetHistogramData(distribution, histo);
    258 }
    259 
    260 //------------------------------------------------------------------------------
    261 // Intra predictions
    262 
    263 // luma 4x4 prediction
    264 
    265 #define DST(x, y) dst[(x) + (y) * BPS]
    266 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
    267 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
    268 
    269 // vertical
    270 static WEBP_INLINE void VE4(uint8_t* WEBP_RESTRICT dst,
    271                            const uint8_t* WEBP_RESTRICT top) {
    272  const v16u8 A1 = { 0 };
    273  const uint64_t val_m = LD(top - 1);
    274  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
    275  const v16u8 B = SLDI_UB(A, A, 1);
    276  const v16u8 C = SLDI_UB(A, A, 2);
    277  const v16u8 AC = __msa_ave_u_b(A, C);
    278  const v16u8 B2 = __msa_ave_u_b(B, B);
    279  const v16u8 R = __msa_aver_u_b(AC, B2);
    280  const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
    281  SW4(out, out, out, out, dst, BPS);
    282 }
    283 
    284 // horizontal
    285 static WEBP_INLINE void HE4(uint8_t* WEBP_RESTRICT dst,
    286                            const uint8_t* WEBP_RESTRICT top) {
    287  const int X = top[-1];
    288  const int I = top[-2];
    289  const int J = top[-3];
    290  const int K = top[-4];
    291  const int L = top[-5];
    292  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
    293  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
    294  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
    295  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
    296 }
    297 
    298 static WEBP_INLINE void DC4(uint8_t* WEBP_RESTRICT dst,
    299                            const uint8_t* WEBP_RESTRICT top) {
    300  uint32_t dc = 4;
    301  int i;
    302  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
    303  dc >>= 3;
    304  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
    305  SW4(dc, dc, dc, dc, dst, BPS);
    306 }
    307 
    308 static WEBP_INLINE void RD4(uint8_t* WEBP_RESTRICT dst,
    309                            const uint8_t* WEBP_RESTRICT top) {
    310  const v16u8 A2 = { 0 };
    311  const uint64_t val_m = LD(top - 5);
    312  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
    313  const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
    314  const v16u8 B = SLDI_UB(A, A, 1);
    315  const v16u8 C = SLDI_UB(A, A, 2);
    316  const v16u8 AC = __msa_ave_u_b(A, C);
    317  const v16u8 B2 = __msa_ave_u_b(B, B);
    318  const v16u8 R0 = __msa_aver_u_b(AC, B2);
    319  const v16u8 R1 = SLDI_UB(R0, R0, 1);
    320  const v16u8 R2 = SLDI_UB(R1, R1, 1);
    321  const v16u8 R3 = SLDI_UB(R2, R2, 1);
    322  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
    323  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
    324  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
    325  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
    326  SW4(val3, val2, val1, val0, dst, BPS);
    327 }
    328 
    329 static WEBP_INLINE void LD4(uint8_t* WEBP_RESTRICT dst,
    330                            const uint8_t* WEBP_RESTRICT top) {
    331  const v16u8 A1 = { 0 };
    332  const uint64_t val_m = LD(top);
    333  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
    334  const v16u8 B = SLDI_UB(A, A, 1);
    335  const v16u8 C1 = SLDI_UB(A, A, 2);
    336  const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
    337  const v16u8 AC = __msa_ave_u_b(A, C);
    338  const v16u8 B2 = __msa_ave_u_b(B, B);
    339  const v16u8 R0 = __msa_aver_u_b(AC, B2);
    340  const v16u8 R1 = SLDI_UB(R0, R0, 1);
    341  const v16u8 R2 = SLDI_UB(R1, R1, 1);
    342  const v16u8 R3 = SLDI_UB(R2, R2, 1);
    343  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
    344  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
    345  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
    346  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
    347  SW4(val0, val1, val2, val3, dst, BPS);
    348 }
    349 
    350 static WEBP_INLINE void VR4(uint8_t* WEBP_RESTRICT dst,
    351                            const uint8_t* WEBP_RESTRICT top) {
    352  const int X = top[-1];
    353  const int I = top[-2];
    354  const int J = top[-3];
    355  const int K = top[-4];
    356  const int A = top[0];
    357  const int B = top[1];
    358  const int C = top[2];
    359  const int D = top[3];
    360  DST(0, 0) = DST(1, 2) = AVG2(X, A);
    361  DST(1, 0) = DST(2, 2) = AVG2(A, B);
    362  DST(2, 0) = DST(3, 2) = AVG2(B, C);
    363  DST(3, 0)             = AVG2(C, D);
    364  DST(0, 3) =             AVG3(K, J, I);
    365  DST(0, 2) =             AVG3(J, I, X);
    366  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
    367  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
    368  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
    369  DST(3, 1) =             AVG3(B, C, D);
    370 }
    371 
    372 static WEBP_INLINE void VL4(uint8_t* WEBP_RESTRICT dst,
    373                            const uint8_t* WEBP_RESTRICT top) {
    374  const int A = top[0];
    375  const int B = top[1];
    376  const int C = top[2];
    377  const int D = top[3];
    378  const int E = top[4];
    379  const int F = top[5];
    380  const int G = top[6];
    381  const int H = top[7];
    382  DST(0, 0) =             AVG2(A, B);
    383  DST(1, 0) = DST(0, 2) = AVG2(B, C);
    384  DST(2, 0) = DST(1, 2) = AVG2(C, D);
    385  DST(3, 0) = DST(2, 2) = AVG2(D, E);
    386  DST(0, 1) =             AVG3(A, B, C);
    387  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
    388  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
    389  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
    390              DST(3, 2) = AVG3(E, F, G);
    391              DST(3, 3) = AVG3(F, G, H);
    392 }
    393 
    394 static WEBP_INLINE void HU4(uint8_t* WEBP_RESTRICT dst,
    395                            const uint8_t* WEBP_RESTRICT top) {
    396  const int I = top[-2];
    397  const int J = top[-3];
    398  const int K = top[-4];
    399  const int L = top[-5];
    400  DST(0, 0) =             AVG2(I, J);
    401  DST(2, 0) = DST(0, 1) = AVG2(J, K);
    402  DST(2, 1) = DST(0, 2) = AVG2(K, L);
    403  DST(1, 0) =             AVG3(I, J, K);
    404  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
    405  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
    406  DST(3, 2) = DST(2, 2) =
    407  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
    408 }
    409 
    410 static WEBP_INLINE void HD4(uint8_t* WEBP_RESTRICT dst,
    411                            const uint8_t* WEBP_RESTRICT top) {
    412  const int X = top[-1];
    413  const int I = top[-2];
    414  const int J = top[-3];
    415  const int K = top[-4];
    416  const int L = top[-5];
    417  const int A = top[0];
    418  const int B = top[1];
    419  const int C = top[2];
    420  DST(0, 0) = DST(2, 1) = AVG2(I, X);
    421  DST(0, 1) = DST(2, 2) = AVG2(J, I);
    422  DST(0, 2) = DST(2, 3) = AVG2(K, J);
    423  DST(0, 3)             = AVG2(L, K);
    424  DST(3, 0)             = AVG3(A, B, C);
    425  DST(2, 0)             = AVG3(X, A, B);
    426  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
    427  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
    428  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
    429  DST(1, 3)             = AVG3(L, K, J);
    430 }
    431 
    432 static WEBP_INLINE void TM4(uint8_t* WEBP_RESTRICT dst,
    433                            const uint8_t* WEBP_RESTRICT top) {
    434  const v16i8 zero = { 0 };
    435  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
    436  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
    437  const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
    438  const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
    439  const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
    440  const v16u8 T1 = LD_UB(top);
    441  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
    442  const v8i16 d = T - TL;
    443  v8i16 r0, r1, r2, r3;
    444  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
    445  CLIP_SH4_0_255(r0, r1, r2, r3);
    446  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
    447 }
    448 
    449 #undef DST
    450 #undef AVG3
    451 #undef AVG2
    452 
    453 static void Intra4Preds_MSA(uint8_t* WEBP_RESTRICT dst,
    454                            const uint8_t* WEBP_RESTRICT top) {
    455  DC4(I4DC4 + dst, top);
    456  TM4(I4TM4 + dst, top);
    457  VE4(I4VE4 + dst, top);
    458  HE4(I4HE4 + dst, top);
    459  RD4(I4RD4 + dst, top);
    460  VR4(I4VR4 + dst, top);
    461  LD4(I4LD4 + dst, top);
    462  VL4(I4VL4 + dst, top);
    463  HD4(I4HD4 + dst, top);
    464  HU4(I4HU4 + dst, top);
    465 }
    466 
    467 // luma 16x16 prediction
    468 
    469 #define STORE16x16(out, dst) do {                                        \
    470    ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS);  \
    471    ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
    472 } while (0)
    473 
    474 static WEBP_INLINE void VerticalPred16x16(uint8_t* WEBP_RESTRICT dst,
    475                                          const uint8_t* WEBP_RESTRICT top) {
    476  if (top != NULL) {
    477    const v16u8 out = LD_UB(top);
    478    STORE16x16(out, dst);
    479  } else {
    480    const v16u8 out = (v16u8)__msa_fill_b(0x7f);
    481    STORE16x16(out, dst);
    482  }
    483 }
    484 
    485 static WEBP_INLINE void HorizontalPred16x16(uint8_t* WEBP_RESTRICT dst,
    486                                            const uint8_t* WEBP_RESTRICT left) {
    487  if (left != NULL) {
    488    int j;
    489    for (j = 0; j < 16; j += 4) {
    490      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
    491      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
    492      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
    493      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
    494      ST_UB4(L0, L1, L2, L3, dst, BPS);
    495      dst += 4 * BPS;
    496      left += 4;
    497    }
    498  } else {
    499    const v16u8 out = (v16u8)__msa_fill_b(0x81);
    500    STORE16x16(out, dst);
    501  }
    502 }
    503 
    504 static WEBP_INLINE void TrueMotion16x16(uint8_t* WEBP_RESTRICT dst,
    505                                        const uint8_t* WEBP_RESTRICT left,
    506                                        const uint8_t* WEBP_RESTRICT top) {
    507  if (left != NULL) {
    508    if (top != NULL) {
    509      int j;
    510      v8i16 d1, d2;
    511      const v16i8 zero = { 0 };
    512      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
    513      const v16u8 T = LD_UB(top);
    514      ILVRL_B2_SH(zero, T, d1, d2);
    515      SUB2(d1, TL, d2, TL, d1, d2);
    516      for (j = 0; j < 16; j += 4) {
    517        v16i8 t0, t1, t2, t3;
    518        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
    519        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
    520        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
    521        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
    522        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
    523        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
    524        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
    525        CLIP_SH4_0_255(r0, r1, r2, r3);
    526        CLIP_SH4_0_255(r4, r5, r6, r7);
    527        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
    528        ST_SB4(t0, t1, t2, t3, dst, BPS);
    529        dst += 4 * BPS;
    530      }
    531    } else {
    532      HorizontalPred16x16(dst, left);
    533    }
    534  } else {
    535    if (top != NULL) {
    536      VerticalPred16x16(dst, top);
    537    } else {
    538      const v16u8 out = (v16u8)__msa_fill_b(0x81);
    539      STORE16x16(out, dst);
    540    }
    541  }
    542 }
    543 
    544 static WEBP_INLINE void DCMode16x16(uint8_t* WEBP_RESTRICT dst,
    545                                    const uint8_t* WEBP_RESTRICT left,
    546                                    const uint8_t* WEBP_RESTRICT top) {
    547  int DC;
    548  v16u8 out;
    549  if (top != NULL && left != NULL) {
    550    const v16u8 rtop = LD_UB(top);
    551    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
    552    const v16u8 rleft = LD_UB(left);
    553    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
    554    const v8u16 dctemp = dctop + dcleft;
    555    DC = HADD_UH_U32(dctemp);
    556    DC = (DC + 16) >> 5;
    557  } else if (left != NULL) {   // left but no top
    558    const v16u8 rleft = LD_UB(left);
    559    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
    560    DC = HADD_UH_U32(dcleft);
    561    DC = (DC + DC + 16) >> 5;
    562  } else if (top != NULL) {   // top but no left
    563    const v16u8 rtop = LD_UB(top);
    564    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
    565    DC = HADD_UH_U32(dctop);
    566    DC = (DC + DC + 16) >> 5;
    567  } else {   // no top, no left, nothing.
    568    DC = 0x80;
    569  }
    570  out = (v16u8)__msa_fill_b(DC);
    571  STORE16x16(out, dst);
    572 }
    573 
    574 static void Intra16Preds_MSA(uint8_t* WEBP_RESTRICT dst,
    575                             const uint8_t* WEBP_RESTRICT left,
    576                             const uint8_t* WEBP_RESTRICT top) {
    577  DCMode16x16(I16DC16 + dst, left, top);
    578  VerticalPred16x16(I16VE16 + dst, top);
    579  HorizontalPred16x16(I16HE16 + dst, left);
    580  TrueMotion16x16(I16TM16 + dst, left, top);
    581 }
    582 
    583 // Chroma 8x8 prediction
    584 
    585 #define CALC_DC8(in, out) do {                              \
    586  const v8u16 temp0 = __msa_hadd_u_h(in, in);               \
    587  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);         \
    588  const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1);  \
    589  const v2i64 temp3 = __msa_splati_d(temp2, 1);             \
    590  const v2i64 temp4 = temp3 + temp2;                        \
    591  const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4);       \
    592  const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0);      \
    593  out = __msa_copy_s_d(temp6, 0);                           \
    594 } while (0)
    595 
    596 #define STORE8x8(out, dst) do {                 \
    597  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
    598  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
    599 } while (0)
    600 
    601 static WEBP_INLINE void VerticalPred8x8(uint8_t* WEBP_RESTRICT dst,
    602                                        const uint8_t* WEBP_RESTRICT top) {
    603  if (top != NULL) {
    604    const uint64_t out = LD(top);
    605    STORE8x8(out, dst);
    606  } else {
    607    const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
    608    STORE8x8(out, dst);
    609  }
    610 }
    611 
    612 static WEBP_INLINE void HorizontalPred8x8(uint8_t* WEBP_RESTRICT dst,
    613                                          const uint8_t* WEBP_RESTRICT left) {
    614  if (left != NULL) {
    615    int j;
    616    for (j = 0; j < 8; j += 4) {
    617      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
    618      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
    619      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
    620      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
    621      const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
    622      const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
    623      const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
    624      const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
    625      SD4(out0, out1, out2, out3, dst, BPS);
    626      dst += 4 * BPS;
    627      left += 4;
    628    }
    629  } else {
    630    const uint64_t out = 0x8181818181818181ULL;
    631    STORE8x8(out, dst);
    632  }
    633 }
    634 
    635 static WEBP_INLINE void TrueMotion8x8(uint8_t* WEBP_RESTRICT dst,
    636                                      const uint8_t* WEBP_RESTRICT left,
    637                                      const uint8_t* WEBP_RESTRICT top) {
    638  if (left != NULL) {
    639    if (top != NULL) {
    640      int j;
    641      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
    642      const v16u8 T1 = LD_UB(top);
    643      const v16i8 zero = { 0 };
    644      const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
    645      const v8i16 d = T - TL;
    646      for (j = 0; j < 8; j += 4) {
    647        uint64_t out0, out1, out2, out3;
    648        v16i8 t0, t1;
    649        v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
    650        v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
    651        v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
    652        v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
    653        ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
    654        CLIP_SH4_0_255(r0, r1, r2, r3);
    655        PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
    656        out0 = __msa_copy_s_d((v2i64)t0, 0);
    657        out1 = __msa_copy_s_d((v2i64)t0, 1);
    658        out2 = __msa_copy_s_d((v2i64)t1, 0);
    659        out3 = __msa_copy_s_d((v2i64)t1, 1);
    660        SD4(out0, out1, out2, out3, dst, BPS);
    661        dst += 4 * BPS;
    662      }
    663    } else {
    664      HorizontalPred8x8(dst, left);
    665    }
    666  } else {
    667    if (top != NULL) {
    668      VerticalPred8x8(dst, top);
    669    } else {
    670      const uint64_t out = 0x8181818181818181ULL;
    671      STORE8x8(out, dst);
    672    }
    673  }
    674 }
    675 
    676 static WEBP_INLINE void DCMode8x8(uint8_t* WEBP_RESTRICT dst,
    677                                  const uint8_t* WEBP_RESTRICT left,
    678                                  const uint8_t* WEBP_RESTRICT top) {
    679  uint64_t out;
    680  v16u8 src = { 0 };
    681  if (top != NULL && left != NULL) {
    682    const uint64_t left_m = LD(left);
    683    const uint64_t top_m = LD(top);
    684    INSERT_D2_UB(left_m, top_m, src);
    685    CALC_DC8(src, out);
    686  } else if (left != NULL) {   // left but no top
    687    const uint64_t left_m = LD(left);
    688    INSERT_D2_UB(left_m, left_m, src);
    689    CALC_DC8(src, out);
    690  } else if (top != NULL) {   // top but no left
    691    const uint64_t top_m = LD(top);
    692    INSERT_D2_UB(top_m, top_m, src);
    693    CALC_DC8(src, out);
    694  } else {   // no top, no left, nothing.
    695    src = (v16u8)__msa_fill_b(0x80);
    696    out = __msa_copy_s_d((v2i64)src, 0);
    697  }
    698  STORE8x8(out, dst);
    699 }
    700 
    701 static void IntraChromaPreds_MSA(uint8_t* WEBP_RESTRICT dst,
    702                                 const uint8_t* WEBP_RESTRICT left,
    703                                 const uint8_t* WEBP_RESTRICT top) {
    704  // U block
    705  DCMode8x8(C8DC8 + dst, left, top);
    706  VerticalPred8x8(C8VE8 + dst, top);
    707  HorizontalPred8x8(C8HE8 + dst, left);
    708  TrueMotion8x8(C8TM8 + dst, left, top);
    709  // V block
    710  dst += 8;
    711  if (top != NULL) top += 8;
    712  if (left != NULL) left += 16;
    713  DCMode8x8(C8DC8 + dst, left, top);
    714  VerticalPred8x8(C8VE8 + dst, top);
    715  HorizontalPred8x8(C8HE8 + dst, left);
    716  TrueMotion8x8(C8TM8 + dst, left, top);
    717 }
    718 
    719 //------------------------------------------------------------------------------
    720 // Metric
    721 
    722 #define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
    723  v16u8 tmp0, tmp1;                                                        \
    724  v8i16 tmp2, tmp3;                                                        \
    725  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                       \
    726  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
    727  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
    728  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                       \
    729  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
    730  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
    731 } while (0)
    732 
    733 #define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
    734  v16u8 tmp0, tmp1;                                                         \
    735  v8i16 tmp2, tmp3;                                                         \
    736  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                        \
    737  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
    738  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
    739  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                        \
    740  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
    741  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
    742 } while (0)
    743 
    744 static int SSE16x16_MSA(const uint8_t* WEBP_RESTRICT a,
    745                        const uint8_t* WEBP_RESTRICT b) {
    746  uint32_t sum;
    747  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    748  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
    749  v4i32 out0, out1, out2, out3;
    750 
    751  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
    752  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
    753  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
    754  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
    755  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
    756  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
    757  a += 8 * BPS;
    758  b += 8 * BPS;
    759  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
    760  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
    761  PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
    762  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
    763  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
    764  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
    765  out0 += out1;
    766  out2 += out3;
    767  out0 += out2;
    768  sum = HADD_SW_S32(out0);
    769  return sum;
    770 }
    771 
    772 static int SSE16x8_MSA(const uint8_t* WEBP_RESTRICT a,
    773                       const uint8_t* WEBP_RESTRICT b) {
    774  uint32_t sum;
    775  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    776  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
    777  v4i32 out0, out1, out2, out3;
    778 
    779  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
    780  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
    781  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
    782  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
    783  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
    784  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
    785  out0 += out1;
    786  out2 += out3;
    787  out0 += out2;
    788  sum = HADD_SW_S32(out0);
    789  return sum;
    790 }
    791 
    792 static int SSE8x8_MSA(const uint8_t* WEBP_RESTRICT a,
    793                      const uint8_t* WEBP_RESTRICT b) {
    794  uint32_t sum;
    795  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    796  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
    797  v16u8 t0, t1, t2, t3;
    798  v4i32 out0, out1, out2, out3;
    799 
    800  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
    801  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
    802  ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
    803  PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
    804  ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
    805  PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
    806  out0 += out1;
    807  out2 += out3;
    808  out0 += out2;
    809  sum = HADD_SW_S32(out0);
    810  return sum;
    811 }
    812 
    813 static int SSE4x4_MSA(const uint8_t* WEBP_RESTRICT a,
    814                      const uint8_t* WEBP_RESTRICT b) {
    815  uint32_t sum = 0;
    816  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
    817  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
    818  v8i16 diff0, diff1;
    819  v4i32 out0, out1;
    820 
    821  LW4(a, BPS, src0, src1, src2, src3);
    822  LW4(b, BPS, ref0, ref1, ref2, ref3);
    823  INSERT_W4_UB(src0, src1, src2, src3, src);
    824  INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
    825  ILVRL_B2_UB(src, ref, tmp0, tmp1);
    826  HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
    827  DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
    828  out0 += out1;
    829  sum = HADD_SW_S32(out0);
    830  return sum;
    831 }
    832 
    833 //------------------------------------------------------------------------------
    834 // Quantization
    835 
    836 static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
    837                             const VP8Matrix* WEBP_RESTRICT const mtx) {
    838  int sum;
    839  v8i16 in0, in1, sh0, sh1, out0, out1;
    840  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
    841  v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
    842  const v8i16 zero = { 0 };
    843  const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
    844  const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
    845  const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
    846 
    847  LD_SH2(&in[0], 8, in0, in1);
    848  LD_SH2(&mtx->sharpen[0], 8, sh0, sh1);
    849  tmp4 = __msa_add_a_h(in0, zero);
    850  tmp5 = __msa_add_a_h(in1, zero);
    851  ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
    852  ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
    853  HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
    854  sign0 = (in0 < zero);
    855  sign1 = (in1 < zero);                           // sign
    856  LD_SH2(&mtx->iq[0], 8, tmp0, tmp1);             // iq
    857  ILVRL_H2_SW(zero, tmp0, t0, t1);
    858  ILVRL_H2_SW(zero, tmp1, t2, t3);
    859  LD_SW4(&mtx->bias[0], 4, b0, b1, b2, b3);       // bias
    860  MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
    861  ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
    862  SRAI_W4_SW(b0, b1, b2, b3, 17);
    863  PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
    864  tmp0 = (tmp2 > maxlevel);
    865  tmp1 = (tmp3 > maxlevel);
    866  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
    867  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
    868  SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
    869  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
    870  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
    871  LD_SW4(&mtx->zthresh[0], 4, t0, t1, t2, t3);    // zthresh
    872  t0 = (s0 > t0);
    873  t1 = (s1 > t1);
    874  t2 = (s2 > t2);
    875  t3 = (s3 > t3);
    876  PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
    877  tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
    878  tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
    879  LD_SH2(&mtx->q[0], 8, tmp0, tmp1);
    880  MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
    881  VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
    882  ST_SH2(in0, in1, &in[0], 8);
    883  ST_SH2(out0, out1, &out[0], 8);
    884  out0 = __msa_add_a_h(out0, out1);
    885  sum = HADD_SH_S32(out0);
    886  return (sum > 0);
    887 }
    888 
    889 static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
    890                               const VP8Matrix* WEBP_RESTRICT const mtx) {
    891  int nz;
    892  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
    893  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
    894  return nz;
    895 }
    896 
    897 //------------------------------------------------------------------------------
    898 // Entry point
    899 
    900 extern void VP8EncDspInitMSA(void);
    901 
    902 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
    903  VP8ITransform = ITransform_MSA;
    904  VP8FTransform = FTransform_MSA;
    905  VP8FTransformWHT = FTransformWHT_MSA;
    906 
    907  VP8TDisto4x4 = Disto4x4_MSA;
    908  VP8TDisto16x16 = Disto16x16_MSA;
    909  VP8CollectHistogram = CollectHistogram_MSA;
    910 
    911  VP8EncPredLuma4 = Intra4Preds_MSA;
    912  VP8EncPredLuma16 = Intra16Preds_MSA;
    913  VP8EncPredChroma8 = IntraChromaPreds_MSA;
    914 
    915  VP8SSE16x16 = SSE16x16_MSA;
    916  VP8SSE16x8 = SSE16x8_MSA;
    917  VP8SSE8x8 = SSE8x8_MSA;
    918  VP8SSE4x4 = SSE4x4_MSA;
    919 
    920  VP8EncQuantizeBlock = QuantizeBlock_MSA;
    921  VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
    922  VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
    923 }
    924 
    925 #else  // !WEBP_USE_MSA
    926 
    927 WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
    928 
    929 #endif  // WEBP_USE_MSA