tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

upsampling_msa.c (29529B)


      1 // Copyright 2016 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // MSA version of YUV to RGB upsampling functions.
     11 //
     12 // Author: Prashant Patil (prashant.patil@imgtec.com)
     13 
     14 #include <string.h>
     15 #include "src/dsp/dsp.h"
     16 
     17 #if defined(WEBP_USE_MSA)
     18 
     19 #include "src/dsp/msa_macro.h"
     20 #include "src/dsp/yuv.h"
     21 
     22 #ifdef FANCY_UPSAMPLING
     23 
     24 #define ILVR_UW2(in, out0, out1) do {                            \
     25  const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in);  \
     26  out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0);                   \
     27  out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0);                   \
     28 } while (0)
     29 
     30 #define ILVRL_UW4(in, out0, out1, out2, out3) do {  \
     31  v16u8 t0, t1;                                     \
     32  ILVRL_B2_UB(zero, in, t0, t1);                    \
     33  ILVRL_H2_UW(zero, t0, out0, out1);                \
     34  ILVRL_H2_UW(zero, t1, out2, out3);                \
     35 } while (0)
     36 
     37 #define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do {   \
     38  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);        \
     39  v4u32 temp0, temp1, temp2, temp3;                            \
     40  MUL4(in0, const0, in1, const0, in2, const0, in3, const0,     \
     41       temp0, temp1, temp2, temp3);                            \
     42  PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1);         \
     43 } while (0)
     44 
     45 #define MULTHI_8(in0, in1, cnst, out0) do {                 \
     46  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);     \
     47  v4u32 temp0, temp1;                                       \
     48  MUL2(in0, const0, in1, const0, temp0, temp1);             \
     49  out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0);  \
     50 } while (0)
     51 
     52 #define CALC_R16(y0, y1, v0, v1, dst) do {                \
     53  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
     54  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
     55  const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1);  \
     56  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
     57  v8i16 b1 = __msa_subs_s_h(a1, const_a);                 \
     58  SRAI_H2_SH(b0, b1, 6);                                  \
     59  CLIP_SH2_0_255(b0, b1);                                 \
     60  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);       \
     61 } while (0)
     62 
     63 #define CALC_R8(y0, v0, dst) do {                         \
     64  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
     65  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
     66  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
     67  b0 = SRAI_H(b0, 6);                                     \
     68  CLIP_SH_0_255(b0);                                      \
     69  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);       \
     70 } while (0)
     71 
     72 #define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do {   \
     73  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
     74  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
     75  v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1);   \
     76  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
     77  const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1);    \
     78  a0 = __msa_adds_s_h(b0, const_a);                  \
     79  a1 = __msa_adds_s_h(b1, const_a);                  \
     80  SRAI_H2_SH(a0, a1, 6);                             \
     81  CLIP_SH2_0_255(a0, a1);                            \
     82  dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);  \
     83 } while (0)
     84 
     85 #define CALC_G8(y0, u0, v0, dst) do {                \
     86  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
     87  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
     88  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
     89  a0 = __msa_adds_s_h(b0, const_a);                  \
     90  a0 = SRAI_H(a0, 6);                                \
     91  CLIP_SH_0_255(a0);                                 \
     92  dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0);  \
     93 } while (0)
     94 
     95 #define CALC_B16(y0, y1, u0, u1, dst) do {           \
     96  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
     97  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
     98  const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1);    \
     99  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
    100  v8u16 b1 = __msa_subs_u_h(a1, const_a);            \
    101  SRAI_H2_UH(b0, b1, 6);                             \
    102  CLIP_UH2_0_255(b0, b1);                            \
    103  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);  \
    104 } while (0)
    105 
    106 #define CALC_B8(y0, u0, dst) do {                    \
    107  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
    108  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
    109  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
    110  b0 = SRAI_H(b0, 6);                                \
    111  CLIP_UH_0_255(b0);                                 \
    112  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);  \
    113 } while (0)
    114 
    115 #define CALC_RGB16(y, u, v, R, G, B) do {    \
    116  const v16u8 zero = { 0 };                  \
    117  v8u16 y0, y1, u0, u1, v0, v1;              \
    118  v4u32 p0, p1, p2, p3;                      \
    119  const v16u8 in_y = LD_UB(y);               \
    120  const v16u8 in_u = LD_UB(u);               \
    121  const v16u8 in_v = LD_UB(v);               \
    122  ILVRL_UW4(in_y, p0, p1, p2, p3);           \
    123  MULTHI_16(p0, p1, p2, p3, 19077, y0, y1);  \
    124  ILVRL_UW4(in_v, p0, p1, p2, p3);           \
    125  MULTHI_16(p0, p1, p2, p3, 26149, v0, v1);  \
    126  CALC_R16(y0, y1, v0, v1, R);               \
    127  MULTHI_16(p0, p1, p2, p3, 13320, v0, v1);  \
    128  ILVRL_UW4(in_u, p0, p1, p2, p3);           \
    129  MULTHI_16(p0, p1, p2, p3, 6419, u0, u1);   \
    130  CALC_G16(y0, y1, u0, u1, v0, v1, G);       \
    131  MULTHI_16(p0, p1, p2, p3, 33050, u0, u1);  \
    132  CALC_B16(y0, y1, u0, u1, B);               \
    133 } while (0)
    134 
    135 #define CALC_RGB8(y, u, v, R, G, B) do {  \
    136  const v16u8 zero = { 0 };               \
    137  v8u16 y0, u0, v0;                       \
    138  v4u32 p0, p1;                           \
    139  const v16u8 in_y = LD_UB(y);            \
    140  const v16u8 in_u = LD_UB(u);            \
    141  const v16u8 in_v = LD_UB(v);            \
    142  ILVR_UW2(in_y, p0, p1);                 \
    143  MULTHI_8(p0, p1, 19077, y0);            \
    144  ILVR_UW2(in_v, p0, p1);                 \
    145  MULTHI_8(p0, p1, 26149, v0);            \
    146  CALC_R8(y0, v0, R);                     \
    147  MULTHI_8(p0, p1, 13320, v0);            \
    148  ILVR_UW2(in_u, p0, p1);                 \
    149  MULTHI_8(p0, p1, 6419, u0);             \
    150  CALC_G8(y0, u0, v0, G);                 \
    151  MULTHI_8(p0, p1, 33050, u0);            \
    152  CALC_B8(y0, u0, B);                     \
    153 } while (0)
    154 
    155 #define STORE16_3(a0, a1, a2, dst) do {                          \
    156  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,  \
    157                        8, 9, 20, 10 };                          \
    158  const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7,  \
    159                        8, 25, 9, 10 };                          \
    160  const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7,  \
    161                        30, 8, 9, 31 };                          \
    162  v16u8 out0, out1, out2, tmp0, tmp1, tmp2;                      \
    163  ILVRL_B2_UB(a1, a0, tmp0, tmp1);                               \
    164  out0 = VSHF_UB(tmp0, a2, mask0);                               \
    165  tmp2 = SLDI_UB(tmp1, tmp0, 11);                                \
    166  out1 = VSHF_UB(tmp2, a2, mask1);                               \
    167  tmp2 = SLDI_UB(tmp1, tmp1, 6);                                 \
    168  out2 = VSHF_UB(tmp2, a2, mask2);                               \
    169  ST_UB(out0, dst +  0);                                         \
    170  ST_UB(out1, dst + 16);                                         \
    171  ST_UB(out2, dst + 32);                                         \
    172 } while (0)
    173 
    174 #define STORE8_3(a0, a1, a2, dst) do {                             \
    175  int64_t out_m;                                                   \
    176  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,    \
    177                        8, 9, 20, 10 };                            \
    178  const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23,            \
    179                        255, 255, 255, 255, 255, 255, 255, 255 };  \
    180  const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);    \
    181  v16u8 out0, out1;                                                \
    182  VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1);        \
    183  ST_UB(out0, dst);                                                \
    184  out_m = __msa_copy_s_d((v2i64)out1, 0);                          \
    185  SD(out_m, dst + 16);                                             \
    186 } while (0)
    187 
    188 #define STORE16_4(a0, a1, a2, a3, dst) do {  \
    189  v16u8 tmp0, tmp1, tmp2, tmp3;              \
    190  v16u8 out0, out1, out2, out3;              \
    191  ILVRL_B2_UB(a1, a0, tmp0, tmp1);           \
    192  ILVRL_B2_UB(a3, a2, tmp2, tmp3);           \
    193  ILVRL_H2_UB(tmp2, tmp0, out0, out1);       \
    194  ILVRL_H2_UB(tmp3, tmp1, out2, out3);       \
    195  ST_UB(out0, dst +  0);                     \
    196  ST_UB(out1, dst + 16);                     \
    197  ST_UB(out2, dst + 32);                     \
    198  ST_UB(out3, dst + 48);                     \
    199 } while (0)
    200 
    201 #define STORE8_4(a0, a1, a2, a3, dst) do {  \
    202  v16u8 tmp0, tmp1, tmp2, tmp3;             \
    203  ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1);   \
    204  ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3);      \
    205  ST_UB(tmp2, dst +  0);                    \
    206  ST_UB(tmp3, dst + 16);                    \
    207 } while (0)
    208 
    209 #define STORE2_16(a0, a1, dst) do {  \
    210  v16u8 out0, out1;                  \
    211  ILVRL_B2_UB(a1, a0, out0, out1);   \
    212  ST_UB(out0, dst +  0);             \
    213  ST_UB(out1, dst + 16);             \
    214 } while (0)
    215 
    216 #define STORE2_8(a0, a1, dst) do {                               \
    217  const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);  \
    218  ST_UB(out0, dst);                                              \
    219 } while (0)
    220 
    221 #define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do {  \
    222  CALC_RGB##N(y, u, v, R, G, B);                         \
    223  tmp0 = ANDI_B(R, 0xf0);                                \
    224  tmp1 = SRAI_B(G, 4);                                   \
    225  RG = tmp0 | tmp1;                                      \
    226  tmp0 = ANDI_B(B, 0xf0);                                \
    227  BA = ORI_B(tmp0, 0x0f);                                \
    228  STORE2_##N(out0, out1, dst);                           \
    229 } while (0)
    230 
    231 #define CALC_RGB565(y, u, v, out0, out1, N, dst) do {  \
    232  CALC_RGB##N(y, u, v, R, G, B);                       \
    233  tmp0 = ANDI_B(R, 0xf8);                              \
    234  tmp1 = SRAI_B(G, 5);                                 \
    235  RG = tmp0 | tmp1;                                    \
    236  tmp0 = SLLI_B(G, 3);                                 \
    237  tmp1 = ANDI_B(tmp0, 0xe0);                           \
    238  tmp0 = SRAI_B(B, 3);                                 \
    239  GB = tmp0 | tmp1;                                    \
    240  STORE2_##N(out0, out1, dst);                         \
    241 } while (0)
    242 
    243 static WEBP_INLINE int Clip8(int v) {
    244  return v < 0 ? 0 : v > 255 ? 255 : v;
    245 }
    246 
    247 static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
    248  const int y1 = MultHi(y, 19077);
    249  const int r1 = y1 + MultHi(v, 26149) - 14234;
    250  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
    251  const int b1 = y1 + MultHi(u, 33050) - 17685;
    252  rgb[0] = Clip8(r1 >> 6);
    253  rgb[1] = Clip8(g1 >> 6);
    254  rgb[2] = Clip8(b1 >> 6);
    255 }
    256 
    257 static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
    258  const int y1 = MultHi(y, 19077);
    259  const int r1 = y1 + MultHi(v, 26149) - 14234;
    260  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
    261  const int b1 = y1 + MultHi(u, 33050) - 17685;
    262  bgr[0] = Clip8(b1 >> 6);
    263  bgr[1] = Clip8(g1 >> 6);
    264  bgr[2] = Clip8(r1 >> 6);
    265 }
    266 
    267 #if !defined(WEBP_REDUCE_CSP)
    268 static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
    269  const int y1 = MultHi(y, 19077);
    270  const int r1 = y1 + MultHi(v, 26149) - 14234;
    271  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
    272  const int b1 = y1 + MultHi(u, 33050) - 17685;
    273  const int r = Clip8(r1 >> 6);
    274  const int g = Clip8(g1 >> 6);
    275  const int b = Clip8(b1 >> 6);
    276  const int rg = (r & 0xf8) | (g >> 5);
    277  const int gb = ((g << 3) & 0xe0) | (b >> 3);
    278 #if (WEBP_SWAP_16BIT_CSP == 1)
    279  rgb[0] = gb;
    280  rgb[1] = rg;
    281 #else
    282  rgb[0] = rg;
    283  rgb[1] = gb;
    284 #endif
    285 }
    286 
    287 static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
    288  const int y1 = MultHi(y, 19077);
    289  const int r1 = y1 + MultHi(v, 26149) - 14234;
    290  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
    291  const int b1 = y1 + MultHi(u, 33050) - 17685;
    292  const int r = Clip8(r1 >> 6);
    293  const int g = Clip8(g1 >> 6);
    294  const int b = Clip8(b1 >> 6);
    295  const int rg = (r & 0xf0) | (g >> 4);
    296  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
    297 #if (WEBP_SWAP_16BIT_CSP == 1)
    298  argb[0] = ba;
    299  argb[1] = rg;
    300 #else
    301  argb[0] = rg;
    302  argb[1] = ba;
    303 #endif
    304 }
    305 
    306 static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
    307  argb[0] = 0xff;
    308  YuvToRgb(y, u, v, argb + 1);
    309 }
    310 #endif  // WEBP_REDUCE_CSP
    311 
    312 static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
    313  YuvToBgr(y, u, v, bgra);
    314  bgra[3] = 0xff;
    315 }
    316 
    317 static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
    318  YuvToRgb(y, u, v, rgba);
    319  rgba[3] = 0xff;
    320 }
    321 
    322 #if !defined(WEBP_REDUCE_CSP)
    323 static void YuvToRgbLine(const uint8_t* WEBP_RESTRICT y,
    324                         const uint8_t* WEBP_RESTRICT u,
    325                         const uint8_t* WEBP_RESTRICT v,
    326                         uint8_t* WEBP_RESTRICT dst, int length) {
    327  v16u8 R, G, B;
    328  while (length >= 16) {
    329    CALC_RGB16(y, u, v, R, G, B);
    330    STORE16_3(R, G, B, dst);
    331    y      += 16;
    332    u      += 16;
    333    v      += 16;
    334    dst    += 16 * 3;
    335    length -= 16;
    336  }
    337  if (length > 8) {
    338    uint8_t temp[3 * 16] = { 0 };
    339    memcpy(temp, y, length * sizeof(*temp));
    340    CALC_RGB16(temp, u, v, R, G, B);
    341    STORE16_3(R, G, B, temp);
    342    memcpy(dst, temp, length * 3 * sizeof(*dst));
    343  } else if (length > 0) {
    344    uint8_t temp[3 * 8] = { 0 };
    345    memcpy(temp, y, length * sizeof(*temp));
    346    CALC_RGB8(temp, u, v, R, G, B);
    347    STORE8_3(R, G, B, temp);
    348    memcpy(dst, temp, length * 3 * sizeof(*dst));
    349  }
    350 }
    351 
    352 static void YuvToBgrLine(const uint8_t* WEBP_RESTRICT y,
    353                         const uint8_t* WEBP_RESTRICT u,
    354                         const uint8_t* WEBP_RESTRICT v,
    355                         uint8_t* WEBP_RESTRICT dst, int length) {
    356  v16u8 R, G, B;
    357  while (length >= 16) {
    358    CALC_RGB16(y, u, v, R, G, B);
    359    STORE16_3(B, G, R, dst);
    360    y      += 16;
    361    u      += 16;
    362    v      += 16;
    363    dst    += 16 * 3;
    364    length -= 16;
    365  }
    366  if (length > 8) {
    367    uint8_t temp[3 * 16] = { 0 };
    368    memcpy(temp, y, length * sizeof(*temp));
    369    CALC_RGB16(temp, u, v, R, G, B);
    370    STORE16_3(B, G, R, temp);
    371    memcpy(dst, temp, length * 3 * sizeof(*dst));
    372  } else if (length > 0) {
    373    uint8_t temp[3 * 8] = { 0 };
    374    memcpy(temp, y, length * sizeof(*temp));
    375    CALC_RGB8(temp, u, v, R, G, B);
    376    STORE8_3(B, G, R, temp);
    377    memcpy(dst, temp, length * 3 * sizeof(*dst));
    378  }
    379 }
    380 #endif  // WEBP_REDUCE_CSP
    381 
    382 static void YuvToRgbaLine(const uint8_t* WEBP_RESTRICT y,
    383                          const uint8_t* WEBP_RESTRICT u,
    384                          const uint8_t* WEBP_RESTRICT v,
    385                          uint8_t* WEBP_RESTRICT dst, int length) {
    386  v16u8 R, G, B;
    387  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
    388  while (length >= 16) {
    389    CALC_RGB16(y, u, v, R, G, B);
    390    STORE16_4(R, G, B, A, dst);
    391    y      += 16;
    392    u      += 16;
    393    v      += 16;
    394    dst    += 16 * 4;
    395    length -= 16;
    396  }
    397  if (length > 8) {
    398    uint8_t temp[4 * 16] = { 0 };
    399    memcpy(temp, y, length * sizeof(*temp));
    400    CALC_RGB16(&temp[0], u, v, R, G, B);
    401    STORE16_4(R, G, B, A, temp);
    402    memcpy(dst, temp, length * 4 * sizeof(*dst));
    403  } else if (length > 0) {
    404    uint8_t temp[4 * 8] = { 0 };
    405    memcpy(temp, y, length * sizeof(*temp));
    406    CALC_RGB8(temp, u, v, R, G, B);
    407    STORE8_4(R, G, B, A, temp);
    408    memcpy(dst, temp, length * 4 * sizeof(*dst));
    409  }
    410 }
    411 
    412 static void YuvToBgraLine(const uint8_t* WEBP_RESTRICT y,
    413                          const uint8_t* WEBP_RESTRICT u,
    414                          const uint8_t* WEBP_RESTRICT v,
    415                          uint8_t* WEBP_RESTRICT dst, int length) {
    416  v16u8 R, G, B;
    417  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
    418  while (length >= 16) {
    419    CALC_RGB16(y, u, v, R, G, B);
    420    STORE16_4(B, G, R, A, dst);
    421    y      += 16;
    422    u      += 16;
    423    v      += 16;
    424    dst    += 16 * 4;
    425    length -= 16;
    426  }
    427  if (length > 8) {
    428    uint8_t temp[4 * 16] = { 0 };
    429    memcpy(temp, y, length * sizeof(*temp));
    430    CALC_RGB16(temp, u, v, R, G, B);
    431    STORE16_4(B, G, R, A, temp);
    432    memcpy(dst, temp, length * 4 * sizeof(*dst));
    433  } else if (length > 0) {
    434    uint8_t temp[4 * 8] = { 0 };
    435    memcpy(temp, y, length * sizeof(*temp));
    436    CALC_RGB8(temp, u, v, R, G, B);
    437    STORE8_4(B, G, R, A, temp);
    438    memcpy(dst, temp, length * 4 * sizeof(*dst));
    439  }
    440 }
    441 
    442 #if !defined(WEBP_REDUCE_CSP)
    443 static void YuvToArgbLine(const uint8_t* WEBP_RESTRICT y,
    444                          const uint8_t* WEBP_RESTRICT u,
    445                          const uint8_t* WEBP_RESTRICT v,
    446                          uint8_t* WEBP_RESTRICT dst, int length) {
    447  v16u8 R, G, B;
    448  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
    449  while (length >= 16) {
    450    CALC_RGB16(y, u, v, R, G, B);
    451    STORE16_4(A, R, G, B, dst);
    452    y      += 16;
    453    u      += 16;
    454    v      += 16;
    455    dst    += 16 * 4;
    456    length -= 16;
    457  }
    458  if (length > 8) {
    459    uint8_t temp[4 * 16] = { 0 };
    460    memcpy(temp, y, length * sizeof(*temp));
    461    CALC_RGB16(temp, u, v, R, G, B);
    462    STORE16_4(A, R, G, B, temp);
    463    memcpy(dst, temp, length * 4 * sizeof(*dst));
    464  } else if (length > 0) {
    465    uint8_t temp[4 * 8] = { 0 };
    466    memcpy(temp, y, length * sizeof(*temp));
    467    CALC_RGB8(temp, u, v, R, G, B);
    468    STORE8_4(A, R, G, B, temp);
    469    memcpy(dst, temp, length * 4 * sizeof(*dst));
    470  }
    471 }
    472 
    473 static void YuvToRgba4444Line(const uint8_t* WEBP_RESTRICT y,
    474                              const uint8_t* WEBP_RESTRICT u,
    475                              const uint8_t* WEBP_RESTRICT v,
    476                              uint8_t* WEBP_RESTRICT dst, int length) {
    477  v16u8 R, G, B, RG, BA, tmp0, tmp1;
    478  while (length >= 16) {
    479 #if (WEBP_SWAP_16BIT_CSP == 1)
    480    CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
    481 #else
    482    CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
    483 #endif
    484    y      += 16;
    485    u      += 16;
    486    v      += 16;
    487    dst    += 16 * 2;
    488    length -= 16;
    489  }
    490  if (length > 8) {
    491    uint8_t temp[2 * 16] = { 0 };
    492    memcpy(temp, y, length * sizeof(*temp));
    493 #if (WEBP_SWAP_16BIT_CSP == 1)
    494    CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
    495 #else
    496    CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
    497 #endif
    498    memcpy(dst, temp, length * 2 * sizeof(*dst));
    499  } else if (length > 0) {
    500    uint8_t temp[2 * 8] = { 0 };
    501    memcpy(temp, y, length * sizeof(*temp));
    502 #if (WEBP_SWAP_16BIT_CSP == 1)
    503    CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
    504 #else
    505    CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
    506 #endif
    507    memcpy(dst, temp, length * 2 * sizeof(*dst));
    508  }
    509 }
    510 
    511 static void YuvToRgb565Line(const uint8_t* WEBP_RESTRICT y,
    512                            const uint8_t* WEBP_RESTRICT u,
    513                            const uint8_t* WEBP_RESTRICT v,
    514                            uint8_t* WEBP_RESTRICT dst, int length) {
    515  v16u8 R, G, B, RG, GB, tmp0, tmp1;
    516  while (length >= 16) {
    517 #if (WEBP_SWAP_16BIT_CSP == 1)
    518    CALC_RGB565(y, u, v, GB, RG, 16, dst);
    519 #else
    520    CALC_RGB565(y, u, v, RG, GB, 16, dst);
    521 #endif
    522    y      += 16;
    523    u      += 16;
    524    v      += 16;
    525    dst    += 16 * 2;
    526    length -= 16;
    527  }
    528  if (length > 8) {
    529    uint8_t temp[2 * 16] = { 0 };
    530    memcpy(temp, y, length * sizeof(*temp));
    531 #if (WEBP_SWAP_16BIT_CSP == 1)
    532    CALC_RGB565(temp, u, v, GB, RG, 16, temp);
    533 #else
    534    CALC_RGB565(temp, u, v, RG, GB, 16, temp);
    535 #endif
    536    memcpy(dst, temp, length * 2 * sizeof(*dst));
    537  } else if (length > 0) {
    538    uint8_t temp[2 * 8] = { 0 };
    539    memcpy(temp, y, length * sizeof(*temp));
    540 #if (WEBP_SWAP_16BIT_CSP == 1)
    541    CALC_RGB565(temp, u, v, GB, RG, 8, temp);
    542 #else
    543    CALC_RGB565(temp, u, v, RG, GB, 8, temp);
    544 #endif
    545    memcpy(dst, temp, length * 2 * sizeof(*dst));
    546  }
    547 }
    548 #endif  // WEBP_REDUCE_CSP
    549 
    550 #define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
    551  v16u8 s = __msa_aver_u_b(a, d);             \
    552  v16u8 t = __msa_aver_u_b(b, c);             \
    553  const v16u8 st = s ^ t;                     \
    554  v16u8 ad = a ^ d;                           \
    555  v16u8 bc = b ^ c;                           \
    556  v16u8 t0 = ad | bc;                         \
    557  v16u8 t1 = t0 | st;                         \
    558  v16u8 t2 = ANDI_B(t1, 1);                   \
    559  v16u8 t3 = __msa_aver_u_b(s, t);            \
    560  const v16u8 k = t3 - t2;                    \
    561  v16u8 diag1, diag2;                         \
    562  AVER_UB2_UB(t, k, s, k, t0, t1);            \
    563  bc = bc & st;                               \
    564  ad = ad & st;                               \
    565  t = t ^ k;                                  \
    566  s = s ^ k;                                  \
    567  t2 = bc | t;                                \
    568  t3 = ad | s;                                \
    569  t2 = ANDI_B(t2, 1);                         \
    570  t3 = ANDI_B(t3, 1);                         \
    571  SUB2(t0, t2, t1, t3, diag1, diag2);         \
    572  AVER_UB2_UB(a, diag1, b, diag2, t0, t1);    \
    573  ILVRL_B2_UB(t1, t0, a, b);                  \
    574  if (pbot_y != NULL) {                       \
    575    AVER_UB2_UB(c, diag2, d, diag1, t0, t1);  \
    576    ILVRL_B2_UB(t1, t0, c, d);                \
    577  }                                           \
    578 } while (0)
    579 
    580 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
    581 static void FUNC_NAME(const uint8_t* WEBP_RESTRICT top_y,                \
    582                      const uint8_t* WEBP_RESTRICT bot_y,                \
    583                      const uint8_t* WEBP_RESTRICT top_u,                \
    584                      const uint8_t* WEBP_RESTRICT top_v,                \
    585                      const uint8_t* WEBP_RESTRICT cur_u,                \
    586                      const uint8_t* WEBP_RESTRICT cur_v,                \
    587                      uint8_t* WEBP_RESTRICT top_dst,                    \
    588                      uint8_t* WEBP_RESTRICT bot_dst, int len) {         \
    589  int size = (len - 1) >> 1;                                             \
    590  uint8_t temp_u[64];                                                    \
    591  uint8_t temp_v[64];                                                    \
    592  const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16));              \
    593  const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16));               \
    594  const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;            \
    595  const uint8_t* ptop_y = &top_y[1];                                     \
    596  uint8_t* ptop_dst = top_dst + XSTEP;                                   \
    597  const uint8_t* pbot_y = &bot_y[1];                                     \
    598  uint8_t* pbot_dst = bot_dst + XSTEP;                                   \
    599                                                                         \
    600  FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                      \
    601  if (bot_y != NULL) {                                                   \
    602    const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;          \
    603    FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst);                    \
    604  }                                                                      \
    605  while (size >= 16) {                                                   \
    606    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
    607    LD_UB2(top_u, 1, tu0, tu1);                                          \
    608    LD_UB2(cur_u, 1, cu0, cu1);                                          \
    609    LD_UB2(top_v, 1, tv0, tv1);                                          \
    610    LD_UB2(cur_v, 1, cv0, cv1);                                          \
    611    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
    612    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
    613    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
    614    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
    615    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32);           \
    616    if (bot_y != NULL) {                                                 \
    617      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32);        \
    618    }                                                                    \
    619    ptop_y   += 32;                                                      \
    620    pbot_y   += 32;                                                      \
    621    ptop_dst += XSTEP * 32;                                              \
    622    pbot_dst += XSTEP * 32;                                              \
    623    top_u    += 16;                                                      \
    624    top_v    += 16;                                                      \
    625    cur_u    += 16;                                                      \
    626    cur_v    += 16;                                                      \
    627    size     -= 16;                                                      \
    628  }                                                                      \
    629  if (size > 0) {                                                        \
    630    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
    631    memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t));                    \
    632    memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t));                    \
    633    memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t));                    \
    634    memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t));                    \
    635    LD_UB2(&temp_u[ 0], 1, tu0, tu1);                                    \
    636    LD_UB2(&temp_u[32], 1, cu0, cu1);                                    \
    637    LD_UB2(&temp_v[ 0], 1, tv0, tv1);                                    \
    638    LD_UB2(&temp_v[32], 1, cv0, cv1);                                    \
    639    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
    640    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
    641    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
    642    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
    643    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2);     \
    644    if (bot_y != NULL) {                                                 \
    645      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2);  \
    646    }                                                                    \
    647    top_u += size;                                                       \
    648    top_v += size;                                                       \
    649    cur_u += size;                                                       \
    650    cur_v += size;                                                       \
    651  }                                                                      \
    652  if (!(len & 1)) {                                                      \
    653    const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16));               \
    654    const uint32_t c0  = ((cur_u[0]) | ((cur_v[0]) << 16));              \
    655    const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2;              \
    656    FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16),                      \
    657                top_dst + (len - 1) * XSTEP);                            \
    658    if (bot_y != NULL) {                                                 \
    659      const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2;            \
    660      FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16),                    \
    661           bot_dst + (len - 1) * XSTEP);                                 \
    662    }                                                                    \
    663  }                                                                      \
    664 }
    665 
    666 UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
    667 UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
    668 #if !defined(WEBP_REDUCE_CSP)
    669 UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
    670 UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
    671 UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
    672 UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
    673 UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
    674 #endif   // WEBP_REDUCE_CSP
    675 
    676 //------------------------------------------------------------------------------
    677 // Entry point
    678 
    679 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
    680 
    681 extern void WebPInitUpsamplersMSA(void);
    682 
    683 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
    684  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
    685  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
    686  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
    687  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
    688 #if !defined(WEBP_REDUCE_CSP)
    689  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
    690  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
    691  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
    692  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
    693  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
    694  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
    695  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
    696 #endif   // WEBP_REDUCE_CSP
    697 }
    698 
    699 #endif  // FANCY_UPSAMPLING
    700 
    701 #endif  // WEBP_USE_MSA
    702 
    703 #if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))
    704 WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)
    705 #endif