tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dec_mips32.c (26925B)


      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // MIPS version of dsp functions
     11 //
     12 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
     13 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
     14 
     15 #include "src/dsp/dsp.h"
     16 
     17 #if defined(WEBP_USE_MIPS32)
     18 
     19 #include "src/dsp/mips_macro.h"
     20 
     21 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
     22 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
     23 
     24 static WEBP_INLINE int abs_mips32(int x) {
     25  const int sign = x >> 31;
     26  return (x ^ sign) - sign;
     27 }
     28 
     29 // 4 pixels in, 2 pixels out
     30 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
     31  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
     32  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
     33  const int a1 = VP8ksclip2[(a + 4) >> 3];
     34  const int a2 = VP8ksclip2[(a + 3) >> 3];
     35  p[-step] = VP8kclip1[p0 + a2];
     36  p[    0] = VP8kclip1[q0 - a1];
     37 }
     38 
     39 // 4 pixels in, 4 pixels out
     40 static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
     41  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
     42  const int a = 3 * (q0 - p0);
     43  const int a1 = VP8ksclip2[(a + 4) >> 3];
     44  const int a2 = VP8ksclip2[(a + 3) >> 3];
     45  const int a3 = (a1 + 1) >> 1;
     46  p[-2 * step] = VP8kclip1[p1 + a3];
     47  p[-    step] = VP8kclip1[p0 + a2];
     48  p[        0] = VP8kclip1[q0 - a1];
     49  p[     step] = VP8kclip1[q1 - a3];
     50 }
     51 
     52 // 6 pixels in, 6 pixels out
     53 static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
     54  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
     55  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
     56  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
     57  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
     58  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
     59  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
     60  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
     61  p[-3 * step] = VP8kclip1[p2 + a3];
     62  p[-2 * step] = VP8kclip1[p1 + a2];
     63  p[-    step] = VP8kclip1[p0 + a1];
     64  p[        0] = VP8kclip1[q0 - a1];
     65  p[     step] = VP8kclip1[q1 - a2];
     66  p[ 2 * step] = VP8kclip1[q2 - a3];
     67 }
     68 
     69 static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
     70  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
     71  return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
     72 }
     73 
     74 static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
     75  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
     76  return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
     77 }
     78 
     79 static WEBP_INLINE int needs_filter2(const uint8_t* p,
     80                                     int step, int t, int it) {
     81  const int p3 = p[-4 * step], p2 = p[-3 * step];
     82  const int p1 = p[-2 * step], p0 = p[-step];
     83  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
     84  if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
     85    return 0;
     86  }
     87  return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
     88         abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
     89         abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
     90 }
     91 
     92 static WEBP_INLINE void FilterLoop26(uint8_t* p,
     93                                     int hstride, int vstride, int size,
     94                                     int thresh, int ithresh, int hev_thresh) {
     95  const int thresh2 = 2 * thresh + 1;
     96  while (size-- > 0) {
     97    if (needs_filter2(p, hstride, thresh2, ithresh)) {
     98      if (hev(p, hstride, hev_thresh)) {
     99        do_filter2(p, hstride);
    100      } else {
    101        do_filter6(p, hstride);
    102      }
    103    }
    104    p += vstride;
    105  }
    106 }
    107 
    108 static WEBP_INLINE void FilterLoop24(uint8_t* p,
    109                                     int hstride, int vstride, int size,
    110                                     int thresh, int ithresh, int hev_thresh) {
    111  const int thresh2 = 2 * thresh + 1;
    112  while (size-- > 0) {
    113    if (needs_filter2(p, hstride, thresh2, ithresh)) {
    114      if (hev(p, hstride, hev_thresh)) {
    115        do_filter2(p, hstride);
    116      } else {
    117        do_filter4(p, hstride);
    118      }
    119    }
    120    p += vstride;
    121  }
    122 }
    123 
    124 // on macroblock edges
    125 static void VFilter16(uint8_t* p, int stride,
    126                      int thresh, int ithresh, int hev_thresh) {
    127  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
    128 }
    129 
    130 static void HFilter16(uint8_t* p, int stride,
    131                      int thresh, int ithresh, int hev_thresh) {
    132  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
    133 }
    134 
    135 // 8-pixels wide variant, for chroma filtering
    136 static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    137                     int stride, int thresh, int ithresh, int hev_thresh) {
    138  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
    139  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
    140 }
    141 
    142 static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    143                     int stride, int thresh, int ithresh, int hev_thresh) {
    144  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
    145  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
    146 }
    147 
    148 static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    149                      int stride, int thresh, int ithresh, int hev_thresh) {
    150  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
    151  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
    152 }
    153 
    154 static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    155                      int stride, int thresh, int ithresh, int hev_thresh) {
    156  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
    157  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
    158 }
    159 
    160 // on three inner edges
    161 static void VFilter16i(uint8_t* p, int stride,
    162                       int thresh, int ithresh, int hev_thresh) {
    163  int k;
    164  for (k = 3; k > 0; --k) {
    165    p += 4 * stride;
    166    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
    167  }
    168 }
    169 
    170 static void HFilter16i(uint8_t* p, int stride,
    171                       int thresh, int ithresh, int hev_thresh) {
    172  int k;
    173  for (k = 3; k > 0; --k) {
    174    p += 4;
    175    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
    176  }
    177 }
    178 
    179 //------------------------------------------------------------------------------
    180 // Simple In-loop filtering (Paragraph 15.2)
    181 
    182 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
    183  int i;
    184  const int thresh2 = 2 * thresh + 1;
    185  for (i = 0; i < 16; ++i) {
    186    if (needs_filter(p + i, stride, thresh2)) {
    187      do_filter2(p + i, stride);
    188    }
    189  }
    190 }
    191 
    192 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
    193  int i;
    194  const int thresh2 = 2 * thresh + 1;
    195  for (i = 0; i < 16; ++i) {
    196    if (needs_filter(p + i * stride, 1, thresh2)) {
    197      do_filter2(p + i * stride, 1);
    198    }
    199  }
    200 }
    201 
    202 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
    203  int k;
    204  for (k = 3; k > 0; --k) {
    205    p += 4 * stride;
    206    SimpleVFilter16(p, stride, thresh);
    207  }
    208 }
    209 
    210 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
    211  int k;
    212  for (k = 3; k > 0; --k) {
    213    p += 4;
    214    SimpleHFilter16(p, stride, thresh);
    215  }
    216 }
    217 
    218 static void TransformOne(const int16_t* WEBP_RESTRICT in,
    219                         uint8_t* WEBP_RESTRICT dst) {
    220  int temp0, temp1, temp2, temp3, temp4;
    221  int temp5, temp6, temp7, temp8, temp9;
    222  int temp10, temp11, temp12, temp13, temp14;
    223  int temp15, temp16, temp17, temp18, temp19;
    224  int16_t* p_in = (int16_t*)in;
    225 
    226  // loops unrolled and merged to avoid usage of tmp buffer
    227  // and to reduce number of stalls. MUL macro is written
    228  // in assembler and inlined
    229  __asm__ volatile(
    230    "lh       %[temp0],  0(%[in])                      \n\t"
    231    "lh       %[temp8],  16(%[in])                     \n\t"
    232    "lh       %[temp4],  8(%[in])                      \n\t"
    233    "lh       %[temp12], 24(%[in])                     \n\t"
    234    "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
    235    "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
    236    "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
    237    MUL_SHIFT_C1(temp17, temp12)
    238    MUL_SHIFT_C1_IO(temp4, temp19)
    239    "mul      %[temp12], %[temp12], %[kC2]             \n\t"
    240    "lh       %[temp1],  2(%[in])                      \n\t"
    241    "lh       %[temp5],  10(%[in])                     \n\t"
    242    "lh       %[temp9],  18(%[in])                     \n\t"
    243    "lh       %[temp13], 26(%[in])                     \n\t"
    244    "sra      %[temp8],  %[temp8],  16                 \n\t"
    245    "sra      %[temp12], %[temp12], 16                 \n\t"
    246    "lh       %[temp2],  4(%[in])                      \n\t"
    247    "lh       %[temp6],  12(%[in])                     \n\t"
    248    "lh       %[temp10], 20(%[in])                     \n\t"
    249    "lh       %[temp14], 28(%[in])                     \n\t"
    250    "subu     %[temp17], %[temp8],  %[temp17]          \n\t"
    251    "addu     %[temp4],  %[temp4],  %[temp12]          \n\t"
    252    "addu     %[temp8],  %[temp16], %[temp4]           \n\t"
    253    "subu     %[temp4],  %[temp16], %[temp4]           \n\t"
    254    "addu     %[temp16], %[temp1],  %[temp9]           \n\t"
    255    "subu     %[temp1],  %[temp1],  %[temp9]           \n\t"
    256    "lh       %[temp3],  6(%[in])                      \n\t"
    257    "lh       %[temp7],  14(%[in])                     \n\t"
    258    "lh       %[temp11], 22(%[in])                     \n\t"
    259    "lh       %[temp15], 30(%[in])                     \n\t"
    260    "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
    261    "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
    262    "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
    263    MUL_SHIFT_C1(temp17, temp13)
    264    MUL_SHIFT_C1_IO(temp5, temp19)
    265    "mul      %[temp13], %[temp13], %[kC2]             \n\t"
    266    "sra      %[temp9],  %[temp9],  16                 \n\t"
    267    "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
    268    "sra      %[temp13], %[temp13], 16                 \n\t"
    269    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
    270    "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
    271    "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
    272    MUL_SHIFT_C1(temp17, temp14)
    273    "mul      %[temp14], %[temp14], %[kC2]             \n\t"
    274    "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
    275    "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
    276    "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
    277    "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
    278    "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
    279    MUL_SHIFT_C1_IO(temp6, temp19)
    280    "sra      %[temp14], %[temp14], 16                 \n\t"
    281    "sra      %[temp10], %[temp10], 16                 \n\t"
    282    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
    283    "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
    284    "addu     %[temp10], %[temp16], %[temp6]           \n\t"
    285    "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
    286    "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
    287    "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
    288    MUL_SHIFT_C1(temp17, temp15)
    289    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
    290    "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
    291    "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
    292    "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
    293    MUL_SHIFT_C1_IO(temp7, temp19)
    294    "addiu    %[temp8],  %[temp8],  4                  \n\t"
    295    "addiu    %[temp12], %[temp12], 4                  \n\t"
    296    "addiu    %[temp0],  %[temp0],  4                  \n\t"
    297    "addiu    %[temp4],  %[temp4],  4                  \n\t"
    298    "sra      %[temp15], %[temp15], 16                 \n\t"
    299    "sra      %[temp11], %[temp11], 16                 \n\t"
    300    "subu     %[temp17], %[temp11], %[temp17]          \n\t"
    301    "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
    302    "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
    303    "subu     %[temp3],  %[temp3],  %[temp17]          \n\t"
    304    "addu     %[temp11], %[temp16], %[temp7]           \n\t"
    305    "subu     %[temp7],  %[temp16], %[temp7]           \n\t"
    306    "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
    307    "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
    308    "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
    309    MUL_SHIFT_C1(temp17, temp11)
    310    MUL_SHIFT_C1_IO(temp9, temp19)
    311    "mul      %[temp11], %[temp11], %[kC2]             \n\t"
    312    "sra      %[temp10], %[temp10], 16                 \n\t"
    313    "sra      %[temp11], %[temp11], 16                 \n\t"
    314    "subu     %[temp17], %[temp10], %[temp17]          \n\t"
    315    "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
    316    "addu     %[temp10], %[temp12], %[temp14]          \n\t"
    317    "subu     %[temp12], %[temp12], %[temp14]          \n\t"
    318    "mul      %[temp14], %[temp13], %[kC2]             \n\t"
    319    MUL_SHIFT_C1(temp9, temp15)
    320    MUL_SHIFT_C1_IO(temp13, temp19)
    321    "mul      %[temp15], %[temp15], %[kC2]             \n\t"
    322    "sra      %[temp14], %[temp14], 16                 \n\t"
    323    "sra      %[temp15], %[temp15], 16                 \n\t"
    324    "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
    325    "addu     %[temp15], %[temp13], %[temp15]          \n\t"
    326    "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
    327    "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
    328    "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
    329    MUL_SHIFT_C1(temp13, temp3)
    330    MUL_SHIFT_C1_IO(temp1, temp19)
    331    "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
    332    "sra      %[temp2],  %[temp2],  16                 \n\t"
    333    "sra      %[temp3],  %[temp3],  16                 \n\t"
    334    "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
    335    "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
    336    "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
    337    "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
    338    "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
    339    MUL_SHIFT_C1(temp1, temp7)
    340    MUL_SHIFT_C1_IO(temp5, temp19)
    341    "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
    342    "sra      %[temp6],  %[temp6],  16                 \n\t"
    343    "sra      %[temp7],  %[temp7],  16                 \n\t"
    344    "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
    345    "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
    346    "addu     %[temp5],  %[temp16], %[temp11]          \n\t"
    347    "subu     %[temp16], %[temp16], %[temp11]          \n\t"
    348    "addu     %[temp11], %[temp8],  %[temp17]          \n\t"
    349    "subu     %[temp8],  %[temp8],  %[temp17]          \n\t"
    350    "sra      %[temp5],  %[temp5],  3                  \n\t"
    351    "sra      %[temp16], %[temp16], 3                  \n\t"
    352    "sra      %[temp11], %[temp11], 3                  \n\t"
    353    "sra      %[temp8],  %[temp8],  3                  \n\t"
    354    "addu     %[temp17], %[temp10], %[temp15]          \n\t"
    355    "subu     %[temp10], %[temp10], %[temp15]          \n\t"
    356    "addu     %[temp15], %[temp12], %[temp9]           \n\t"
    357    "subu     %[temp12], %[temp12], %[temp9]           \n\t"
    358    "sra      %[temp17], %[temp17], 3                  \n\t"
    359    "sra      %[temp10], %[temp10], 3                  \n\t"
    360    "sra      %[temp15], %[temp15], 3                  \n\t"
    361    "sra      %[temp12], %[temp12], 3                  \n\t"
    362    "addu     %[temp9],  %[temp14], %[temp3]           \n\t"
    363    "subu     %[temp14], %[temp14], %[temp3]           \n\t"
    364    "addu     %[temp3],  %[temp0],  %[temp13]          \n\t"
    365    "subu     %[temp0],  %[temp0],  %[temp13]          \n\t"
    366    "sra      %[temp9],  %[temp9],  3                  \n\t"
    367    "sra      %[temp14], %[temp14], 3                  \n\t"
    368    "sra      %[temp3],  %[temp3],  3                  \n\t"
    369    "sra      %[temp0],  %[temp0],  3                  \n\t"
    370    "addu     %[temp13], %[temp2],  %[temp7]           \n\t"
    371    "subu     %[temp2],  %[temp2],  %[temp7]           \n\t"
    372    "addu     %[temp7],  %[temp4],  %[temp1]           \n\t"
    373    "subu     %[temp4],  %[temp4],  %[temp1]           \n\t"
    374    "sra      %[temp13], %[temp13], 3                  \n\t"
    375    "sra      %[temp2],  %[temp2],  3                  \n\t"
    376    "sra      %[temp7],  %[temp7],  3                  \n\t"
    377    "sra      %[temp4],  %[temp4],  3                  \n\t"
    378    "addiu    %[temp6],  $zero,     255                \n\t"
    379    "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
    380    "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
    381    "sra      %[temp5],  %[temp1],  8                  \n\t"
    382    "sra      %[temp18], %[temp1],  31                 \n\t"
    383    "beqz     %[temp5],  1f                            \n\t"
    384    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
    385    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
    386  "1:                                                  \n\t"
    387    "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
    388    "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
    389    "addu     %[temp18], %[temp18], %[temp11]          \n\t"
    390    "sra      %[temp11], %[temp18], 8                  \n\t"
    391    "sra      %[temp1],  %[temp18], 31                 \n\t"
    392    "beqz     %[temp11], 2f                            \n\t"
    393    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
    394    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
    395  "2:                                                  \n\t"
    396    "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
    397    "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
    398    "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
    399    "sra      %[temp8],  %[temp1],  8                  \n\t"
    400    "sra      %[temp18], %[temp1],  31                 \n\t"
    401    "beqz     %[temp8],  3f                            \n\t"
    402    "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
    403    "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
    404  "3:                                                  \n\t"
    405    "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
    406    "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
    407    "addu     %[temp18], %[temp18], %[temp16]          \n\t"
    408    "sra      %[temp16], %[temp18], 8                  \n\t"
    409    "sra      %[temp1],  %[temp18], 31                 \n\t"
    410    "beqz     %[temp16], 4f                            \n\t"
    411    "xor      %[temp18], %[temp18], %[temp18]          \n\t"
    412    "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
    413  "4:                                                  \n\t"
    414    "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
    415    "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
    416    "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
    417    "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
    418    "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
    419    "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
    420    "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
    421    "addu     %[temp11], %[temp11], %[temp12]          \n\t"
    422    "addu     %[temp16], %[temp16], %[temp10]          \n\t"
    423    "sra      %[temp18], %[temp5],  8                  \n\t"
    424    "sra      %[temp1],  %[temp5],  31                 \n\t"
    425    "beqz     %[temp18], 5f                            \n\t"
    426    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
    427    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
    428  "5:                                                  \n\t"
    429    "sra      %[temp18], %[temp8],  8                  \n\t"
    430    "sra      %[temp1],  %[temp8],  31                 \n\t"
    431    "beqz     %[temp18], 6f                            \n\t"
    432    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
    433    "movz     %[temp8],  %[temp6],  %[temp1]           \n\t"
    434  "6:                                                  \n\t"
    435    "sra      %[temp18], %[temp11], 8                  \n\t"
    436    "sra      %[temp1],  %[temp11], 31                 \n\t"
    437    "sra      %[temp17], %[temp16], 8                  \n\t"
    438    "sra      %[temp15], %[temp16], 31                 \n\t"
    439    "beqz     %[temp18], 7f                            \n\t"
    440    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
    441    "movz     %[temp11], %[temp6],  %[temp1]           \n\t"
    442  "7:                                                  \n\t"
    443    "beqz     %[temp17], 8f                            \n\t"
    444    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    445    "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
    446  "8:                                                  \n\t"
    447    "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
    448    "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
    449    "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
    450    "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
    451    "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
    452    "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
    453    "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
    454    "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
    455    "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
    456    "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
    457    "addu     %[temp11], %[temp11], %[temp0]           \n\t"
    458    "addu     %[temp16], %[temp16], %[temp14]          \n\t"
    459    "sra      %[temp18], %[temp5],  8                  \n\t"
    460    "sra      %[temp1],  %[temp5],  31                 \n\t"
    461    "sra      %[temp17], %[temp8],  8                  \n\t"
    462    "sra      %[temp15], %[temp8],  31                 \n\t"
    463    "sra      %[temp12], %[temp11], 8                  \n\t"
    464    "sra      %[temp10], %[temp11], 31                 \n\t"
    465    "sra      %[temp9],  %[temp16], 8                  \n\t"
    466    "sra      %[temp3],  %[temp16], 31                 \n\t"
    467    "beqz     %[temp18], 9f                            \n\t"
    468    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
    469    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
    470  "9:                                                  \n\t"
    471    "beqz     %[temp17], 10f                           \n\t"
    472    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
    473    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
    474  "10:                                                 \n\t"
    475    "beqz     %[temp12], 11f                           \n\t"
    476    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
    477    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
    478  "11:                                                 \n\t"
    479    "beqz     %[temp9],  12f                           \n\t"
    480    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    481    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
    482  "12:                                                 \n\t"
    483    "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
    484    "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
    485    "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
    486    "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
    487    "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
    488    "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
    489    "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
    490    "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
    491    "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
    492    "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
    493    "addu     %[temp11], %[temp11], %[temp4]           \n\t"
    494    "addu     %[temp16], %[temp16], %[temp2]           \n\t"
    495    "sra      %[temp18], %[temp5],  8                  \n\t"
    496    "sra      %[temp1],  %[temp5],  31                 \n\t"
    497    "sra      %[temp17], %[temp8],  8                  \n\t"
    498    "sra      %[temp15], %[temp8],  31                 \n\t"
    499    "sra      %[temp12], %[temp11], 8                  \n\t"
    500    "sra      %[temp10], %[temp11], 31                 \n\t"
    501    "sra      %[temp9],  %[temp16], 8                  \n\t"
    502    "sra      %[temp3],  %[temp16], 31                 \n\t"
    503    "beqz     %[temp18], 13f                           \n\t"
    504    "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
    505    "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
    506  "13:                                                 \n\t"
    507    "beqz     %[temp17], 14f                           \n\t"
    508    "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
    509    "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
    510  "14:                                                 \n\t"
    511    "beqz     %[temp12], 15f                           \n\t"
    512    "xor      %[temp11], %[temp11], %[temp11]          \n\t"
    513    "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
    514  "15:                                                 \n\t"
    515    "beqz     %[temp9],  16f                           \n\t"
    516    "xor      %[temp16], %[temp16], %[temp16]          \n\t"
    517    "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
    518  "16:                                                 \n\t"
    519    "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
    520    "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
    521    "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
    522    "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
    523 
    524    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    525      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    526      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    527      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
    528      [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
    529      [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
    530      [temp18]"=&r"(temp18), [temp19]"=&r"(temp19)
    531    : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
    532    : "memory", "hi", "lo"
    533  );
    534 }
    535 
    536 static void TransformTwo(const int16_t* WEBP_RESTRICT in,
    537                         uint8_t* WEBP_RESTRICT dst, int do_two) {
    538  TransformOne(in, dst);
    539  if (do_two) {
    540    TransformOne(in + 16, dst + 4);
    541  }
    542 }
    543 
    544 //------------------------------------------------------------------------------
    545 // Entry point
    546 
    547 extern void VP8DspInitMIPS32(void);
    548 
    549 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
    550  VP8InitClipTables();
    551 
    552  VP8Transform = TransformTwo;
    553 
    554  VP8VFilter16 = VFilter16;
    555  VP8HFilter16 = HFilter16;
    556  VP8VFilter8 = VFilter8;
    557  VP8HFilter8 = HFilter8;
    558  VP8VFilter16i = VFilter16i;
    559  VP8HFilter16i = HFilter16i;
    560  VP8VFilter8i = VFilter8i;
    561  VP8HFilter8i = HFilter8i;
    562 
    563  VP8SimpleVFilter16 = SimpleVFilter16;
    564  VP8SimpleHFilter16 = SimpleHFilter16;
    565  VP8SimpleVFilter16i = SimpleVFilter16i;
    566  VP8SimpleHFilter16i = SimpleHFilter16i;
    567 }
    568 
    569 #else  // !WEBP_USE_MIPS32
    570 
    571 WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
    572 
    573 #endif  // WEBP_USE_MIPS32