tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

enc_mips_dsp_r2.c (83579B)


      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // MIPS version of speed-critical encoding functions.
     11 //
     12 // Author(s): Darko Laus (darko.laus@imgtec.com)
     13 //            Mirko Raus (mirko.raus@imgtec.com)
     14 
     15 #include "src/dsp/dsp.h"
     16 
     17 #if defined(WEBP_USE_MIPS_DSP_R2)
     18 
     19 #include "src/dsp/mips_macro.h"
     20 #include "src/enc/cost_enc.h"
     21 #include "src/enc/vp8i_enc.h"
     22 
     23 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
     24 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
     25 
     26 // O - output
     27 // I - input (macro doesn't change it)
     28 #define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
     29                          I0, I1, I2, I3, I4, I5, I6, I7)                      \
     30  "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
     31  "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
     32  "addq.ph          %[" #O2 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
     33  "subq.ph          %[" #O3 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
     34  "addq.ph          %[" #O4 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
     35  "subq.ph          %[" #O5 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
     36  "addq.ph          %[" #O6 "],   %[" #I6 "],  %[" #I7 "]     \n\t"            \
     37  "subq.ph          %[" #O7 "],   %[" #I6 "],  %[" #I7 "]     \n\t"
     38 
     39 // IO - input/output
     40 #define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
     41  "absq_s.ph        %[" #IO0 "],   %[" #IO0 "]                \n\t"            \
     42  "absq_s.ph        %[" #IO1 "],   %[" #IO1 "]                \n\t"            \
     43  "absq_s.ph        %[" #IO2 "],   %[" #IO2 "]                \n\t"            \
     44  "absq_s.ph        %[" #IO3 "],   %[" #IO3 "]                \n\t"            \
     45  "absq_s.ph        %[" #IO4 "],   %[" #IO4 "]                \n\t"            \
     46  "absq_s.ph        %[" #IO5 "],   %[" #IO5 "]                \n\t"            \
     47  "absq_s.ph        %[" #IO6 "],   %[" #IO6 "]                \n\t"            \
     48  "absq_s.ph        %[" #IO7 "],   %[" #IO7 "]                \n\t"
     49 
     50 // dpa.w.ph $ac0 temp0 ,temp1
     51 //  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
     52 // dpax.w.ph $ac0 temp0 ,temp1
     53 //  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
     54 // O - output
     55 // I - input (macro doesn't change it)
     56 #define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
     57                 I8, I9, I10, I11, I12, I13, I14, I15)                         \
     58    "mult            $ac0,      $zero,     $zero              \n\t"            \
     59    "dpa.w.ph        $ac0,      %[" #I2 "],  %[" #I0 "]       \n\t"            \
     60    "dpax.w.ph       $ac0,      %[" #I5 "],  %[" #I6 "]       \n\t"            \
     61    "dpa.w.ph        $ac0,      %[" #I8 "],  %[" #I9 "]       \n\t"            \
     62    "dpax.w.ph       $ac0,      %[" #I11 "], %[" #I4 "]       \n\t"            \
     63    "dpa.w.ph        $ac0,      %[" #I12 "], %[" #I7 "]       \n\t"            \
     64    "dpax.w.ph       $ac0,      %[" #I13 "], %[" #I1 "]       \n\t"            \
     65    "dpa.w.ph        $ac0,      %[" #I14 "], %[" #I3 "]       \n\t"            \
     66    "dpax.w.ph       $ac0,      %[" #I15 "], %[" #I10 "]      \n\t"            \
     67    "mflo            %[" #O0 "],  $ac0                        \n\t"
     68 
     69 #define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
     70  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
     71  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
     72  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
     73  [temp17]"=&r"(temp17)
     74 
     75 // macro for one horizontal pass in FTransform
     76 // temp0..temp15 holds tmp[0]..tmp[15]
     77 // A - offset in bytes to load from src and ref buffers
     78 // TEMP0..TEMP3 - registers for corresponding tmp elements
     79 #define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
     80  "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
     81  "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
     82  "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
     83  "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
     84  "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
     85  "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
     86  "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
     87  "preceu.ph.qbr   %[" #TEMP3 "],   %[" #TEMP3 "]                       \n\t"  \
     88  "subq.ph         %[" #TEMP0 "],   %[" #TEMP0 "],   %[" #TEMP1 "]      \n\t"  \
     89  "subq.ph         %[" #TEMP2 "],   %[" #TEMP2 "],   %[" #TEMP3 "]      \n\t"  \
     90  "rotr            %[" #TEMP0 "],   %[" #TEMP0 "],   16                 \n\t"  \
     91  "addq.ph         %[" #TEMP1 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
     92  "subq.ph         %[" #TEMP3 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
     93  "seh             %[" #TEMP0 "],   %[" #TEMP1 "]                       \n\t"  \
     94  "sra             %[temp16],     %[" #TEMP1 "],   16                   \n\t"  \
     95  "seh             %[temp19],     %[" #TEMP3 "]                         \n\t"  \
     96  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   16                 \n\t"  \
     97  "subu            %[" #TEMP2 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
     98  "addu            %[" #TEMP0 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
     99  "mul             %[temp17],     %[temp19],     %[c2217]               \n\t"  \
    100  "mul             %[temp18],     %[" #TEMP3 "],   %[c5352]             \n\t"  \
    101  "mul             %[" #TEMP1 "],   %[temp19],     %[c5352]             \n\t"  \
    102  "mul             %[temp16],     %[" #TEMP3 "],   %[c2217]             \n\t"  \
    103  "sll             %[" #TEMP2 "],   %[" #TEMP2 "],   3                  \n\t"  \
    104  "sll             %[" #TEMP0 "],   %[" #TEMP0 "],   3                  \n\t"  \
    105  "subu            %[" #TEMP3 "],   %[temp17],     %[temp18]            \n\t"  \
    106  "addu            %[" #TEMP1 "],   %[temp16],     %[" #TEMP1 "]        \n\t"  \
    107  "addiu           %[" #TEMP3 "],   %[" #TEMP3 "],   937                \n\t"  \
    108  "addiu           %[" #TEMP1 "],   %[" #TEMP1 "],   1812               \n\t"  \
    109  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   9                  \n\t"  \
    110  "sra             %[" #TEMP1 "],   %[" #TEMP1 "],   9                  \n\t"
    111 
    112 // macro for one vertical pass in FTransform
    113 // temp0..temp15 holds tmp[0]..tmp[15]
    114 // A..D - offsets in bytes to store to out buffer
    115 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
    116 #define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)                 \
    117  "addu            %[temp16],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
    118  "subu            %[temp19],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
    119  "addu            %[temp17],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
    120  "subu            %[temp18],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
    121  "mul             %[" #TEMP8 "],   %[temp19],     %[c2217]         \n\t"      \
    122  "mul             %[" #TEMP12 "],  %[temp18],     %[c2217]         \n\t"      \
    123  "mul             %[" #TEMP4 "],   %[temp19],     %[c5352]         \n\t"      \
    124  "mul             %[temp18],     %[temp18],     %[c5352]           \n\t"      \
    125  "addiu           %[temp16],     %[temp16],     7                  \n\t"      \
    126  "addu            %[" #TEMP0 "],   %[temp16],     %[temp17]        \n\t"      \
    127  "sra             %[" #TEMP0 "],   %[" #TEMP0 "],   4              \n\t"      \
    128  "addu            %[" #TEMP12 "],  %[" #TEMP12 "],  %[" #TEMP4 "]  \n\t"      \
    129  "subu            %[" #TEMP4 "],   %[temp16],     %[temp17]        \n\t"      \
    130  "sra             %[" #TEMP4 "],   %[" #TEMP4 "],   4              \n\t"      \
    131  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   30000          \n\t"      \
    132  "addiu           %[" #TEMP12 "],  %[" #TEMP12 "],  12000          \n\t"      \
    133  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   21000          \n\t"      \
    134  "subu            %[" #TEMP8 "],   %[" #TEMP8 "],   %[temp18]      \n\t"      \
    135  "sra             %[" #TEMP12 "],  %[" #TEMP12 "],  16             \n\t"      \
    136  "sra             %[" #TEMP8 "],   %[" #TEMP8 "],   16             \n\t"      \
    137  "addiu           %[temp16],     %[" #TEMP12 "],  1                \n\t"      \
    138  "movn            %[" #TEMP12 "],  %[temp16],     %[temp19]        \n\t"      \
    139  "sh              %[" #TEMP0 "],   " #A "(%[temp20])               \n\t"      \
    140  "sh              %[" #TEMP4 "],   " #C "(%[temp20])               \n\t"      \
    141  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
    142  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
    143 
    144 static void FTransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT src,
    145                                 const uint8_t* WEBP_RESTRICT ref,
    146                                 int16_t* WEBP_RESTRICT out) {
    147  const int c2217 = 2217;
    148  const int c5352 = 5352;
    149  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
    150  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
    151  int temp17, temp18, temp19, temp20;
    152  const int* const args[3] =
    153      { (const int*)src, (const int*)ref, (const int*)out };
    154 
    155  __asm__ volatile (
    156    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
    157    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
    158    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
    159    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
    160    "lw            %[temp20],     8(%[args])                  \n\t"
    161    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
    162    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
    163    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
    164    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
    165    OUTPUT_EARLY_CLOBBER_REGS_18(),
    166      [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
    167    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
    168    : "memory", "hi", "lo"
    169  );
    170 }
    171 
    172 #undef VERTICAL_PASS
    173 #undef HORIZONTAL_PASS
    174 
    175 static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
    176                                      const int16_t* WEBP_RESTRICT in,
    177                                      uint8_t* WEBP_RESTRICT dst) {
    178  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
    179  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
    180 
    181  __asm__ volatile (
    182    "ulw              %[temp1],   0(%[in])                 \n\t"
    183    "ulw              %[temp2],   16(%[in])                \n\t"
    184    LOAD_IN_X2(temp5, temp6, 24, 26)
    185    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
    186    LOAD_IN_X2(temp1, temp2, 8, 10)
    187    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
    188                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
    189                  temp13, temp11, temp14, temp12)
    190    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
    191    "ulw              %[temp17],  4(%[in])                 \n\t"
    192    "ulw              %[temp18],  20(%[in])                \n\t"
    193    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
    194    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
    195    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
    196    LOAD_IN_X2(temp17, temp18, 12, 14)
    197    LOAD_IN_X2(temp9, temp10, 28, 30)
    198    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
    199                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
    200                  temp15, temp4, temp16, temp17)
    201    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
    202    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
    203    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
    204 
    205    // horizontal
    206    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
    207    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
    208    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
    209    "repl.ph          %[temp2],   0x4                      \n\t"
    210    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
    211    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
    212    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
    213    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
    214    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
    215    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
    216                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
    217                  temp6, temp17, temp8, temp18)
    218    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
    219                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
    220                  temp18, temp12, temp17, temp16)
    221    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
    222    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
    223    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
    224                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
    225                   temp6)
    226    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
    227                          temp16, temp11, temp10, temp15, temp14)
    228    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
    229                        0, 0, 0, 0,
    230                        0, 1, 2, 3,
    231                        BPS)
    232    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
    233                            temp11, temp10, temp11, temp14, temp15)
    234    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
    235                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
    236                     dst, 0, 1, 2, 3, BPS)
    237 
    238    OUTPUT_EARLY_CLOBBER_REGS_18()
    239    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
    240    : "memory", "hi", "lo"
    241  );
    242 }
    243 
    244 static void ITransform_MIPSdspR2(const uint8_t* WEBP_RESTRICT ref,
    245                                 const int16_t* WEBP_RESTRICT in,
    246                                 uint8_t* WEBP_RESTRICT dst, int do_two) {
    247  ITransformOne(ref, in, dst);
    248  if (do_two) {
    249    ITransformOne(ref + 4, in + 16, dst + 4);
    250  }
    251 }
    252 
    253 static int Disto4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
    254                              const uint8_t* WEBP_RESTRICT const b,
    255                              const uint16_t* WEBP_RESTRICT const w) {
    256  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
    257  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
    258 
    259  __asm__ volatile (
    260    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
    261                        0, 0, 0, 0,
    262                        0, 1, 2, 3,
    263                        BPS)
    264    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
    265                            temp12, temp1, temp2, temp3, temp4)
    266    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
    267                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
    268    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
    269                          temp7, temp2, temp4, temp6, temp8)
    270    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
    271                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
    272    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
    273                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
    274    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
    275                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
    276    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
    277    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
    278                        0, 4, 8, 12,
    279                        0, 0, 0, 0,
    280                        0)
    281    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
    282                        0, 4, 8, 12,
    283                        1, 1, 1, 1,
    284                        16)
    285    MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
    286             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
    287    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
    288                        0, 0, 0, 0,
    289                        0, 1, 2, 3,
    290                        BPS)
    291    CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
    292                            temp12, temp1, temp2, temp3, temp4)
    293    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
    294                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
    295    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
    296                          temp7, temp2, temp4, temp6, temp8)
    297    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
    298                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
    299    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
    300                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
    301    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
    302                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
    303    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
    304    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
    305                        0, 4, 8, 12,
    306                        0, 0, 0, 0,
    307                        0)
    308    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
    309                        0, 4, 8, 12,
    310                        1, 1, 1, 1,
    311                        16)
    312    MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
    313             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
    314    OUTPUT_EARLY_CLOBBER_REGS_17()
    315    : [a]"r"(a), [b]"r"(b), [w]"r"(w)
    316    : "memory", "hi", "lo"
    317  );
    318  return abs(temp3 - temp17) >> 5;
    319 }
    320 
    321 static int Disto16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT const a,
    322                                const uint8_t* WEBP_RESTRICT const b,
    323                                const uint16_t* WEBP_RESTRICT const w) {
    324  int D = 0;
    325  int x, y;
    326  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    327    for (x = 0; x < 16; x += 4) {
    328      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
    329    }
    330  }
    331  return D;
    332 }
    333 
    334 //------------------------------------------------------------------------------
    335 // Intra predictions
    336 
    337 #define FILL_PART(J, SIZE)                                            \
    338    "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
    339    "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
    340  ".if " #SIZE " == 16                                     \n\t"      \
    341    "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
    342    "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
    343  ".endif                                                  \n\t"
    344 
    345 #define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
    346  int value = (VALUE);                                              \
    347  __asm__ volatile (                                                \
    348    "replv.qb   %[value],  %[value]                      \n\t"      \
    349    FILL_PART( 0, SIZE)                                             \
    350    FILL_PART( 1, SIZE)                                             \
    351    FILL_PART( 2, SIZE)                                             \
    352    FILL_PART( 3, SIZE)                                             \
    353    FILL_PART( 4, SIZE)                                             \
    354    FILL_PART( 5, SIZE)                                             \
    355    FILL_PART( 6, SIZE)                                             \
    356    FILL_PART( 7, SIZE)                                             \
    357  ".if " #SIZE " == 16                                   \n\t"      \
    358    FILL_PART( 8, 16)                                               \
    359    FILL_PART( 9, 16)                                               \
    360    FILL_PART(10, 16)                                               \
    361    FILL_PART(11, 16)                                               \
    362    FILL_PART(12, 16)                                               \
    363    FILL_PART(13, 16)                                               \
    364    FILL_PART(14, 16)                                               \
    365    FILL_PART(15, 16)                                               \
    366  ".endif                                                \n\t"      \
    367    : [value]"+&r"(value)                                           \
    368    : [dst]"r"((DST))                                               \
    369    : "memory"                                                      \
    370  );                                                                \
    371 } while (0)
    372 
    373 #define VERTICAL_PRED(DST, TOP, SIZE)                                          \
    374 static WEBP_INLINE void VerticalPred##SIZE(                                    \
    375    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (TOP)) {        \
    376  int j;                                                                       \
    377  if ((TOP)) {                                                                 \
    378    for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
    379  } else {                                                                     \
    380    FILL_8_OR_16((DST), 127, (SIZE));                                          \
    381  }                                                                            \
    382 }
    383 
    384 VERTICAL_PRED(dst, top, 8)
    385 VERTICAL_PRED(dst, top, 16)
    386 
    387 #undef VERTICAL_PRED
    388 
    389 #define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
    390 static WEBP_INLINE void HorizontalPred##SIZE(                                  \
    391    uint8_t* WEBP_RESTRICT (DST), const uint8_t* WEBP_RESTRICT (LEFT)) {       \
    392  if (LEFT) {                                                                  \
    393    int j;                                                                     \
    394    for (j = 0; j < (SIZE); ++j) {                                             \
    395      memset((DST) + j * BPS, (LEFT)[j], (SIZE));                              \
    396    }                                                                          \
    397  } else {                                                                     \
    398    FILL_8_OR_16((DST), 129, (SIZE));                                          \
    399  }                                                                            \
    400 }
    401 
    402 HORIZONTAL_PRED(dst, left, 8)
    403 HORIZONTAL_PRED(dst, left, 16)
    404 
    405 #undef HORIZONTAL_PRED
    406 
    407 #define CLIPPING()                                                             \
    408  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
    409  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
    410  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
    411  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
    412  "addu.ph         %[temp2],   %[temp2],   %[leftY_1]    \n\t"                 \
    413  "addu.ph         %[temp0],   %[temp0],   %[leftY_1]    \n\t"                 \
    414  "addu.ph         %[temp3],   %[temp3],   %[leftY_1]    \n\t"                 \
    415  "addu.ph         %[temp1],   %[temp1],   %[leftY_1]    \n\t"                 \
    416  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
    417  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
    418  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
    419  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
    420  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
    421  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"
    422 
    423 #define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do {                              \
    424  int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y];                            \
    425  int temp0, temp1, temp2, temp3;                                              \
    426  __asm__ volatile (                                                           \
    427    "replv.ph        %[leftY_1], %[leftY_1]              \n\t"                 \
    428    "ulw             %[temp0],   0(%[top])               \n\t"                 \
    429    "ulw             %[temp1],   4(%[top])               \n\t"                 \
    430    "subu.ph         %[leftY_1], %[leftY_1], %[left_1]   \n\t"                 \
    431    CLIPPING()                                                                 \
    432    "usw             %[temp0],   0(%[dst])               \n\t"                 \
    433    "usw             %[temp1],   4(%[dst])               \n\t"                 \
    434  ".if " #SIZE " == 16                                   \n\t"                 \
    435    "ulw             %[temp0],   8(%[top])               \n\t"                 \
    436    "ulw             %[temp1],   12(%[top])              \n\t"                 \
    437    CLIPPING()                                                                 \
    438    "usw             %[temp0],   8(%[dst])               \n\t"                 \
    439    "usw             %[temp1],   12(%[dst])              \n\t"                 \
    440  ".endif                                                \n\t"                 \
    441    : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),       \
    442      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
    443    : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST))                    \
    444    : "memory"                                                                 \
    445  );                                                                           \
    446 } while (0)
    447 
    448 #define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do {                                 \
    449  int y;                                                                       \
    450  const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1];                     \
    451  for (y = 0; y < (SIZE); ++y) {                                               \
    452    CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE));                              \
    453    (DST) += BPS;                                                              \
    454  }                                                                            \
    455 } while (0)
    456 
    457 #define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
    458 static WEBP_INLINE void TrueMotion##SIZE(uint8_t* WEBP_RESTRICT (DST),         \
    459                                         const uint8_t* WEBP_RESTRICT (LEFT),  \
    460                                         const uint8_t* WEBP_RESTRICT (TOP)) { \
    461  if ((LEFT) != NULL) {                                                        \
    462    if ((TOP) != NULL) {                                                       \
    463      CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
    464    } else {                                                                   \
    465      HorizontalPred##SIZE((DST), (LEFT));                                     \
    466    }                                                                          \
    467  } else {                                                                     \
    468    /* true motion without left samples (hence: with default 129 value)    */  \
    469    /* is equivalent to VE prediction where you just copy the top samples. */  \
    470    /* Note that if top samples are not available, the default value is    */  \
    471    /* then 129, and not 127 as in the VerticalPred case.                  */  \
    472    if ((TOP) != NULL) {                                                       \
    473      VerticalPred##SIZE((DST), (TOP));                                        \
    474    } else {                                                                   \
    475      FILL_8_OR_16((DST), 129, (SIZE));                                        \
    476    }                                                                          \
    477  }                                                                            \
    478 }
    479 
    480 TRUE_MOTION(dst, left, top, 8)
    481 TRUE_MOTION(dst, left, top, 16)
    482 
    483 #undef TRUE_MOTION
    484 #undef CLIP_TO_DST
    485 #undef CLIP_8B_TO_DST
    486 #undef CLIPPING
    487 
    488 static WEBP_INLINE void DCMode16(uint8_t* WEBP_RESTRICT dst,
    489                                 const uint8_t* WEBP_RESTRICT left,
    490                                 const uint8_t* WEBP_RESTRICT top) {
    491  int DC, DC1;
    492  int temp0, temp1, temp2, temp3;
    493 
    494  __asm__ volatile(
    495    "beqz        %[top],   2f                  \n\t"
    496    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
    497                        0, 4, 8, 12,
    498                        0, 0, 0, 0,
    499                        0)
    500    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
    501    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
    502    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    503    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    504    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
    505    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
    506    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
    507    "move        %[DC1],   %[DC]               \n\t"
    508    "beqz        %[left],  1f                  \n\t"
    509    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
    510                        0, 4, 8, 12,
    511                        0, 0, 0, 0,
    512                        0)
    513    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
    514    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
    515    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    516    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    517    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
    518    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
    519    "addu        %[DC1],   %[temp0], %[temp2]  \n\t"
    520  "1:                                          \n\t"
    521    "addu        %[DC],   %[DC],     %[DC1]    \n\t"
    522    "j           3f                            \n\t"
    523  "2:                                          \n\t"
    524    "beqz        %[left],  4f                  \n\t"
    525    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
    526                        0, 4, 8, 12,
    527                        0, 0, 0, 0,
    528                        0)
    529    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
    530    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
    531    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    532    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    533    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
    534    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
    535    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
    536    "addu        %[DC],    %[DC],    %[DC]     \n\t"
    537  "3:                                          \n\t"
    538    "shra_r.w    %[DC],    %[DC],    5         \n\t"
    539    "j           5f                            \n\t"
    540  "4:                                          \n\t"
    541    "li          %[DC],    0x80                \n\t"
    542  "5:                                          \n\t"
    543    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
    544      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
    545    : [left]"r"(left), [top]"r"(top)
    546    : "memory"
    547  );
    548 
    549  FILL_8_OR_16(dst, DC, 16);
    550 }
    551 
    552 static WEBP_INLINE void DCMode8(uint8_t* WEBP_RESTRICT dst,
    553                                const uint8_t* WEBP_RESTRICT left,
    554                                const uint8_t* WEBP_RESTRICT top) {
    555  int DC, DC1;
    556  int temp0, temp1, temp2, temp3;
    557 
    558  __asm__ volatile(
    559    "beqz        %[top],   2f                  \n\t"
    560    "ulw         %[temp0], 0(%[top])           \n\t"
    561    "ulw         %[temp1], 4(%[top])           \n\t"
    562    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
    563    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
    564    "addu        %[DC],    %[temp0], %[temp1]  \n\t"
    565    "move        %[DC1],   %[DC]               \n\t"
    566    "beqz        %[left],  1f                  \n\t"
    567    "ulw         %[temp2], 0(%[left])          \n\t"
    568    "ulw         %[temp3], 4(%[left])          \n\t"
    569    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    570    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    571    "addu        %[DC1],   %[temp2], %[temp3]  \n\t"
    572  "1:                                          \n\t"
    573    "addu        %[DC],    %[DC],    %[DC1]    \n\t"
    574    "j           3f                            \n\t"
    575  "2:                                          \n\t"
    576    "beqz        %[left],  4f                  \n\t"
    577    "ulw         %[temp2], 0(%[left])          \n\t"
    578    "ulw         %[temp3], 4(%[left])          \n\t"
    579    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    580    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    581    "addu        %[DC],    %[temp2], %[temp3]  \n\t"
    582    "addu        %[DC],    %[DC],    %[DC]     \n\t"
    583  "3:                                          \n\t"
    584    "shra_r.w    %[DC], %[DC], 4               \n\t"
    585    "j           5f                            \n\t"
    586  "4:                                          \n\t"
    587    "li          %[DC], 0x80                   \n\t"
    588  "5:                                          \n\t"
    589    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
    590      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
    591    : [left]"r"(left), [top]"r"(top)
    592    : "memory"
    593  );
    594 
    595  FILL_8_OR_16(dst, DC, 8);
    596 }
    597 
    598 static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    599  int temp0, temp1;
    600  __asm__ volatile(
    601    "ulw          %[temp0],   0(%[top])               \n\t"
    602    "ulw          %[temp1],   -5(%[top])              \n\t"
    603    "raddu.w.qb   %[temp0],   %[temp0]                \n\t"
    604    "raddu.w.qb   %[temp1],   %[temp1]                \n\t"
    605    "addu         %[temp0],   %[temp0],    %[temp1]   \n\t"
    606    "addiu        %[temp0],   %[temp0],    4          \n\t"
    607    "srl          %[temp0],   %[temp0],    3          \n\t"
    608    "replv.qb     %[temp0],   %[temp0]                \n\t"
    609    "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
    610    "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
    611    "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
    612    "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
    613    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
    614    : [top]"r"(top), [dst]"r"(dst)
    615    : "memory"
    616  );
    617 }
    618 
    619 static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    620  int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
    621  const int c35 = 0xff00ff;
    622  __asm__ volatile (
    623    "lbu              %[temp1],  0(%[top])                     \n\t"
    624    "lbu              %[a10],    1(%[top])                     \n\t"
    625    "lbu              %[temp2],  2(%[top])                     \n\t"
    626    "lbu              %[a32],    3(%[top])                     \n\t"
    627    "ulw              %[temp0],  -5(%[top])                    \n\t"
    628    "lbu              %[temp4],  -1(%[top])                    \n\t"
    629    "append           %[a10],    %[temp1],   16                \n\t"
    630    "append           %[a32],    %[temp2],   16                \n\t"
    631    "replv.ph         %[temp4],  %[temp4]                      \n\t"
    632    "shrl.ph          %[temp1],  %[temp0],   8                 \n\t"
    633    "and              %[temp0],  %[temp0],   %[c35]            \n\t"
    634    "subu.ph          %[temp1],  %[temp1],   %[temp4]          \n\t"
    635    "subu.ph          %[temp0],  %[temp0],   %[temp4]          \n\t"
    636    "srl              %[temp2],  %[temp1],   16                \n\t"
    637    "srl              %[temp3],  %[temp0],   16                \n\t"
    638    "replv.ph         %[temp2],  %[temp2]                      \n\t"
    639    "replv.ph         %[temp3],  %[temp3]                      \n\t"
    640    "replv.ph         %[temp4],  %[temp1]                      \n\t"
    641    "replv.ph         %[temp5],  %[temp0]                      \n\t"
    642    "addu.ph          %[temp0],  %[temp3],   %[a10]            \n\t"
    643    "addu.ph          %[temp1],  %[temp3],   %[a32]            \n\t"
    644    "addu.ph          %[temp3],  %[temp2],   %[a10]            \n\t"
    645    "addu.ph          %[temp2],  %[temp2],   %[a32]            \n\t"
    646    "shll_s.ph        %[temp0],  %[temp0],   7                 \n\t"
    647    "shll_s.ph        %[temp1],  %[temp1],   7                 \n\t"
    648    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
    649    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
    650    "precrqu_s.qb.ph  %[temp0],  %[temp1],   %[temp0]          \n\t"
    651    "precrqu_s.qb.ph  %[temp1],  %[temp2],   %[temp3]          \n\t"
    652    "addu.ph          %[temp2],  %[temp5],   %[a10]            \n\t"
    653    "addu.ph          %[temp3],  %[temp5],   %[a32]            \n\t"
    654    "addu.ph          %[temp5],  %[temp4],   %[a10]            \n\t"
    655    "addu.ph          %[temp4],  %[temp4],   %[a32]            \n\t"
    656    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
    657    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
    658    "shll_s.ph        %[temp4],  %[temp4],   7                 \n\t"
    659    "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
    660    "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
    661    "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
    662    "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
    663    "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
    664    "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
    665    "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
    666    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    667      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    668      [a10]"=&r"(a10), [a32]"=&r"(a32)
    669    : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
    670    : "memory"
    671  );
    672 }
    673 
    674 static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    675  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
    676  __asm__ volatile(
    677    "ulw             %[temp0],   -1(%[top])              \n\t"
    678    "ulh             %[temp1],   3(%[top])               \n\t"
    679    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
    680    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
    681    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
    682    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
    683    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
    684    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
    685    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
    686    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
    687    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
    688    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
    689    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
    690    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
    691    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
    692    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
    693    "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
    694    "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
    695    "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
    696    "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
    697    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    698      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    699      [temp6]"=&r"(temp6)
    700    : [top]"r"(top), [dst]"r"(dst)
    701    : "memory"
    702  );
    703 }
    704 
    705 static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    706  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
    707  __asm__ volatile(
    708    "ulw             %[temp0],   -4(%[top])              \n\t"
    709    "lbu             %[temp1],   -5(%[top])              \n\t"
    710    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
    711    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
    712    "replv.ph        %[temp4],   %[temp1]                \n\t"
    713    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
    714    "packrl.ph       %[temp6],   %[temp2],    %[temp4]   \n\t"
    715    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
    716    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
    717    "addq.ph         %[temp3],   %[temp3],    %[temp5]   \n\t"
    718    "addq.ph         %[temp3],   %[temp3],    %[temp2]   \n\t"
    719    "addq.ph         %[temp2],   %[temp2],    %[temp6]   \n\t"
    720    "addq.ph         %[temp2],   %[temp2],    %[temp4]   \n\t"
    721    "shra_r.ph       %[temp3],   %[temp3],    2          \n\t"
    722    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
    723    "replv.qb        %[temp0],   %[temp3]                \n\t"
    724    "replv.qb        %[temp1],   %[temp2]                \n\t"
    725    "srl             %[temp3],   %[temp3],    16         \n\t"
    726    "srl             %[temp2],   %[temp2],    16         \n\t"
    727    "replv.qb        %[temp3],   %[temp3]                \n\t"
    728    "replv.qb        %[temp2],   %[temp2]                \n\t"
    729    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
    730    "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
    731    "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
    732    "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
    733    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    734      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    735      [temp6]"=&r"(temp6)
    736    : [top]"r"(top), [dst]"r"(dst)
    737    : "memory"
    738  );
    739 }
    740 
    741 static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    742  int temp0, temp1, temp2, temp3, temp4, temp5;
    743  int temp6, temp7, temp8, temp9, temp10, temp11;
    744  __asm__ volatile(
    745    "ulw             %[temp0],    -5(%[top])               \n\t"
    746    "ulw             %[temp1],    -1(%[top])               \n\t"
    747    "preceu.ph.qbl   %[temp2],    %[temp0]                 \n\t"
    748    "preceu.ph.qbr   %[temp3],    %[temp0]                 \n\t"
    749    "preceu.ph.qbr   %[temp4],    %[temp1]                 \n\t"
    750    "preceu.ph.qbl   %[temp5],    %[temp1]                 \n\t"
    751    "packrl.ph       %[temp6],    %[temp2],    %[temp3]    \n\t"
    752    "packrl.ph       %[temp7],    %[temp4],    %[temp2]    \n\t"
    753    "packrl.ph       %[temp8],    %[temp5],    %[temp4]    \n\t"
    754    "shll.ph         %[temp6],    %[temp6],    1           \n\t"
    755    "addq.ph         %[temp9],    %[temp2],    %[temp6]    \n\t"
    756    "shll.ph         %[temp7],    %[temp7],    1           \n\t"
    757    "addq.ph         %[temp9],    %[temp9],    %[temp3]    \n\t"
    758    "shll.ph         %[temp8],    %[temp8],    1           \n\t"
    759    "shra_r.ph       %[temp9],    %[temp9],    2           \n\t"
    760    "addq.ph         %[temp10],   %[temp4],    %[temp7]    \n\t"
    761    "addq.ph         %[temp11],   %[temp5],    %[temp8]    \n\t"
    762    "addq.ph         %[temp10],   %[temp10],   %[temp2]    \n\t"
    763    "addq.ph         %[temp11],   %[temp11],   %[temp4]    \n\t"
    764    "shra_r.ph       %[temp10],   %[temp10],   2           \n\t"
    765    "shra_r.ph       %[temp11],   %[temp11],   2           \n\t"
    766    "lbu             %[temp0],    3(%[top])                \n\t"
    767    "lbu             %[temp1],    2(%[top])                \n\t"
    768    "lbu             %[temp2],    1(%[top])                \n\t"
    769    "sll             %[temp1],    %[temp1],    1           \n\t"
    770    "addu            %[temp0],    %[temp0],    %[temp1]    \n\t"
    771    "addu            %[temp0],    %[temp0],    %[temp2]    \n\t"
    772    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
    773    "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
    774    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
    775    "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
    776    "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
    777    "prepend         %[temp9],    %[temp11],   8           \n\t"
    778    "prepend         %[temp10],   %[temp0],    8           \n\t"
    779    "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
    780    "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
    781    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    782      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    783      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    784      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
    785    : [top]"r"(top), [dst]"r"(dst)
    786    : "memory"
    787  );
    788 }
    789 
    790 static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    791  int temp0, temp1, temp2, temp3, temp4;
    792  int temp5, temp6, temp7, temp8, temp9;
    793  __asm__ volatile (
    794    "ulw              %[temp0],   -4(%[top])              \n\t"
    795    "ulw              %[temp1],   0(%[top])               \n\t"
    796    "preceu.ph.qbl    %[temp2],   %[temp0]                \n\t"
    797    "preceu.ph.qbr    %[temp0],   %[temp0]                \n\t"
    798    "preceu.ph.qbla   %[temp3],   %[temp1]                \n\t"
    799    "preceu.ph.qbra   %[temp1],   %[temp1]                \n\t"
    800    "packrl.ph        %[temp7],   %[temp3],    %[temp2]   \n\t"
    801    "addqh_r.ph       %[temp4],   %[temp1],    %[temp3]   \n\t"
    802    "move             %[temp6],   %[temp1]                \n\t"
    803    "append           %[temp1],   %[temp2],    16         \n\t"
    804    "shll.ph          %[temp9],   %[temp6],    1          \n\t"
    805    "addqh_r.ph       %[temp5],   %[temp7],    %[temp6]   \n\t"
    806    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
    807    "addu.ph          %[temp3],   %[temp7],    %[temp3]   \n\t"
    808    "addu.ph          %[temp1],   %[temp1],    %[temp6]   \n\t"
    809    "packrl.ph        %[temp7],   %[temp2],    %[temp0]   \n\t"
    810    "addu.ph          %[temp6],   %[temp0],    %[temp2]   \n\t"
    811    "addu.ph          %[temp3],   %[temp3],    %[temp9]   \n\t"
    812    "addu.ph          %[temp1],   %[temp1],    %[temp8]   \n\t"
    813    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
    814    "shra_r.ph        %[temp3],   %[temp3],    2          \n\t"
    815    "shra_r.ph        %[temp1],   %[temp1],    2          \n\t"
    816    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
    817    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
    818    "precrq.ph.w      %[temp8],   %[temp4],    %[temp5]   \n\t"
    819    "append           %[temp4],   %[temp5],    16         \n\t"
    820    "precrq.ph.w      %[temp2],   %[temp3],    %[temp1]   \n\t"
    821    "append           %[temp3],   %[temp1],    16         \n\t"
    822    "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
    823    "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
    824    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
    825    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
    826    "append           %[temp3],   %[temp6],    8          \n\t"
    827    "srl              %[temp6],   %[temp6],    16         \n\t"
    828    "append           %[temp8],   %[temp6],    8          \n\t"
    829    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
    830    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
    831    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    832      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    833      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    834      [temp9]"=&r"(temp9)
    835    : [top]"r"(top), [dst]"r"(dst)
    836    : "memory"
    837  );
    838 }
    839 
    840 static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    841  int temp0, temp1, temp2, temp3, temp4, temp5;
    842  int temp6, temp7, temp8, temp9, temp10, temp11;
    843  __asm__ volatile(
    844    "ulw             %[temp0],    0(%[top])               \n\t"
    845    "ulw             %[temp1],    4(%[top])               \n\t"
    846    "preceu.ph.qbl   %[temp2],    %[temp0]                \n\t"
    847    "preceu.ph.qbr   %[temp3],    %[temp0]                \n\t"
    848    "preceu.ph.qbr   %[temp4],    %[temp1]                \n\t"
    849    "preceu.ph.qbl   %[temp5],    %[temp1]                \n\t"
    850    "packrl.ph       %[temp6],    %[temp2],    %[temp3]   \n\t"
    851    "packrl.ph       %[temp7],    %[temp4],    %[temp2]   \n\t"
    852    "packrl.ph       %[temp8],    %[temp5],    %[temp4]   \n\t"
    853    "shll.ph         %[temp6],    %[temp6],    1          \n\t"
    854    "addq.ph         %[temp9],    %[temp2],    %[temp6]   \n\t"
    855    "shll.ph         %[temp7],    %[temp7],    1          \n\t"
    856    "addq.ph         %[temp9],    %[temp9],    %[temp3]   \n\t"
    857    "shll.ph         %[temp8],    %[temp8],    1          \n\t"
    858    "shra_r.ph       %[temp9],    %[temp9],    2          \n\t"
    859    "addq.ph         %[temp10],   %[temp4],    %[temp7]   \n\t"
    860    "addq.ph         %[temp11],   %[temp5],    %[temp8]   \n\t"
    861    "addq.ph         %[temp10],   %[temp10],   %[temp2]   \n\t"
    862    "addq.ph         %[temp11],   %[temp11],   %[temp4]   \n\t"
    863    "shra_r.ph       %[temp10],   %[temp10],   2          \n\t"
    864    "shra_r.ph       %[temp11],   %[temp11],   2          \n\t"
    865    "srl             %[temp1],    %[temp1],    24         \n\t"
    866    "sll             %[temp1],    %[temp1],    1          \n\t"
    867    "raddu.w.qb      %[temp5],    %[temp5]                \n\t"
    868    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]   \n\t"
    869    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
    870    "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
    871    "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
    872    "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
    873    "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
    874    "prepend         %[temp9],    %[temp11],   8          \n\t"
    875    "prepend         %[temp10],   %[temp1],    8          \n\t"
    876    "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
    877    "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
    878    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    879      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    880      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    881      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
    882    : [top]"r"(top), [dst]"r"(dst)
    883    : "memory"
    884  );
    885 }
    886 
    887 static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    888  int temp0, temp1, temp2, temp3, temp4;
    889  int temp5, temp6, temp7, temp8, temp9;
    890  __asm__ volatile (
    891    "ulw              %[temp0],   0(%[top])               \n\t"
    892    "ulw              %[temp1],   4(%[top])               \n\t"
    893    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
    894    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
    895    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
    896    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
    897    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
    898    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
    899    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
    900    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
    901    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
    902    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
    903    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
    904    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
    905    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
    906    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
    907    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
    908    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
    909    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
    910    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
    911    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
    912    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
    913    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
    914    "precrq.ph.w      %[temp8],   %[temp5],    %[temp4]   \n\t"
    915    "append           %[temp5],   %[temp4],    16         \n\t"
    916    "precrq.ph.w      %[temp3],   %[temp2],    %[temp0]   \n\t"
    917    "append           %[temp2],   %[temp0],    16         \n\t"
    918    "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
    919    "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
    920    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
    921    "prepend          %[temp8],   %[temp6],    8          \n\t"
    922    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
    923    "srl              %[temp6],   %[temp6],    16         \n\t"
    924    "prepend          %[temp3],   %[temp6],    8          \n\t"
    925    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
    926    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
    927    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    928      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    929      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    930      [temp9]"=&r"(temp9)
    931    : [top]"r"(top), [dst]"r"(dst)
    932    : "memory"
    933  );
    934 }
    935 
    936 static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    937  int temp0, temp1, temp2, temp3, temp4;
    938  int temp5, temp6, temp7, temp8, temp9;
    939  __asm__ volatile (
    940    "ulw              %[temp0],   -5(%[top])              \n\t"
    941    "ulw              %[temp1],   -1(%[top])              \n\t"
    942    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
    943    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
    944    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
    945    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
    946    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
    947    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
    948    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
    949    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
    950    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
    951    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
    952    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
    953    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
    954    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
    955    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
    956    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
    957    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
    958    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
    959    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
    960    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
    961    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
    962    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
    963    "precrq.ph.w      %[temp1],   %[temp2],    %[temp5]   \n\t"
    964    "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
    965    "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
    966    "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
    967    "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
    968    "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
    969    "append           %[temp2],   %[temp5],    16         \n\t"
    970    "append           %[temp0],   %[temp4],    16         \n\t"
    971    "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
    972    "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
    973    "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
    974    "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
    975    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    976      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    977      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    978      [temp9]"=&r"(temp9)
    979    : [top]"r"(top), [dst]"r"(dst)
    980    : "memory"
    981  );
    982 }
    983 
    984 static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
    985  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
    986  __asm__ volatile (
    987    "ulw             %[temp0],   -5(%[top])              \n\t"
    988    "preceu.ph.qbl   %[temp1],   %[temp0]                \n\t"
    989    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
    990    "packrl.ph       %[temp3],   %[temp1],    %[temp2]   \n\t"
    991    "replv.qb        %[temp7],   %[temp2]                \n\t"
    992    "addqh_r.ph      %[temp4],   %[temp1],    %[temp3]   \n\t"
    993    "addqh_r.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
    994    "shll.ph         %[temp6],   %[temp3],    1          \n\t"
    995    "addu.ph         %[temp3],   %[temp2],    %[temp3]   \n\t"
    996    "addu.ph         %[temp6],   %[temp1],    %[temp6]   \n\t"
    997    "shll.ph         %[temp0],   %[temp2],    1          \n\t"
    998    "addu.ph         %[temp6],   %[temp6],    %[temp2]   \n\t"
    999    "addu.ph         %[temp0],   %[temp3],    %[temp0]   \n\t"
   1000    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
   1001    "shra_r.ph       %[temp0],   %[temp0],    2          \n\t"
   1002    "packrl.ph       %[temp3],   %[temp6],    %[temp5]   \n\t"
   1003    "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
   1004    "append          %[temp0],   %[temp5],    16         \n\t"
   1005    "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
   1006    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
   1007    "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
   1008    "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
   1009    "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
   1010    "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
   1011    "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
   1012    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1013      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
   1014      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
   1015    : [top]"r"(top), [dst]"r"(dst)
   1016    : "memory"
   1017  );
   1018 }
   1019 
   1020 //------------------------------------------------------------------------------
   1021 // Chroma 8x8 prediction (paragraph 12.2)
   1022 
   1023 static void IntraChromaPreds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
   1024                                       const uint8_t* WEBP_RESTRICT left,
   1025                                       const uint8_t* WEBP_RESTRICT top) {
   1026  // U block
   1027  DCMode8(C8DC8 + dst, left, top);
   1028  VerticalPred8(C8VE8 + dst, top);
   1029  HorizontalPred8(C8HE8 + dst, left);
   1030  TrueMotion8(C8TM8 + dst, left, top);
   1031  // V block
   1032  dst += 8;
   1033  if (top) top += 8;
   1034  if (left) left += 16;
   1035  DCMode8(C8DC8 + dst, left, top);
   1036  VerticalPred8(C8VE8 + dst, top);
   1037  HorizontalPred8(C8HE8 + dst, left);
   1038  TrueMotion8(C8TM8 + dst, left, top);
   1039 }
   1040 
   1041 //------------------------------------------------------------------------------
   1042 // luma 16x16 prediction (paragraph 12.3)
   1043 
   1044 static void Intra16Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
   1045                                   const uint8_t* WEBP_RESTRICT left,
   1046                                   const uint8_t* WEBP_RESTRICT top) {
   1047  DCMode16(I16DC16 + dst, left, top);
   1048  VerticalPred16(I16VE16 + dst, top);
   1049  HorizontalPred16(I16HE16 + dst, left);
   1050  TrueMotion16(I16TM16 + dst, left, top);
   1051 }
   1052 
   1053 // Left samples are top[-5 .. -2], top_left is top[-1], top are
   1054 // located at top[0..3], and top right is top[4..7]
   1055 static void Intra4Preds_MIPSdspR2(uint8_t* WEBP_RESTRICT dst,
   1056                                  const uint8_t* WEBP_RESTRICT top) {
   1057  DC4(I4DC4 + dst, top);
   1058  TM4(I4TM4 + dst, top);
   1059  VE4(I4VE4 + dst, top);
   1060  HE4(I4HE4 + dst, top);
   1061  RD4(I4RD4 + dst, top);
   1062  VR4(I4VR4 + dst, top);
   1063  LD4(I4LD4 + dst, top);
   1064  VL4(I4VL4 + dst, top);
   1065  HD4(I4HD4 + dst, top);
   1066  HU4(I4HU4 + dst, top);
   1067 }
   1068 
   1069 //------------------------------------------------------------------------------
   1070 // Metric
   1071 
   1072 #if !defined(WORK_AROUND_GCC)
   1073 
   1074 #define GET_SSE_INNER(A)                                                  \
   1075  "lw               %[temp0],    " #A "(%[a])                  \n\t"      \
   1076  "lw               %[temp1],    " #A "(%[b])                  \n\t"      \
   1077  "preceu.ph.qbr    %[temp2],    %[temp0]                      \n\t"      \
   1078  "preceu.ph.qbl    %[temp0],    %[temp0]                      \n\t"      \
   1079  "preceu.ph.qbr    %[temp3],    %[temp1]                      \n\t"      \
   1080  "preceu.ph.qbl    %[temp1],    %[temp1]                      \n\t"      \
   1081  "subq.ph          %[temp2],    %[temp2],    %[temp3]         \n\t"      \
   1082  "subq.ph          %[temp0],    %[temp0],    %[temp1]         \n\t"      \
   1083  "dpa.w.ph         $ac0,        %[temp2],    %[temp2]         \n\t"      \
   1084  "dpa.w.ph         $ac0,        %[temp0],    %[temp0]         \n\t"
   1085 
   1086 #define GET_SSE(A, B, C, D)               \
   1087  GET_SSE_INNER(A)                        \
   1088  GET_SSE_INNER(B)                        \
   1089  GET_SSE_INNER(C)                        \
   1090  GET_SSE_INNER(D)
   1091 
   1092 static int SSE16x16_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
   1093                              const uint8_t* WEBP_RESTRICT b) {
   1094  int count;
   1095  int temp0, temp1, temp2, temp3;
   1096  __asm__ volatile (
   1097    "mult   $zero,    $zero                            \n\t"
   1098    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
   1099    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
   1100    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
   1101    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
   1102    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
   1103    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
   1104    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
   1105    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
   1106    GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
   1107    GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
   1108    GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
   1109    GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
   1110    GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
   1111    GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
   1112    GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
   1113    GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
   1114    "mflo   %[count]                                   \n\t"
   1115    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1116      [temp3]"=&r"(temp3), [count]"=&r"(count)
   1117    : [a]"r"(a), [b]"r"(b)
   1118    : "memory", "hi", "lo"
   1119  );
   1120  return count;
   1121 }
   1122 
   1123 static int SSE16x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
   1124                             const uint8_t* WEBP_RESTRICT b) {
   1125  int count;
   1126  int temp0, temp1, temp2, temp3;
   1127  __asm__ volatile (
   1128    "mult   $zero,    $zero                            \n\t"
   1129    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
   1130    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
   1131    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
   1132    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
   1133    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
   1134    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
   1135    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
   1136    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
   1137    "mflo   %[count]                                   \n\t"
   1138    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1139      [temp3]"=&r"(temp3), [count]"=&r"(count)
   1140    : [a]"r"(a), [b]"r"(b)
   1141    : "memory", "hi", "lo"
   1142  );
   1143  return count;
   1144 }
   1145 
   1146 static int SSE8x8_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
   1147                            const uint8_t* WEBP_RESTRICT b) {
   1148  int count;
   1149  int temp0, temp1, temp2, temp3;
   1150  __asm__ volatile (
   1151    "mult   $zero,    $zero                            \n\t"
   1152    GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
   1153    GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
   1154    GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
   1155    GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
   1156    "mflo   %[count]                                   \n\t"
   1157    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1158      [temp3]"=&r"(temp3), [count]"=&r"(count)
   1159    : [a]"r"(a), [b]"r"(b)
   1160    : "memory", "hi", "lo"
   1161  );
   1162  return count;
   1163 }
   1164 
   1165 static int SSE4x4_MIPSdspR2(const uint8_t* WEBP_RESTRICT a,
   1166                            const uint8_t* WEBP_RESTRICT b) {
   1167  int count;
   1168  int temp0, temp1, temp2, temp3;
   1169  __asm__ volatile (
   1170    "mult   $zero,    $zero                            \n\t"
   1171    GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
   1172    "mflo   %[count]                                   \n\t"
   1173    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1174      [temp3]"=&r"(temp3), [count]"=&r"(count)
   1175    : [a]"r"(a), [b]"r"(b)
   1176    : "memory", "hi", "lo"
   1177  );
   1178  return count;
   1179 }
   1180 
   1181 #undef GET_SSE
   1182 #undef GET_SSE_INNER
   1183 
   1184 #endif  // !WORK_AROUND_GCC
   1185 
   1186 #undef FILL_8_OR_16
   1187 #undef FILL_PART
   1188 #undef OUTPUT_EARLY_CLOBBER_REGS_17
   1189 #undef MUL_HALF
   1190 #undef ABS_X8
   1191 #undef ADD_SUB_HALVES_X4
   1192 
   1193 //------------------------------------------------------------------------------
   1194 // Quantization
   1195 //
   1196 
   1197 // macro for one pass through for loop in QuantizeBlock reading 2 values at time
   1198 // QUANTDIV macro inlined
   1199 // J - offset in bytes (kZigzag[n] * 2)
   1200 // K - offset in bytes (kZigzag[n] * 4)
   1201 // N - offset in bytes (n * 2)
   1202 // N1 - offset in bytes ((n + 1) * 2)
   1203 #define QUANTIZE_ONE(J, K, N, N1)                                         \
   1204  "ulw         %[temp1],     " #J "(%[ppin])                 \n\t"        \
   1205  "ulw         %[temp2],     " #J "(%[ppsharpen])            \n\t"        \
   1206  "lhu         %[temp3],     " #K "(%[ppzthresh])            \n\t"        \
   1207  "lhu         %[temp6],     " #K "+4(%[ppzthresh])          \n\t"        \
   1208  "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
   1209  "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
   1210  "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
   1211  "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
   1212  "li          %[level],     0x10001                         \n\t"        \
   1213  "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
   1214  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
   1215  "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
   1216  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
   1217  "beqz        %[temp5],     0f                              \n\t"        \
   1218  "lhu         %[temp3],     " #J "(%[ppq])                  \n\t"        \
   1219  "beq         %[temp5],     %[level],         1f            \n\t"        \
   1220  "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
   1221  "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
   1222  "beqz        %[temp5],     2f                              \n\t"        \
   1223  "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
   1224  "sh          $0,           " #J "+2(%[ppin])               \n\t"        \
   1225  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
   1226  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
   1227  "sra         %[level],     %[level],         17            \n\t"        \
   1228  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
   1229  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
   1230  "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
   1231  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
   1232  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
   1233  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
   1234  "or          %[ret],       %[ret],           %[level]      \n\t"        \
   1235  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
   1236  "sh          %[temp5],     " #J "(%[ppin])                 \n\t"        \
   1237  "j           3f                                            \n\t"        \
   1238 "2:                                                          \n\t"        \
   1239  "lhu         %[temp1],     " #J "+2(%[ppiq])               \n\t"        \
   1240  "srl         %[temp5],     %[coeff],         16            \n\t"        \
   1241  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
   1242  "lw          %[temp2],     " #K "+4(%[ppbias])             \n\t"        \
   1243  "lhu         %[temp3],     " #J "+2(%[ppq])                \n\t"        \
   1244  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
   1245  "sra         %[level],     %[level],         17            \n\t"        \
   1246  "srl         %[temp6],     %[sign],          16            \n\t"        \
   1247  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
   1248  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
   1249  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
   1250  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
   1251  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
   1252  "sh          $0,           " #J "(%[ppin])                 \n\t"        \
   1253  "sh          $0,           " #N "(%[pout])                 \n\t"        \
   1254  "or          %[ret],       %[ret],           %[level]      \n\t"        \
   1255  "sh          %[temp5],     " #J "+2(%[ppin])               \n\t"        \
   1256  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
   1257  "j           3f                                            \n\t"        \
   1258 "1:                                                          \n\t"        \
   1259  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
   1260  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
   1261  "ulw         %[temp3],     " #J "(%[ppq])                  \n\t"        \
   1262  "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
   1263  "srl         %[temp0],     %[coeff],         16            \n\t"        \
   1264  "lhu         %[temp6],     " #J "+2(%[ppiq])               \n\t"        \
   1265  "lw          %[coeff],     " #K "+4(%[ppbias])             \n\t"        \
   1266  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
   1267  "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
   1268  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
   1269  "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
   1270  "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
   1271  "shra.ph     %[level],     %[level],         1             \n\t"        \
   1272  "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
   1273  "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
   1274  "xor         %[level],     %[level],         %[sign]       \n\t"        \
   1275  "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
   1276  "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
   1277  "or          %[ret],       %[ret],           %[level]      \n\t"        \
   1278  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
   1279  "srl         %[level],     %[level],         16            \n\t"        \
   1280  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
   1281  "usw         %[temp3],     " #J "(%[ppin])                 \n\t"        \
   1282  "j           3f                                            \n\t"        \
   1283 "0:                                                          \n\t"        \
   1284  "sh          $0,           " #N "(%[pout])                 \n\t"        \
   1285  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
   1286  "usw         $0,           " #J "(%[ppin])                 \n\t"        \
   1287 "3:                                                          \n\t"
   1288 
   1289 static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
   1290                                   const VP8Matrix* WEBP_RESTRICT const mtx) {
   1291  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
   1292  int sign, coeff, level;
   1293  int max_level = MAX_LEVEL;
   1294  int max_level1 = max_level << 16 | max_level;
   1295  int ret = 0;
   1296 
   1297  int16_t* ppin             = &in[0];
   1298  int16_t* pout             = &out[0];
   1299  const uint16_t* ppsharpen = &mtx->sharpen[0];
   1300  const uint32_t* ppzthresh = &mtx->zthresh[0];
   1301  const uint16_t* ppq       = &mtx->q[0];
   1302  const uint16_t* ppiq      = &mtx->iq[0];
   1303  const uint32_t* ppbias    = &mtx->bias[0];
   1304 
   1305  __asm__ volatile (
   1306    QUANTIZE_ONE( 0,  0,  0,  2)
   1307    QUANTIZE_ONE( 4,  8, 10, 12)
   1308    QUANTIZE_ONE( 8, 16,  4,  8)
   1309    QUANTIZE_ONE(12, 24, 14, 24)
   1310    QUANTIZE_ONE(16, 32,  6, 16)
   1311    QUANTIZE_ONE(20, 40, 22, 26)
   1312    QUANTIZE_ONE(24, 48, 18, 20)
   1313    QUANTIZE_ONE(28, 56, 28, 30)
   1314 
   1315    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
   1316      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
   1317      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
   1318      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
   1319      [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
   1320    : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
   1321      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
   1322      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
   1323      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
   1324    : "memory", "hi", "lo"
   1325  );
   1326 
   1327  return (ret != 0);
   1328 }
   1329 
   1330 static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
   1331                                     const VP8Matrix* WEBP_RESTRICT const mtx) {
   1332  int nz;
   1333  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
   1334  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
   1335  return nz;
   1336 }
   1337 
   1338 #undef QUANTIZE_ONE
   1339 
   1340 // macro for one horizontal pass in FTransformWHT
   1341 // temp0..temp7 holds tmp[0]..tmp[15]
   1342 // A, B, C, D - offset in bytes to load from in buffer
   1343 // TEMP0, TEMP1 - registers for corresponding tmp elements
   1344 #define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1)                          \
   1345  "lh              %[" #TEMP0 "],  " #A "(%[in])            \n\t"              \
   1346  "lh              %[" #TEMP1 "],  " #B "(%[in])            \n\t"              \
   1347  "lh              %[temp8],     " #C "(%[in])              \n\t"              \
   1348  "lh              %[temp9],     " #D "(%[in])              \n\t"              \
   1349  "ins             %[" #TEMP1 "],  %[" #TEMP0 "],  16,  16  \n\t"              \
   1350  "ins             %[temp9],     %[temp8],     16,  16      \n\t"              \
   1351  "subq.ph         %[temp8],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
   1352  "addq.ph         %[temp9],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
   1353  "precrq.ph.w     %[" #TEMP0 "],  %[temp8],     %[temp9]   \n\t"              \
   1354  "append          %[temp8],     %[temp9],     16           \n\t"              \
   1355  "subq.ph         %[" #TEMP1 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
   1356  "addq.ph         %[" #TEMP0 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
   1357  "rotr            %[" #TEMP1 "],  %[" #TEMP1 "],  16       \n\t"
   1358 
   1359 // macro for one vertical pass in FTransformWHT
   1360 // temp0..temp7 holds tmp[0]..tmp[15]
   1361 // A, B, C, D - offsets in bytes to store to out buffer
   1362 // TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
   1363 #define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6)              \
   1364  "addq.ph         %[temp8],     %[" #TEMP0 "],  %[" #TEMP4 "]    \n\t"        \
   1365  "addq.ph         %[temp9],     %[" #TEMP2 "],  %[" #TEMP6 "]    \n\t"        \
   1366  "subq.ph         %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
   1367  "subq.ph         %[" #TEMP6 "],  %[" #TEMP0 "],  %[" #TEMP4 "]  \n\t"        \
   1368  "addqh.ph        %[" #TEMP0 "],  %[temp8],     %[temp9]         \n\t"        \
   1369  "subqh.ph        %[" #TEMP4 "],  %[" #TEMP6 "],  %[" #TEMP2 "]  \n\t"        \
   1370  "addqh.ph        %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
   1371  "subqh.ph        %[" #TEMP6 "],  %[temp8],     %[temp9]         \n\t"        \
   1372  "usw             %[" #TEMP0 "],  " #A "(%[out])                 \n\t"        \
   1373  "usw             %[" #TEMP2 "],  " #B "(%[out])                 \n\t"        \
   1374  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
   1375  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
   1376 
   1377 static void FTransformWHT_MIPSdspR2(const int16_t* WEBP_RESTRICT in,
   1378                                    int16_t* WEBP_RESTRICT out) {
   1379  int temp0, temp1, temp2, temp3, temp4;
   1380  int temp5, temp6, temp7, temp8, temp9;
   1381 
   1382  __asm__ volatile (
   1383    HORIZONTAL_PASS_WHT(  0,  32,  64,  96, temp0, temp1)
   1384    HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
   1385    HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
   1386    HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
   1387    VERTICAL_PASS_WHT(0,  8, 16, 24, temp0, temp2, temp4, temp6)
   1388    VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
   1389    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1390      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
   1391      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
   1392      [temp9]"=&r"(temp9)
   1393    : [in]"r"(in), [out]"r"(out)
   1394    : "memory"
   1395  );
   1396 }
   1397 
   1398 #undef VERTICAL_PASS_WHT
   1399 #undef HORIZONTAL_PASS_WHT
   1400 
   1401 // macro for converting coefficients to bin
   1402 // convert 8 coeffs at time
   1403 // A, B, C, D - offsets in bytes to load from out buffer
   1404 #define CONVERT_COEFFS_TO_BIN(A, B, C, D)                                      \
   1405  "ulw        %[temp0],  " #A "(%[out])                \n\t"                   \
   1406  "ulw        %[temp1],  " #B "(%[out])                \n\t"                   \
   1407  "ulw        %[temp2],  " #C "(%[out])                \n\t"                   \
   1408  "ulw        %[temp3],  " #D "(%[out])                \n\t"                   \
   1409  "absq_s.ph  %[temp0],  %[temp0]                      \n\t"                   \
   1410  "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
   1411  "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
   1412  "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
   1413  "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
   1414  "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
   1415  "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
   1416  "shra.ph    %[temp3],  %[temp3],    3                \n\t"                   \
   1417  "shll_s.ph  %[temp0],  %[temp0],    10               \n\t"                   \
   1418  "shll_s.ph  %[temp1],  %[temp1],    10               \n\t"                   \
   1419  "shll_s.ph  %[temp2],  %[temp2],    10               \n\t"                   \
   1420  "shll_s.ph  %[temp3],  %[temp3],    10               \n\t"                   \
   1421  "shrl.ph    %[temp0],  %[temp0],    10               \n\t"                   \
   1422  "shrl.ph    %[temp1],  %[temp1],    10               \n\t"                   \
   1423  "shrl.ph    %[temp2],  %[temp2],    10               \n\t"                   \
   1424  "shrl.ph    %[temp3],  %[temp3],    10               \n\t"                   \
   1425  "shll.ph    %[temp0],  %[temp0],    2                \n\t"                   \
   1426  "shll.ph    %[temp1],  %[temp1],    2                \n\t"                   \
   1427  "shll.ph    %[temp2],  %[temp2],    2                \n\t"                   \
   1428  "shll.ph    %[temp3],  %[temp3],    2                \n\t"                   \
   1429  "ext        %[temp4],  %[temp0],    0,       16      \n\t"                   \
   1430  "ext        %[temp0],  %[temp0],    16,      16      \n\t"                   \
   1431  "addu       %[temp4],  %[temp4],    %[dist]          \n\t"                   \
   1432  "addu       %[temp0],  %[temp0],    %[dist]          \n\t"                   \
   1433  "ext        %[temp5],  %[temp1],    0,       16      \n\t"                   \
   1434  "lw         %[temp8],  0(%[temp4])                   \n\t"                   \
   1435  "ext        %[temp1],  %[temp1],    16,      16      \n\t"                   \
   1436  "addu       %[temp5],  %[temp5],    %[dist]          \n\t"                   \
   1437  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1438  "sw         %[temp8],  0(%[temp4])                   \n\t"                   \
   1439  "lw         %[temp8],  0(%[temp0])                   \n\t"                   \
   1440  "addu       %[temp1],  %[temp1],    %[dist]          \n\t"                   \
   1441  "ext        %[temp6],  %[temp2],    0,       16      \n\t"                   \
   1442  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1443  "sw         %[temp8],  0(%[temp0])                   \n\t"                   \
   1444  "lw         %[temp8],  0(%[temp5])                   \n\t"                   \
   1445  "ext        %[temp2],  %[temp2],    16,      16      \n\t"                   \
   1446  "addu       %[temp6],  %[temp6],    %[dist]          \n\t"                   \
   1447  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1448  "sw         %[temp8],  0(%[temp5])                   \n\t"                   \
   1449  "lw         %[temp8],  0(%[temp1])                   \n\t"                   \
   1450  "addu       %[temp2],  %[temp2],    %[dist]          \n\t"                   \
   1451  "ext        %[temp7],  %[temp3],    0,       16      \n\t"                   \
   1452  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1453  "sw         %[temp8],  0(%[temp1])                   \n\t"                   \
   1454  "lw         %[temp8],  0(%[temp6])                   \n\t"                   \
   1455  "ext        %[temp3],  %[temp3],    16,      16      \n\t"                   \
   1456  "addu       %[temp7],  %[temp7],    %[dist]          \n\t"                   \
   1457  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1458  "sw         %[temp8],  0(%[temp6])                   \n\t"                   \
   1459  "lw         %[temp8],  0(%[temp2])                   \n\t"                   \
   1460  "addu       %[temp3],  %[temp3],    %[dist]          \n\t"                   \
   1461  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1462  "sw         %[temp8],  0(%[temp2])                   \n\t"                   \
   1463  "lw         %[temp8],  0(%[temp7])                   \n\t"                   \
   1464  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1465  "sw         %[temp8],  0(%[temp7])                   \n\t"                   \
   1466  "lw         %[temp8],  0(%[temp3])                   \n\t"                   \
   1467  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1468  "sw         %[temp8],  0(%[temp3])                   \n\t"
   1469 
   1470 static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
   1471                                       int start_block, int end_block,
   1472                                       VP8Histogram* const histo) {
   1473  int j;
   1474  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   1475  const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
   1476  for (j = start_block; j < end_block; ++j) {
   1477    int16_t out[16];
   1478    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
   1479 
   1480    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
   1481 
   1482    // Convert coefficients to bin.
   1483    __asm__ volatile (
   1484      CONVERT_COEFFS_TO_BIN( 0,  4,  8, 12)
   1485      CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
   1486      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1487        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
   1488        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
   1489      : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
   1490      : "memory"
   1491    );
   1492  }
   1493  VP8SetHistogramData(distribution, histo);
   1494 }
   1495 
   1496 #undef CONVERT_COEFFS_TO_BIN
   1497 
   1498 //------------------------------------------------------------------------------
   1499 // Entry point
   1500 
   1501 extern void VP8EncDspInitMIPSdspR2(void);
   1502 
   1503 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
   1504  VP8FTransform = FTransform_MIPSdspR2;
   1505  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
   1506  VP8ITransform = ITransform_MIPSdspR2;
   1507 
   1508  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
   1509  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
   1510 
   1511  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
   1512  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
   1513  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
   1514 
   1515 #if !defined(WORK_AROUND_GCC)
   1516  VP8SSE16x16 = SSE16x16_MIPSdspR2;
   1517  VP8SSE8x8 = SSE8x8_MIPSdspR2;
   1518  VP8SSE16x8 = SSE16x8_MIPSdspR2;
   1519  VP8SSE4x4 = SSE4x4_MIPSdspR2;
   1520 #endif
   1521 
   1522  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
   1523  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
   1524 
   1525  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
   1526 }
   1527 
   1528 #else  // !WEBP_USE_MIPS_DSP_R2
   1529 
   1530 WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
   1531 
   1532 #endif  // WEBP_USE_MIPS_DSP_R2