tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dec_mips_dsp_r2.c (51089B)


      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // MIPS version of dsp functions
     11 //
     12 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
     13 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
     14 
     15 #include "src/dsp/dsp.h"
     16 
     17 #if defined(WEBP_USE_MIPS_DSP_R2)
     18 
     19 #include "src/dsp/mips_macro.h"
     20 
     21 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
     22 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
     23 
     24 static void TransformDC(const int16_t* WEBP_RESTRICT in,
     25                        uint8_t* WEBP_RESTRICT dst) {
     26  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
     27 
     28  __asm__ volatile (
     29    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
     30                        0, 0, 0, 0,
     31                        0, 1, 2, 3,
     32                        BPS)
     33    "lh               %[temp5],  0(%[in])               \n\t"
     34    "addiu            %[temp5],  %[temp5],  4           \n\t"
     35    "ins              %[temp5],  %[temp5],  16, 16      \n\t"
     36    "shra.ph          %[temp5],  %[temp5],  3           \n\t"
     37    CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
     38                            temp3, temp1, temp2, temp3, temp4)
     39    STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
     40                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
     41                     dst, 0, 1, 2, 3, BPS)
     42 
     43    OUTPUT_EARLY_CLOBBER_REGS_10()
     44    : [in]"r"(in), [dst]"r"(dst)
     45    : "memory"
     46  );
     47 }
     48 
     49 static void TransformAC3(const int16_t* WEBP_RESTRICT in,
     50                         uint8_t* WEBP_RESTRICT dst) {
     51  const int a = in[0] + 4;
     52  int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
     53  const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
     54  const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
     55  const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
     56  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
     57  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
     58 
     59  __asm__ volatile (
     60    "ins              %[c4],      %[d4],     16,       16    \n\t"
     61    "replv.ph         %[temp1],   %[a]                       \n\t"
     62    "replv.ph         %[temp4],   %[d1]                      \n\t"
     63    ADD_SUB_HALVES(temp2, temp3, temp1, c4)
     64    "replv.ph         %[temp5],   %[c1]                      \n\t"
     65    SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
     66                   temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
     67    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
     68                        0, 0, 0, 0,
     69                        0, 1, 2, 3,
     70                        BPS)
     71    CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
     72                            temp11, temp17, temp3, temp5, temp11, temp12)
     73    PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
     74                          temp4, temp7, temp6, temp10, temp9)
     75    STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
     76                     temp17, temp12, temp18, temp1, temp8, temp2, temp4,
     77                     temp7, temp6, dst, 0, 1, 2, 3, BPS)
     78 
     79    OUTPUT_EARLY_CLOBBER_REGS_18(),
     80      [c4]"+&r"(c4)
     81    : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
     82    : "memory"
     83  );
     84 }
     85 
     86 static void TransformOne(const int16_t* WEBP_RESTRICT in,
     87                         uint8_t* WEBP_RESTRICT dst) {
     88  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
     89  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
     90 
     91  __asm__ volatile (
     92    "ulw              %[temp1],   0(%[in])                 \n\t"
     93    "ulw              %[temp2],   16(%[in])                \n\t"
     94    LOAD_IN_X2(temp5, temp6, 24, 26)
     95    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
     96    LOAD_IN_X2(temp1, temp2, 8, 10)
     97    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
     98                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
     99                  temp13, temp11, temp14, temp12)
    100    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
    101    "ulw              %[temp17],  4(%[in])                 \n\t"
    102    "ulw              %[temp18],  20(%[in])                \n\t"
    103    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
    104    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
    105    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
    106    LOAD_IN_X2(temp17, temp18, 12, 14)
    107    LOAD_IN_X2(temp9, temp10, 28, 30)
    108    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
    109                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
    110                  temp15, temp4, temp16, temp17)
    111    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
    112    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
    113    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
    114 
    115    // horizontal
    116    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
    117    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
    118    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
    119    "repl.ph          %[temp2],   0x4                      \n\t"
    120    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
    121    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
    122    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
    123    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
    124    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
    125    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
    126                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
    127                  temp6, temp17, temp8, temp18)
    128    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
    129                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
    130                  temp18, temp12, temp17, temp16)
    131    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
    132    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
    133    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
    134                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
    135                   temp6)
    136    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
    137                          temp16, temp11, temp10, temp15, temp14)
    138    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
    139                        0, 0, 0, 0,
    140                        0, 1, 2, 3,
    141                        BPS)
    142    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
    143                            temp11, temp10, temp11, temp14, temp15)
    144    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
    145                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
    146                     dst, 0, 1, 2, 3, BPS)
    147 
    148    OUTPUT_EARLY_CLOBBER_REGS_18()
    149    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
    150    : "memory", "hi", "lo"
    151  );
    152 }
    153 
    154 static void TransformTwo(const int16_t* WEBP_RESTRICT in,
    155                         uint8_t* WEBP_RESTRICT dst, int do_two) {
    156  TransformOne(in, dst);
    157  if (do_two) {
    158    TransformOne(in + 16, dst + 4);
    159  }
    160 }
    161 
    162 static WEBP_INLINE void FilterLoop26(uint8_t* p,
    163                                     int hstride, int vstride, int size,
    164                                     int thresh, int ithresh, int hev_thresh) {
    165  const int thresh2 = 2 * thresh + 1;
    166  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
    167  int temp10, temp11, temp12, temp13, temp14, temp15;
    168 
    169  __asm__ volatile (
    170    ".set      push                                      \n\t"
    171    ".set      noreorder                                 \n\t"
    172  "1:                                                    \n\t"
    173    "negu      %[temp1],  %[hstride]                     \n\t"
    174    "addiu     %[size],   %[size],        -1             \n\t"
    175    "sll       %[temp2],  %[hstride],     1              \n\t"
    176    "sll       %[temp3],  %[temp1],       1              \n\t"
    177    "addu      %[temp4],  %[temp2],       %[hstride]     \n\t"
    178    "addu      %[temp5],  %[temp3],       %[temp1]       \n\t"
    179    "lbu       %[temp7],  0(%[p])                        \n\t"
    180    "sll       %[temp6],  %[temp3],       1              \n\t"
    181    "lbux      %[temp8],  %[temp5](%[p])                 \n\t"
    182    "lbux      %[temp9],  %[temp3](%[p])                 \n\t"
    183    "lbux      %[temp10], %[temp1](%[p])                 \n\t"
    184    "lbux      %[temp11], %[temp6](%[p])                 \n\t"
    185    "lbux      %[temp12], %[hstride](%[p])               \n\t"
    186    "lbux      %[temp13], %[temp2](%[p])                 \n\t"
    187    "lbux      %[temp14], %[temp4](%[p])                 \n\t"
    188    "subu      %[temp1],  %[temp10],      %[temp7]       \n\t"
    189    "subu      %[temp2],  %[temp9],       %[temp12]      \n\t"
    190    "absq_s.w  %[temp3],  %[temp1]                       \n\t"
    191    "absq_s.w  %[temp4],  %[temp2]                       \n\t"
    192    "negu      %[temp1],  %[temp1]                       \n\t"
    193    "sll       %[temp3],  %[temp3],       2              \n\t"
    194    "addu      %[temp15], %[temp3],       %[temp4]       \n\t"
    195    "subu      %[temp3],  %[temp15],      %[thresh2]     \n\t"
    196    "sll       %[temp6],  %[temp1],       1              \n\t"
    197    "bgtz      %[temp3],  3f                             \n\t"
    198    " subu     %[temp4],  %[temp11],      %[temp8]       \n\t"
    199    "absq_s.w  %[temp4],  %[temp4]                       \n\t"
    200    "shll_s.w  %[temp2],  %[temp2],       24             \n\t"
    201    "subu      %[temp4],  %[temp4],       %[ithresh]     \n\t"
    202    "bgtz      %[temp4],  3f                             \n\t"
    203    " subu     %[temp3],  %[temp8],       %[temp9]       \n\t"
    204    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
    205    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
    206    "bgtz      %[temp3],  3f                             \n\t"
    207    " subu     %[temp5],  %[temp9],       %[temp10]      \n\t"
    208    "absq_s.w  %[temp3],  %[temp5]                       \n\t"
    209    "absq_s.w  %[temp5],  %[temp5]                       \n\t"
    210    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
    211    "bgtz      %[temp3],  3f                             \n\t"
    212    " subu     %[temp3],  %[temp14],      %[temp13]      \n\t"
    213    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
    214    "slt       %[temp5],  %[hev_thresh],  %[temp5]       \n\t"
    215    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
    216    "bgtz      %[temp3],  3f                             \n\t"
    217    " subu     %[temp3],  %[temp13],      %[temp12]      \n\t"
    218    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
    219    "sra       %[temp4],  %[temp2],       24             \n\t"
    220    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
    221    "bgtz      %[temp3],  3f                             \n\t"
    222    " subu     %[temp15], %[temp12],      %[temp7]       \n\t"
    223    "absq_s.w  %[temp3],  %[temp15]                      \n\t"
    224    "absq_s.w  %[temp15], %[temp15]                      \n\t"
    225    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
    226    "bgtz      %[temp3],  3f                             \n\t"
    227    " slt      %[temp15], %[hev_thresh],  %[temp15]      \n\t"
    228    "addu      %[temp3],  %[temp6],       %[temp1]       \n\t"
    229    "or        %[temp2],  %[temp5],       %[temp15]      \n\t"
    230    "addu      %[temp5],  %[temp4],       %[temp3]       \n\t"
    231    "beqz      %[temp2],  4f                             \n\t"
    232    " shra_r.w %[temp1],  %[temp5],       3              \n\t"
    233    "addiu     %[temp2],  %[temp5],       3              \n\t"
    234    "sra       %[temp2],  %[temp2],       3              \n\t"
    235    "shll_s.w  %[temp1],  %[temp1],       27             \n\t"
    236    "shll_s.w  %[temp2],  %[temp2],       27             \n\t"
    237    "subu      %[temp3],  %[p],           %[hstride]     \n\t"
    238    "sra       %[temp1],  %[temp1],       27             \n\t"
    239    "sra       %[temp2],  %[temp2],       27             \n\t"
    240    "subu      %[temp1],  %[temp7],       %[temp1]       \n\t"
    241    "addu      %[temp2],  %[temp10],      %[temp2]       \n\t"
    242    "lbux      %[temp2],  %[temp2](%[VP8kclip1])         \n\t"
    243    "lbux      %[temp1],  %[temp1](%[VP8kclip1])         \n\t"
    244    "sb        %[temp2],  0(%[temp3])                    \n\t"
    245    "j         3f                                        \n\t"
    246    " sb       %[temp1],  0(%[p])                        \n\t"
    247  "4:                                                    \n\t"
    248    "shll_s.w  %[temp5],  %[temp5],       24             \n\t"
    249    "subu      %[temp14], %[p],           %[hstride]     \n\t"
    250    "subu      %[temp11], %[temp14],      %[hstride]     \n\t"
    251    "sra       %[temp6],  %[temp5],       24             \n\t"
    252    "sll       %[temp1],  %[temp6],       3              \n\t"
    253    "subu      %[temp15], %[temp11],      %[hstride]     \n\t"
    254    "addu      %[temp2],  %[temp6],       %[temp1]       \n\t"
    255    "sll       %[temp3],  %[temp2],       1              \n\t"
    256    "addu      %[temp4],  %[temp3],       %[temp2]       \n\t"
    257    "addiu     %[temp2],  %[temp2],       63             \n\t"
    258    "addiu     %[temp3],  %[temp3],       63             \n\t"
    259    "addiu     %[temp4],  %[temp4],       63             \n\t"
    260    "sra       %[temp2],  %[temp2],       7              \n\t"
    261    "sra       %[temp3],  %[temp3],       7              \n\t"
    262    "sra       %[temp4],  %[temp4],       7              \n\t"
    263    "addu      %[temp1],  %[temp8],       %[temp2]       \n\t"
    264    "addu      %[temp5],  %[temp9],       %[temp3]       \n\t"
    265    "addu      %[temp6],  %[temp10],      %[temp4]       \n\t"
    266    "subu      %[temp8],  %[temp7],       %[temp4]       \n\t"
    267    "subu      %[temp7],  %[temp12],      %[temp3]       \n\t"
    268    "addu      %[temp10], %[p],           %[hstride]     \n\t"
    269    "subu      %[temp9],  %[temp13],      %[temp2]       \n\t"
    270    "addu      %[temp12], %[temp10],      %[hstride]     \n\t"
    271    "lbux      %[temp2],  %[temp1](%[VP8kclip1])         \n\t"
    272    "lbux      %[temp3],  %[temp5](%[VP8kclip1])         \n\t"
    273    "lbux      %[temp4],  %[temp6](%[VP8kclip1])         \n\t"
    274    "lbux      %[temp5],  %[temp8](%[VP8kclip1])         \n\t"
    275    "lbux      %[temp6],  %[temp7](%[VP8kclip1])         \n\t"
    276    "lbux      %[temp8],  %[temp9](%[VP8kclip1])         \n\t"
    277    "sb        %[temp2],  0(%[temp15])                   \n\t"
    278    "sb        %[temp3],  0(%[temp11])                   \n\t"
    279    "sb        %[temp4],  0(%[temp14])                   \n\t"
    280    "sb        %[temp5],  0(%[p])                        \n\t"
    281    "sb        %[temp6],  0(%[temp10])                   \n\t"
    282    "sb        %[temp8],  0(%[temp12])                   \n\t"
    283  "3:                                                    \n\t"
    284    "bgtz      %[size],   1b                             \n\t"
    285    " addu     %[p],      %[p],           %[vstride]     \n\t"
    286    ".set      pop                                       \n\t"
    287    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
    288      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
    289      [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
    290      [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
    291      [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
    292      [size]"+&r"(size), [p]"+&r"(p)
    293    : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
    294      [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
    295      [VP8kclip1]"r"(VP8kclip1)
    296    : "memory"
    297  );
    298 }
    299 
    300 static WEBP_INLINE void FilterLoop24(uint8_t* p,
    301                                     int hstride, int vstride, int size,
    302                                     int thresh, int ithresh, int hev_thresh) {
    303  int p0, q0, p1, q1, p2, q2, p3, q3;
    304  int step1, step2, temp1, temp2, temp3, temp4;
    305  uint8_t* pTemp0;
    306  uint8_t* pTemp1;
    307  const int thresh2 = 2 * thresh + 1;
    308 
    309  __asm__ volatile (
    310    ".set      push                                   \n\t"
    311    ".set      noreorder                              \n\t"
    312    "bltz      %[size],    3f                         \n\t"
    313    " nop                                             \n\t"
    314  "2:                                                 \n\t"
    315    "negu      %[step1],   %[hstride]                 \n\t"
    316    "lbu       %[q0],      0(%[p])                    \n\t"
    317    "lbux      %[p0],      %[step1](%[p])             \n\t"
    318    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
    319    "lbux      %[q1],      %[hstride](%[p])           \n\t"
    320    "subu      %[temp1],   %[p0],         %[q0]       \n\t"
    321    "lbux      %[p1],      %[step1](%[p])             \n\t"
    322    "addu      %[step2],   %[hstride],    %[hstride]  \n\t"
    323    "absq_s.w  %[temp2],   %[temp1]                   \n\t"
    324    "subu      %[temp3],   %[p1],         %[q1]       \n\t"
    325    "absq_s.w  %[temp4],   %[temp3]                   \n\t"
    326    "sll       %[temp2],   %[temp2],      2           \n\t"
    327    "addu      %[temp2],   %[temp2],      %[temp4]    \n\t"
    328    "subu      %[temp4],   %[temp2],      %[thresh2]  \n\t"
    329    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
    330    "bgtz      %[temp4],   0f                         \n\t"
    331    " lbux     %[p2],      %[step1](%[p])             \n\t"
    332    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
    333    "lbux      %[q2],      %[step2](%[p])             \n\t"
    334    "lbux      %[p3],      %[step1](%[p])             \n\t"
    335    "subu      %[temp4],   %[p2],         %[p1]       \n\t"
    336    "addu      %[step2],   %[step2],      %[hstride]  \n\t"
    337    "subu      %[temp2],   %[p3],         %[p2]       \n\t"
    338    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
    339    "absq_s.w  %[temp2],   %[temp2]                   \n\t"
    340    "lbux      %[q3],      %[step2](%[p])             \n\t"
    341    "subu      %[temp4],   %[temp4],      %[ithresh]  \n\t"
    342    "negu      %[temp1],   %[temp1]                   \n\t"
    343    "bgtz      %[temp4],   0f                         \n\t"
    344    " subu     %[temp2],   %[temp2],      %[ithresh]  \n\t"
    345    "subu      %[p3],      %[p1],         %[p0]       \n\t"
    346    "bgtz      %[temp2],   0f                         \n\t"
    347    " absq_s.w %[p3],      %[p3]                      \n\t"
    348    "subu      %[temp4],   %[q3],         %[q2]       \n\t"
    349    "subu      %[pTemp0],  %[p],          %[hstride]  \n\t"
    350    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
    351    "subu      %[temp2],   %[p3],         %[ithresh]  \n\t"
    352    "sll       %[step1],   %[temp1],      1           \n\t"
    353    "bgtz      %[temp2],   0f                         \n\t"
    354    " subu     %[temp4],   %[temp4],      %[ithresh]  \n\t"
    355    "subu      %[temp2],   %[q2],         %[q1]       \n\t"
    356    "bgtz      %[temp4],   0f                         \n\t"
    357    " absq_s.w %[temp2],   %[temp2]                   \n\t"
    358    "subu      %[q3],      %[q1],         %[q0]       \n\t"
    359    "absq_s.w  %[q3],      %[q3]                      \n\t"
    360    "subu      %[temp2],   %[temp2],      %[ithresh]  \n\t"
    361    "addu      %[temp1],   %[temp1],      %[step1]    \n\t"
    362    "bgtz      %[temp2],   0f                         \n\t"
    363    " subu     %[temp4],   %[q3],         %[ithresh]  \n\t"
    364    "slt       %[p3],      %[hev_thresh], %[p3]       \n\t"
    365    "bgtz      %[temp4],   0f                         \n\t"
    366    " slt      %[q3],      %[hev_thresh], %[q3]       \n\t"
    367    "or        %[q3],      %[q3],         %[p3]       \n\t"
    368    "bgtz      %[q3],      1f                         \n\t"
    369    " shra_r.w %[temp2],   %[temp1],      3           \n\t"
    370    "addiu     %[temp1],   %[temp1],      3           \n\t"
    371    "sra       %[temp1],   %[temp1],      3           \n\t"
    372    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
    373    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
    374    "addu      %[pTemp1],  %[p],          %[hstride]  \n\t"
    375    "sra       %[temp2],   %[temp2],      27          \n\t"
    376    "sra       %[temp1],   %[temp1],      27          \n\t"
    377    "addiu     %[step1],   %[temp2],      1           \n\t"
    378    "sra       %[step1],   %[step1],      1           \n\t"
    379    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
    380    "addu      %[p1],      %[p1],         %[step1]    \n\t"
    381    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
    382    "subu      %[q1],      %[q1],         %[step1]    \n\t"
    383    "lbux      %[temp2],   %[p0](%[VP8kclip1])        \n\t"
    384    "lbux      %[temp3],   %[q0](%[VP8kclip1])        \n\t"
    385    "lbux      %[temp4],   %[q1](%[VP8kclip1])        \n\t"
    386    "sb        %[temp2],   0(%[pTemp0])               \n\t"
    387    "lbux      %[temp1],   %[p1](%[VP8kclip1])        \n\t"
    388    "subu      %[pTemp0],  %[pTemp0],    %[hstride]   \n\t"
    389    "sb        %[temp3],   0(%[p])                    \n\t"
    390    "sb        %[temp4],   0(%[pTemp1])               \n\t"
    391    "j         0f                                     \n\t"
    392    " sb       %[temp1],   0(%[pTemp0])               \n\t"
    393  "1:                                                 \n\t"
    394    "shll_s.w  %[temp3],   %[temp3],      24          \n\t"
    395    "sra       %[temp3],   %[temp3],      24          \n\t"
    396    "addu      %[temp1],   %[temp1],      %[temp3]    \n\t"
    397    "shra_r.w  %[temp2],   %[temp1],      3           \n\t"
    398    "addiu     %[temp1],   %[temp1],      3           \n\t"
    399    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
    400    "sra       %[temp1],   %[temp1],      3           \n\t"
    401    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
    402    "sra       %[temp2],   %[temp2],      27          \n\t"
    403    "sra       %[temp1],   %[temp1],      27          \n\t"
    404    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
    405    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
    406    "lbux      %[temp1],   %[p0](%[VP8kclip1])        \n\t"
    407    "lbux      %[temp2],   %[q0](%[VP8kclip1])        \n\t"
    408    "sb        %[temp2],   0(%[p])                    \n\t"
    409    "sb        %[temp1],   0(%[pTemp0])               \n\t"
    410  "0:                                                 \n\t"
    411    "subu      %[size],    %[size],       1           \n\t"
    412    "bgtz      %[size],    2b                         \n\t"
    413    " addu     %[p],       %[p],          %[vstride]  \n\t"
    414  "3:                                                 \n\t"
    415    ".set      pop                                    \n\t"
    416    : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
    417      [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
    418      [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
    419      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
    420      [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
    421      [size]"+&r"(size)
    422    : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
    423      [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
    424      [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
    425    : "memory"
    426  );
    427 }
    428 
    429 // on macroblock edges
    430 static void VFilter16(uint8_t* p, int stride,
    431                      int thresh, int ithresh, int hev_thresh) {
    432  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
    433 }
    434 
    435 static void HFilter16(uint8_t* p, int stride,
    436                      int thresh, int ithresh, int hev_thresh) {
    437  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
    438 }
    439 
    440 // 8-pixels wide variant, for chroma filtering
    441 static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    442                     int stride, int thresh, int ithresh, int hev_thresh) {
    443  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
    444  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
    445 }
    446 
    447 static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    448                     int stride, int thresh, int ithresh, int hev_thresh) {
    449  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
    450  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
    451 }
    452 
    453 // on three inner edges
    454 static void VFilter16i(uint8_t* p, int stride,
    455                       int thresh, int ithresh, int hev_thresh) {
    456  int k;
    457  for (k = 3; k > 0; --k) {
    458    p += 4 * stride;
    459    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
    460  }
    461 }
    462 
    463 static void HFilter16i(uint8_t* p, int stride,
    464                       int thresh, int ithresh, int hev_thresh) {
    465  int k;
    466  for (k = 3; k > 0; --k) {
    467    p += 4;
    468    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
    469  }
    470 }
    471 
    472 static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    473                      int stride, int thresh, int ithresh, int hev_thresh) {
    474  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
    475  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
    476 }
    477 
    478 static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
    479                      int stride, int thresh, int ithresh, int hev_thresh) {
    480  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
    481  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
    482 }
    483 
    484 //------------------------------------------------------------------------------
    485 // Simple In-loop filtering (Paragraph 15.2)
    486 
    487 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
    488  int i;
    489  const int thresh2 = 2 * thresh + 1;
    490  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
    491  uint8_t* p1 = p - stride;
    492  __asm__ volatile (
    493    ".set      push                                      \n\t"
    494    ".set      noreorder                                 \n\t"
    495    "li        %[i],        16                           \n\t"
    496  "0:                                                    \n\t"
    497    "negu      %[temp4],    %[stride]                    \n\t"
    498    "sll       %[temp5],    %[temp4],       1            \n\t"
    499    "lbu       %[temp2],    0(%[p])                      \n\t"
    500    "lbux      %[temp3],    %[stride](%[p])              \n\t"
    501    "lbux      %[temp1],    %[temp4](%[p])               \n\t"
    502    "lbux      %[temp0],    %[temp5](%[p])               \n\t"
    503    "subu      %[temp7],    %[temp1],       %[temp2]     \n\t"
    504    "subu      %[temp6],    %[temp0],       %[temp3]     \n\t"
    505    "absq_s.w  %[temp4],    %[temp7]                     \n\t"
    506    "absq_s.w  %[temp5],    %[temp6]                     \n\t"
    507    "sll       %[temp4],    %[temp4],       2            \n\t"
    508    "subu      %[temp5],    %[temp5],       %[thresh2]   \n\t"
    509    "addu      %[temp5],    %[temp4],       %[temp5]     \n\t"
    510    "negu      %[temp8],    %[temp7]                     \n\t"
    511    "bgtz      %[temp5],    1f                           \n\t"
    512    " addiu    %[i],        %[i],           -1           \n\t"
    513    "sll       %[temp4],    %[temp8],       1            \n\t"
    514    "shll_s.w  %[temp5],    %[temp6],       24           \n\t"
    515    "addu      %[temp3],    %[temp4],       %[temp8]     \n\t"
    516    "sra       %[temp5],    %[temp5],       24           \n\t"
    517    "addu      %[temp3],    %[temp3],       %[temp5]     \n\t"
    518    "addiu     %[temp7],    %[temp3],       3            \n\t"
    519    "sra       %[temp7],    %[temp7],       3            \n\t"
    520    "shra_r.w  %[temp8],    %[temp3],       3            \n\t"
    521    "shll_s.w  %[temp0],    %[temp7],       27           \n\t"
    522    "shll_s.w  %[temp4],    %[temp8],       27           \n\t"
    523    "sra       %[temp0],    %[temp0],       27           \n\t"
    524    "sra       %[temp4],    %[temp4],       27           \n\t"
    525    "addu      %[temp7],    %[temp1],       %[temp0]     \n\t"
    526    "subu      %[temp2],    %[temp2],       %[temp4]     \n\t"
    527    "lbux      %[temp3],    %[temp7](%[VP8kclip1])       \n\t"
    528    "lbux      %[temp4],    %[temp2](%[VP8kclip1])       \n\t"
    529    "sb        %[temp3],    0(%[p1])                     \n\t"
    530    "sb        %[temp4],    0(%[p])                      \n\t"
    531  "1:                                                    \n\t"
    532    "addiu     %[p1],       %[p1],          1            \n\t"
    533    "bgtz      %[i],        0b                           \n\t"
    534    " addiu    %[p],        %[p],           1            \n\t"
    535    " .set     pop                                       \n\t"
    536    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    537      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    538      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    539      [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
    540    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
    541    : "memory"
    542  );
    543 }
    544 
    545 // TEMP0 = SRC[A + A1 * BPS]
    546 // TEMP1 = SRC[B + B1 * BPS]
    547 // TEMP2 = SRC[C + C1 * BPS]
    548 // TEMP3 = SRC[D + D1 * BPS]
    549 #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
    550                     A, A1, B, B1, C, C1, D, D1, SRC)                          \
    551  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
    552  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
    553  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
    554  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
    555 
    556 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
    557  int i;
    558  const int thresh2 = 2 * thresh + 1;
    559  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
    560  __asm__ volatile (
    561    ".set      push                                     \n\t"
    562    ".set      noreorder                                \n\t"
    563    "li        %[i],       16                           \n\t"
    564  "0:                                                   \n\t"
    565    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
    566    "subu      %[temp7],    %[temp1],       %[temp2]    \n\t"
    567    "subu      %[temp6],    %[temp0],       %[temp3]    \n\t"
    568    "absq_s.w  %[temp4],    %[temp7]                    \n\t"
    569    "absq_s.w  %[temp5],    %[temp6]                    \n\t"
    570    "sll       %[temp4],    %[temp4],       2           \n\t"
    571    "addu      %[temp5],    %[temp4],       %[temp5]    \n\t"
    572    "subu      %[temp5],    %[temp5],       %[thresh2]  \n\t"
    573    "negu      %[temp8],    %[temp7]                    \n\t"
    574    "bgtz      %[temp5],    1f                          \n\t"
    575    " addiu    %[i],        %[i],           -1          \n\t"
    576    "sll       %[temp4],    %[temp8],       1           \n\t"
    577    "shll_s.w  %[temp5],    %[temp6],       24          \n\t"
    578    "addu      %[temp3],    %[temp4],       %[temp8]    \n\t"
    579    "sra       %[temp5],    %[temp5],       24          \n\t"
    580    "addu      %[temp3],    %[temp3],       %[temp5]    \n\t"
    581    "addiu     %[temp7],    %[temp3],       3           \n\t"
    582    "sra       %[temp7],    %[temp7],       3           \n\t"
    583    "shra_r.w  %[temp8],    %[temp3],       3           \n\t"
    584    "shll_s.w  %[temp0],    %[temp7],       27          \n\t"
    585    "shll_s.w  %[temp4],    %[temp8],       27          \n\t"
    586    "sra       %[temp0],    %[temp0],       27          \n\t"
    587    "sra       %[temp4],    %[temp4],       27          \n\t"
    588    "addu      %[temp7],    %[temp1],       %[temp0]    \n\t"
    589    "subu      %[temp2],    %[temp2],       %[temp4]    \n\t"
    590    "lbux      %[temp3],    %[temp7](%[VP8kclip1])      \n\t"
    591    "lbux      %[temp4],    %[temp2](%[VP8kclip1])      \n\t"
    592    "sb        %[temp3],    -1(%[p])                    \n\t"
    593    "sb        %[temp4],    0(%[p])                     \n\t"
    594  "1:                                                   \n\t"
    595    "bgtz      %[i],        0b                          \n\t"
    596    " addu     %[p],        %[p],           %[stride]   \n\t"
    597    ".set      pop                                      \n\t"
    598    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    599      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    600      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    601      [p]"+&r"(p), [i]"=&r"(i)
    602    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
    603    : "memory"
    604  );
    605 }
    606 
    607 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
    608  int k;
    609  for (k = 3; k > 0; --k) {
    610    p += 4 * stride;
    611    SimpleVFilter16(p, stride, thresh);
    612  }
    613 }
    614 
    615 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
    616  int k;
    617  for (k = 3; k > 0; --k) {
    618    p += 4;
    619    SimpleHFilter16(p, stride, thresh);
    620  }
    621 }
    622 
    623 // DST[A * BPS]     = TEMP0
    624 // DST[B + C * BPS] = TEMP1
    625 #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
    626  "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
    627  "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
    628 
    629 static void VE4(uint8_t* dst) {    // vertical
    630  const uint8_t* top = dst - BPS;
    631  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
    632  __asm__ volatile (
    633    "ulw             %[temp0],   -1(%[top])              \n\t"
    634    "ulh             %[temp1],   3(%[top])               \n\t"
    635    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
    636    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
    637    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
    638    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
    639    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
    640    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
    641    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
    642    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
    643    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
    644    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
    645    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
    646    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
    647    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
    648    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
    649    STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
    650    STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
    651    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    652      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    653      [temp6]"=&r"(temp6)
    654    : [top]"r"(top), [dst]"r"(dst)
    655    : "memory"
    656  );
    657 }
    658 
    659 static void DC4(uint8_t* dst) {   // DC
    660  int temp0, temp1, temp2, temp3, temp4;
    661  __asm__ volatile (
    662    "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
    663    LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
    664    "ins          %[temp1],   %[temp2],    8,     8    \n\t"
    665    "ins          %[temp1],   %[temp3],    16,    8    \n\t"
    666    "ins          %[temp1],   %[temp4],    24,    8    \n\t"
    667    "raddu.w.qb   %[temp0],   %[temp0]                 \n\t"
    668    "raddu.w.qb   %[temp1],   %[temp1]                 \n\t"
    669    "addu         %[temp0],   %[temp0],    %[temp1]    \n\t"
    670    "shra_r.w     %[temp0],   %[temp0],    3           \n\t"
    671    "replv.qb     %[temp0],   %[temp0]                 \n\t"
    672    STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
    673    STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
    674    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    675      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
    676    : [dst]"r"(dst)
    677    : "memory"
    678  );
    679 }
    680 
    681 static void RD4(uint8_t* dst) {   // Down-right
    682  int temp0, temp1, temp2, temp3, temp4;
    683  int temp5, temp6, temp7, temp8;
    684  __asm__ volatile (
    685    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
    686    "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
    687    "ins            %[temp1],   %[temp0], 16, 16               \n\t"
    688    "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
    689    "ins            %[temp2],   %[temp1], 16, 16               \n\t"
    690    "preceu.ph.qbl  %[temp4],   %[temp7]                       \n\t"
    691    "ins            %[temp3],   %[temp2], 16, 16               \n\t"
    692    "shll.ph        %[temp2],   %[temp2], 1                    \n\t"
    693    "addq.ph        %[temp3],   %[temp3], %[temp1]             \n\t"
    694    "packrl.ph      %[temp6],   %[temp5], %[temp1]             \n\t"
    695    "addq.ph        %[temp3],   %[temp3], %[temp2]             \n\t"
    696    "addq.ph        %[temp1],   %[temp1], %[temp5]             \n\t"
    697    "shll.ph        %[temp6],   %[temp6], 1                    \n\t"
    698    "addq.ph        %[temp1],   %[temp1], %[temp6]             \n\t"
    699    "packrl.ph      %[temp0],   %[temp4], %[temp5]             \n\t"
    700    "addq.ph        %[temp8],   %[temp5], %[temp4]             \n\t"
    701    "shra_r.ph      %[temp3],   %[temp3], 2                    \n\t"
    702    "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
    703    "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
    704    "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
    705    "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
    706    "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
    707    "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
    708    "ins            %[temp7],   %[temp5], 0,  8                \n\t"
    709    "precr.qb.ph    %[temp2],   %[temp1], %[temp3]             \n\t"
    710    "raddu.w.qb     %[temp4],   %[temp7]                       \n\t"
    711    "precr.qb.ph    %[temp6],   %[temp8], %[temp1]             \n\t"
    712    "shra_r.w       %[temp4],   %[temp4], 2                    \n\t"
    713    STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
    714    "prepend        %[temp2],   %[temp8], 8                    \n\t"
    715    "prepend        %[temp6],   %[temp4], 8                    \n\t"
    716    STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
    717    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    718      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    719      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
    720    : [dst]"r"(dst)
    721    : "memory"
    722  );
    723 }
    724 
    725 // TEMP0 = SRC[A * BPS]
    726 // TEMP1 = SRC[B + C * BPS]
    727 #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
    728  "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
    729  "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
    730 
    731 static void LD4(uint8_t* dst) {   // Down-Left
    732  int temp0, temp1, temp2, temp3, temp4;
    733  int temp5, temp6, temp7, temp8, temp9;
    734  __asm__ volatile (
    735    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
    736    "preceu.ph.qbl   %[temp2],    %[temp0]                     \n\t"
    737    "preceu.ph.qbr   %[temp3],    %[temp0]                     \n\t"
    738    "preceu.ph.qbr   %[temp4],    %[temp1]                     \n\t"
    739    "preceu.ph.qbl   %[temp5],    %[temp1]                     \n\t"
    740    "packrl.ph       %[temp6],    %[temp2],    %[temp3]        \n\t"
    741    "packrl.ph       %[temp7],    %[temp4],    %[temp2]        \n\t"
    742    "packrl.ph       %[temp8],    %[temp5],    %[temp4]        \n\t"
    743    "shll.ph         %[temp6],    %[temp6],    1               \n\t"
    744    "addq.ph         %[temp9],    %[temp2],    %[temp6]        \n\t"
    745    "shll.ph         %[temp7],    %[temp7],    1               \n\t"
    746    "addq.ph         %[temp9],    %[temp9],    %[temp3]        \n\t"
    747    "shll.ph         %[temp8],    %[temp8],    1               \n\t"
    748    "shra_r.ph       %[temp9],    %[temp9],    2               \n\t"
    749    "addq.ph         %[temp3],    %[temp4],    %[temp7]        \n\t"
    750    "addq.ph         %[temp0],    %[temp5],    %[temp8]        \n\t"
    751    "addq.ph         %[temp3],    %[temp3],    %[temp2]        \n\t"
    752    "addq.ph         %[temp0],    %[temp0],    %[temp4]        \n\t"
    753    "shra_r.ph       %[temp3],    %[temp3],    2               \n\t"
    754    "shra_r.ph       %[temp0],    %[temp0],    2               \n\t"
    755    "srl             %[temp1],    %[temp1],    24              \n\t"
    756    "sll             %[temp1],    %[temp1],    1               \n\t"
    757    "raddu.w.qb      %[temp5],    %[temp5]                     \n\t"
    758    "precr.qb.ph     %[temp9],    %[temp3],    %[temp9]        \n\t"
    759    "precr.qb.ph     %[temp3],    %[temp0],    %[temp3]        \n\t"
    760    "addu            %[temp1],    %[temp1],    %[temp5]        \n\t"
    761    "shra_r.w        %[temp1],    %[temp1],    2               \n\t"
    762    STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
    763    "prepend         %[temp9],    %[temp0],    8               \n\t"
    764    "prepend         %[temp3],    %[temp1],    8               \n\t"
    765    STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
    766    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    767      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    768      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    769      [temp9]"=&r"(temp9)
    770    : [dst]"r"(dst)
    771    : "memory"
    772  );
    773 }
    774 
    775 //------------------------------------------------------------------------------
    776 // Chroma
    777 
    778 static void DC8uv(uint8_t* dst) {     // DC
    779  int temp0, temp1, temp2, temp3, temp4;
    780  int temp5, temp6, temp7, temp8, temp9;
    781  __asm__ volatile (
    782    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
    783    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
    784    LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
    785    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
    786    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
    787    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
    788    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
    789    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
    790    "addu         %[temp8],   %[temp8],    %[temp9]      \n\t"
    791    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
    792    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
    793    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
    794    "addu         %[temp0],   %[temp0],    %[temp2]      \n\t"
    795    "addu         %[temp0],   %[temp0],    %[temp6]      \n\t"
    796    "shra_r.w     %[temp0],   %[temp0],    4             \n\t"
    797    "replv.qb     %[temp0],   %[temp0]                   \n\t"
    798    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
    799    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
    800    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
    801    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
    802    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
    803    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
    804    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
    805    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
    806    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    807      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    808      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    809      [temp9]"=&r"(temp9)
    810    : [dst]"r"(dst)
    811    : "memory"
    812  );
    813 }
    814 
    815 static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
    816  int temp0, temp1;
    817  __asm__ volatile (
    818    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
    819    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
    820    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
    821    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
    822    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
    823    "replv.qb     %[temp0],   %[temp0]                   \n\t"
    824    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
    825    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
    826    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
    827    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
    828    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
    829    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
    830    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
    831    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
    832    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
    833    : [dst]"r"(dst)
    834    : "memory"
    835  );
    836 }
    837 
    838 static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
    839  int temp0, temp1, temp2, temp3, temp4;
    840  int temp5, temp6, temp7, temp8;
    841  __asm__ volatile (
    842    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
    843    LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
    844    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
    845    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
    846    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
    847    "addu         %[temp8],   %[temp8],    %[temp1]      \n\t"
    848    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
    849    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
    850    "addu         %[temp0],   %[temp6],    %[temp2]      \n\t"
    851    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
    852    "replv.qb     %[temp0],   %[temp0]                   \n\t"
    853    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
    854    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
    855    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
    856    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
    857    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
    858    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
    859    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
    860    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
    861    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    862      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    863      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
    864    : [dst]"r"(dst)
    865    : "memory"
    866  );
    867 }
    868 
    869 #undef LOAD_8_BYTES
    870 #undef STORE_8_BYTES
    871 #undef LOAD_4_BYTES
    872 
    873 #define CLIPPING(SIZE)                                                         \
    874  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
    875  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
    876 ".if " #SIZE " == 8                                      \n\t"                 \
    877  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
    878  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
    879 ".endif                                                  \n\t"                 \
    880  "addu.ph         %[temp2],   %[temp2],   %[dst_1]      \n\t"                 \
    881  "addu.ph         %[temp0],   %[temp0],   %[dst_1]      \n\t"                 \
    882 ".if " #SIZE " == 8                                      \n\t"                 \
    883  "addu.ph         %[temp3],   %[temp3],   %[dst_1]      \n\t"                 \
    884  "addu.ph         %[temp1],   %[temp1],   %[dst_1]      \n\t"                 \
    885 ".endif                                                  \n\t"                 \
    886  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
    887  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
    888 ".if " #SIZE " == 8                                      \n\t"                 \
    889  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
    890  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
    891 ".endif                                                  \n\t"                 \
    892  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
    893 ".if " #SIZE " == 8                                      \n\t"                 \
    894  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"                 \
    895 ".endif                                                  \n\t"
    896 
    897 
    898 #define CLIP_8B_TO_DST(DST, TOP, SIZE) do {                                    \
    899  int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1];                              \
    900  int temp0, temp1, temp2, temp3;                                              \
    901  __asm__ volatile (                                                           \
    902  ".if " #SIZE " < 8                                     \n\t"                 \
    903    "ulw             %[temp0],   0(%[top])               \n\t"                 \
    904    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
    905    CLIPPING(4)                                                                \
    906    "usw             %[temp0],   0(%[dst])               \n\t"                 \
    907  ".else                                                 \n\t"                 \
    908    "ulw             %[temp0],   0(%[top])               \n\t"                 \
    909    "ulw             %[temp1],   4(%[top])               \n\t"                 \
    910    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
    911    CLIPPING(8)                                                                \
    912    "usw             %[temp0],   0(%[dst])               \n\t"                 \
    913    "usw             %[temp1],   4(%[dst])               \n\t"                 \
    914  ".if " #SIZE " == 16                                   \n\t"                 \
    915    "ulw             %[temp0],   8(%[top])               \n\t"                 \
    916    "ulw             %[temp1],   12(%[top])              \n\t"                 \
    917    CLIPPING(8)                                                                \
    918    "usw             %[temp0],   8(%[dst])               \n\t"                 \
    919    "usw             %[temp1],   12(%[dst])              \n\t"                 \
    920  ".endif                                                \n\t"                 \
    921  ".endif                                                \n\t"                 \
    922    : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),           \
    923      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
    924    : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST))                      \
    925    : "memory"                                                                 \
    926  );                                                                           \
    927 } while (0)
    928 
    929 #define CLIP_TO_DST(DST, SIZE) do {                                            \
    930  int y;                                                                       \
    931  const uint8_t* top = (DST) - BPS;                                            \
    932  const int top_1 = ((int)top[-1] << 16) + top[-1];                            \
    933  for (y = 0; y < (SIZE); ++y) {                                               \
    934    CLIP_8B_TO_DST((DST), top, (SIZE));                                        \
    935    (DST) += BPS;                                                              \
    936  }                                                                            \
    937 } while (0)
    938 
    939 #define TRUE_MOTION(DST, SIZE)                                                 \
    940 static void TrueMotion##SIZE(uint8_t* (DST)) {                                 \
    941  CLIP_TO_DST((DST), (SIZE));                                                  \
    942 }
    943 
    944 TRUE_MOTION(dst, 4)
    945 TRUE_MOTION(dst, 8)
    946 TRUE_MOTION(dst, 16)
    947 
    948 #undef TRUE_MOTION
    949 #undef CLIP_TO_DST
    950 #undef CLIP_8B_TO_DST
    951 #undef CLIPPING
    952 
    953 //------------------------------------------------------------------------------
    954 // Entry point
    955 
    956 extern void VP8DspInitMIPSdspR2(void);
    957 
    958 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
    959  VP8TransformDC = TransformDC;
    960  VP8TransformAC3 = TransformAC3;
    961  VP8Transform = TransformTwo;
    962 
    963  VP8VFilter16 = VFilter16;
    964  VP8HFilter16 = HFilter16;
    965  VP8VFilter8 = VFilter8;
    966  VP8HFilter8 = HFilter8;
    967  VP8VFilter16i = VFilter16i;
    968  VP8HFilter16i = HFilter16i;
    969  VP8VFilter8i = VFilter8i;
    970  VP8HFilter8i = HFilter8i;
    971  VP8SimpleVFilter16 = SimpleVFilter16;
    972  VP8SimpleHFilter16 = SimpleHFilter16;
    973  VP8SimpleVFilter16i = SimpleVFilter16i;
    974  VP8SimpleHFilter16i = SimpleHFilter16i;
    975 
    976  VP8PredLuma4[0] = DC4;
    977  VP8PredLuma4[1] = TrueMotion4;
    978  VP8PredLuma4[2] = VE4;
    979  VP8PredLuma4[4] = RD4;
    980  VP8PredLuma4[6] = LD4;
    981 
    982  VP8PredChroma8[0] = DC8uv;
    983  VP8PredChroma8[1] = TrueMotion8;
    984  VP8PredChroma8[4] = DC8uvNoTop;
    985  VP8PredChroma8[5] = DC8uvNoLeft;
    986 
    987  VP8PredLuma16[1] = TrueMotion16;
    988 }
    989 
    990 #else  // !WEBP_USE_MIPS_DSP_R2
    991 
    992 WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
    993 
    994 #endif  // WEBP_USE_MIPS_DSP_R2