dec_mips_dsp_r2.c (51089B)
1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MIPS version of dsp functions 11 // 12 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) 13 // Jovan Zelincevic (jovan.zelincevic@imgtec.com) 14 15 #include "src/dsp/dsp.h" 16 17 #if defined(WEBP_USE_MIPS_DSP_R2) 18 19 #include "src/dsp/mips_macro.h" 20 21 static const int kC1 = WEBP_TRANSFORM_AC3_C1; 22 static const int kC2 = WEBP_TRANSFORM_AC3_C2; 23 24 static void TransformDC(const int16_t* WEBP_RESTRICT in, 25 uint8_t* WEBP_RESTRICT dst) { 26 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10; 27 28 __asm__ volatile ( 29 LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst, 30 0, 0, 0, 0, 31 0, 1, 2, 3, 32 BPS) 33 "lh %[temp5], 0(%[in]) \n\t" 34 "addiu %[temp5], %[temp5], 4 \n\t" 35 "ins %[temp5], %[temp5], 16, 16 \n\t" 36 "shra.ph %[temp5], %[temp5], 3 \n\t" 37 CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2, 38 temp3, temp1, temp2, temp3, temp4) 39 STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3, 40 temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5, 41 dst, 0, 1, 2, 3, BPS) 42 43 OUTPUT_EARLY_CLOBBER_REGS_10() 44 : [in]"r"(in), [dst]"r"(dst) 45 : "memory" 46 ); 47 } 48 49 static void TransformAC3(const int16_t* WEBP_RESTRICT in, 50 uint8_t* WEBP_RESTRICT dst) { 51 const int a = in[0] + 4; 52 int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]); 53 const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]); 54 const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]); 55 const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]); 56 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; 57 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; 58 59 __asm__ volatile ( 60 "ins %[c4], %[d4], 16, 16 \n\t" 61 "replv.ph %[temp1], %[a] \n\t" 62 "replv.ph %[temp4], %[d1] \n\t" 63 ADD_SUB_HALVES(temp2, temp3, temp1, c4) 64 "replv.ph %[temp5], %[c1] \n\t" 65 SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4, 66 temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5) 67 LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst, 68 0, 0, 0, 0, 69 0, 1, 2, 3, 70 BPS) 71 CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16, 72 temp11, temp17, temp3, temp5, temp11, temp12) 73 PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2, 74 temp4, temp7, temp6, temp10, temp9) 75 STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11, 76 temp17, temp12, temp18, temp1, temp8, temp2, temp4, 77 temp7, temp6, dst, 0, 1, 2, 3, BPS) 78 79 OUTPUT_EARLY_CLOBBER_REGS_18(), 80 [c4]"+&r"(c4) 81 : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1) 82 : "memory" 83 ); 84 } 85 86 static void TransformOne(const int16_t* WEBP_RESTRICT in, 87 uint8_t* WEBP_RESTRICT dst) { 88 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; 89 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; 90 91 __asm__ volatile ( 92 "ulw %[temp1], 0(%[in]) \n\t" 93 "ulw %[temp2], 16(%[in]) \n\t" 94 LOAD_IN_X2(temp5, temp6, 24, 26) 95 ADD_SUB_HALVES(temp3, temp4, temp1, temp2) 96 LOAD_IN_X2(temp1, temp2, 8, 10) 97 MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, 98 temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6, 99 temp13, temp11, temp14, temp12) 100 INSERT_HALF_X2(temp8, temp7, temp10, temp9) 101 "ulw %[temp17], 4(%[in]) \n\t" 102 "ulw %[temp18], 20(%[in]) \n\t" 103 ADD_SUB_HALVES(temp1, temp2, temp3, temp8) 104 ADD_SUB_HALVES(temp5, temp6, temp4, temp7) 105 ADD_SUB_HALVES(temp7, temp8, temp17, temp18) 106 LOAD_IN_X2(temp17, temp18, 12, 14) 107 LOAD_IN_X2(temp9, temp10, 28, 30) 108 MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17, 109 temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10, 110 temp15, temp4, temp16, temp17) 111 INSERT_HALF_X2(temp11, temp12, temp13, temp14) 112 ADD_SUB_HALVES(temp17, temp8, temp8, temp11) 113 ADD_SUB_HALVES(temp3, temp4, temp7, temp12) 114 115 // horizontal 116 SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6) 117 INSERT_HALF_X2(temp1, temp6, temp5, temp2) 118 SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8) 119 "repl.ph %[temp2], 0x4 \n\t" 120 INSERT_HALF_X2(temp3, temp8, temp17, temp4) 121 "addq.ph %[temp1], %[temp1], %[temp2] \n\t" 122 "addq.ph %[temp6], %[temp6], %[temp2] \n\t" 123 ADD_SUB_HALVES(temp2, temp4, temp1, temp3) 124 ADD_SUB_HALVES(temp5, temp7, temp6, temp8) 125 MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18, 126 temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15, 127 temp6, temp17, temp8, temp18) 128 MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16, 129 temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14, 130 temp18, temp12, temp17, temp16) 131 INSERT_HALF_X2(temp1, temp3, temp9, temp13) 132 INSERT_HALF_X2(temp6, temp8, temp11, temp15) 133 SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15, 134 temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8, 135 temp6) 136 PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13, 137 temp16, temp11, temp10, temp15, temp14) 138 LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst, 139 0, 0, 0, 0, 140 0, 1, 2, 3, 141 BPS) 142 CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10, 143 temp11, temp10, temp11, temp14, temp15) 144 STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11, 145 temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4, 146 dst, 0, 1, 2, 3, BPS) 147 148 OUTPUT_EARLY_CLOBBER_REGS_18() 149 : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2) 150 : "memory", "hi", "lo" 151 ); 152 } 153 154 static void TransformTwo(const int16_t* WEBP_RESTRICT in, 155 uint8_t* WEBP_RESTRICT dst, int do_two) { 156 TransformOne(in, dst); 157 if (do_two) { 158 TransformOne(in + 16, dst + 4); 159 } 160 } 161 162 static WEBP_INLINE void FilterLoop26(uint8_t* p, 163 int hstride, int vstride, int size, 164 int thresh, int ithresh, int hev_thresh) { 165 const int thresh2 = 2 * thresh + 1; 166 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; 167 int temp10, temp11, temp12, temp13, temp14, temp15; 168 169 __asm__ volatile ( 170 ".set push \n\t" 171 ".set noreorder \n\t" 172 "1: \n\t" 173 "negu %[temp1], %[hstride] \n\t" 174 "addiu %[size], %[size], -1 \n\t" 175 "sll %[temp2], %[hstride], 1 \n\t" 176 "sll %[temp3], %[temp1], 1 \n\t" 177 "addu %[temp4], %[temp2], %[hstride] \n\t" 178 "addu %[temp5], %[temp3], %[temp1] \n\t" 179 "lbu %[temp7], 0(%[p]) \n\t" 180 "sll %[temp6], %[temp3], 1 \n\t" 181 "lbux %[temp8], %[temp5](%[p]) \n\t" 182 "lbux %[temp9], %[temp3](%[p]) \n\t" 183 "lbux %[temp10], %[temp1](%[p]) \n\t" 184 "lbux %[temp11], %[temp6](%[p]) \n\t" 185 "lbux %[temp12], %[hstride](%[p]) \n\t" 186 "lbux %[temp13], %[temp2](%[p]) \n\t" 187 "lbux %[temp14], %[temp4](%[p]) \n\t" 188 "subu %[temp1], %[temp10], %[temp7] \n\t" 189 "subu %[temp2], %[temp9], %[temp12] \n\t" 190 "absq_s.w %[temp3], %[temp1] \n\t" 191 "absq_s.w %[temp4], %[temp2] \n\t" 192 "negu %[temp1], %[temp1] \n\t" 193 "sll %[temp3], %[temp3], 2 \n\t" 194 "addu %[temp15], %[temp3], %[temp4] \n\t" 195 "subu %[temp3], %[temp15], %[thresh2] \n\t" 196 "sll %[temp6], %[temp1], 1 \n\t" 197 "bgtz %[temp3], 3f \n\t" 198 " subu %[temp4], %[temp11], %[temp8] \n\t" 199 "absq_s.w %[temp4], %[temp4] \n\t" 200 "shll_s.w %[temp2], %[temp2], 24 \n\t" 201 "subu %[temp4], %[temp4], %[ithresh] \n\t" 202 "bgtz %[temp4], 3f \n\t" 203 " subu %[temp3], %[temp8], %[temp9] \n\t" 204 "absq_s.w %[temp3], %[temp3] \n\t" 205 "subu %[temp3], %[temp3], %[ithresh] \n\t" 206 "bgtz %[temp3], 3f \n\t" 207 " subu %[temp5], %[temp9], %[temp10] \n\t" 208 "absq_s.w %[temp3], %[temp5] \n\t" 209 "absq_s.w %[temp5], %[temp5] \n\t" 210 "subu %[temp3], %[temp3], %[ithresh] \n\t" 211 "bgtz %[temp3], 3f \n\t" 212 " subu %[temp3], %[temp14], %[temp13] \n\t" 213 "absq_s.w %[temp3], %[temp3] \n\t" 214 "slt %[temp5], %[hev_thresh], %[temp5] \n\t" 215 "subu %[temp3], %[temp3], %[ithresh] \n\t" 216 "bgtz %[temp3], 3f \n\t" 217 " subu %[temp3], %[temp13], %[temp12] \n\t" 218 "absq_s.w %[temp3], %[temp3] \n\t" 219 "sra %[temp4], %[temp2], 24 \n\t" 220 "subu %[temp3], %[temp3], %[ithresh] \n\t" 221 "bgtz %[temp3], 3f \n\t" 222 " subu %[temp15], %[temp12], %[temp7] \n\t" 223 "absq_s.w %[temp3], %[temp15] \n\t" 224 "absq_s.w %[temp15], %[temp15] \n\t" 225 "subu %[temp3], %[temp3], %[ithresh] \n\t" 226 "bgtz %[temp3], 3f \n\t" 227 " slt %[temp15], %[hev_thresh], %[temp15] \n\t" 228 "addu %[temp3], %[temp6], %[temp1] \n\t" 229 "or %[temp2], %[temp5], %[temp15] \n\t" 230 "addu %[temp5], %[temp4], %[temp3] \n\t" 231 "beqz %[temp2], 4f \n\t" 232 " shra_r.w %[temp1], %[temp5], 3 \n\t" 233 "addiu %[temp2], %[temp5], 3 \n\t" 234 "sra %[temp2], %[temp2], 3 \n\t" 235 "shll_s.w %[temp1], %[temp1], 27 \n\t" 236 "shll_s.w %[temp2], %[temp2], 27 \n\t" 237 "subu %[temp3], %[p], %[hstride] \n\t" 238 "sra %[temp1], %[temp1], 27 \n\t" 239 "sra %[temp2], %[temp2], 27 \n\t" 240 "subu %[temp1], %[temp7], %[temp1] \n\t" 241 "addu %[temp2], %[temp10], %[temp2] \n\t" 242 "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t" 243 "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t" 244 "sb %[temp2], 0(%[temp3]) \n\t" 245 "j 3f \n\t" 246 " sb %[temp1], 0(%[p]) \n\t" 247 "4: \n\t" 248 "shll_s.w %[temp5], %[temp5], 24 \n\t" 249 "subu %[temp14], %[p], %[hstride] \n\t" 250 "subu %[temp11], %[temp14], %[hstride] \n\t" 251 "sra %[temp6], %[temp5], 24 \n\t" 252 "sll %[temp1], %[temp6], 3 \n\t" 253 "subu %[temp15], %[temp11], %[hstride] \n\t" 254 "addu %[temp2], %[temp6], %[temp1] \n\t" 255 "sll %[temp3], %[temp2], 1 \n\t" 256 "addu %[temp4], %[temp3], %[temp2] \n\t" 257 "addiu %[temp2], %[temp2], 63 \n\t" 258 "addiu %[temp3], %[temp3], 63 \n\t" 259 "addiu %[temp4], %[temp4], 63 \n\t" 260 "sra %[temp2], %[temp2], 7 \n\t" 261 "sra %[temp3], %[temp3], 7 \n\t" 262 "sra %[temp4], %[temp4], 7 \n\t" 263 "addu %[temp1], %[temp8], %[temp2] \n\t" 264 "addu %[temp5], %[temp9], %[temp3] \n\t" 265 "addu %[temp6], %[temp10], %[temp4] \n\t" 266 "subu %[temp8], %[temp7], %[temp4] \n\t" 267 "subu %[temp7], %[temp12], %[temp3] \n\t" 268 "addu %[temp10], %[p], %[hstride] \n\t" 269 "subu %[temp9], %[temp13], %[temp2] \n\t" 270 "addu %[temp12], %[temp10], %[hstride] \n\t" 271 "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t" 272 "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t" 273 "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t" 274 "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t" 275 "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t" 276 "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t" 277 "sb %[temp2], 0(%[temp15]) \n\t" 278 "sb %[temp3], 0(%[temp11]) \n\t" 279 "sb %[temp4], 0(%[temp14]) \n\t" 280 "sb %[temp5], 0(%[p]) \n\t" 281 "sb %[temp6], 0(%[temp10]) \n\t" 282 "sb %[temp8], 0(%[temp12]) \n\t" 283 "3: \n\t" 284 "bgtz %[size], 1b \n\t" 285 " addu %[p], %[p], %[vstride] \n\t" 286 ".set pop \n\t" 287 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3), 288 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), 289 [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9), 290 [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12), 291 [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15), 292 [size]"+&r"(size), [p]"+&r"(p) 293 : [hstride]"r"(hstride), [thresh2]"r"(thresh2), 294 [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh), 295 [VP8kclip1]"r"(VP8kclip1) 296 : "memory" 297 ); 298 } 299 300 static WEBP_INLINE void FilterLoop24(uint8_t* p, 301 int hstride, int vstride, int size, 302 int thresh, int ithresh, int hev_thresh) { 303 int p0, q0, p1, q1, p2, q2, p3, q3; 304 int step1, step2, temp1, temp2, temp3, temp4; 305 uint8_t* pTemp0; 306 uint8_t* pTemp1; 307 const int thresh2 = 2 * thresh + 1; 308 309 __asm__ volatile ( 310 ".set push \n\t" 311 ".set noreorder \n\t" 312 "bltz %[size], 3f \n\t" 313 " nop \n\t" 314 "2: \n\t" 315 "negu %[step1], %[hstride] \n\t" 316 "lbu %[q0], 0(%[p]) \n\t" 317 "lbux %[p0], %[step1](%[p]) \n\t" 318 "subu %[step1], %[step1], %[hstride] \n\t" 319 "lbux %[q1], %[hstride](%[p]) \n\t" 320 "subu %[temp1], %[p0], %[q0] \n\t" 321 "lbux %[p1], %[step1](%[p]) \n\t" 322 "addu %[step2], %[hstride], %[hstride] \n\t" 323 "absq_s.w %[temp2], %[temp1] \n\t" 324 "subu %[temp3], %[p1], %[q1] \n\t" 325 "absq_s.w %[temp4], %[temp3] \n\t" 326 "sll %[temp2], %[temp2], 2 \n\t" 327 "addu %[temp2], %[temp2], %[temp4] \n\t" 328 "subu %[temp4], %[temp2], %[thresh2] \n\t" 329 "subu %[step1], %[step1], %[hstride] \n\t" 330 "bgtz %[temp4], 0f \n\t" 331 " lbux %[p2], %[step1](%[p]) \n\t" 332 "subu %[step1], %[step1], %[hstride] \n\t" 333 "lbux %[q2], %[step2](%[p]) \n\t" 334 "lbux %[p3], %[step1](%[p]) \n\t" 335 "subu %[temp4], %[p2], %[p1] \n\t" 336 "addu %[step2], %[step2], %[hstride] \n\t" 337 "subu %[temp2], %[p3], %[p2] \n\t" 338 "absq_s.w %[temp4], %[temp4] \n\t" 339 "absq_s.w %[temp2], %[temp2] \n\t" 340 "lbux %[q3], %[step2](%[p]) \n\t" 341 "subu %[temp4], %[temp4], %[ithresh] \n\t" 342 "negu %[temp1], %[temp1] \n\t" 343 "bgtz %[temp4], 0f \n\t" 344 " subu %[temp2], %[temp2], %[ithresh] \n\t" 345 "subu %[p3], %[p1], %[p0] \n\t" 346 "bgtz %[temp2], 0f \n\t" 347 " absq_s.w %[p3], %[p3] \n\t" 348 "subu %[temp4], %[q3], %[q2] \n\t" 349 "subu %[pTemp0], %[p], %[hstride] \n\t" 350 "absq_s.w %[temp4], %[temp4] \n\t" 351 "subu %[temp2], %[p3], %[ithresh] \n\t" 352 "sll %[step1], %[temp1], 1 \n\t" 353 "bgtz %[temp2], 0f \n\t" 354 " subu %[temp4], %[temp4], %[ithresh] \n\t" 355 "subu %[temp2], %[q2], %[q1] \n\t" 356 "bgtz %[temp4], 0f \n\t" 357 " absq_s.w %[temp2], %[temp2] \n\t" 358 "subu %[q3], %[q1], %[q0] \n\t" 359 "absq_s.w %[q3], %[q3] \n\t" 360 "subu %[temp2], %[temp2], %[ithresh] \n\t" 361 "addu %[temp1], %[temp1], %[step1] \n\t" 362 "bgtz %[temp2], 0f \n\t" 363 " subu %[temp4], %[q3], %[ithresh] \n\t" 364 "slt %[p3], %[hev_thresh], %[p3] \n\t" 365 "bgtz %[temp4], 0f \n\t" 366 " slt %[q3], %[hev_thresh], %[q3] \n\t" 367 "or %[q3], %[q3], %[p3] \n\t" 368 "bgtz %[q3], 1f \n\t" 369 " shra_r.w %[temp2], %[temp1], 3 \n\t" 370 "addiu %[temp1], %[temp1], 3 \n\t" 371 "sra %[temp1], %[temp1], 3 \n\t" 372 "shll_s.w %[temp2], %[temp2], 27 \n\t" 373 "shll_s.w %[temp1], %[temp1], 27 \n\t" 374 "addu %[pTemp1], %[p], %[hstride] \n\t" 375 "sra %[temp2], %[temp2], 27 \n\t" 376 "sra %[temp1], %[temp1], 27 \n\t" 377 "addiu %[step1], %[temp2], 1 \n\t" 378 "sra %[step1], %[step1], 1 \n\t" 379 "addu %[p0], %[p0], %[temp1] \n\t" 380 "addu %[p1], %[p1], %[step1] \n\t" 381 "subu %[q0], %[q0], %[temp2] \n\t" 382 "subu %[q1], %[q1], %[step1] \n\t" 383 "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t" 384 "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t" 385 "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t" 386 "sb %[temp2], 0(%[pTemp0]) \n\t" 387 "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t" 388 "subu %[pTemp0], %[pTemp0], %[hstride] \n\t" 389 "sb %[temp3], 0(%[p]) \n\t" 390 "sb %[temp4], 0(%[pTemp1]) \n\t" 391 "j 0f \n\t" 392 " sb %[temp1], 0(%[pTemp0]) \n\t" 393 "1: \n\t" 394 "shll_s.w %[temp3], %[temp3], 24 \n\t" 395 "sra %[temp3], %[temp3], 24 \n\t" 396 "addu %[temp1], %[temp1], %[temp3] \n\t" 397 "shra_r.w %[temp2], %[temp1], 3 \n\t" 398 "addiu %[temp1], %[temp1], 3 \n\t" 399 "shll_s.w %[temp2], %[temp2], 27 \n\t" 400 "sra %[temp1], %[temp1], 3 \n\t" 401 "shll_s.w %[temp1], %[temp1], 27 \n\t" 402 "sra %[temp2], %[temp2], 27 \n\t" 403 "sra %[temp1], %[temp1], 27 \n\t" 404 "addu %[p0], %[p0], %[temp1] \n\t" 405 "subu %[q0], %[q0], %[temp2] \n\t" 406 "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t" 407 "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t" 408 "sb %[temp2], 0(%[p]) \n\t" 409 "sb %[temp1], 0(%[pTemp0]) \n\t" 410 "0: \n\t" 411 "subu %[size], %[size], 1 \n\t" 412 "bgtz %[size], 2b \n\t" 413 " addu %[p], %[p], %[vstride] \n\t" 414 "3: \n\t" 415 ".set pop \n\t" 416 : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1), 417 [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3), 418 [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1), 419 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), 420 [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p), 421 [size]"+&r"(size) 422 : [vstride]"r"(vstride), [ithresh]"r"(ithresh), 423 [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride), 424 [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) 425 : "memory" 426 ); 427 } 428 429 // on macroblock edges 430 static void VFilter16(uint8_t* p, int stride, 431 int thresh, int ithresh, int hev_thresh) { 432 FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh); 433 } 434 435 static void HFilter16(uint8_t* p, int stride, 436 int thresh, int ithresh, int hev_thresh) { 437 FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh); 438 } 439 440 // 8-pixels wide variant, for chroma filtering 441 static void VFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, 442 int stride, int thresh, int ithresh, int hev_thresh) { 443 FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); 444 FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); 445 } 446 447 static void HFilter8(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, 448 int stride, int thresh, int ithresh, int hev_thresh) { 449 FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); 450 FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); 451 } 452 453 // on three inner edges 454 static void VFilter16i(uint8_t* p, int stride, 455 int thresh, int ithresh, int hev_thresh) { 456 int k; 457 for (k = 3; k > 0; --k) { 458 p += 4 * stride; 459 FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh); 460 } 461 } 462 463 static void HFilter16i(uint8_t* p, int stride, 464 int thresh, int ithresh, int hev_thresh) { 465 int k; 466 for (k = 3; k > 0; --k) { 467 p += 4; 468 FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh); 469 } 470 } 471 472 static void VFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, 473 int stride, int thresh, int ithresh, int hev_thresh) { 474 FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); 475 FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); 476 } 477 478 static void HFilter8i(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v, 479 int stride, int thresh, int ithresh, int hev_thresh) { 480 FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); 481 FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); 482 } 483 484 //------------------------------------------------------------------------------ 485 // Simple In-loop filtering (Paragraph 15.2) 486 487 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { 488 int i; 489 const int thresh2 = 2 * thresh + 1; 490 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; 491 uint8_t* p1 = p - stride; 492 __asm__ volatile ( 493 ".set push \n\t" 494 ".set noreorder \n\t" 495 "li %[i], 16 \n\t" 496 "0: \n\t" 497 "negu %[temp4], %[stride] \n\t" 498 "sll %[temp5], %[temp4], 1 \n\t" 499 "lbu %[temp2], 0(%[p]) \n\t" 500 "lbux %[temp3], %[stride](%[p]) \n\t" 501 "lbux %[temp1], %[temp4](%[p]) \n\t" 502 "lbux %[temp0], %[temp5](%[p]) \n\t" 503 "subu %[temp7], %[temp1], %[temp2] \n\t" 504 "subu %[temp6], %[temp0], %[temp3] \n\t" 505 "absq_s.w %[temp4], %[temp7] \n\t" 506 "absq_s.w %[temp5], %[temp6] \n\t" 507 "sll %[temp4], %[temp4], 2 \n\t" 508 "subu %[temp5], %[temp5], %[thresh2] \n\t" 509 "addu %[temp5], %[temp4], %[temp5] \n\t" 510 "negu %[temp8], %[temp7] \n\t" 511 "bgtz %[temp5], 1f \n\t" 512 " addiu %[i], %[i], -1 \n\t" 513 "sll %[temp4], %[temp8], 1 \n\t" 514 "shll_s.w %[temp5], %[temp6], 24 \n\t" 515 "addu %[temp3], %[temp4], %[temp8] \n\t" 516 "sra %[temp5], %[temp5], 24 \n\t" 517 "addu %[temp3], %[temp3], %[temp5] \n\t" 518 "addiu %[temp7], %[temp3], 3 \n\t" 519 "sra %[temp7], %[temp7], 3 \n\t" 520 "shra_r.w %[temp8], %[temp3], 3 \n\t" 521 "shll_s.w %[temp0], %[temp7], 27 \n\t" 522 "shll_s.w %[temp4], %[temp8], 27 \n\t" 523 "sra %[temp0], %[temp0], 27 \n\t" 524 "sra %[temp4], %[temp4], 27 \n\t" 525 "addu %[temp7], %[temp1], %[temp0] \n\t" 526 "subu %[temp2], %[temp2], %[temp4] \n\t" 527 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t" 528 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t" 529 "sb %[temp3], 0(%[p1]) \n\t" 530 "sb %[temp4], 0(%[p]) \n\t" 531 "1: \n\t" 532 "addiu %[p1], %[p1], 1 \n\t" 533 "bgtz %[i], 0b \n\t" 534 " addiu %[p], %[p], 1 \n\t" 535 " .set pop \n\t" 536 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 537 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 538 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), 539 [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1) 540 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) 541 : "memory" 542 ); 543 } 544 545 // TEMP0 = SRC[A + A1 * BPS] 546 // TEMP1 = SRC[B + B1 * BPS] 547 // TEMP2 = SRC[C + C1 * BPS] 548 // TEMP3 = SRC[D + D1 * BPS] 549 #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \ 550 A, A1, B, B1, C, C1, D, D1, SRC) \ 551 "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 552 "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 553 "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 554 "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 555 556 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { 557 int i; 558 const int thresh2 = 2 * thresh + 1; 559 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; 560 __asm__ volatile ( 561 ".set push \n\t" 562 ".set noreorder \n\t" 563 "li %[i], 16 \n\t" 564 "0: \n\t" 565 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p) 566 "subu %[temp7], %[temp1], %[temp2] \n\t" 567 "subu %[temp6], %[temp0], %[temp3] \n\t" 568 "absq_s.w %[temp4], %[temp7] \n\t" 569 "absq_s.w %[temp5], %[temp6] \n\t" 570 "sll %[temp4], %[temp4], 2 \n\t" 571 "addu %[temp5], %[temp4], %[temp5] \n\t" 572 "subu %[temp5], %[temp5], %[thresh2] \n\t" 573 "negu %[temp8], %[temp7] \n\t" 574 "bgtz %[temp5], 1f \n\t" 575 " addiu %[i], %[i], -1 \n\t" 576 "sll %[temp4], %[temp8], 1 \n\t" 577 "shll_s.w %[temp5], %[temp6], 24 \n\t" 578 "addu %[temp3], %[temp4], %[temp8] \n\t" 579 "sra %[temp5], %[temp5], 24 \n\t" 580 "addu %[temp3], %[temp3], %[temp5] \n\t" 581 "addiu %[temp7], %[temp3], 3 \n\t" 582 "sra %[temp7], %[temp7], 3 \n\t" 583 "shra_r.w %[temp8], %[temp3], 3 \n\t" 584 "shll_s.w %[temp0], %[temp7], 27 \n\t" 585 "shll_s.w %[temp4], %[temp8], 27 \n\t" 586 "sra %[temp0], %[temp0], 27 \n\t" 587 "sra %[temp4], %[temp4], 27 \n\t" 588 "addu %[temp7], %[temp1], %[temp0] \n\t" 589 "subu %[temp2], %[temp2], %[temp4] \n\t" 590 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t" 591 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t" 592 "sb %[temp3], -1(%[p]) \n\t" 593 "sb %[temp4], 0(%[p]) \n\t" 594 "1: \n\t" 595 "bgtz %[i], 0b \n\t" 596 " addu %[p], %[p], %[stride] \n\t" 597 ".set pop \n\t" 598 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 599 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 600 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), 601 [p]"+&r"(p), [i]"=&r"(i) 602 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) 603 : "memory" 604 ); 605 } 606 607 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { 608 int k; 609 for (k = 3; k > 0; --k) { 610 p += 4 * stride; 611 SimpleVFilter16(p, stride, thresh); 612 } 613 } 614 615 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { 616 int k; 617 for (k = 3; k > 0; --k) { 618 p += 4; 619 SimpleHFilter16(p, stride, thresh); 620 } 621 } 622 623 // DST[A * BPS] = TEMP0 624 // DST[B + C * BPS] = TEMP1 625 #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \ 626 "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \ 627 "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t" 628 629 static void VE4(uint8_t* dst) { // vertical 630 const uint8_t* top = dst - BPS; 631 int temp0, temp1, temp2, temp3, temp4, temp5, temp6; 632 __asm__ volatile ( 633 "ulw %[temp0], -1(%[top]) \n\t" 634 "ulh %[temp1], 3(%[top]) \n\t" 635 "preceu.ph.qbr %[temp2], %[temp0] \n\t" 636 "preceu.ph.qbl %[temp3], %[temp0] \n\t" 637 "preceu.ph.qbr %[temp4], %[temp1] \n\t" 638 "packrl.ph %[temp5], %[temp3], %[temp2] \n\t" 639 "packrl.ph %[temp6], %[temp4], %[temp3] \n\t" 640 "shll.ph %[temp5], %[temp5], 1 \n\t" 641 "shll.ph %[temp6], %[temp6], 1 \n\t" 642 "addq.ph %[temp2], %[temp5], %[temp2] \n\t" 643 "addq.ph %[temp6], %[temp6], %[temp4] \n\t" 644 "addq.ph %[temp2], %[temp2], %[temp3] \n\t" 645 "addq.ph %[temp6], %[temp6], %[temp3] \n\t" 646 "shra_r.ph %[temp2], %[temp2], 2 \n\t" 647 "shra_r.ph %[temp6], %[temp6], 2 \n\t" 648 "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t" 649 STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst) 650 STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst) 651 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 652 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 653 [temp6]"=&r"(temp6) 654 : [top]"r"(top), [dst]"r"(dst) 655 : "memory" 656 ); 657 } 658 659 static void DC4(uint8_t* dst) { // DC 660 int temp0, temp1, temp2, temp3, temp4; 661 __asm__ volatile ( 662 "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t" 663 LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst) 664 "ins %[temp1], %[temp2], 8, 8 \n\t" 665 "ins %[temp1], %[temp3], 16, 8 \n\t" 666 "ins %[temp1], %[temp4], 24, 8 \n\t" 667 "raddu.w.qb %[temp0], %[temp0] \n\t" 668 "raddu.w.qb %[temp1], %[temp1] \n\t" 669 "addu %[temp0], %[temp0], %[temp1] \n\t" 670 "shra_r.w %[temp0], %[temp0], 3 \n\t" 671 "replv.qb %[temp0], %[temp0] \n\t" 672 STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst) 673 STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst) 674 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 675 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4) 676 : [dst]"r"(dst) 677 : "memory" 678 ); 679 } 680 681 static void RD4(uint8_t* dst) { // Down-right 682 int temp0, temp1, temp2, temp3, temp4; 683 int temp5, temp6, temp7, temp8; 684 __asm__ volatile ( 685 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst) 686 "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t" 687 "ins %[temp1], %[temp0], 16, 16 \n\t" 688 "preceu.ph.qbr %[temp5], %[temp7] \n\t" 689 "ins %[temp2], %[temp1], 16, 16 \n\t" 690 "preceu.ph.qbl %[temp4], %[temp7] \n\t" 691 "ins %[temp3], %[temp2], 16, 16 \n\t" 692 "shll.ph %[temp2], %[temp2], 1 \n\t" 693 "addq.ph %[temp3], %[temp3], %[temp1] \n\t" 694 "packrl.ph %[temp6], %[temp5], %[temp1] \n\t" 695 "addq.ph %[temp3], %[temp3], %[temp2] \n\t" 696 "addq.ph %[temp1], %[temp1], %[temp5] \n\t" 697 "shll.ph %[temp6], %[temp6], 1 \n\t" 698 "addq.ph %[temp1], %[temp1], %[temp6] \n\t" 699 "packrl.ph %[temp0], %[temp4], %[temp5] \n\t" 700 "addq.ph %[temp8], %[temp5], %[temp4] \n\t" 701 "shra_r.ph %[temp3], %[temp3], 2 \n\t" 702 "shll.ph %[temp0], %[temp0], 1 \n\t" 703 "shra_r.ph %[temp1], %[temp1], 2 \n\t" 704 "addq.ph %[temp8], %[temp0], %[temp8] \n\t" 705 "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t" 706 "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t" 707 "shra_r.ph %[temp8], %[temp8], 2 \n\t" 708 "ins %[temp7], %[temp5], 0, 8 \n\t" 709 "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t" 710 "raddu.w.qb %[temp4], %[temp7] \n\t" 711 "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t" 712 "shra_r.w %[temp4], %[temp4], 2 \n\t" 713 STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst) 714 "prepend %[temp2], %[temp8], 8 \n\t" 715 "prepend %[temp6], %[temp4], 8 \n\t" 716 STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst) 717 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 718 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 719 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8) 720 : [dst]"r"(dst) 721 : "memory" 722 ); 723 } 724 725 // TEMP0 = SRC[A * BPS] 726 // TEMP1 = SRC[B + C * BPS] 727 #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \ 728 "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ 729 "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t" 730 731 static void LD4(uint8_t* dst) { // Down-Left 732 int temp0, temp1, temp2, temp3, temp4; 733 int temp5, temp6, temp7, temp8, temp9; 734 __asm__ volatile ( 735 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) 736 "preceu.ph.qbl %[temp2], %[temp0] \n\t" 737 "preceu.ph.qbr %[temp3], %[temp0] \n\t" 738 "preceu.ph.qbr %[temp4], %[temp1] \n\t" 739 "preceu.ph.qbl %[temp5], %[temp1] \n\t" 740 "packrl.ph %[temp6], %[temp2], %[temp3] \n\t" 741 "packrl.ph %[temp7], %[temp4], %[temp2] \n\t" 742 "packrl.ph %[temp8], %[temp5], %[temp4] \n\t" 743 "shll.ph %[temp6], %[temp6], 1 \n\t" 744 "addq.ph %[temp9], %[temp2], %[temp6] \n\t" 745 "shll.ph %[temp7], %[temp7], 1 \n\t" 746 "addq.ph %[temp9], %[temp9], %[temp3] \n\t" 747 "shll.ph %[temp8], %[temp8], 1 \n\t" 748 "shra_r.ph %[temp9], %[temp9], 2 \n\t" 749 "addq.ph %[temp3], %[temp4], %[temp7] \n\t" 750 "addq.ph %[temp0], %[temp5], %[temp8] \n\t" 751 "addq.ph %[temp3], %[temp3], %[temp2] \n\t" 752 "addq.ph %[temp0], %[temp0], %[temp4] \n\t" 753 "shra_r.ph %[temp3], %[temp3], 2 \n\t" 754 "shra_r.ph %[temp0], %[temp0], 2 \n\t" 755 "srl %[temp1], %[temp1], 24 \n\t" 756 "sll %[temp1], %[temp1], 1 \n\t" 757 "raddu.w.qb %[temp5], %[temp5] \n\t" 758 "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t" 759 "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t" 760 "addu %[temp1], %[temp1], %[temp5] \n\t" 761 "shra_r.w %[temp1], %[temp1], 2 \n\t" 762 STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst) 763 "prepend %[temp9], %[temp0], 8 \n\t" 764 "prepend %[temp3], %[temp1], 8 \n\t" 765 STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst) 766 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 767 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 768 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), 769 [temp9]"=&r"(temp9) 770 : [dst]"r"(dst) 771 : "memory" 772 ); 773 } 774 775 //------------------------------------------------------------------------------ 776 // Chroma 777 778 static void DC8uv(uint8_t* dst) { // DC 779 int temp0, temp1, temp2, temp3, temp4; 780 int temp5, temp6, temp7, temp8, temp9; 781 __asm__ volatile ( 782 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) 783 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst) 784 LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst) 785 "raddu.w.qb %[temp0], %[temp0] \n\t" 786 "raddu.w.qb %[temp1], %[temp1] \n\t" 787 "addu %[temp2], %[temp2], %[temp3] \n\t" 788 "addu %[temp4], %[temp4], %[temp5] \n\t" 789 "addu %[temp6], %[temp6], %[temp7] \n\t" 790 "addu %[temp8], %[temp8], %[temp9] \n\t" 791 "addu %[temp0], %[temp0], %[temp1] \n\t" 792 "addu %[temp2], %[temp2], %[temp4] \n\t" 793 "addu %[temp6], %[temp6], %[temp8] \n\t" 794 "addu %[temp0], %[temp0], %[temp2] \n\t" 795 "addu %[temp0], %[temp0], %[temp6] \n\t" 796 "shra_r.w %[temp0], %[temp0], 4 \n\t" 797 "replv.qb %[temp0], %[temp0] \n\t" 798 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) 799 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) 800 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) 801 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) 802 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) 803 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) 804 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) 805 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) 806 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 807 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 808 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), 809 [temp9]"=&r"(temp9) 810 : [dst]"r"(dst) 811 : "memory" 812 ); 813 } 814 815 static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples 816 int temp0, temp1; 817 __asm__ volatile ( 818 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) 819 "raddu.w.qb %[temp0], %[temp0] \n\t" 820 "raddu.w.qb %[temp1], %[temp1] \n\t" 821 "addu %[temp0], %[temp0], %[temp1] \n\t" 822 "shra_r.w %[temp0], %[temp0], 3 \n\t" 823 "replv.qb %[temp0], %[temp0] \n\t" 824 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) 825 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) 826 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) 827 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) 828 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) 829 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) 830 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) 831 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) 832 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1) 833 : [dst]"r"(dst) 834 : "memory" 835 ); 836 } 837 838 static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples 839 int temp0, temp1, temp2, temp3, temp4; 840 int temp5, temp6, temp7, temp8; 841 __asm__ volatile ( 842 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst) 843 LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst) 844 "addu %[temp2], %[temp2], %[temp3] \n\t" 845 "addu %[temp4], %[temp4], %[temp5] \n\t" 846 "addu %[temp6], %[temp6], %[temp7] \n\t" 847 "addu %[temp8], %[temp8], %[temp1] \n\t" 848 "addu %[temp2], %[temp2], %[temp4] \n\t" 849 "addu %[temp6], %[temp6], %[temp8] \n\t" 850 "addu %[temp0], %[temp6], %[temp2] \n\t" 851 "shra_r.w %[temp0], %[temp0], 3 \n\t" 852 "replv.qb %[temp0], %[temp0] \n\t" 853 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) 854 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) 855 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) 856 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) 857 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) 858 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) 859 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) 860 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) 861 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), 862 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), 863 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8) 864 : [dst]"r"(dst) 865 : "memory" 866 ); 867 } 868 869 #undef LOAD_8_BYTES 870 #undef STORE_8_BYTES 871 #undef LOAD_4_BYTES 872 873 #define CLIPPING(SIZE) \ 874 "preceu.ph.qbl %[temp2], %[temp0] \n\t" \ 875 "preceu.ph.qbr %[temp0], %[temp0] \n\t" \ 876 ".if " #SIZE " == 8 \n\t" \ 877 "preceu.ph.qbl %[temp3], %[temp1] \n\t" \ 878 "preceu.ph.qbr %[temp1], %[temp1] \n\t" \ 879 ".endif \n\t" \ 880 "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \ 881 "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \ 882 ".if " #SIZE " == 8 \n\t" \ 883 "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \ 884 "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \ 885 ".endif \n\t" \ 886 "shll_s.ph %[temp2], %[temp2], 7 \n\t" \ 887 "shll_s.ph %[temp0], %[temp0], 7 \n\t" \ 888 ".if " #SIZE " == 8 \n\t" \ 889 "shll_s.ph %[temp3], %[temp3], 7 \n\t" \ 890 "shll_s.ph %[temp1], %[temp1], 7 \n\t" \ 891 ".endif \n\t" \ 892 "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \ 893 ".if " #SIZE " == 8 \n\t" \ 894 "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \ 895 ".endif \n\t" 896 897 898 #define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \ 899 int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \ 900 int temp0, temp1, temp2, temp3; \ 901 __asm__ volatile ( \ 902 ".if " #SIZE " < 8 \n\t" \ 903 "ulw %[temp0], 0(%[top]) \n\t" \ 904 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \ 905 CLIPPING(4) \ 906 "usw %[temp0], 0(%[dst]) \n\t" \ 907 ".else \n\t" \ 908 "ulw %[temp0], 0(%[top]) \n\t" \ 909 "ulw %[temp1], 4(%[top]) \n\t" \ 910 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \ 911 CLIPPING(8) \ 912 "usw %[temp0], 0(%[dst]) \n\t" \ 913 "usw %[temp1], 4(%[dst]) \n\t" \ 914 ".if " #SIZE " == 16 \n\t" \ 915 "ulw %[temp0], 8(%[top]) \n\t" \ 916 "ulw %[temp1], 12(%[top]) \n\t" \ 917 CLIPPING(8) \ 918 "usw %[temp0], 8(%[dst]) \n\t" \ 919 "usw %[temp1], 12(%[dst]) \n\t" \ 920 ".endif \n\t" \ 921 ".endif \n\t" \ 922 : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ 923 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \ 924 : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \ 925 : "memory" \ 926 ); \ 927 } while (0) 928 929 #define CLIP_TO_DST(DST, SIZE) do { \ 930 int y; \ 931 const uint8_t* top = (DST) - BPS; \ 932 const int top_1 = ((int)top[-1] << 16) + top[-1]; \ 933 for (y = 0; y < (SIZE); ++y) { \ 934 CLIP_8B_TO_DST((DST), top, (SIZE)); \ 935 (DST) += BPS; \ 936 } \ 937 } while (0) 938 939 #define TRUE_MOTION(DST, SIZE) \ 940 static void TrueMotion##SIZE(uint8_t* (DST)) { \ 941 CLIP_TO_DST((DST), (SIZE)); \ 942 } 943 944 TRUE_MOTION(dst, 4) 945 TRUE_MOTION(dst, 8) 946 TRUE_MOTION(dst, 16) 947 948 #undef TRUE_MOTION 949 #undef CLIP_TO_DST 950 #undef CLIP_8B_TO_DST 951 #undef CLIPPING 952 953 //------------------------------------------------------------------------------ 954 // Entry point 955 956 extern void VP8DspInitMIPSdspR2(void); 957 958 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) { 959 VP8TransformDC = TransformDC; 960 VP8TransformAC3 = TransformAC3; 961 VP8Transform = TransformTwo; 962 963 VP8VFilter16 = VFilter16; 964 VP8HFilter16 = HFilter16; 965 VP8VFilter8 = VFilter8; 966 VP8HFilter8 = HFilter8; 967 VP8VFilter16i = VFilter16i; 968 VP8HFilter16i = HFilter16i; 969 VP8VFilter8i = VFilter8i; 970 VP8HFilter8i = HFilter8i; 971 VP8SimpleVFilter16 = SimpleVFilter16; 972 VP8SimpleHFilter16 = SimpleHFilter16; 973 VP8SimpleVFilter16i = SimpleVFilter16i; 974 VP8SimpleHFilter16i = SimpleHFilter16i; 975 976 VP8PredLuma4[0] = DC4; 977 VP8PredLuma4[1] = TrueMotion4; 978 VP8PredLuma4[2] = VE4; 979 VP8PredLuma4[4] = RD4; 980 VP8PredLuma4[6] = LD4; 981 982 VP8PredChroma8[0] = DC8uv; 983 VP8PredChroma8[1] = TrueMotion8; 984 VP8PredChroma8[4] = DC8uvNoTop; 985 VP8PredChroma8[5] = DC8uvNoLeft; 986 987 VP8PredLuma16[1] = TrueMotion16; 988 } 989 990 #else // !WEBP_USE_MIPS_DSP_R2 991 992 WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2) 993 994 #endif // WEBP_USE_MIPS_DSP_R2