lossless_enc_mips32.c (12865B)
1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MIPS version of lossless functions 11 // 12 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) 13 // Jovan Zelincevic (jovan.zelincevic@imgtec.com) 14 15 #include "src/dsp/dsp.h" 16 #include "src/dsp/lossless.h" 17 #include "src/dsp/lossless_common.h" 18 19 #if defined(WEBP_USE_MIPS32) 20 21 #include <assert.h> 22 #include <math.h> 23 #include <stdlib.h> 24 #include <string.h> 25 26 static uint64_t FastSLog2Slow_MIPS32(uint32_t v) { 27 assert(v >= LOG_LOOKUP_IDX_MAX); 28 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { 29 uint32_t log_cnt, y; 30 uint64_t correction; 31 const int c24 = 24; 32 uint32_t temp; 33 34 // Xf = 256 = 2^8 35 // log_cnt is index of leading one in upper 24 bits 36 __asm__ volatile( 37 "clz %[log_cnt], %[v] \n\t" 38 "addiu %[y], $zero, 1 \n\t" 39 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" 40 "sllv %[y], %[y], %[log_cnt] \n\t" 41 "srlv %[temp], %[v], %[log_cnt] \n\t" 42 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), 43 [temp]"=r"(temp) 44 : [c24]"r"(c24), [v]"r"(v) 45 ); 46 47 // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 48 // Xf = floor(Xf) * (1 + (v % y) / v) 49 // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) 50 // The correction factor: log(1 + d) ~ d; for very small d values, so 51 // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v 52 53 // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) 54 correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1)); 55 return (uint64_t)v * (kLog2Table[temp] + 56 ((uint64_t)log_cnt << LOG_2_PRECISION_BITS)) + 57 correction; 58 } else { 59 return (uint64_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * v * log((double)v) + .5); 60 } 61 } 62 63 static uint32_t FastLog2Slow_MIPS32(uint32_t v) { 64 assert(v >= LOG_LOOKUP_IDX_MAX); 65 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { 66 uint32_t log_cnt, y; 67 const int c24 = 24; 68 uint32_t log_2; 69 uint32_t temp; 70 71 __asm__ volatile( 72 "clz %[log_cnt], %[v] \n\t" 73 "addiu %[y], $zero, 1 \n\t" 74 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" 75 "sllv %[y], %[y], %[log_cnt] \n\t" 76 "srlv %[temp], %[v], %[log_cnt] \n\t" 77 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), 78 [temp]"=r"(temp) 79 : [c24]"r"(c24), [v]"r"(v) 80 ); 81 82 log_2 = kLog2Table[temp] + (log_cnt << LOG_2_PRECISION_BITS); 83 if (v >= APPROX_LOG_MAX) { 84 // Since the division is still expensive, add this correction factor only 85 // for large values of 'v'. 86 const uint64_t correction = LOG_2_RECIPROCAL_FIXED * (v & (y - 1)); 87 log_2 += (uint32_t)DivRound(correction, v); 88 } 89 return log_2; 90 } else { 91 return (uint32_t)(LOG_2_RECIPROCAL_FIXED_DOUBLE * log((double)v) + .5); 92 } 93 } 94 95 // C version of this function: 96 // int i = 0; 97 // int64_t cost = 0; 98 // const uint32_t* pop = &population[4]; 99 // const uint32_t* LoopEnd = &population[length]; 100 // while (pop != LoopEnd) { 101 // ++i; 102 // cost += i * *pop; 103 // cost += i * *(pop + 1); 104 // pop += 2; 105 // } 106 // return cost; 107 static uint32_t ExtraCost_MIPS32(const uint32_t* const population, int length) { 108 int i, temp0, temp1; 109 const uint32_t* pop = &population[4]; 110 const uint32_t* const LoopEnd = &population[length]; 111 112 __asm__ volatile( 113 "mult $zero, $zero \n\t" 114 "xor %[i], %[i], %[i] \n\t" 115 "beq %[pop], %[LoopEnd], 2f \n\t" 116 "1: \n\t" 117 "lw %[temp0], 0(%[pop]) \n\t" 118 "lw %[temp1], 4(%[pop]) \n\t" 119 "addiu %[i], %[i], 1 \n\t" 120 "addiu %[pop], %[pop], 8 \n\t" 121 "madd %[i], %[temp0] \n\t" 122 "madd %[i], %[temp1] \n\t" 123 "bne %[pop], %[LoopEnd], 1b \n\t" 124 "2: \n\t" 125 "mfhi %[temp0] \n\t" 126 "mflo %[temp1] \n\t" 127 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), 128 [i]"=&r"(i), [pop]"+r"(pop) 129 : [LoopEnd]"r"(LoopEnd) 130 : "memory", "hi", "lo" 131 ); 132 133 return ((int64_t)temp0 << 32 | temp1); 134 } 135 136 #define HUFFMAN_COST_PASS \ 137 __asm__ volatile( \ 138 "sll %[temp1], %[temp0], 3 \n\t" \ 139 "addiu %[temp3], %[streak], -3 \n\t" \ 140 "addu %[temp2], %[pstreaks], %[temp1] \n\t" \ 141 "blez %[temp3], 1f \n\t" \ 142 "srl %[temp1], %[temp1], 1 \n\t" \ 143 "addu %[temp3], %[pcnts], %[temp1] \n\t" \ 144 "lw %[temp0], 4(%[temp2]) \n\t" \ 145 "lw %[temp1], 0(%[temp3]) \n\t" \ 146 "addu %[temp0], %[temp0], %[streak] \n\t" \ 147 "addiu %[temp1], %[temp1], 1 \n\t" \ 148 "sw %[temp0], 4(%[temp2]) \n\t" \ 149 "sw %[temp1], 0(%[temp3]) \n\t" \ 150 "b 2f \n\t" \ 151 "1: \n\t" \ 152 "lw %[temp0], 0(%[temp2]) \n\t" \ 153 "addu %[temp0], %[temp0], %[streak] \n\t" \ 154 "sw %[temp0], 0(%[temp2]) \n\t" \ 155 "2: \n\t" \ 156 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \ 157 [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \ 158 : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \ 159 [streak]"r"(streak) \ 160 : "memory" \ 161 ); 162 163 // Returns the various RLE counts 164 static WEBP_INLINE void GetEntropyUnrefinedHelper( 165 uint32_t val, int i, uint32_t* WEBP_RESTRICT const val_prev, 166 int* WEBP_RESTRICT const i_prev, 167 VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, 168 VP8LStreaks* WEBP_RESTRICT const stats) { 169 int* const pstreaks = &stats->streaks[0][0]; 170 int* const pcnts = &stats->counts[0]; 171 int temp0, temp1, temp2, temp3; 172 const int streak = i - *i_prev; 173 174 // Gather info for the bit entropy. 175 if (*val_prev != 0) { 176 bit_entropy->sum += (*val_prev) * streak; 177 bit_entropy->nonzeros += streak; 178 bit_entropy->nonzero_code = *i_prev; 179 bit_entropy->entropy += VP8LFastSLog2(*val_prev) * streak; 180 if (bit_entropy->max_val < *val_prev) { 181 bit_entropy->max_val = *val_prev; 182 } 183 } 184 185 // Gather info for the Huffman cost. 186 temp0 = (*val_prev != 0); 187 HUFFMAN_COST_PASS 188 189 *val_prev = val; 190 *i_prev = i; 191 } 192 193 static void GetEntropyUnrefined_MIPS32( 194 const uint32_t X[], int length, 195 VP8LBitEntropy* WEBP_RESTRICT const bit_entropy, 196 VP8LStreaks* WEBP_RESTRICT const stats) { 197 int i; 198 int i_prev = 0; 199 uint32_t x_prev = X[0]; 200 201 memset(stats, 0, sizeof(*stats)); 202 VP8LBitEntropyInit(bit_entropy); 203 204 for (i = 1; i < length; ++i) { 205 const uint32_t x = X[i]; 206 if (x != x_prev) { 207 GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats); 208 } 209 } 210 GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); 211 212 bit_entropy->entropy = VP8LFastSLog2(bit_entropy->sum) - bit_entropy->entropy; 213 } 214 215 static void GetCombinedEntropyUnrefined_MIPS32( 216 const uint32_t X[], const uint32_t Y[], int length, 217 VP8LBitEntropy* WEBP_RESTRICT const entropy, 218 VP8LStreaks* WEBP_RESTRICT const stats) { 219 int i = 1; 220 int i_prev = 0; 221 uint32_t xy_prev = X[0] + Y[0]; 222 223 memset(stats, 0, sizeof(*stats)); 224 VP8LBitEntropyInit(entropy); 225 226 for (i = 1; i < length; ++i) { 227 const uint32_t xy = X[i] + Y[i]; 228 if (xy != xy_prev) { 229 GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats); 230 } 231 } 232 GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats); 233 234 entropy->entropy = VP8LFastSLog2(entropy->sum) - entropy->entropy; 235 } 236 237 #define ASM_START \ 238 __asm__ volatile( \ 239 ".set push \n\t" \ 240 ".set at \n\t" \ 241 ".set macro \n\t" \ 242 "1: \n\t" 243 244 // P2 = P0 + P1 245 // A..D - offsets 246 // E - temp variable to tell macro 247 // if pointer should be incremented 248 // 'literal' and successive histograms could be unaligned 249 // so we must use ulw and usw 250 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \ 251 "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \ 252 "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \ 253 "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \ 254 "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \ 255 "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \ 256 "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \ 257 "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \ 258 "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \ 259 "addu %[temp4], %[temp4], %[temp0] \n\t" \ 260 "addu %[temp5], %[temp5], %[temp1] \n\t" \ 261 "addu %[temp6], %[temp6], %[temp2] \n\t" \ 262 "addu %[temp7], %[temp7], %[temp3] \n\t" \ 263 "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \ 264 ".if " #E " == 1 \n\t" \ 265 "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \ 266 ".endif \n\t" \ 267 "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \ 268 "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \ 269 "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \ 270 "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \ 271 "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \ 272 "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \ 273 ".set pop \n\t" \ 274 275 #define ASM_END_COMMON_0 \ 276 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ 277 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ 278 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \ 279 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \ 280 [pa]"+r"(pa), [pout]"+r"(pout) 281 282 #define ASM_END_COMMON_1 \ 283 : [LoopEnd]"r"(LoopEnd) \ 284 : "memory", "at" \ 285 ); 286 287 #define ASM_END_0 \ 288 ASM_END_COMMON_0 \ 289 , [pb]"+r"(pb) \ 290 ASM_END_COMMON_1 291 292 #define ASM_END_1 \ 293 ASM_END_COMMON_0 \ 294 ASM_END_COMMON_1 295 296 static void AddVector_MIPS32(const uint32_t* WEBP_RESTRICT pa, 297 const uint32_t* WEBP_RESTRICT pb, 298 uint32_t* WEBP_RESTRICT pout, int size) { 299 uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; 300 const int end = ((size) / 4) * 4; 301 const uint32_t* const LoopEnd = pa + end; 302 int i; 303 ASM_START 304 ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) 305 ASM_END_0 306 for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i]; 307 } 308 309 static void AddVectorEq_MIPS32(const uint32_t* WEBP_RESTRICT pa, 310 uint32_t* WEBP_RESTRICT pout, int size) { 311 uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; 312 const int end = ((size) / 4) * 4; 313 const uint32_t* const LoopEnd = pa + end; 314 int i; 315 ASM_START 316 ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) 317 ASM_END_1 318 for (i = 0; i < size - end; ++i) pout[i] += pa[i]; 319 } 320 321 #undef ASM_END_1 322 #undef ASM_END_0 323 #undef ASM_END_COMMON_1 324 #undef ASM_END_COMMON_0 325 #undef ADD_TO_OUT 326 #undef ASM_START 327 328 //------------------------------------------------------------------------------ 329 // Entry point 330 331 extern void VP8LEncDspInitMIPS32(void); 332 333 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { 334 VP8LFastSLog2Slow = FastSLog2Slow_MIPS32; 335 VP8LFastLog2Slow = FastLog2Slow_MIPS32; 336 VP8LExtraCost = ExtraCost_MIPS32; 337 VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32; 338 VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32; 339 VP8LAddVector = AddVector_MIPS32; 340 VP8LAddVectorEq = AddVectorEq_MIPS32; 341 } 342 343 #else // !WEBP_USE_MIPS32 344 345 WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32) 346 347 #endif // WEBP_USE_MIPS32