ImageScalingSSE2.cpp (12890B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "ImageScaling.h" 8 #include "mozilla/Attributes.h" 9 10 #include "SSEHelpers.h" 11 12 /* The functions below use the following system for averaging 4 pixels: 13 * 14 * The first observation is that a half-adder is implemented as follows: 15 * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1); 16 * 17 * This can be trivially extended to three pixels by observaring that when 18 * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the 19 * carries of the individual numbers, since the sum of 3 bits can only ever 20 * have a carry of one. 21 * 22 * We then observe that the average is then ((carry << 1) + sum) >> 1, or, 23 * assuming eliminating overflows and underflows, carry + (sum >> 1). 24 * 25 * We now average our existing sum with the fourth number, so we get: 26 * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1). 27 * 28 * We now observe that our sum has been moved into place relative to the 29 * carry, so we can now average with the carry to get the final 4 input 30 * average: avg = (sum2 + carry) >> 1; 31 * 32 * Or to reverse the proof: 33 * avg = ((sum >> 1) + carry + d >> 1) >> 1 34 * avg = ((a + b + c) >> 1 + d >> 1) >> 1 35 * avg = ((a + b + c + d) >> 2) 36 * 37 * An additional fact used in the SSE versions is the concept that we can 38 * trivially convert a rounded average to a truncated average: 39 * 40 * We have: 41 * f(a, b) = (a + b + 1) >> 1 42 * 43 * And want: 44 * g(a, b) = (a + b) >> 1 45 * 46 * Observe: 47 * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1) 48 * == ~((-a - 1 + -b - 1 + 1) >> 1) 49 * == ~((-a - 1 + -b) >> 1) 50 * == ~((-(a + b) - 1) >> 1) 51 * == ~((~(a + b)) >> 1) 52 * == (a + b) >> 1 53 * == g(a, b) 54 */ 55 56 MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg) { 57 __m128i minusone = _mm_set1_epi32(0xffffffff); 58 return _mm_xor_si128(arg, minusone); 59 } 60 61 /* We have to pass pointers here, MSVC does not allow passing more than 3 62 * __m128i arguments on the stack. And it does not allow 16-byte aligned 63 * stack variables. This inlines properly on MSVC 2010. It does -not- inline 64 * with just the inline directive. 65 */ 66 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i* a, __m128i* b, __m128i* c, 67 __m128i* d) { 68 #define shuf1 _MM_SHUFFLE(2, 0, 2, 0) 69 #define shuf2 _MM_SHUFFLE(3, 1, 3, 1) 70 71 // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps 72 // needs to be a compile time constant. 73 #define shuffle_si128(arga, argb, imm) \ 74 _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), \ 75 _mm_castsi128_ps((argb)), (imm))); 76 77 __m128i t = shuffle_si128(*a, *b, shuf1); 78 *b = shuffle_si128(*a, *b, shuf2); 79 *a = t; 80 t = shuffle_si128(*c, *d, shuf1); 81 *d = shuffle_si128(*c, *d, shuf2); 82 *c = t; 83 84 #undef shuf1 85 #undef shuf2 86 #undef shuffle_si128 87 88 __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c)); 89 90 __m128i carry = 91 _mm_or_si128(_mm_and_si128(*a, *b), 92 _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c))); 93 94 sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d)); 95 96 return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry))); 97 } 98 99 MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b) { 100 return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); 101 } 102 103 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b) { 104 __m128i t = _mm_castps_si128(_mm_shuffle_ps( 105 _mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1))); 106 b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), 107 _MM_SHUFFLE(2, 0, 2, 0))); 108 a = t; 109 110 return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); 111 } 112 113 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, 114 uint32_t d) { 115 uint32_t sum = a ^ b ^ c; 116 uint32_t carry = (a & b) | (a & c) | (b & c); 117 118 uint32_t mask = 0xfefefefe; 119 120 // Not having a byte based average instruction means we should mask to avoid 121 // underflow. 122 sum = (((sum ^ d) & mask) >> 1) + (sum & d); 123 124 return (((sum ^ carry) & mask) >> 1) + (sum & carry); 125 } 126 127 // Simple 2 pixel average version of the function above. 128 MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) { 129 uint32_t sum = a ^ b; 130 uint32_t carry = (a & b); 131 132 uint32_t mask = 0xfefefefe; 133 134 return ((sum & mask) >> 1) + carry; 135 } 136 137 namespace mozilla::gfx { 138 139 void ImageHalfScaler::HalfImage2D_SSE2(uint8_t* aSource, int32_t aSourceStride, 140 const IntSize& aSourceSize, 141 uint8_t* aDest, uint32_t aDestStride) { 142 const int Bpp = 4; 143 144 for (int y = 0; y < aSourceSize.height; y += 2) { 145 __m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride); 146 int x = 0; 147 // Run a loop depending on alignment. 148 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && 149 !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { 150 for (; x < (aSourceSize.width - 7); x += 8) { 151 __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); 152 __m128i* lowerRow = 153 (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); 154 155 __m128i a = _mm_load_si128(upperRow); 156 __m128i b = _mm_load_si128(upperRow + 1); 157 __m128i c = _mm_load_si128(lowerRow); 158 __m128i d = _mm_load_si128(lowerRow + 1); 159 160 *storage++ = avg_sse2_8x2(&a, &b, &c, &d); 161 } 162 } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { 163 for (; x < (aSourceSize.width - 7); x += 8) { 164 __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); 165 __m128i* lowerRow = 166 (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); 167 168 __m128i a = _mm_load_si128(upperRow); 169 __m128i b = _mm_load_si128(upperRow + 1); 170 __m128i c = loadUnaligned128(lowerRow); 171 __m128i d = loadUnaligned128(lowerRow + 1); 172 173 *storage++ = avg_sse2_8x2(&a, &b, &c, &d); 174 } 175 } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { 176 for (; x < (aSourceSize.width - 7); x += 8) { 177 __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); 178 __m128i* lowerRow = 179 (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); 180 181 __m128i a = loadUnaligned128((__m128i*)upperRow); 182 __m128i b = loadUnaligned128((__m128i*)upperRow + 1); 183 __m128i c = _mm_load_si128((__m128i*)lowerRow); 184 __m128i d = _mm_load_si128((__m128i*)lowerRow + 1); 185 186 *storage++ = avg_sse2_8x2(&a, &b, &c, &d); 187 } 188 } else { 189 for (; x < (aSourceSize.width - 7); x += 8) { 190 __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); 191 __m128i* lowerRow = 192 (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); 193 194 __m128i a = loadUnaligned128(upperRow); 195 __m128i b = loadUnaligned128(upperRow + 1); 196 __m128i c = loadUnaligned128(lowerRow); 197 __m128i d = loadUnaligned128(lowerRow + 1); 198 199 *storage++ = avg_sse2_8x2(&a, &b, &c, &d); 200 } 201 } 202 203 uint32_t* unalignedStorage = (uint32_t*)storage; 204 // Take care of the final pixels, we know there's an even number of pixels 205 // in the source rectangle. We use a 2x2 'simd' implementation for this. 206 // 207 // Potentially we only have to do this in the last row since overflowing 208 // 8 pixels in an earlier row would appear to be harmless as it doesn't 209 // touch invalid memory. Even when reading and writing to the same surface. 210 // in practice we only do this when doing an additional downscale pass, and 211 // in this situation we have unused stride to write into harmlessly. 212 // I do not believe the additional code complexity would be worth it though. 213 for (; x < aSourceSize.width; x += 2) { 214 uint8_t* upperRow = aSource + (y * aSourceStride + x * Bpp); 215 uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp); 216 217 *unalignedStorage++ = 218 Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1), 219 *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1)); 220 } 221 } 222 } 223 224 void ImageHalfScaler::HalfImageVertical_SSE2(uint8_t* aSource, 225 int32_t aSourceStride, 226 const IntSize& aSourceSize, 227 uint8_t* aDest, 228 uint32_t aDestStride) { 229 for (int y = 0; y < aSourceSize.height; y += 2) { 230 __m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride); 231 int x = 0; 232 // Run a loop depending on alignment. 233 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && 234 !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { 235 for (; x < (aSourceSize.width - 3); x += 4) { 236 uint8_t* upperRow = aSource + (y * aSourceStride + x * 4); 237 uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 238 239 __m128i a = _mm_load_si128((__m128i*)upperRow); 240 __m128i b = _mm_load_si128((__m128i*)lowerRow); 241 242 *storage++ = avg_sse2_4x2_4x1(a, b); 243 } 244 } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { 245 // This line doesn't align well. 246 for (; x < (aSourceSize.width - 3); x += 4) { 247 uint8_t* upperRow = aSource + (y * aSourceStride + x * 4); 248 uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 249 250 __m128i a = _mm_load_si128((__m128i*)upperRow); 251 __m128i b = loadUnaligned128((__m128i*)lowerRow); 252 253 *storage++ = avg_sse2_4x2_4x1(a, b); 254 } 255 } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { 256 for (; x < (aSourceSize.width - 3); x += 4) { 257 uint8_t* upperRow = aSource + (y * aSourceStride + x * 4); 258 uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 259 260 __m128i a = loadUnaligned128((__m128i*)upperRow); 261 __m128i b = _mm_load_si128((__m128i*)lowerRow); 262 263 *storage++ = avg_sse2_4x2_4x1(a, b); 264 } 265 } else { 266 for (; x < (aSourceSize.width - 3); x += 4) { 267 uint8_t* upperRow = aSource + (y * aSourceStride + x * 4); 268 uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 269 270 __m128i a = loadUnaligned128((__m128i*)upperRow); 271 __m128i b = loadUnaligned128((__m128i*)lowerRow); 272 273 *storage++ = avg_sse2_4x2_4x1(a, b); 274 } 275 } 276 277 uint32_t* unalignedStorage = (uint32_t*)storage; 278 // Take care of the final pixels, we know there's an even number of pixels 279 // in the source rectangle. 280 // 281 // Similar overflow considerations are valid as in the previous function. 282 for (; x < aSourceSize.width; x++) { 283 uint8_t* upperRow = aSource + (y * aSourceStride + x * 4); 284 uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 285 286 *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow); 287 } 288 } 289 } 290 291 void ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t* aSource, 292 int32_t aSourceStride, 293 const IntSize& aSourceSize, 294 uint8_t* aDest, 295 uint32_t aDestStride) { 296 for (int y = 0; y < aSourceSize.height; y++) { 297 __m128i* storage = (__m128i*)(aDest + (y * aDestStride)); 298 int x = 0; 299 // Run a loop depending on alignment. 300 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { 301 for (; x < (aSourceSize.width - 7); x += 8) { 302 __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); 303 304 __m128i a = _mm_load_si128(pixels); 305 __m128i b = _mm_load_si128(pixels + 1); 306 307 *storage++ = avg_sse2_8x1_4x1(a, b); 308 } 309 } else { 310 for (; x < (aSourceSize.width - 7); x += 8) { 311 __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); 312 313 __m128i a = loadUnaligned128(pixels); 314 __m128i b = loadUnaligned128(pixels + 1); 315 316 *storage++ = avg_sse2_8x1_4x1(a, b); 317 } 318 } 319 320 uint32_t* unalignedStorage = (uint32_t*)storage; 321 // Take care of the final pixels, we know there's an even number of pixels 322 // in the source rectangle. 323 // 324 // Similar overflow considerations are valid as in the previous function. 325 for (; x < aSourceSize.width; x += 2) { 326 uint32_t* pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4)); 327 328 *unalignedStorage++ = Avg2(*pixels, *(pixels + 1)); 329 } 330 } 331 } 332 333 } // namespace mozilla::gfx