FilterProcessingSIMD-inl.h (52215B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "FilterProcessing.h" 8 9 #include "SIMD.h" 10 #include "SVGTurbulenceRenderer-inl.h" 11 12 namespace mozilla { 13 namespace gfx { 14 15 template <typename u8x16_t> 16 inline already_AddRefed<DataSourceSurface> ConvertToB8G8R8A8_SIMD( 17 SourceSurface* aSurface) { 18 IntSize size = aSurface->GetSize(); 19 RefPtr<DataSourceSurface> output = 20 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); 21 if (!output) { 22 return nullptr; 23 } 24 25 RefPtr<DataSourceSurface> input = aSurface->GetDataSurface(); 26 DataSourceSurface::ScopedMap inputMap(input, DataSourceSurface::READ); 27 DataSourceSurface::ScopedMap outputMap(output, DataSourceSurface::READ_WRITE); 28 const uint8_t* inputData = inputMap.GetData(); 29 uint8_t* outputData = outputMap.GetData(); 30 int32_t inputStride = inputMap.GetStride(); 31 int32_t outputStride = outputMap.GetStride(); 32 switch (input->GetFormat()) { 33 case SurfaceFormat::B8G8R8A8: 34 output = input; 35 break; 36 case SurfaceFormat::B8G8R8X8: 37 for (int32_t y = 0; y < size.height; y++) { 38 for (int32_t x = 0; x < size.width; x++) { 39 int32_t inputIndex = y * inputStride + 4 * x; 40 int32_t outputIndex = y * outputStride + 4 * x; 41 outputData[outputIndex + 0] = inputData[inputIndex + 0]; 42 outputData[outputIndex + 1] = inputData[inputIndex + 1]; 43 outputData[outputIndex + 2] = inputData[inputIndex + 2]; 44 outputData[outputIndex + 3] = 255; 45 } 46 } 47 break; 48 case SurfaceFormat::R8G8B8A8: 49 for (int32_t y = 0; y < size.height; y++) { 50 for (int32_t x = 0; x < size.width; x++) { 51 int32_t inputIndex = y * inputStride + 4 * x; 52 int32_t outputIndex = y * outputStride + 4 * x; 53 outputData[outputIndex + 2] = inputData[inputIndex + 0]; 54 outputData[outputIndex + 1] = inputData[inputIndex + 1]; 55 outputData[outputIndex + 0] = inputData[inputIndex + 2]; 56 outputData[outputIndex + 3] = inputData[inputIndex + 3]; 57 } 58 } 59 break; 60 case SurfaceFormat::R8G8B8X8: 61 for (int32_t y = 0; y < size.height; y++) { 62 for (int32_t x = 0; x < size.width; x++) { 63 int32_t inputIndex = y * inputStride + 4 * x; 64 int32_t outputIndex = y * outputStride + 4 * x; 65 outputData[outputIndex + 2] = inputData[inputIndex + 0]; 66 outputData[outputIndex + 1] = inputData[inputIndex + 1]; 67 outputData[outputIndex + 0] = inputData[inputIndex + 2]; 68 outputData[outputIndex + 3] = 255; 69 } 70 } 71 break; 72 case SurfaceFormat::A8: 73 for (int32_t y = 0; y < size.height; y++) { 74 for (int32_t x = 0; x < size.width; x += 16) { 75 int32_t inputIndex = y * inputStride + x; 76 int32_t outputIndex = y * outputStride + 4 * x; 77 u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]); 78 // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by 79 // interleaving with 0000000000000000 twice. 80 u8x16_t zero = simd::FromZero8<u8x16_t>(); 81 u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16); 82 u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16); 83 u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8); 84 u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8); 85 u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16); 86 u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16); 87 simd::Store8(&outputData[outputIndex], p1To4); 88 if ((x + 4) * 4 < outputStride) { 89 simd::Store8(&outputData[outputIndex + 4 * 4], p5To8); 90 } 91 if ((x + 8) * 4 < outputStride) { 92 simd::Store8(&outputData[outputIndex + 4 * 8], p9To12); 93 } 94 if ((x + 12) * 4 < outputStride) { 95 simd::Store8(&outputData[outputIndex + 4 * 12], p13To16); 96 } 97 } 98 } 99 break; 100 default: 101 output = nullptr; 102 break; 103 } 104 return output.forget(); 105 } 106 107 template <typename u8x16_t> 108 inline void ExtractAlpha_SIMD(const IntSize& size, const uint8_t* sourceData, 109 int32_t sourceStride, uint8_t* alphaData, 110 int32_t alphaStride) { 111 for (int32_t y = 0; y < size.height; y++) { 112 for (int32_t x = 0; x < size.width; x += 16) { 113 // Process 16 pixels at a time. 114 // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of 115 // AAAAAAAAAAAAAAAA. 116 int32_t sourceIndex = y * sourceStride + 4 * x; 117 int32_t targetIndex = y * alphaStride + x; 118 119 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>(); 120 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>(); 121 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>(); 122 123 u8x16_t bgrabgrabgrabgra1 = 124 simd::Load8<u8x16_t>(&sourceData[sourceIndex]); 125 if (4 * (x + 4) < sourceStride) { 126 bgrabgrabgrabgra2 = 127 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]); 128 } 129 if (4 * (x + 8) < sourceStride) { 130 bgrabgrabgrabgra3 = 131 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]); 132 } 133 if (4 * (x + 12) < sourceStride) { 134 bgrabgrabgrabgra4 = 135 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]); 136 } 137 138 u8x16_t bbggrraabbggrraa1 = 139 simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); 140 u8x16_t bbggrraabbggrraa2 = 141 simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); 142 u8x16_t bbggrraabbggrraa3 = 143 simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); 144 u8x16_t bbggrraabbggrraa4 = 145 simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); 146 u8x16_t bbbbggggrrrraaaa1 = 147 simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3); 148 u8x16_t bbbbggggrrrraaaa2 = 149 simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3); 150 u8x16_t bbbbggggrrrraaaa3 = 151 simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4); 152 u8x16_t bbbbggggrrrraaaa4 = 153 simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4); 154 u8x16_t rrrrrrrraaaaaaaa1 = 155 simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); 156 u8x16_t rrrrrrrraaaaaaaa2 = 157 simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); 158 u8x16_t aaaaaaaaaaaaaaaa = 159 simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); 160 161 simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa); 162 } 163 } 164 } 165 166 // This function calculates the result color values for four pixels, but for 167 // only two color channels - either b & r or g & a. However, the a result will 168 // not be used. 169 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa. 170 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the 171 // alpha of all four pixels (and both aaaa's are the same). 172 // blendendComponent1 and blendedComponent2 are the out parameters. 173 template <typename i16x8_t, typename i32x4_t, uint32_t aBlendMode> 174 inline void BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha, 175 i16x8_t dest, 176 const i16x8_t& destAlpha, 177 i32x4_t& blendedComponent1, 178 i32x4_t& blendedComponent2) { 179 i16x8_t x255 = simd::FromI16<i16x8_t>(255); 180 181 switch (aBlendMode) { 182 case BLEND_MODE_MULTIPLY: { 183 // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * 184 // dest); 185 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); 186 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 187 i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = 188 simd::Add16(twoFiftyFiveMinusSourceAlpha, source); 189 190 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest); 191 i16x8_t leftFactor1 = simd::InterleaveLo16( 192 twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource); 193 blendedComponent1 = 194 simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1); 195 blendedComponent1 = simd::FastDivideBy255(blendedComponent1); 196 197 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest); 198 i16x8_t leftFactor2 = simd::InterleaveHi16( 199 twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource); 200 blendedComponent2 = 201 simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2); 202 blendedComponent2 = simd::FastDivideBy255(blendedComponent2); 203 204 break; 205 } 206 207 case BLEND_MODE_SCREEN: { 208 // val = 255 * (source + dest) + (0 - dest) * source; 209 i16x8_t sourcePlusDest = simd::Add16(source, dest); 210 i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest); 211 212 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = 213 simd::InterleaveLo16(x255, zeroMinusDest); 214 i16x8_t sourcePlusDestInterleavedWithSource1 = 215 simd::InterleaveLo16(sourcePlusDest, source); 216 blendedComponent1 = 217 simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, 218 sourcePlusDestInterleavedWithSource1); 219 blendedComponent1 = simd::FastDivideBy255(blendedComponent1); 220 221 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = 222 simd::InterleaveHi16(x255, zeroMinusDest); 223 i16x8_t sourcePlusDestInterleavedWithSource2 = 224 simd::InterleaveHi16(sourcePlusDest, source); 225 blendedComponent2 = 226 simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, 227 sourcePlusDestInterleavedWithSource2); 228 blendedComponent2 = simd::FastDivideBy255(blendedComponent2); 229 230 break; 231 } 232 233 case BLEND_MODE_DARKEN: 234 case BLEND_MODE_LIGHTEN: { 235 // Darken: 236 // val = min((255 - destAlpha) * source + 255 * dest, 237 // 255 * source + (255 - sourceAlpha) * dest); 238 // 239 // Lighten: 240 // val = max((255 - destAlpha) * source + 255 * dest, 241 // 255 * source + (255 - sourceAlpha) * dest); 242 243 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); 244 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 245 246 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = 247 simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255); 248 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = 249 simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha); 250 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest); 251 i32x4_t product1_1 = simd::MulAdd16x8x2To32x4( 252 twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, 253 sourceInterleavedWithDest1); 254 i32x4_t product1_2 = simd::MulAdd16x8x2To32x4( 255 twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, 256 sourceInterleavedWithDest1); 257 blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN 258 ? simd::Min32(product1_1, product1_2) 259 : simd::Max32(product1_1, product1_2); 260 blendedComponent1 = simd::FastDivideBy255(blendedComponent1); 261 262 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = 263 simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255); 264 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = 265 simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha); 266 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest); 267 i32x4_t product2_1 = simd::MulAdd16x8x2To32x4( 268 twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, 269 sourceInterleavedWithDest2); 270 i32x4_t product2_2 = simd::MulAdd16x8x2To32x4( 271 twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, 272 sourceInterleavedWithDest2); 273 blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN 274 ? simd::Min32(product2_1, product2_2) 275 : simd::Max32(product2_1, product2_2); 276 blendedComponent2 = simd::FastDivideBy255(blendedComponent2); 277 278 break; 279 } 280 } 281 } 282 283 // The alpha channel is subject to a different calculation than the RGB 284 // channels, and this calculation is the same for all blend modes: 285 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha) 286 template <typename i16x8_t, typename i32x4_t> 287 inline i32x4_t BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, 288 i16x8_t d_rrrraaaa1234) { 289 // clang-format off 290 // We're using MulAdd16x8x2To32x4, so we need to interleave our factors 291 // appropriately. The calculation is rewritten as follows: 292 // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0]) 293 // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255) 294 // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255) 295 // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0] 296 // clang-format on 297 i16x8_t zeroInterleavedWithSourceAlpha = 298 simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234); 299 i16x8_t fiveTenInterleavedWithDestAlpha = 300 simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234); 301 i16x8_t f1 = 302 simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha); 303 i16x8_t f2 = 304 simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255)); 305 return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2)); 306 } 307 308 template <typename u8x16_t, typename i16x8_t> 309 inline void UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234, 310 i16x8_t& bbbbgggg1234, 311 i16x8_t& rrrraaaa1234) { 312 // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234 313 i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234); 314 i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234); 315 i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34); 316 i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34); 317 bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24); 318 rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24); 319 } 320 321 template <typename i32x4_t, typename i16x8_t, typename u8x16_t> 322 inline u8x16_t ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234, 323 i32x4_t rrrr1234, 324 const i32x4_t& aaaa1234) { 325 // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234 326 i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234); 327 i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234); 328 i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234); 329 i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234); 330 i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234); 331 i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234); 332 return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34); 333 } 334 335 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode> 336 inline void ApplyBlending_SIMD(const DataSourceSurface::ScopedMap& aInputMap1, 337 const DataSourceSurface::ScopedMap& aInputMap2, 338 const DataSourceSurface::ScopedMap& aOutputMap, 339 const IntSize& aSize) { 340 const uint8_t* source1Data = aInputMap1.GetData(); 341 const uint8_t* source2Data = aInputMap2.GetData(); 342 uint8_t* targetData = aOutputMap.GetData(); 343 int32_t targetStride = aOutputMap.GetStride(); 344 int32_t source1Stride = aInputMap1.GetStride(); 345 int32_t source2Stride = aInputMap2.GetStride(); 346 347 for (int32_t y = 0; y < aSize.height; y++) { 348 for (int32_t x = 0; x < aSize.width; x += 4) { 349 int32_t targetIndex = y * targetStride + 4 * x; 350 int32_t source1Index = y * source1Stride + 4 * x; 351 int32_t source2Index = y * source2Stride + 4 * x; 352 353 u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]); 354 u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]); 355 356 // The blending calculation for the RGB channels all need access to the 357 // alpha channel of their pixel, and the alpha calculation is different, 358 // so it makes sense to separate by channel. 359 360 i16x8_t s_bbbbgggg1234, s_rrrraaaa1234; 361 i16x8_t d_bbbbgggg1234, d_rrrraaaa1234; 362 UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234); 363 UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234); 364 i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(s_rrrraaaa1234); 365 i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(d_rrrraaaa1234); 366 367 // We only use blendedB, blendedG and blendedR. 368 i32x4_t blendedB, blendedG, blendedR, blendedA; 369 BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>( 370 s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, 371 blendedB, blendedG); 372 BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>( 373 s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, 374 blendedR, blendedA); 375 376 // Throw away blendedA and overwrite it with the correct blended alpha. 377 blendedA = BlendAlphaOfFourPixels<i16x8_t, i32x4_t>(s_rrrraaaa1234, 378 d_rrrraaaa1234); 379 380 u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t, i16x8_t, u8x16_t>( 381 blendedB, blendedG, blendedR, blendedA); 382 simd::Store8(&targetData[targetIndex], result1234); 383 } 384 } 385 } 386 387 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode> 388 inline already_AddRefed<DataSourceSurface> ApplyBlending_SIMD( 389 DataSourceSurface* aInput1, DataSourceSurface* aInput2) { 390 IntSize size = aInput1->GetSize(); 391 RefPtr<DataSourceSurface> target = 392 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); 393 if (!target) { 394 return nullptr; 395 } 396 397 DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ); 398 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE); 399 if (aInput1->Equals(aInput2)) { 400 ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap1, 401 outputMap, size); 402 } else { 403 DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ); 404 ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap2, 405 outputMap, size); 406 } 407 408 return target.forget(); 409 } 410 411 template <typename i32x4_t, typename i16x8_t, typename u8x16_t> 412 static already_AddRefed<DataSourceSurface> ApplyBlending_SIMD( 413 DataSourceSurface* aInput1, DataSourceSurface* aInput2, 414 BlendMode aBlendMode) { 415 switch (aBlendMode) { 416 case BLEND_MODE_MULTIPLY: 417 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_MULTIPLY>( 418 aInput1, aInput2); 419 case BLEND_MODE_SCREEN: 420 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_SCREEN>( 421 aInput1, aInput2); 422 case BLEND_MODE_DARKEN: 423 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_DARKEN>( 424 aInput1, aInput2); 425 case BLEND_MODE_LIGHTEN: 426 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_LIGHTEN>( 427 aInput1, aInput2); 428 default: 429 return nullptr; 430 } 431 } 432 433 template <MorphologyOperator Operator, typename u8x16_t> 434 static u8x16_t Morph8(u8x16_t a, u8x16_t b) { 435 return Operator == MORPHOLOGY_OPERATOR_ERODE ? simd::Min8(a, b) 436 : simd::Max8(a, b); 437 } 438 439 // Set every pixel to the per-component minimum or maximum of the pixels around 440 // it that are up to aRadius pixels away from it (horizontally). 441 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t> 442 inline void ApplyMorphologyHorizontal_SIMD( 443 const uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData, 444 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) { 445 static_assert( 446 op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE, 447 "unexpected morphology operator"); 448 449 int32_t kernelSize = aRadius + 1 + aRadius; 450 MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0"); 451 MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3); 452 int32_t completeKernelSizeForFourPixels = kernelSize + 3; 453 MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 || 454 completeKernelSizeForFourPixels % 4 == 2); 455 456 // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just 457 // the way we need them to be. 458 459 IntRect sourceRect = aDestRect; 460 sourceRect.Inflate(aRadius, 0); 461 462 for (int32_t y = aDestRect.Y(); y < aDestRect.YMost(); y++) { 463 int32_t kernelStartX = aDestRect.X() - aRadius; 464 for (int32_t x = aDestRect.X(); x < aDestRect.XMost(); 465 x += 4, kernelStartX += 4) { 466 // We process four pixels (16 color values) at a time. 467 // aSourceData[0] points to the pixel located at aDestRect.TopLeft(); 468 // source values can be read beyond that because the source is extended 469 // by aRadius pixels. 470 471 int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX; 472 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); 473 u8x16_t m1234 = p1234; 474 475 for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) { 476 u8x16_t p5678 = 477 (kernelStartX + i < sourceRect.XMost()) 478 ? simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) 479 : simd::FromZero8<u8x16_t>(); 480 u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678); 481 u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678); 482 m1234 = Morph8<op, u8x16_t>(m1234, p2345); 483 m1234 = Morph8<op, u8x16_t>(m1234, p3456); 484 if (i + 2 < completeKernelSizeForFourPixels) { 485 u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678); 486 m1234 = Morph8<op, u8x16_t>(m1234, p4567); 487 m1234 = Morph8<op, u8x16_t>(m1234, p5678); 488 } 489 p1234 = p5678; 490 } 491 492 int32_t destIndex = y * aDestStride + 4 * x; 493 simd::Store8(&aDestData[destIndex], m1234); 494 } 495 } 496 } 497 498 template <typename i16x8_t, typename u8x16_t> 499 inline void ApplyMorphologyHorizontal_SIMD( 500 const uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData, 501 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius, 502 MorphologyOperator aOp) { 503 if (aOp == MORPHOLOGY_OPERATOR_ERODE) { 504 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>( 505 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); 506 } else { 507 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t, 508 u8x16_t>( 509 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); 510 } 511 } 512 513 // Set every pixel to the per-component minimum or maximum of the pixels around 514 // it that are up to aRadius pixels away from it (vertically). 515 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t> 516 static void ApplyMorphologyVertical_SIMD( 517 const uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData, 518 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) { 519 static_assert( 520 op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE, 521 "unexpected morphology operator"); 522 523 int32_t startY = aDestRect.Y() - aRadius; 524 int32_t endY = aDestRect.Y() + aRadius; 525 for (int32_t y = aDestRect.Y(); y < aDestRect.YMost(); 526 y++, startY++, endY++) { 527 for (int32_t x = aDestRect.X(); x < aDestRect.XMost(); x += 4) { 528 int32_t sourceIndex = startY * aSourceStride + 4 * x; 529 u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); 530 sourceIndex += aSourceStride; 531 for (int32_t iy = startY + 1; iy <= endY; 532 iy++, sourceIndex += aSourceStride) { 533 u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); 534 u = Morph8<op, u8x16_t>(u, u2); 535 } 536 537 int32_t destIndex = y * aDestStride + 4 * x; 538 simd::Store8(&aDestData[destIndex], u); 539 } 540 } 541 } 542 543 template <typename i16x8_t, typename u8x16_t> 544 inline void ApplyMorphologyVertical_SIMD( 545 const uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData, 546 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius, 547 MorphologyOperator aOp) { 548 if (aOp == MORPHOLOGY_OPERATOR_ERODE) { 549 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>( 550 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); 551 } else { 552 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t, u8x16_t>( 553 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); 554 } 555 } 556 557 template <typename i32x4_t, typename i16x8_t> 558 static i32x4_t ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, 559 const i32x4_t& bias) { 560 // int16_t p[8] == { b, g, r, a, b, g, r, a }. 561 // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }. 562 // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }. 563 // int32_t bias[4] == { _B, _G, _R, _A }. 564 565 i32x4_t sum = bias; 566 567 // int16_t bg[8] = { b, g, b, g, b, g, b, g }; 568 i16x8_t bg = simd::ShuffleHi16<1, 0, 1, 0>(simd::ShuffleLo16<1, 0, 1, 0>(p)); 569 // int32_t prodsum_bg[4] = 570 // { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA } 571 i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg); 572 sum = simd::Add32(sum, prodsum_bg); 573 574 // uint16_t ra[8] = { r, a, r, a, r, a, r, a }; 575 i16x8_t ra = simd::ShuffleHi16<3, 2, 3, 2>(simd::ShuffleLo16<3, 2, 3, 2>(p)); 576 // int32_t prodsum_ra[4] = 577 // { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA } 578 i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra); 579 sum = simd::Add32(sum, prodsum_ra); 580 581 // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }. 582 return sum; 583 } 584 585 template <typename i32x4_t, typename i16x8_t, typename u8x16_t> 586 static already_AddRefed<DataSourceSurface> ApplyColorMatrix_SIMD( 587 DataSourceSurface* aInput, const Matrix5x4& aMatrix) { 588 IntSize size = aInput->GetSize(); 589 RefPtr<DataSourceSurface> target = 590 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); 591 if (!target) { 592 return nullptr; 593 } 594 595 DataSourceSurface::ScopedMap inputMap(aInput, DataSourceSurface::READ); 596 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE); 597 598 const uint8_t* sourceData = inputMap.GetData(); 599 uint8_t* targetData = outputMap.GetData(); 600 int32_t sourceStride = inputMap.GetStride(); 601 int32_t targetStride = outputMap.GetStride(); 602 603 const int16_t factor = 128; 604 const Float floatElementMax = INT16_MAX / factor; // 255 605 MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, 606 "badly chosen float-to-int scale"); 607 608 const Float* floats = &aMatrix._11; 609 610 ptrdiff_t componentOffsets[4] = { 611 B8G8R8A8_COMPONENT_BYTEOFFSET_R, B8G8R8A8_COMPONENT_BYTEOFFSET_G, 612 B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_A}; 613 614 // We store the color matrix in rows_bgra in the following format: 615 // { bB, bG, bR, bA, gB, gG, gR, gA }. 616 // { bB, gB, bG, gG, bR, gR, bA, gA } 617 // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16 618 // which works especially well for our use case. 619 int16_t rows_bgra[2][8]; 620 for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) { 621 for (size_t colIndex = 0; colIndex < 4; colIndex++) { 622 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex]; 623 Float clampedFloatMatrixElement = 624 std::clamp(floatMatrixElement, -floatElementMax, floatElementMax); 625 int16_t scaledIntMatrixElement = 626 int16_t(floorf(clampedFloatMatrixElement * factor + 0.5)); 627 int8_t bg_or_ra = componentOffsets[rowIndex] / 2; 628 int8_t g_or_a = componentOffsets[rowIndex] % 2; 629 int8_t B_or_G_or_R_or_A = componentOffsets[colIndex]; 630 rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = 631 scaledIntMatrixElement; 632 } 633 } 634 635 int32_t rowBias[4]; 636 Float biasMax = 637 (INT32_MAX - 4 * 255 * INT16_MAX - (factor / 2)) / (factor * 255); 638 for (size_t colIndex = 0; colIndex < 4; colIndex++) { 639 size_t rowIndex = 4; 640 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex]; 641 Float clampedFloatMatrixElement = 642 std::clamp(floatMatrixElement, -biasMax, biasMax); 643 // Add 0.5 before multiplying by factor so that the later bitshift dividing 644 // by factor is rounding to nearest 645 Float scaledFloatMatrixElement = 646 (clampedFloatMatrixElement * 255 + 0.5) * factor; 647 int32_t scaledIntMatrixElement = 648 int32_t(floorf(scaledFloatMatrixElement + 0.5)); 649 rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement; 650 } 651 652 i16x8_t row_bg_v = simd::FromI16<i16x8_t>( 653 rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3], 654 rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]); 655 656 i16x8_t row_ra_v = simd::FromI16<i16x8_t>( 657 rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3], 658 rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]); 659 660 i32x4_t rowsBias_v = 661 simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]); 662 663 for (int32_t y = 0; y < size.height; y++) { 664 for (int32_t x = 0; x < size.width; x += 4) { 665 MOZ_ASSERT(sourceStride >= 4 * (x + 4), 666 "need to be able to read 4 pixels at this position"); 667 MOZ_ASSERT(targetStride >= 4 * (x + 4), 668 "need to be able to write 4 pixels at this position"); 669 int32_t sourceIndex = y * sourceStride + 4 * x; 670 int32_t targetIndex = y * targetStride + 4 * x; 671 672 // We load 4 pixels, unpack them, process them 1 pixel at a time, and 673 // finally pack and store the 4 result pixels. 674 675 u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); 676 677 // Splat needed to get each pixel twice into i16x8 678 i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234)); 679 i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234)); 680 i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234)); 681 i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234)); 682 683 i32x4_t result_p1 = 684 ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v); 685 i32x4_t result_p2 = 686 ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v); 687 i32x4_t result_p3 = 688 ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v); 689 i32x4_t result_p4 = 690 ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v); 691 692 static_assert(factor == 1 << 7, 693 "Please adapt the calculation in the lines below for a " 694 "different factor."); 695 u8x16_t result_p1234 = simd::PackAndSaturate32To8( 696 simd::ShiftRight32<7>(result_p1), simd::ShiftRight32<7>(result_p2), 697 simd::ShiftRight32<7>(result_p3), simd::ShiftRight32<7>(result_p4)); 698 simd::Store8(&targetData[targetIndex], result_p1234); 699 } 700 } 701 702 return target.forget(); 703 } 704 705 // source / dest: bgra bgra 706 // sourceAlpha / destAlpha: aaaa aaaa 707 // result: bgra bgra 708 template <typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator> 709 static inline u16x8_t CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, 710 u16x8_t dest, 711 const u16x8_t& destAlpha) { 712 u16x8_t x255 = simd::FromU16<u16x8_t>(255); 713 714 switch (aCompositeOperator) { 715 case COMPOSITE_OPERATOR_OVER: { 716 // val = dest * (255 - sourceAlpha) + source * 255; 717 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 718 719 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); 720 u16x8_t rightFactor1 = 721 simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255); 722 i32x4_t result1 = 723 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); 724 725 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); 726 u16x8_t rightFactor2 = 727 simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255); 728 i32x4_t result2 = 729 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); 730 731 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), 732 simd::FastDivideBy255(result2)); 733 } 734 735 case COMPOSITE_OPERATOR_IN: { 736 // val = source * destAlpha; 737 return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha)); 738 } 739 740 case COMPOSITE_OPERATOR_OUT: { 741 // val = source * (255 - destAlpha); 742 u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha)); 743 return simd::FastDivideBy255_16(prod); 744 } 745 746 case COMPOSITE_OPERATOR_ATOP: { 747 // val = dest * (255 - sourceAlpha) + source * destAlpha; 748 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 749 750 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); 751 u16x8_t rightFactor1 = 752 simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha); 753 i32x4_t result1 = 754 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); 755 756 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); 757 u16x8_t rightFactor2 = 758 simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha); 759 i32x4_t result2 = 760 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); 761 762 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), 763 simd::FastDivideBy255(result2)); 764 } 765 766 case COMPOSITE_OPERATOR_XOR: { 767 // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha); 768 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 769 u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); 770 771 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); 772 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, 773 twoFiftyFiveMinusDestAlpha); 774 i32x4_t result1 = 775 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); 776 777 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); 778 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, 779 twoFiftyFiveMinusDestAlpha); 780 i32x4_t result2 = 781 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); 782 783 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), 784 simd::FastDivideBy255(result2)); 785 } 786 787 case COMPOSITE_OPERATOR_LIGHTER: { 788 // val = dest * sourceAlpha + source * destAlpha; 789 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); 790 u16x8_t rightFactor1 = simd::InterleaveLo16(sourceAlpha, destAlpha); 791 i32x4_t result1 = 792 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); 793 794 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); 795 u16x8_t rightFactor2 = simd::InterleaveHi16(sourceAlpha, destAlpha); 796 i32x4_t result2 = 797 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); 798 799 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), 800 simd::FastDivideBy255(result2)); 801 } 802 803 default: 804 return simd::FromU16<u16x8_t>(0); 805 } 806 } 807 808 template <typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op> 809 static void ApplyComposition(DataSourceSurface* aSource, 810 DataSourceSurface* aDest) { 811 IntSize size = aDest->GetSize(); 812 813 DataSourceSurface::ScopedMap input(aSource, DataSourceSurface::READ); 814 DataSourceSurface::ScopedMap output(aDest, DataSourceSurface::READ_WRITE); 815 816 const uint8_t* sourceData = input.GetData(); 817 uint8_t* destData = output.GetData(); 818 uint32_t sourceStride = input.GetStride(); 819 uint32_t destStride = output.GetStride(); 820 821 for (int32_t y = 0; y < size.height; y++) { 822 for (int32_t x = 0; x < size.width; x += 4) { 823 uint32_t sourceIndex = y * sourceStride + 4 * x; 824 uint32_t destIndex = y * destStride + 4 * x; 825 826 u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); 827 u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]); 828 829 u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234); 830 u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234); 831 u16x8_t sa12 = simd::Splat16<3, 3>(s12); 832 u16x8_t da12 = simd::Splat16<3, 3>(d12); 833 u16x8_t result12 = 834 CompositeTwoPixels<i32x4_t, u16x8_t, op>(s12, sa12, d12, da12); 835 836 u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234); 837 u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234); 838 u16x8_t sa34 = simd::Splat16<3, 3>(s34); 839 u16x8_t da34 = simd::Splat16<3, 3>(d34); 840 u16x8_t result34 = 841 CompositeTwoPixels<i32x4_t, u16x8_t, op>(s34, sa34, d34, da34); 842 843 u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34); 844 simd::Store8(&destData[destIndex], result1234); 845 } 846 } 847 } 848 849 template <typename i32x4_t, typename i16x8_t, typename u8x16_t> 850 static void ApplyComposition_SIMD(DataSourceSurface* aSource, 851 DataSourceSurface* aDest, 852 CompositeOperator aOperator) { 853 switch (aOperator) { 854 case COMPOSITE_OPERATOR_OVER: 855 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OVER>( 856 aSource, aDest); 857 break; 858 case COMPOSITE_OPERATOR_IN: 859 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_IN>( 860 aSource, aDest); 861 break; 862 case COMPOSITE_OPERATOR_OUT: 863 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OUT>( 864 aSource, aDest); 865 break; 866 case COMPOSITE_OPERATOR_ATOP: 867 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_ATOP>( 868 aSource, aDest); 869 break; 870 case COMPOSITE_OPERATOR_XOR: 871 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_XOR>( 872 aSource, aDest); 873 break; 874 case COMPOSITE_OPERATOR_LIGHTER: 875 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_LIGHTER>( 876 aSource, aDest); 877 break; 878 default: 879 MOZ_CRASH("GFX: Incomplete switch"); 880 } 881 } 882 883 template <typename u8x16_t> 884 static void SeparateColorChannels_SIMD( 885 const IntSize& size, const uint8_t* sourceData, int32_t sourceStride, 886 uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, 887 uint8_t* channel3Data, int32_t channelStride) { 888 for (int32_t y = 0; y < size.height; y++) { 889 for (int32_t x = 0; x < size.width; x += 16) { 890 // Process 16 pixels at a time. 891 int32_t sourceIndex = y * sourceStride + 4 * x; 892 int32_t targetIndex = y * channelStride + x; 893 894 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>(); 895 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>(); 896 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>(); 897 898 u8x16_t bgrabgrabgrabgra1 = 899 simd::Load8<u8x16_t>(&sourceData[sourceIndex]); 900 if (4 * (x + 4) < sourceStride) { 901 bgrabgrabgrabgra2 = 902 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]); 903 } 904 if (4 * (x + 8) < sourceStride) { 905 bgrabgrabgrabgra3 = 906 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]); 907 } 908 if (4 * (x + 12) < sourceStride) { 909 bgrabgrabgrabgra4 = 910 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]); 911 } 912 913 u8x16_t bbggrraabbggrraa1 = 914 simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); 915 u8x16_t bbggrraabbggrraa2 = 916 simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); 917 u8x16_t bbggrraabbggrraa3 = 918 simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); 919 u8x16_t bbggrraabbggrraa4 = 920 simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); 921 u8x16_t bbbbggggrrrraaaa1 = 922 simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3); 923 u8x16_t bbbbggggrrrraaaa2 = 924 simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3); 925 u8x16_t bbbbggggrrrraaaa3 = 926 simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4); 927 u8x16_t bbbbggggrrrraaaa4 = 928 simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4); 929 u8x16_t bbbbbbbbgggggggg1 = 930 simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); 931 u8x16_t rrrrrrrraaaaaaaa1 = 932 simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); 933 u8x16_t bbbbbbbbgggggggg2 = 934 simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); 935 u8x16_t rrrrrrrraaaaaaaa2 = 936 simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); 937 u8x16_t bbbbbbbbbbbbbbbb = 938 simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2); 939 u8x16_t gggggggggggggggg = 940 simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2); 941 u8x16_t rrrrrrrrrrrrrrrr = 942 simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); 943 u8x16_t aaaaaaaaaaaaaaaa = 944 simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); 945 946 simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb); 947 simd::Store8(&channel1Data[targetIndex], gggggggggggggggg); 948 simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr); 949 simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa); 950 } 951 } 952 } 953 954 template <typename u8x16_t> 955 static void CombineColorChannels_SIMD( 956 const IntSize& size, int32_t resultStride, uint8_t* resultData, 957 int32_t channelStride, uint8_t* channel0Data, const uint8_t* channel1Data, 958 const uint8_t* channel2Data, const uint8_t* channel3Data) { 959 for (int32_t y = 0; y < size.height; y++) { 960 for (int32_t x = 0; x < size.width; x += 16) { 961 // Process 16 pixels at a time. 962 int32_t resultIndex = y * resultStride + 4 * x; 963 int32_t channelIndex = y * channelStride + x; 964 965 u8x16_t bbbbbbbbbbbbbbbb = 966 simd::Load8<u8x16_t>(&channel0Data[channelIndex]); 967 u8x16_t gggggggggggggggg = 968 simd::Load8<u8x16_t>(&channel1Data[channelIndex]); 969 u8x16_t rrrrrrrrrrrrrrrr = 970 simd::Load8<u8x16_t>(&channel2Data[channelIndex]); 971 u8x16_t aaaaaaaaaaaaaaaa = 972 simd::Load8<u8x16_t>(&channel3Data[channelIndex]); 973 974 u8x16_t brbrbrbrbrbrbrbr1 = 975 simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr); 976 u8x16_t brbrbrbrbrbrbrbr2 = 977 simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr); 978 u8x16_t gagagagagagagaga1 = 979 simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa); 980 u8x16_t gagagagagagagaga2 = 981 simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa); 982 983 u8x16_t bgrabgrabgrabgra1 = 984 simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1); 985 u8x16_t bgrabgrabgrabgra2 = 986 simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1); 987 u8x16_t bgrabgrabgrabgra3 = 988 simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2); 989 u8x16_t bgrabgrabgrabgra4 = 990 simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2); 991 992 simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1); 993 if (4 * (x + 4) < resultStride) { 994 simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2); 995 } 996 if (4 * (x + 8) < resultStride) { 997 simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3); 998 } 999 if (4 * (x + 12) < resultStride) { 1000 simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4); 1001 } 1002 } 1003 } 1004 } 1005 1006 template <typename u16x8_t, typename u8x16_t> 1007 static void DoOpacityCalculation_SIMD(const IntSize& aSize, 1008 uint8_t* aTargetData, 1009 int32_t aTargetStride, 1010 const uint8_t* aSourceData, 1011 int32_t aSourceStride, Float aOpacity) { 1012 uint8_t alphaValue = uint8_t(roundf(255.f * aOpacity)); 1013 u16x8_t alphaValues = 1014 simd::FromU16<u16x8_t>(alphaValue, alphaValue, alphaValue, alphaValue, 1015 alphaValue, alphaValue, alphaValue, alphaValue); 1016 for (int32_t y = 0; y < aSize.height; y++) { 1017 for (int32_t x = 0; x < aSize.width; x += 4) { 1018 int32_t inputIndex = y * aSourceStride + 4 * x; 1019 int32_t targetIndex = y * aTargetStride + 4 * x; 1020 1021 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]); 1022 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234); 1023 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234); 1024 1025 // Multiply all components with alpha. 1026 p12 = simd::Mul16(p12, alphaValues); 1027 p34 = simd::Mul16(p34, alphaValues); 1028 1029 // Divide by 255 and pack. 1030 u8x16_t result = simd::PackAndSaturate16To8(simd::ShiftRight16<8>(p12), 1031 simd::ShiftRight16<8>(p34)); 1032 1033 simd::Store8(&aTargetData[targetIndex], result); 1034 } 1035 } 1036 } 1037 1038 template <typename f32x4_t, typename i32x4_t, typename u8x16_t> 1039 static already_AddRefed<DataSourceSurface> RenderTurbulence_SIMD( 1040 const IntSize& aSize, const Point& aOffset, const Size& aBaseFrequency, 1041 int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, 1042 const Rect& aTileRect) { 1043 #define RETURN_TURBULENCE(Type, Stitch) \ 1044 SVGTurbulenceRenderer<Type, Stitch, f32x4_t, i32x4_t, u8x16_t> renderer( \ 1045 aBaseFrequency, aSeed, aNumOctaves, aTileRect); \ 1046 return renderer.Render(aSize, aOffset); 1047 1048 switch (aType) { 1049 case TURBULENCE_TYPE_TURBULENCE: { 1050 if (aStitch) { 1051 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true); 1052 } 1053 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false); 1054 } 1055 case TURBULENCE_TYPE_FRACTAL_NOISE: { 1056 if (aStitch) { 1057 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true); 1058 } 1059 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false); 1060 } 1061 } 1062 return nullptr; 1063 #undef RETURN_TURBULENCE 1064 } 1065 1066 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4 1067 template <typename i32x4_t, typename i16x8_t> 1068 static MOZ_ALWAYS_INLINE i16x8_t ArithmeticCombineTwoPixels( 1069 i16x8_t in1, i16x8_t in2, const i16x8_t& k1And4, const i16x8_t& k2And3) { 1070 // Calculate input product: inProd = (in1 * in2) / 255. 1071 i32x4_t inProd_1, inProd_2; 1072 simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2); 1073 i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), 1074 simd::FastDivideBy255(inProd_2)); 1075 1076 // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128 1077 i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128); 1078 i16x8_t inProd1AndOneTwentyEight = 1079 simd::InterleaveLo16(inProd, oneTwentyEight); 1080 i16x8_t inProd2AndOneTwentyEight = 1081 simd::InterleaveHi16(inProd, oneTwentyEight); 1082 i32x4_t inProdTimesK1PlusK4_1 = 1083 simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight); 1084 i32x4_t inProdTimesK1PlusK4_2 = 1085 simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight); 1086 1087 // Calculate k2 * in1 + k3 * in2 1088 i16x8_t in12_1 = simd::InterleaveLo16(in1, in2); 1089 i16x8_t in12_2 = simd::InterleaveHi16(in1, in2); 1090 i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1); 1091 i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2); 1092 1093 // Sum everything up and truncate the fractional part. 1094 i32x4_t result_1 = 1095 simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1)); 1096 i32x4_t result_2 = 1097 simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2)); 1098 return simd::PackAndSaturate32To16(result_1, result_2); 1099 } 1100 1101 template <typename i32x4_t, typename i16x8_t, typename u8x16_t> 1102 static void ApplyArithmeticCombine_SIMD( 1103 const DataSourceSurface::ScopedMap& aInputMap1, 1104 const DataSourceSurface::ScopedMap& aInputMap2, 1105 const DataSourceSurface::ScopedMap& aOutputMap, const IntSize& aSize, 1106 Float aK1, Float aK2, Float aK3, Float aK4) { 1107 const uint8_t* source1Data = aInputMap1.GetData(); 1108 const uint8_t* source2Data = aInputMap2.GetData(); 1109 uint8_t* targetData = aOutputMap.GetData(); 1110 uint32_t source1Stride = aInputMap1.GetStride(); 1111 uint32_t source2Stride = aInputMap2.GetStride(); 1112 uint32_t targetStride = aOutputMap.GetStride(); 1113 1114 // The arithmetic combine filter does the following calculation: 1115 // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4 1116 // 1117 // Or, with in1/2 integers between 0 and 255: 1118 // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255 1119 // 1120 // We want the whole calculation to happen in integer, with 16-bit factors. 1121 // So we convert our factors to fixed-point with precision 1.8.7. 1122 // K4 is premultiplied with 255, and it will be multiplied with 128 later 1123 // during the actual calculation, because premultiplying it with 255 * 128 1124 // would overflow int16. 1125 1126 i16x8_t k1 = simd::FromI16<i16x8_t>( 1127 int16_t(floorf(std::clamp(aK1, -255.0f, 255.0f) * 128 + 0.5f))); 1128 i16x8_t k2 = simd::FromI16<i16x8_t>( 1129 int16_t(floorf(std::clamp(aK2, -255.0f, 255.0f) * 128 + 0.5f))); 1130 i16x8_t k3 = simd::FromI16<i16x8_t>( 1131 int16_t(floorf(std::clamp(aK3, -255.0f, 255.0f) * 128 + 0.5f))); 1132 i16x8_t k4 = simd::FromI16<i16x8_t>( 1133 int16_t(floorf(std::clamp(aK4, -128.0f, 128.0f) * 255 + 0.5f))); 1134 1135 i16x8_t k1And4 = simd::InterleaveLo16(k1, k4); 1136 i16x8_t k2And3 = simd::InterleaveLo16(k2, k3); 1137 1138 for (int32_t y = 0; y < aSize.height; y++) { 1139 for (int32_t x = 0; x < aSize.width; x += 4) { 1140 uint32_t source1Index = y * source1Stride + 4 * x; 1141 uint32_t source2Index = y * source2Stride + 4 * x; 1142 uint32_t targetIndex = y * targetStride + 4 * x; 1143 1144 // Load and unpack. 1145 u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]); 1146 u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]); 1147 i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1); 1148 i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1); 1149 i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2); 1150 i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2); 1151 1152 // Multiply and add. 1153 i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>( 1154 in1_12, in2_12, k1And4, k2And3); 1155 i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>( 1156 in1_34, in2_34, k1And4, k2And3); 1157 1158 // Pack and store. 1159 simd::Store8(&targetData[targetIndex], 1160 simd::PackAndSaturate16To8(result_12, result_34)); 1161 } 1162 } 1163 } 1164 1165 template <typename i32x4_t, typename i16x8_t, typename u8x16_t> 1166 static already_AddRefed<DataSourceSurface> ApplyArithmeticCombine_SIMD( 1167 DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1, 1168 Float aK2, Float aK3, Float aK4) { 1169 IntSize size = aInput1->GetSize(); 1170 RefPtr<DataSourceSurface> target = 1171 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); 1172 if (!target) { 1173 return nullptr; 1174 } 1175 1176 DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ); 1177 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE); 1178 1179 if (aInput1->Equals(aInput2)) { 1180 ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>( 1181 inputMap1, inputMap1, outputMap, size, aK1, aK2, aK3, aK4); 1182 } else { 1183 DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ); 1184 ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>( 1185 inputMap1, inputMap2, outputMap, size, aK1, aK2, aK3, aK4); 1186 } 1187 1188 return target.forget(); 1189 } 1190 1191 } // namespace gfx 1192 } // namespace mozilla