SIMD.cpp (20310B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "mozilla/SIMD.h" 8 9 #include <cstring> 10 #include <stdint.h> 11 #include <type_traits> 12 13 #include "mozilla/EndianUtils.h" 14 #include "mozilla/SSE.h" 15 16 #ifdef MOZILLA_PRESUME_SSE2 17 18 # include <immintrin.h> 19 20 #endif 21 22 namespace mozilla { 23 24 template <typename TValue> 25 const TValue* FindInBufferNaive(const TValue* ptr, TValue value, 26 size_t length) { 27 const TValue* end = ptr + length; 28 while (ptr < end) { 29 if (*ptr == value) { 30 return ptr; 31 } 32 ptr++; 33 } 34 return nullptr; 35 } 36 37 #ifdef MOZILLA_PRESUME_SSE2 38 39 const __m128i* Cast128(uintptr_t ptr) { 40 return reinterpret_cast<const __m128i*>(ptr); 41 } 42 43 template <typename T> 44 T GetAs(uintptr_t ptr) { 45 return *reinterpret_cast<const T*>(ptr); 46 } 47 48 // Akin to ceil/floor, AlignDown/AlignUp will return the original pointer if it 49 // is already aligned. 50 uintptr_t AlignDown16(uintptr_t ptr) { return ptr & ~0xf; } 51 52 uintptr_t AlignUp16(uintptr_t ptr) { return AlignDown16(ptr + 0xf); } 53 54 template <typename TValue> 55 __m128i CmpEq128(__m128i a, __m128i b) { 56 static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2); 57 if (sizeof(TValue) == 1) { 58 return _mm_cmpeq_epi8(a, b); 59 } 60 return _mm_cmpeq_epi16(a, b); 61 } 62 63 # ifdef __GNUC__ 64 65 // Earlier versions of GCC are missing the _mm_loadu_si32 instruction. This 66 // workaround from Peter Cordes (https://stackoverflow.com/a/72837992) compiles 67 // down to the same instructions. We could just replace _mm_loadu_si32 68 __m128i Load32BitsIntoXMM(uintptr_t ptr) { 69 int tmp; 70 memcpy(&tmp, reinterpret_cast<const void*>(ptr), 71 sizeof(tmp)); // unaligned aliasing-safe load 72 return _mm_cvtsi32_si128(tmp); // efficient on GCC/clang/MSVC 73 } 74 75 # else 76 77 __m128i Load32BitsIntoXMM(uintptr_t ptr) { 78 return _mm_loadu_si32(Cast128(ptr)); 79 } 80 81 # endif 82 83 const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c, 84 uintptr_t d) { 85 __m128i haystackA = Load32BitsIntoXMM(a); 86 __m128i cmpA = CmpEq128<char>(needle, haystackA); 87 __m128i haystackB = Load32BitsIntoXMM(b); 88 __m128i cmpB = CmpEq128<char>(needle, haystackB); 89 __m128i haystackC = Load32BitsIntoXMM(c); 90 __m128i cmpC = CmpEq128<char>(needle, haystackC); 91 __m128i haystackD = Load32BitsIntoXMM(d); 92 __m128i cmpD = CmpEq128<char>(needle, haystackD); 93 __m128i or_ab = _mm_or_si128(cmpA, cmpB); 94 __m128i or_cd = _mm_or_si128(cmpC, cmpD); 95 __m128i or_abcd = _mm_or_si128(or_ab, or_cd); 96 int orMask = _mm_movemask_epi8(or_abcd); 97 if (orMask & 0xf) { 98 int cmpMask; 99 cmpMask = _mm_movemask_epi8(cmpA); 100 if (cmpMask & 0xf) { 101 return reinterpret_cast<const char*>(a + __builtin_ctz(cmpMask)); 102 } 103 cmpMask = _mm_movemask_epi8(cmpB); 104 if (cmpMask & 0xf) { 105 return reinterpret_cast<const char*>(b + __builtin_ctz(cmpMask)); 106 } 107 cmpMask = _mm_movemask_epi8(cmpC); 108 if (cmpMask & 0xf) { 109 return reinterpret_cast<const char*>(c + __builtin_ctz(cmpMask)); 110 } 111 cmpMask = _mm_movemask_epi8(cmpD); 112 if (cmpMask & 0xf) { 113 return reinterpret_cast<const char*>(d + __builtin_ctz(cmpMask)); 114 } 115 } 116 117 return nullptr; 118 } 119 120 template <typename TValue> 121 const TValue* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b, 122 uintptr_t c, uintptr_t d) { 123 __m128i haystackA = _mm_loadu_si128(Cast128(a)); 124 __m128i cmpA = CmpEq128<TValue>(needle, haystackA); 125 __m128i haystackB = _mm_loadu_si128(Cast128(b)); 126 __m128i cmpB = CmpEq128<TValue>(needle, haystackB); 127 __m128i haystackC = _mm_loadu_si128(Cast128(c)); 128 __m128i cmpC = CmpEq128<TValue>(needle, haystackC); 129 __m128i haystackD = _mm_loadu_si128(Cast128(d)); 130 __m128i cmpD = CmpEq128<TValue>(needle, haystackD); 131 __m128i or_ab = _mm_or_si128(cmpA, cmpB); 132 __m128i or_cd = _mm_or_si128(cmpC, cmpD); 133 __m128i or_abcd = _mm_or_si128(or_ab, or_cd); 134 int orMask = _mm_movemask_epi8(or_abcd); 135 if (orMask) { 136 int cmpMask; 137 cmpMask = _mm_movemask_epi8(cmpA); 138 if (cmpMask) { 139 return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask)); 140 } 141 cmpMask = _mm_movemask_epi8(cmpB); 142 if (cmpMask) { 143 return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask)); 144 } 145 cmpMask = _mm_movemask_epi8(cmpC); 146 if (cmpMask) { 147 return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask)); 148 } 149 cmpMask = _mm_movemask_epi8(cmpD); 150 if (cmpMask) { 151 return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask)); 152 } 153 } 154 155 return nullptr; 156 } 157 158 enum class HaystackOverlap { 159 Overlapping, 160 Sequential, 161 }; 162 163 // Check two 16-byte chunks for the two-byte sequence loaded into needle1 164 // followed by needle1. `carryOut` is an optional pointer which we will 165 // populate based on whether the last character of b matches needle1. This 166 // should be provided on subsequent calls via `carryIn` so we can detect cases 167 // where the last byte of b's 16-byte chunk is needle1 and the first byte of 168 // the next a's 16-byte chunk is needle2. `overlap` and whether 169 // `carryIn`/`carryOut` are NULL should be knowable at compile time to avoid 170 // branching. 171 template <typename TValue> 172 const TValue* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a, 173 uintptr_t b, __m128i* carryIn, __m128i* carryOut, 174 HaystackOverlap overlap) { 175 const int shiftRightAmount = 16 - sizeof(TValue); 176 const int shiftLeftAmount = sizeof(TValue); 177 __m128i haystackA = _mm_loadu_si128(Cast128(a)); 178 __m128i cmpA1 = CmpEq128<TValue>(needle1, haystackA); 179 __m128i cmpA2 = CmpEq128<TValue>(needle2, haystackA); 180 __m128i cmpA; 181 if (carryIn) { 182 cmpA = _mm_and_si128( 183 _mm_or_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), *carryIn), cmpA2); 184 } else { 185 cmpA = _mm_and_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), cmpA2); 186 } 187 __m128i haystackB = _mm_loadu_si128(Cast128(b)); 188 __m128i cmpB1 = CmpEq128<TValue>(needle1, haystackB); 189 __m128i cmpB2 = CmpEq128<TValue>(needle2, haystackB); 190 __m128i cmpB; 191 if (overlap == HaystackOverlap::Overlapping) { 192 cmpB = _mm_and_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), cmpB2); 193 } else { 194 MOZ_ASSERT(overlap == HaystackOverlap::Sequential); 195 __m128i carryAB = _mm_bsrli_si128(cmpA1, shiftRightAmount); 196 cmpB = _mm_and_si128( 197 _mm_or_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), carryAB), cmpB2); 198 } 199 __m128i or_ab = _mm_or_si128(cmpA, cmpB); 200 int orMask = _mm_movemask_epi8(or_ab); 201 if (orMask) { 202 int cmpMask; 203 cmpMask = _mm_movemask_epi8(cmpA); 204 if (cmpMask) { 205 return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask) - 206 shiftLeftAmount); 207 } 208 cmpMask = _mm_movemask_epi8(cmpB); 209 if (cmpMask) { 210 return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask) - 211 shiftLeftAmount); 212 } 213 } 214 215 if (carryOut) { 216 _mm_store_si128(carryOut, _mm_bsrli_si128(cmpB1, shiftRightAmount)); 217 } 218 219 return nullptr; 220 } 221 222 template <typename TValue> 223 const TValue* FindInBuffer(const TValue* ptr, TValue value, size_t length) { 224 static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2); 225 static_assert(std::is_unsigned<TValue>::value); 226 uint64_t splat64; 227 if (sizeof(TValue) == 1) { 228 splat64 = 0x0101010101010101llu; 229 } else { 230 splat64 = 0x0001000100010001llu; 231 } 232 233 // Load our needle into a 16-byte register 234 uint64_t u64_value = static_cast<uint64_t>(value) * splat64; 235 int64_t i64_value = *reinterpret_cast<int64_t*>(&u64_value); 236 __m128i needle = _mm_set_epi64x(i64_value, i64_value); 237 238 size_t numBytes = length * sizeof(TValue); 239 uintptr_t cur = reinterpret_cast<uintptr_t>(ptr); 240 uintptr_t end = cur + numBytes; 241 242 if ((sizeof(TValue) > 1 && numBytes < 16) || numBytes < 4) { 243 while (cur < end) { 244 if (GetAs<TValue>(cur) == value) { 245 return reinterpret_cast<const TValue*>(cur); 246 } 247 cur += sizeof(TValue); 248 } 249 return nullptr; 250 } 251 252 if (numBytes < 16) { 253 // NOTE: here and below, we have some bit fiddling which could look a 254 // little weird. The important thing to note though is it's just a trick 255 // for getting the number 4 if numBytes is greater than or equal to 8, 256 // and 0 otherwise. This lets us fully cover the range without any 257 // branching for the case where numBytes is in [4,8), and [8,16). We get 258 // four ranges from this - if numbytes > 8, we get: 259 // [0,4), [4,8], [end - 8), [end - 4) 260 // and if numbytes < 8, we get 261 // [0,4), [0,4), [end - 4), [end - 4) 262 uintptr_t a = cur; 263 uintptr_t b = cur + ((numBytes & 8) >> 1); 264 uintptr_t c = end - 4 - ((numBytes & 8) >> 1); 265 uintptr_t d = end - 4; 266 const char* charResult = Check4x4Chars(needle, a, b, c, d); 267 // Note: we ensure above that sizeof(TValue) == 1 here, so this is 268 // either char to char or char to something like a uint8_t. 269 return reinterpret_cast<const TValue*>(charResult); 270 } 271 272 if (numBytes < 64) { 273 // NOTE: see the above explanation of the similar chunk of code, but in 274 // this case, replace 8 with 32 and 4 with 16. 275 uintptr_t a = cur; 276 uintptr_t b = cur + ((numBytes & 32) >> 1); 277 uintptr_t c = end - 16 - ((numBytes & 32) >> 1); 278 uintptr_t d = end - 16; 279 return Check4x16Bytes<TValue>(needle, a, b, c, d); 280 } 281 282 // Get the initial unaligned load out of the way. This will overlap with the 283 // aligned stuff below, but the overlapped part should effectively be free 284 // (relative to a mispredict from doing a byte-by-byte loop). 285 __m128i haystack = _mm_loadu_si128(Cast128(cur)); 286 __m128i cmp = CmpEq128<TValue>(needle, haystack); 287 int cmpMask = _mm_movemask_epi8(cmp); 288 if (cmpMask) { 289 return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask)); 290 } 291 292 // Now we're working with aligned memory. Hooray! \o/ 293 cur = AlignUp16(cur); 294 295 // The address of the final 48-63 bytes. We overlap this with what we check in 296 // our hot loop below to avoid branching. Again, the overlap should be 297 // negligible compared with a branch mispredict. 298 uintptr_t tailStartPtr = AlignDown16(end - 48); 299 uintptr_t tailEndPtr = end - 16; 300 301 while (cur < tailStartPtr) { 302 uintptr_t a = cur; 303 uintptr_t b = cur + 16; 304 uintptr_t c = cur + 32; 305 uintptr_t d = cur + 48; 306 const TValue* result = Check4x16Bytes<TValue>(needle, a, b, c, d); 307 if (result) { 308 return result; 309 } 310 cur += 64; 311 } 312 313 uintptr_t a = tailStartPtr; 314 uintptr_t b = tailStartPtr + 16; 315 uintptr_t c = tailStartPtr + 32; 316 uintptr_t d = tailEndPtr; 317 return Check4x16Bytes<TValue>(needle, a, b, c, d); 318 } 319 320 template <typename TValue> 321 const TValue* TwoElementLoop(uintptr_t start, uintptr_t end, TValue v1, 322 TValue v2) { 323 static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2); 324 325 const TValue* cur = reinterpret_cast<const TValue*>(start); 326 const TValue* preEnd = reinterpret_cast<const TValue*>(end - sizeof(TValue)); 327 328 uint32_t expected = static_cast<uint32_t>(v1) | 329 (static_cast<uint32_t>(v2) << (sizeof(TValue) * 8)); 330 while (cur < preEnd) { 331 // NOTE: this should only ever be called on little endian architectures. 332 static_assert(MOZ_LITTLE_ENDIAN()); 333 // We or cur[0] and cur[1] together explicitly and compare to expected, 334 // in order to avoid UB from just loading them as a uint16_t/uint32_t. 335 // However, it will compile down the same code after optimizations on 336 // little endian systems which support unaligned loads. Comparing them 337 // value-by-value, however, will not, and seems to perform worse in local 338 // microbenchmarking. Even after bitwise or'ing the comparison values 339 // together to avoid the short circuit, the compiler doesn't seem to get 340 // the hint and creates two branches, the first of which might be 341 // frequently mispredicted. 342 uint32_t actual = static_cast<uint32_t>(cur[0]) | 343 (static_cast<uint32_t>(cur[1]) << (sizeof(TValue) * 8)); 344 if (actual == expected) { 345 return cur; 346 } 347 cur++; 348 } 349 return nullptr; 350 } 351 352 template <typename TValue> 353 const TValue* FindTwoInBuffer(const TValue* ptr, TValue v1, TValue v2, 354 size_t length) { 355 static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2); 356 static_assert(std::is_unsigned<TValue>::value); 357 uint64_t splat64; 358 if (sizeof(TValue) == 1) { 359 splat64 = 0x0101010101010101llu; 360 } else { 361 splat64 = 0x0001000100010001llu; 362 } 363 364 // Load our needle into a 16-byte register 365 uint64_t u64_v1 = static_cast<uint64_t>(v1) * splat64; 366 int64_t i64_v1 = *reinterpret_cast<int64_t*>(&u64_v1); 367 __m128i needle1 = _mm_set_epi64x(i64_v1, i64_v1); 368 uint64_t u64_v2 = static_cast<uint64_t>(v2) * splat64; 369 int64_t i64_v2 = *reinterpret_cast<int64_t*>(&u64_v2); 370 __m128i needle2 = _mm_set_epi64x(i64_v2, i64_v2); 371 372 size_t numBytes = length * sizeof(TValue); 373 uintptr_t cur = reinterpret_cast<uintptr_t>(ptr); 374 uintptr_t end = cur + numBytes; 375 376 if (numBytes < 16) { 377 return TwoElementLoop<TValue>(cur, end, v1, v2); 378 } 379 380 if (numBytes < 32) { 381 uintptr_t a = cur; 382 uintptr_t b = end - 16; 383 return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, nullptr, nullptr, 384 HaystackOverlap::Overlapping); 385 } 386 387 // Get the initial unaligned load out of the way. This will likely overlap 388 // with the aligned stuff below, but the overlapped part should effectively 389 // be free. 390 __m128i haystack = _mm_loadu_si128(Cast128(cur)); 391 __m128i cmp1 = CmpEq128<TValue>(needle1, haystack); 392 __m128i cmp2 = CmpEq128<TValue>(needle2, haystack); 393 int cmpMask1 = _mm_movemask_epi8(cmp1); 394 int cmpMask2 = _mm_movemask_epi8(cmp2); 395 int cmpMask = (cmpMask1 << sizeof(TValue)) & cmpMask2; 396 if (cmpMask) { 397 return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask) - 398 sizeof(TValue)); 399 } 400 401 // Now we're working with aligned memory. Hooray! \o/ 402 cur = AlignUp16(cur); 403 404 // The address of the final 48-63 bytes. We overlap this with what we check in 405 // our hot loop below to avoid branching. Again, the overlap should be 406 // negligible compared with a branch mispredict. 407 uintptr_t tailEndPtr = end - 16; 408 uintptr_t tailStartPtr = AlignDown16(tailEndPtr); 409 410 __m128i cmpMaskCarry = _mm_set1_epi32(0); 411 while (cur < tailStartPtr) { 412 uintptr_t a = cur; 413 uintptr_t b = cur + 16; 414 const TValue* result = 415 Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry, 416 &cmpMaskCarry, HaystackOverlap::Sequential); 417 if (result) { 418 return result; 419 } 420 cur += 32; 421 } 422 423 uint32_t carry = (cur == tailStartPtr) ? 0xffffffff : 0; 424 __m128i wideCarry = Load32BitsIntoXMM(reinterpret_cast<uintptr_t>(&carry)); 425 cmpMaskCarry = _mm_and_si128(cmpMaskCarry, wideCarry); 426 uintptr_t a = tailStartPtr; 427 uintptr_t b = tailEndPtr; 428 return Check2x2x16Bytes<TValue>(needle1, needle2, a, b, &cmpMaskCarry, 429 nullptr, HaystackOverlap::Overlapping); 430 } 431 432 const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) { 433 // Signed chars are just really annoying to do bit logic with. Convert to 434 // unsigned at the outermost scope so we don't have to worry about it. 435 const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr); 436 unsigned char uvalue = static_cast<unsigned char>(value); 437 const unsigned char* uresult = 438 FindInBuffer<unsigned char>(uptr, uvalue, length); 439 return reinterpret_cast<const char*>(uresult); 440 } 441 442 // So, this is a bit awkward. It generally simplifies things if we can just 443 // assume all the AVX2 code is 64-bit, so we have this preprocessor guard 444 // in SIMD_avx2 over all of its actual code, and it also defines versions 445 // of its endpoints that just assert false if the guard is not satisfied. 446 // A 32 bit processor could implement the AVX2 instruction set though, which 447 // would result in it passing the supports_avx2() check and landing in an 448 // assertion failure. Accordingly, we just don't allow that to happen. We 449 // are not particularly concerned about ensuring that newer 32 bit processors 450 // get access to the AVX2 functions exposed here. 451 # if defined(MOZILLA_MAY_SUPPORT_AVX2) && defined(__x86_64__) 452 453 bool SupportsAVX2() { return supports_avx2(); } 454 455 # else 456 457 bool SupportsAVX2() { return false; } 458 459 # endif 460 461 const char* SIMD::memchr8(const char* ptr, char value, size_t length) { 462 if (SupportsAVX2()) { 463 return memchr8AVX2(ptr, value, length); 464 } 465 return memchr8SSE2(ptr, value, length); 466 } 467 468 const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value, 469 size_t length) { 470 return FindInBuffer<char16_t>(ptr, value, length); 471 } 472 473 const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value, 474 size_t length) { 475 if (SupportsAVX2()) { 476 return memchr16AVX2(ptr, value, length); 477 } 478 return memchr16SSE2(ptr, value, length); 479 } 480 481 const uint32_t* SIMD::memchr32(const uint32_t* ptr, uint32_t value, 482 size_t length) { 483 if (SupportsAVX2()) { 484 return memchr32AVX2(ptr, value, length); 485 } 486 return FindInBufferNaive<uint32_t>(ptr, value, length); 487 } 488 489 const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value, 490 size_t length) { 491 if (SupportsAVX2()) { 492 return memchr64AVX2(ptr, value, length); 493 } 494 return FindInBufferNaive<uint64_t>(ptr, value, length); 495 } 496 497 const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) { 498 // Signed chars are just really annoying to do bit logic with. Convert to 499 // unsigned at the outermost scope so we don't have to worry about it. 500 const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr); 501 unsigned char uv1 = static_cast<unsigned char>(v1); 502 unsigned char uv2 = static_cast<unsigned char>(v2); 503 const unsigned char* uresult = 504 FindTwoInBuffer<unsigned char>(uptr, uv1, uv2, length); 505 return reinterpret_cast<const char*>(uresult); 506 } 507 508 const char16_t* SIMD::memchr2x16(const char16_t* ptr, char16_t v1, char16_t v2, 509 size_t length) { 510 return FindTwoInBuffer<char16_t>(ptr, v1, v2, length); 511 } 512 513 #else 514 515 const char* SIMD::memchr8(const char* ptr, char value, size_t length) { 516 const void* result = ::memchr(reinterpret_cast<const void*>(ptr), 517 static_cast<int>(value), length); 518 return reinterpret_cast<const char*>(result); 519 } 520 521 const char* SIMD::memchr8SSE2(const char* ptr, char value, size_t length) { 522 return memchr8(ptr, value, length); 523 } 524 525 const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value, 526 size_t length) { 527 return FindInBufferNaive<char16_t>(ptr, value, length); 528 } 529 530 const char16_t* SIMD::memchr16SSE2(const char16_t* ptr, char16_t value, 531 size_t length) { 532 return memchr16(ptr, value, length); 533 } 534 535 const uint32_t* SIMD::memchr32(const uint32_t* ptr, uint32_t value, 536 size_t length) { 537 return FindInBufferNaive<uint32_t>(ptr, value, length); 538 } 539 540 const uint64_t* SIMD::memchr64(const uint64_t* ptr, uint64_t value, 541 size_t length) { 542 return FindInBufferNaive<uint64_t>(ptr, value, length); 543 } 544 545 const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) { 546 const char* end = ptr + length - 1; 547 while (ptr < end) { 548 ptr = memchr8(ptr, v1, end - ptr); 549 if (!ptr) { 550 return nullptr; 551 } 552 if (ptr[1] == v2) { 553 return ptr; 554 } 555 ptr++; 556 } 557 return nullptr; 558 } 559 560 const char16_t* SIMD::memchr2x16(const char16_t* ptr, char16_t v1, char16_t v2, 561 size_t length) { 562 const char16_t* end = ptr + length - 1; 563 while (ptr < end) { 564 ptr = memchr16(ptr, v1, end - ptr); 565 if (!ptr) { 566 return nullptr; 567 } 568 if (ptr[1] == v2) { 569 return ptr; 570 } 571 ptr++; 572 } 573 return nullptr; 574 } 575 576 #endif 577 578 } // namespace mozilla