MacroAssembler-x86-shared-SIMD.cpp (57385B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "jit/MacroAssembler.h" 8 #include "jit/x86-shared/MacroAssembler-x86-shared.h" 9 10 #include "jit/MacroAssembler-inl.h" 11 12 using namespace js; 13 using namespace js::jit; 14 15 using mozilla::DebugOnly; 16 using mozilla::FloatingPoint; 17 using mozilla::Maybe; 18 using mozilla::SpecificNaN; 19 20 void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) { 21 ScratchSimd128Scope scratch(asMasm()); 22 23 vmovd(input, output); 24 if (HasAVX2()) { 25 vbroadcastb(Operand(output), output); 26 return; 27 } 28 vpxor(scratch, scratch, scratch); 29 vpshufb(scratch, output, output); 30 } 31 32 void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) { 33 vmovd(input, output); 34 if (HasAVX2()) { 35 vbroadcastw(Operand(output), output); 36 return; 37 } 38 vpshuflw(0, output, output); 39 vpshufd(0, output, output); 40 } 41 42 void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) { 43 vmovd(input, output); 44 if (HasAVX2()) { 45 vbroadcastd(Operand(output), output); 46 return; 47 } 48 vpshufd(0, output, output); 49 } 50 51 void MacroAssemblerX86Shared::splatX4(FloatRegister input, 52 FloatRegister output) { 53 MOZ_ASSERT(input.isSingle() && output.isSimd128()); 54 if (HasAVX2()) { 55 vbroadcastss(Operand(input), output); 56 return; 57 } 58 input = asMasm().moveSimd128FloatIfNotAVX(input.asSimd128(), output); 59 vshufps(0, input, input, output); 60 } 61 62 void MacroAssemblerX86Shared::splatX2(FloatRegister input, 63 FloatRegister output) { 64 MOZ_ASSERT(input.isDouble() && output.isSimd128()); 65 vmovddup(Operand(input.asSimd128()), output); 66 } 67 68 void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input, 69 Register output, 70 unsigned lane) { 71 if (lane == 0) { 72 // The value we want to extract is in the low double-word 73 moveLowInt32(input, output); 74 } else { 75 vpextrd(lane, input, output); 76 } 77 } 78 79 void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input, 80 FloatRegister output, 81 unsigned lane) { 82 MOZ_ASSERT(input.isSimd128() && output.isSingle()); 83 if (lane == 0) { 84 // The value we want to extract is in the low double-word 85 if (input.asSingle() != output) { 86 moveFloat32(input, output); 87 } 88 } else if (lane == 2) { 89 moveHighPairToLowPairFloat32(input, output); 90 } else { 91 uint32_t mask = MacroAssembler::ComputeShuffleMask(lane); 92 FloatRegister dest = output.asSimd128(); 93 input = moveSimd128FloatIfNotAVX(input, dest); 94 vshufps(mask, input, input, dest); 95 } 96 } 97 98 void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input, 99 FloatRegister output, 100 unsigned lane) { 101 MOZ_ASSERT(input.isSimd128() && output.isDouble()); 102 if (lane == 0) { 103 // The value we want to extract is in the low quadword 104 if (input.asDouble() != output) { 105 moveDouble(input, output); 106 } 107 } else { 108 vpalignr(Operand(input), output, output, 8); 109 } 110 } 111 112 void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input, 113 Register output, unsigned lane, 114 SimdSign sign) { 115 vpextrw(lane, input, Operand(output)); 116 if (sign == SimdSign::Signed) { 117 movswl(output, output); 118 } 119 } 120 121 void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input, 122 Register output, unsigned lane, 123 SimdSign sign) { 124 vpextrb(lane, input, Operand(output)); 125 if (sign == SimdSign::Signed) { 126 if (!AllocatableGeneralRegisterSet(Registers::SingleByteRegs).has(output)) { 127 xchgl(eax, output); 128 movsbl(eax, eax); 129 xchgl(eax, output); 130 } else { 131 movsbl(output, output); 132 } 133 } 134 } 135 136 void MacroAssemblerX86Shared::replaceLaneFloat32x4(unsigned lane, 137 FloatRegister lhs, 138 FloatRegister rhs, 139 FloatRegister dest) { 140 MOZ_ASSERT(lhs.isSimd128() && rhs.isSingle()); 141 142 if (lane == 0) { 143 if (rhs.asSimd128() == lhs) { 144 // no-op, although this should not normally happen for type checking 145 // reasons higher up in the stack. 146 moveSimd128Float(lhs, dest); 147 } else { 148 // move low dword of value into low dword of output 149 vmovss(rhs, lhs, dest); 150 } 151 } else { 152 vinsertps(vinsertpsMask(0, lane), rhs, lhs, dest); 153 } 154 } 155 156 void MacroAssemblerX86Shared::replaceLaneFloat64x2(unsigned lane, 157 FloatRegister lhs, 158 FloatRegister rhs, 159 FloatRegister dest) { 160 MOZ_ASSERT(lhs.isSimd128() && rhs.isDouble()); 161 162 if (lane == 0) { 163 if (rhs.asSimd128() == lhs) { 164 // no-op, although this should not normally happen for type checking 165 // reasons higher up in the stack. 166 moveSimd128Float(lhs, dest); 167 } else { 168 // move low qword of value into low qword of output 169 vmovsd(rhs, lhs, dest); 170 } 171 } else { 172 // move low qword of value into high qword of output 173 vshufpd(0, rhs, lhs, dest); 174 } 175 } 176 177 void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs, 178 FloatRegister output, 179 FloatRegister temp, 180 const uint8_t lanes[16]) { 181 asMasm().loadConstantSimd128Int( 182 SimdConstant::CreateX16(reinterpret_cast<const int8_t*>(lanes)), temp); 183 vpblendvb(temp, rhs, lhs, output); 184 } 185 186 void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs, 187 FloatRegister output, 188 const uint16_t lanes[8]) { 189 uint32_t mask = 0; 190 for (unsigned i = 0; i < 8; i++) { 191 if (lanes[i]) { 192 mask |= (1 << i); 193 } 194 } 195 vpblendw(mask, rhs, lhs, output); 196 } 197 198 void MacroAssemblerX86Shared::laneSelectSimd128(FloatRegister mask, 199 FloatRegister lhs, 200 FloatRegister rhs, 201 FloatRegister output) { 202 vpblendvb(mask, lhs, rhs, output); 203 } 204 205 void MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs, 206 FloatRegister rhs, 207 FloatRegister output, 208 const uint8_t lanes[16]) { 209 ScratchSimd128Scope scratch(asMasm()); 210 211 // Use pshufb instructions to gather the lanes from each source vector. 212 // A negative index creates a zero lane, so the two vectors can be combined. 213 214 // Set scratch = lanes from rhs. 215 int8_t idx[16]; 216 for (unsigned i = 0; i < 16; i++) { 217 idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1; 218 } 219 rhs = moveSimd128IntIfNotAVX(rhs, scratch); 220 asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), rhs, scratch); 221 222 // Set output = lanes from lhs. 223 for (unsigned i = 0; i < 16; i++) { 224 idx[i] = lanes[i] < 16 ? lanes[i] : -1; 225 } 226 lhs = moveSimd128IntIfNotAVX(lhs, output); 227 asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), lhs, output); 228 229 // Combine. 230 vpor(scratch, output, output); 231 } 232 233 static inline FloatRegister ToSimdFloatRegister(const Operand& op) { 234 return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128); 235 } 236 237 void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs, 238 Assembler::Condition cond, 239 FloatRegister output) { 240 switch (cond) { 241 case Assembler::Condition::GreaterThan: 242 vpcmpgtb(rhs, lhs, output); 243 break; 244 case Assembler::Condition::Equal: 245 vpcmpeqb(rhs, lhs, output); 246 break; 247 case Assembler::Condition::LessThan: { 248 ScratchSimd128Scope scratch(asMasm()); 249 if (lhs == output) { 250 moveSimd128Int(lhs, scratch); 251 lhs = scratch; 252 } 253 if (rhs.kind() == Operand::FPREG) { 254 moveSimd128Int(ToSimdFloatRegister(rhs), output); 255 } else { 256 loadAlignedSimd128Int(rhs, output); 257 } 258 vpcmpgtb(Operand(lhs), output, output); 259 break; 260 } 261 case Assembler::Condition::NotEqual: 262 vpcmpeqb(rhs, lhs, output); 263 asMasm().bitwiseNotSimd128(output, output); 264 break; 265 case Assembler::Condition::GreaterThanOrEqual: { 266 ScratchSimd128Scope scratch(asMasm()); 267 if (lhs == output) { 268 moveSimd128Int(lhs, scratch); 269 lhs = scratch; 270 } 271 if (rhs.kind() == Operand::FPREG) { 272 moveSimd128Int(ToSimdFloatRegister(rhs), output); 273 } else { 274 loadAlignedSimd128Int(rhs, output); 275 } 276 vpcmpgtb(Operand(lhs), output, output); 277 } 278 asMasm().bitwiseNotSimd128(output, output); 279 break; 280 case Assembler::Condition::LessThanOrEqual: 281 // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. 282 vpcmpgtb(rhs, lhs, output); 283 asMasm().bitwiseNotSimd128(output, output); 284 break; 285 case Assembler::Above: 286 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 287 vpminub(rhs, lhs, output); 288 vpcmpeqb(Operand(lhs), output, output); 289 } else { 290 vpmaxub(rhs, lhs, output); 291 vpcmpeqb(rhs, output, output); 292 } 293 asMasm().bitwiseNotSimd128(output, output); 294 break; 295 case Assembler::BelowOrEqual: 296 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 297 vpminub(rhs, lhs, output); 298 vpcmpeqb(Operand(lhs), output, output); 299 } else { 300 vpmaxub(rhs, lhs, output); 301 vpcmpeqb(rhs, output, output); 302 } 303 break; 304 case Assembler::Below: 305 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 306 vpmaxub(rhs, lhs, output); 307 vpcmpeqb(Operand(lhs), output, output); 308 } else { 309 vpminub(rhs, lhs, output); 310 vpcmpeqb(rhs, output, output); 311 } 312 asMasm().bitwiseNotSimd128(output, output); 313 break; 314 case Assembler::AboveOrEqual: 315 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 316 vpmaxub(rhs, lhs, output); 317 vpcmpeqb(Operand(lhs), output, output); 318 } else { 319 vpminub(rhs, lhs, output); 320 vpcmpeqb(rhs, output, output); 321 } 322 break; 323 default: 324 MOZ_CRASH("unexpected condition op"); 325 } 326 } 327 328 void MacroAssemblerX86Shared::compareInt8x16(Assembler::Condition cond, 329 FloatRegister lhs, 330 const SimdConstant& rhs, 331 FloatRegister dest) { 332 bool complement = false; 333 switch (cond) { 334 case Assembler::Condition::NotEqual: 335 complement = true; 336 [[fallthrough]]; 337 case Assembler::Condition::Equal: 338 binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqb, 339 &MacroAssembler::vpcmpeqbSimd128); 340 break; 341 case Assembler::Condition::LessThanOrEqual: 342 complement = true; 343 [[fallthrough]]; 344 case Assembler::Condition::GreaterThan: 345 binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtb, 346 &MacroAssembler::vpcmpgtbSimd128); 347 break; 348 default: 349 MOZ_CRASH("unexpected condition op"); 350 } 351 if (complement) { 352 asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest); 353 } 354 } 355 356 void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs, 357 Assembler::Condition cond, 358 FloatRegister output) { 359 switch (cond) { 360 case Assembler::Condition::GreaterThan: 361 vpcmpgtw(rhs, lhs, output); 362 break; 363 case Assembler::Condition::Equal: 364 vpcmpeqw(rhs, lhs, output); 365 break; 366 case Assembler::Condition::LessThan: { 367 ScratchSimd128Scope scratch(asMasm()); 368 if (lhs == output) { 369 moveSimd128Int(lhs, scratch); 370 lhs = scratch; 371 } 372 if (rhs.kind() == Operand::FPREG) { 373 moveSimd128Int(ToSimdFloatRegister(rhs), output); 374 } else { 375 loadAlignedSimd128Int(rhs, output); 376 } 377 vpcmpgtw(Operand(lhs), output, output); 378 break; 379 } 380 case Assembler::Condition::NotEqual: 381 vpcmpeqw(rhs, lhs, output); 382 asMasm().bitwiseNotSimd128(output, output); 383 break; 384 case Assembler::Condition::GreaterThanOrEqual: { 385 ScratchSimd128Scope scratch(asMasm()); 386 if (lhs == output) { 387 moveSimd128Int(lhs, scratch); 388 lhs = scratch; 389 } 390 if (rhs.kind() == Operand::FPREG) { 391 moveSimd128Int(ToSimdFloatRegister(rhs), output); 392 } else { 393 loadAlignedSimd128Int(rhs, output); 394 } 395 vpcmpgtw(Operand(lhs), output, output); 396 } 397 asMasm().bitwiseNotSimd128(output, output); 398 break; 399 case Assembler::Condition::LessThanOrEqual: 400 // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. 401 vpcmpgtw(rhs, lhs, output); 402 asMasm().bitwiseNotSimd128(output, output); 403 break; 404 case Assembler::Above: 405 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 406 vpminuw(rhs, lhs, output); 407 vpcmpeqw(Operand(lhs), output, output); 408 } else { 409 vpmaxuw(rhs, lhs, output); 410 vpcmpeqw(rhs, output, output); 411 } 412 asMasm().bitwiseNotSimd128(output, output); 413 break; 414 case Assembler::BelowOrEqual: 415 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 416 vpminuw(rhs, lhs, output); 417 vpcmpeqw(Operand(lhs), output, output); 418 } else { 419 vpmaxuw(rhs, lhs, output); 420 vpcmpeqw(rhs, output, output); 421 } 422 break; 423 case Assembler::Below: 424 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 425 vpmaxuw(rhs, lhs, output); 426 vpcmpeqw(Operand(lhs), output, output); 427 } else { 428 vpminuw(rhs, lhs, output); 429 vpcmpeqw(rhs, output, output); 430 } 431 asMasm().bitwiseNotSimd128(output, output); 432 break; 433 case Assembler::AboveOrEqual: 434 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 435 vpmaxuw(rhs, lhs, output); 436 vpcmpeqw(Operand(lhs), output, output); 437 } else { 438 vpminuw(rhs, lhs, output); 439 vpcmpeqw(rhs, output, output); 440 } 441 break; 442 default: 443 MOZ_CRASH("unexpected condition op"); 444 } 445 } 446 447 void MacroAssemblerX86Shared::compareInt16x8(Assembler::Condition cond, 448 FloatRegister lhs, 449 const SimdConstant& rhs, 450 FloatRegister dest) { 451 bool complement = false; 452 switch (cond) { 453 case Assembler::Condition::NotEqual: 454 complement = true; 455 [[fallthrough]]; 456 case Assembler::Condition::Equal: 457 binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqw, 458 &MacroAssembler::vpcmpeqwSimd128); 459 break; 460 case Assembler::Condition::LessThanOrEqual: 461 complement = true; 462 [[fallthrough]]; 463 case Assembler::Condition::GreaterThan: 464 binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtw, 465 &MacroAssembler::vpcmpgtwSimd128); 466 break; 467 default: 468 MOZ_CRASH("unexpected condition op"); 469 } 470 if (complement) { 471 asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest); 472 } 473 } 474 475 void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs, 476 Assembler::Condition cond, 477 FloatRegister output) { 478 switch (cond) { 479 case Assembler::Condition::GreaterThan: 480 vpcmpgtd(rhs, lhs, output); 481 break; 482 case Assembler::Condition::Equal: 483 vpcmpeqd(rhs, lhs, output); 484 break; 485 case Assembler::Condition::LessThan: { 486 ScratchSimd128Scope scratch(asMasm()); 487 if (lhs == output) { 488 moveSimd128Int(lhs, scratch); 489 lhs = scratch; 490 } 491 if (rhs.kind() == Operand::FPREG) { 492 moveSimd128Int(ToSimdFloatRegister(rhs), output); 493 } else { 494 loadAlignedSimd128Int(rhs, output); 495 } 496 vpcmpgtd(Operand(lhs), output, output); 497 break; 498 } 499 case Assembler::Condition::NotEqual: 500 vpcmpeqd(rhs, lhs, output); 501 asMasm().bitwiseNotSimd128(output, output); 502 break; 503 case Assembler::Condition::GreaterThanOrEqual: { 504 ScratchSimd128Scope scratch(asMasm()); 505 if (lhs == output) { 506 moveSimd128Int(lhs, scratch); 507 lhs = scratch; 508 } 509 if (rhs.kind() == Operand::FPREG) { 510 moveSimd128Int(ToSimdFloatRegister(rhs), output); 511 } else { 512 loadAlignedSimd128Int(rhs, output); 513 } 514 vpcmpgtd(Operand(lhs), output, output); 515 } 516 asMasm().bitwiseNotSimd128(output, output); 517 break; 518 case Assembler::Condition::LessThanOrEqual: 519 // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. 520 vpcmpgtd(rhs, lhs, output); 521 asMasm().bitwiseNotSimd128(output, output); 522 break; 523 case Assembler::Above: 524 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 525 vpminud(rhs, lhs, output); 526 vpcmpeqd(Operand(lhs), output, output); 527 } else { 528 vpmaxud(rhs, lhs, output); 529 vpcmpeqd(rhs, output, output); 530 } 531 asMasm().bitwiseNotSimd128(output, output); 532 break; 533 case Assembler::BelowOrEqual: 534 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 535 vpminud(rhs, lhs, output); 536 vpcmpeqd(Operand(lhs), output, output); 537 } else { 538 vpmaxud(rhs, lhs, output); 539 vpcmpeqd(rhs, output, output); 540 } 541 break; 542 case Assembler::Below: 543 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 544 vpmaxud(rhs, lhs, output); 545 vpcmpeqd(Operand(lhs), output, output); 546 } else { 547 vpminud(rhs, lhs, output); 548 vpcmpeqd(rhs, output, output); 549 } 550 asMasm().bitwiseNotSimd128(output, output); 551 break; 552 case Assembler::AboveOrEqual: 553 if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { 554 vpmaxud(rhs, lhs, output); 555 vpcmpeqd(Operand(lhs), output, output); 556 } else { 557 vpminud(rhs, lhs, output); 558 vpcmpeqd(rhs, output, output); 559 } 560 break; 561 default: 562 MOZ_CRASH("unexpected condition op"); 563 } 564 } 565 566 void MacroAssemblerX86Shared::compareInt32x4(Assembler::Condition cond, 567 FloatRegister lhs, 568 const SimdConstant& rhs, 569 FloatRegister dest) { 570 bool complement = false; 571 switch (cond) { 572 case Assembler::Condition::NotEqual: 573 complement = true; 574 [[fallthrough]]; 575 case Assembler::Condition::Equal: 576 binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqd, 577 &MacroAssembler::vpcmpeqdSimd128); 578 break; 579 case Assembler::Condition::LessThanOrEqual: 580 complement = true; 581 [[fallthrough]]; 582 case Assembler::Condition::GreaterThan: 583 binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtd, 584 &MacroAssembler::vpcmpgtdSimd128); 585 break; 586 default: 587 MOZ_CRASH("unexpected condition op"); 588 } 589 if (complement) { 590 asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest); 591 } 592 } 593 594 void MacroAssemblerX86Shared::compareForEqualityInt64x2( 595 FloatRegister lhs, Operand rhs, Assembler::Condition cond, 596 FloatRegister output) { 597 static const SimdConstant allOnes = SimdConstant::SplatX4(-1); 598 switch (cond) { 599 case Assembler::Condition::Equal: 600 vpcmpeqq(rhs, lhs, output); 601 break; 602 case Assembler::Condition::NotEqual: 603 vpcmpeqq(rhs, lhs, output); 604 asMasm().bitwiseXorSimd128(output, allOnes, output); 605 break; 606 default: 607 MOZ_CRASH("unexpected condition op"); 608 } 609 } 610 611 void MacroAssemblerX86Shared::compareForOrderingInt64x2( 612 FloatRegister lhs, Operand rhs, Assembler::Condition cond, 613 FloatRegister temp1, FloatRegister temp2, FloatRegister output) { 614 static const SimdConstant allOnes = SimdConstant::SplatX4(-1); 615 // The pseudo code is for (e.g. > comparison): 616 // __m128i pcmpgtq_sse2 (__m128i a, __m128i b) { 617 // __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a)); 618 // r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b)); 619 // return _mm_shuffle_epi32(r, _MM_SHUFFLE(3,3,1,1)); 620 // } 621 // Credits to https://stackoverflow.com/a/65175746 622 switch (cond) { 623 case Assembler::Condition::GreaterThan: 624 vmovdqa(rhs, temp1); 625 vmovdqa(Operand(lhs), temp2); 626 vpsubq(Operand(lhs), temp1, temp1); 627 vpcmpeqd(rhs, temp2, temp2); 628 vandpd(temp2, temp1, temp1); 629 lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); 630 vpcmpgtd(rhs, lhs, output); 631 vpor(Operand(temp1), output, output); 632 vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); 633 break; 634 case Assembler::Condition::LessThan: 635 vmovdqa(rhs, temp1); 636 vmovdqa(Operand(lhs), temp2); 637 vpcmpgtd(Operand(lhs), temp1, temp1); 638 vpcmpeqd(Operand(rhs), temp2, temp2); 639 lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); 640 vpsubq(rhs, lhs, output); 641 vandpd(temp2, output, output); 642 vpor(Operand(temp1), output, output); 643 vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); 644 break; 645 case Assembler::Condition::GreaterThanOrEqual: 646 vmovdqa(rhs, temp1); 647 vmovdqa(Operand(lhs), temp2); 648 vpcmpgtd(Operand(lhs), temp1, temp1); 649 vpcmpeqd(Operand(rhs), temp2, temp2); 650 lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); 651 vpsubq(rhs, lhs, output); 652 vandpd(temp2, output, output); 653 vpor(Operand(temp1), output, output); 654 vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); 655 asMasm().bitwiseXorSimd128(output, allOnes, output); 656 break; 657 case Assembler::Condition::LessThanOrEqual: 658 vmovdqa(rhs, temp1); 659 vmovdqa(Operand(lhs), temp2); 660 vpsubq(Operand(lhs), temp1, temp1); 661 vpcmpeqd(rhs, temp2, temp2); 662 vandpd(temp2, temp1, temp1); 663 lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); 664 vpcmpgtd(rhs, lhs, output); 665 vpor(Operand(temp1), output, output); 666 vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); 667 asMasm().bitwiseXorSimd128(output, allOnes, output); 668 break; 669 default: 670 MOZ_CRASH("unexpected condition op"); 671 } 672 } 673 674 void MacroAssemblerX86Shared::compareForOrderingInt64x2AVX( 675 FloatRegister lhs, FloatRegister rhs, Assembler::Condition cond, 676 FloatRegister output) { 677 MOZ_ASSERT(HasSSE42()); 678 static const SimdConstant allOnes = SimdConstant::SplatX4(-1); 679 switch (cond) { 680 case Assembler::Condition::GreaterThan: 681 vpcmpgtq(Operand(rhs), lhs, output); 682 break; 683 case Assembler::Condition::LessThan: 684 vpcmpgtq(Operand(lhs), rhs, output); 685 break; 686 case Assembler::Condition::GreaterThanOrEqual: 687 vpcmpgtq(Operand(lhs), rhs, output); 688 asMasm().bitwiseXorSimd128(output, allOnes, output); 689 break; 690 case Assembler::Condition::LessThanOrEqual: 691 vpcmpgtq(Operand(rhs), lhs, output); 692 asMasm().bitwiseXorSimd128(output, allOnes, output); 693 break; 694 default: 695 MOZ_CRASH("unexpected condition op"); 696 } 697 } 698 699 void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs, 700 Assembler::Condition cond, 701 FloatRegister output) { 702 // TODO Can do better here with three-address compares 703 704 // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output. 705 // This is bad, but Ion does not need this fixup. 706 ScratchSimd128Scope scratch(asMasm()); 707 if (!HasAVX() && !lhs.aliases(output)) { 708 if (rhs.kind() == Operand::FPREG && 709 output.aliases(FloatRegister::FromCode(rhs.fpu()))) { 710 vmovaps(rhs, scratch); 711 rhs = Operand(scratch); 712 } 713 vmovaps(lhs, output); 714 lhs = output; 715 } 716 717 switch (cond) { 718 case Assembler::Condition::Equal: 719 vcmpeqps(rhs, lhs, output); 720 break; 721 case Assembler::Condition::LessThan: 722 vcmpltps(rhs, lhs, output); 723 break; 724 case Assembler::Condition::LessThanOrEqual: 725 vcmpleps(rhs, lhs, output); 726 break; 727 case Assembler::Condition::NotEqual: 728 vcmpneqps(rhs, lhs, output); 729 break; 730 case Assembler::Condition::GreaterThanOrEqual: 731 case Assembler::Condition::GreaterThan: 732 // We reverse these operations in the -inl.h file so that we don't have to 733 // copy into and out of temporaries after codegen. 734 MOZ_CRASH("should have reversed this"); 735 default: 736 MOZ_CRASH("unexpected condition op"); 737 } 738 } 739 740 void MacroAssemblerX86Shared::compareFloat32x4(Assembler::Condition cond, 741 FloatRegister lhs, 742 const SimdConstant& rhs, 743 FloatRegister dest) { 744 switch (cond) { 745 case Assembler::Condition::Equal: 746 binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpeqps, 747 &MacroAssembler::vcmpeqpsSimd128); 748 break; 749 case Assembler::Condition::LessThan: 750 binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpltps, 751 &MacroAssembler::vcmpltpsSimd128); 752 break; 753 case Assembler::Condition::LessThanOrEqual: 754 binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpleps, 755 &MacroAssembler::vcmplepsSimd128); 756 break; 757 case Assembler::Condition::NotEqual: 758 binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpneqps, 759 &MacroAssembler::vcmpneqpsSimd128); 760 break; 761 default: 762 MOZ_CRASH("unexpected condition op"); 763 } 764 } 765 766 void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs, 767 Assembler::Condition cond, 768 FloatRegister output) { 769 // TODO Can do better here with three-address compares 770 771 // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output. 772 // This is bad, but Ion does not need this fixup. 773 ScratchSimd128Scope scratch(asMasm()); 774 if (!HasAVX() && !lhs.aliases(output)) { 775 if (rhs.kind() == Operand::FPREG && 776 output.aliases(FloatRegister::FromCode(rhs.fpu()))) { 777 vmovapd(rhs, scratch); 778 rhs = Operand(scratch); 779 } 780 vmovapd(lhs, output); 781 lhs = output; 782 } 783 784 switch (cond) { 785 case Assembler::Condition::Equal: 786 vcmpeqpd(rhs, lhs, output); 787 break; 788 case Assembler::Condition::LessThan: 789 vcmpltpd(rhs, lhs, output); 790 break; 791 case Assembler::Condition::LessThanOrEqual: 792 vcmplepd(rhs, lhs, output); 793 break; 794 case Assembler::Condition::NotEqual: 795 vcmpneqpd(rhs, lhs, output); 796 break; 797 case Assembler::Condition::GreaterThanOrEqual: 798 case Assembler::Condition::GreaterThan: 799 // We reverse these operations in the -inl.h file so that we don't have to 800 // copy into and out of temporaries after codegen. 801 MOZ_CRASH("should have reversed this"); 802 default: 803 MOZ_CRASH("unexpected condition op"); 804 } 805 } 806 807 void MacroAssemblerX86Shared::compareFloat64x2(Assembler::Condition cond, 808 FloatRegister lhs, 809 const SimdConstant& rhs, 810 FloatRegister dest) { 811 switch (cond) { 812 case Assembler::Condition::Equal: 813 binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpeqpd, 814 &MacroAssembler::vcmpeqpdSimd128); 815 break; 816 case Assembler::Condition::LessThan: 817 binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpltpd, 818 &MacroAssembler::vcmpltpdSimd128); 819 break; 820 case Assembler::Condition::LessThanOrEqual: 821 binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmplepd, 822 &MacroAssembler::vcmplepdSimd128); 823 break; 824 case Assembler::Condition::NotEqual: 825 binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpneqpd, 826 &MacroAssembler::vcmpneqpdSimd128); 827 break; 828 default: 829 MOZ_CRASH("unexpected condition op"); 830 } 831 } 832 833 // Semantics of wasm max and min. 834 // 835 // * -0 < 0 836 // * If one input is NaN then that NaN is the output 837 // * If both inputs are NaN then the output is selected nondeterministically 838 // * Any returned NaN is always made quiet 839 // * The MVP spec 2.2.3 says "No distinction is made between signalling and 840 // quiet NaNs", suggesting SNaN inputs are allowed and should not fault 841 // 842 // Semantics of maxps/minps/maxpd/minpd: 843 // 844 // * If the values are both +/-0 the rhs is returned 845 // * If the rhs is SNaN then the rhs is returned 846 // * If either value is NaN then the rhs is returned 847 // * An SNaN operand does not appear to give rise to an exception, at least 848 // not in the JS shell on Linux, though the Intel spec lists Invalid 849 // as one of the possible exceptions 850 851 // Various unaddressed considerations: 852 // 853 // It's pretty insane for this to take an Operand rhs - it really needs to be 854 // a register, given the number of times we access it. 855 // 856 // Constant load can be folded into the ANDPS. Do we care? It won't save us 857 // any registers, since output/temp1/temp2/scratch are all live at the same time 858 // after the first instruction of the slow path. 859 // 860 // Can we use blend for the NaN extraction/insertion? We'd need xmm0 for the 861 // mask, which is no fun. But it would be lhs UNORD lhs -> mask, blend; 862 // rhs UNORD rhs -> mask; blend. Better than the mess we have below. But 863 // we'd still need to setup the QNaN bits, unless we can blend those too 864 // with the lhs UNORD rhs mask? 865 // 866 // If we could determine that both input lanes are NaN then the result of the 867 // fast path should be fine modulo the QNaN bits, but it's not obvious this is 868 // much of an advantage. 869 870 void MacroAssemblerX86Shared::minMaxFloat32x4(bool isMin, FloatRegister lhs, 871 Operand rhs, FloatRegister temp1, 872 FloatRegister temp2, 873 FloatRegister output) { 874 ScratchSimd128Scope scratch(asMasm()); 875 Label l; 876 SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000))); 877 878 /* clang-format off */ /* leave my comments alone */ 879 lhs = moveSimd128FloatIfNotAVXOrOther(lhs, scratch, output); 880 if (isMin) { 881 vmovaps(lhs, output); // compute 882 vminps(rhs, output, output); // min lhs, rhs 883 vmovaps(rhs, temp1); // compute 884 vminps(Operand(lhs), temp1, temp1); // min rhs, lhs 885 vorps(temp1, output, output); // fix min(-0, 0) with OR 886 } else { 887 vmovaps(lhs, output); // compute 888 vmaxps(rhs, output, output); // max lhs, rhs 889 vmovaps(rhs, temp1); // compute 890 vmaxps(Operand(lhs), temp1, temp1); // max rhs, lhs 891 vandps(temp1, output, output); // fix max(-0, 0) with AND 892 } 893 vmovaps(lhs, temp1); // compute 894 vcmpunordps(rhs, temp1, temp1); // lhs UNORD rhs 895 vptest(temp1, temp1); // check if any unordered 896 j(Assembler::Equal, &l); // and exit if not 897 898 // Slow path. 899 // output has result for non-NaN lanes, garbage in NaN lanes. 900 // temp1 has lhs UNORD rhs. 901 // temp2 is dead. 902 903 vmovaps(temp1, temp2); // clear NaN lanes of result 904 vpandn(output, temp2, temp2); // result now in temp2 905 asMasm().vpandSimd128(quietBits, temp1, temp1); // setup QNaN bits in NaN lanes 906 vorps(temp1, temp2, temp2); // and OR into result 907 vmovaps(lhs, temp1); // find NaN lanes 908 vcmpunordps(Operand(temp1), temp1, temp1); // in lhs 909 vmovaps(temp1, output); // (and save them for later) 910 vandps(lhs, temp1, temp1); // and extract the NaNs 911 vorps(temp1, temp2, temp2); // and add to the result 912 vmovaps(rhs, temp1); // find NaN lanes 913 vcmpunordps(Operand(temp1), temp1, temp1); // in rhs 914 vpandn(temp1, output, output); // except if they were in lhs 915 vandps(rhs, output, output); // and extract the NaNs 916 vorps(temp2, output, output); // and add to the result 917 918 bind(&l); 919 /* clang-format on */ 920 } 921 922 void MacroAssemblerX86Shared::minMaxFloat32x4AVX(bool isMin, FloatRegister lhs, 923 FloatRegister rhs, 924 FloatRegister temp1, 925 FloatRegister temp2, 926 FloatRegister output) { 927 ScratchSimd128Scope scratch(asMasm()); 928 Label l; 929 SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000))); 930 931 /* clang-format off */ /* leave my comments alone */ 932 FloatRegister lhsCopy = moveSimd128FloatIfEqual(lhs, scratch, output); 933 // Allow rhs be assigned to scratch when rhs == lhs and == output -- 934 // don't make a special case since the semantics require setup QNaN bits. 935 FloatRegister rhsCopy = moveSimd128FloatIfEqual(rhs, scratch, output); 936 if (isMin) { 937 vminps(Operand(rhs), lhs, temp2); // min lhs, rhs 938 vminps(Operand(lhs), rhs, temp1); // min rhs, lhs 939 vorps(temp1, temp2, output); // fix min(-0, 0) with OR 940 } else { 941 vmaxps(Operand(rhs), lhs, temp2); // max lhs, rhs 942 vmaxps(Operand(lhs), rhs, temp1); // max rhs, lhs 943 vandps(temp1, temp2, output); // fix max(-0, 0) with AND 944 } 945 vcmpunordps(Operand(rhsCopy), lhsCopy, temp1); // lhs UNORD rhs 946 vptest(temp1, temp1); // check if any unordered 947 j(Assembler::Equal, &l); // and exit if not 948 949 // Slow path. 950 // output has result for non-NaN lanes, garbage in NaN lanes. 951 // temp1 has lhs UNORD rhs. 952 // temp2 is dead. 953 vcmpunordps(Operand(lhsCopy), lhsCopy, temp2); // find NaN lanes in lhs 954 vblendvps(temp2, lhsCopy, rhsCopy, temp2); // add other lines from rhs 955 asMasm().vporSimd128(quietBits, temp2, temp2); // setup QNaN bits in NaN lanes 956 vblendvps(temp1, temp2, output, output); // replace NaN lines from temp2 957 958 bind(&l); 959 /* clang-format on */ 960 } 961 962 // Exactly as above. 963 void MacroAssemblerX86Shared::minMaxFloat64x2(bool isMin, FloatRegister lhs, 964 Operand rhs, FloatRegister temp1, 965 FloatRegister temp2, 966 FloatRegister output) { 967 ScratchSimd128Scope scratch(asMasm()); 968 Label l; 969 SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull))); 970 971 /* clang-format off */ /* leave my comments alone */ 972 lhs = moveSimd128FloatIfNotAVXOrOther(lhs, scratch, output); 973 if (isMin) { 974 vmovapd(lhs, output); // compute 975 vminpd(rhs, output, output); // min lhs, rhs 976 vmovapd(rhs, temp1); // compute 977 vminpd(Operand(lhs), temp1, temp1); // min rhs, lhs 978 vorpd(temp1, output, output); // fix min(-0, 0) with OR 979 } else { 980 vmovapd(lhs, output); // compute 981 vmaxpd(rhs, output, output); // max lhs, rhs 982 vmovapd(rhs, temp1); // compute 983 vmaxpd(Operand(lhs), temp1, temp1); // max rhs, lhs 984 vandpd(temp1, output, output); // fix max(-0, 0) with AND 985 } 986 vmovapd(lhs, temp1); // compute 987 vcmpunordpd(rhs, temp1, temp1); // lhs UNORD rhs 988 vptest(temp1, temp1); // check if any unordered 989 j(Assembler::Equal, &l); // and exit if not 990 991 // Slow path. 992 // output has result for non-NaN lanes, garbage in NaN lanes. 993 // temp1 has lhs UNORD rhs. 994 // temp2 is dead. 995 996 vmovapd(temp1, temp2); // clear NaN lanes of result 997 vpandn(output, temp2, temp2); // result now in temp2 998 asMasm().vpandSimd128(quietBits, temp1, temp1); // setup QNaN bits in NaN lanes 999 vorpd(temp1, temp2, temp2); // and OR into result 1000 vmovapd(lhs, temp1); // find NaN lanes 1001 vcmpunordpd(Operand(temp1), temp1, temp1); // in lhs 1002 vmovapd(temp1, output); // (and save them for later) 1003 vandpd(lhs, temp1, temp1); // and extract the NaNs 1004 vorpd(temp1, temp2, temp2); // and add to the result 1005 vmovapd(rhs, temp1); // find NaN lanes 1006 vcmpunordpd(Operand(temp1), temp1, temp1); // in rhs 1007 vpandn(temp1, output, output); // except if they were in lhs 1008 vandpd(rhs, output, output); // and extract the NaNs 1009 vorpd(temp2, output, output); // and add to the result 1010 1011 bind(&l); 1012 /* clang-format on */ 1013 } 1014 1015 void MacroAssemblerX86Shared::minMaxFloat64x2AVX(bool isMin, FloatRegister lhs, 1016 FloatRegister rhs, 1017 FloatRegister temp1, 1018 FloatRegister temp2, 1019 FloatRegister output) { 1020 ScratchSimd128Scope scratch(asMasm()); 1021 Label l; 1022 SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull))); 1023 1024 /* clang-format off */ /* leave my comments alone */ 1025 FloatRegister lhsCopy = moveSimd128FloatIfEqual(lhs, scratch, output); 1026 // Allow rhs be assigned to scratch when rhs == lhs and == output -- 1027 // don't make a special case since the semantics require setup QNaN bits. 1028 FloatRegister rhsCopy = moveSimd128FloatIfEqual(rhs, scratch, output); 1029 if (isMin) { 1030 vminpd(Operand(rhs), lhs, temp2); // min lhs, rhs 1031 vminpd(Operand(lhs), rhs, temp1); // min rhs, lhs 1032 vorpd(temp1, temp2, output); // fix min(-0, 0) with OR 1033 } else { 1034 vmaxpd(Operand(rhs), lhs, temp2); // max lhs, rhs 1035 vmaxpd(Operand(lhs), rhs, temp1); // max rhs, lhs 1036 vandpd(temp1, temp2, output); // fix max(-0, 0) with AND 1037 } 1038 vcmpunordpd(Operand(rhsCopy), lhsCopy, temp1); // lhs UNORD rhs 1039 vptest(temp1, temp1); // check if any unordered 1040 j(Assembler::Equal, &l); // and exit if not 1041 1042 // Slow path. 1043 // output has result for non-NaN lanes, garbage in NaN lanes. 1044 // temp1 has lhs UNORD rhs. 1045 // temp2 is dead. 1046 vcmpunordpd(Operand(lhsCopy), lhsCopy, temp2); // find NaN lanes in lhs 1047 vblendvpd(temp2, lhsCopy, rhsCopy, temp2); // add other lines from rhs 1048 asMasm().vporSimd128(quietBits, temp2, temp2); // setup QNaN bits in NaN lanes 1049 vblendvpd(temp1, temp2, output, output); // replace NaN lines from temp2 1050 1051 bind(&l); 1052 /* clang-format on */ 1053 } 1054 1055 void MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, FloatRegister rhs, 1056 FloatRegister temp1, 1057 FloatRegister temp2, 1058 FloatRegister output) { 1059 if (HasAVX()) { 1060 minMaxFloat32x4AVX(/*isMin=*/true, lhs, rhs, temp1, temp2, output); 1061 return; 1062 } 1063 minMaxFloat32x4(/*isMin=*/true, lhs, Operand(rhs), temp1, temp2, output); 1064 } 1065 1066 void MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, FloatRegister rhs, 1067 FloatRegister temp1, 1068 FloatRegister temp2, 1069 FloatRegister output) { 1070 if (HasAVX()) { 1071 minMaxFloat32x4AVX(/*isMin=*/false, lhs, rhs, temp1, temp2, output); 1072 return; 1073 } 1074 minMaxFloat32x4(/*isMin=*/false, lhs, Operand(rhs), temp1, temp2, output); 1075 } 1076 1077 void MacroAssemblerX86Shared::minFloat64x2(FloatRegister lhs, FloatRegister rhs, 1078 FloatRegister temp1, 1079 FloatRegister temp2, 1080 FloatRegister output) { 1081 if (HasAVX()) { 1082 minMaxFloat64x2AVX(/*isMin=*/true, lhs, rhs, temp1, temp2, output); 1083 return; 1084 } 1085 minMaxFloat64x2(/*isMin=*/true, lhs, Operand(rhs), temp1, temp2, output); 1086 } 1087 1088 void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, FloatRegister rhs, 1089 FloatRegister temp1, 1090 FloatRegister temp2, 1091 FloatRegister output) { 1092 if (HasAVX()) { 1093 minMaxFloat64x2AVX(/*isMin=*/false, lhs, rhs, temp1, temp2, output); 1094 return; 1095 } 1096 minMaxFloat64x2(/*isMin=*/false, lhs, Operand(rhs), temp1, temp2, output); 1097 } 1098 1099 void MacroAssemblerX86Shared::packedShiftByScalarInt8x16( 1100 FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest, 1101 void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister, 1102 FloatRegister), 1103 void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister)) { 1104 ScratchSimd128Scope scratch(asMasm()); 1105 vmovd(count, scratch); 1106 1107 // High bytes 1108 vpalignr(Operand(in), xtmp, xtmp, 8); 1109 (this->*extend)(Operand(xtmp), xtmp); 1110 (this->*shift)(scratch, xtmp, xtmp); 1111 1112 // Low bytes 1113 (this->*extend)(Operand(dest), dest); 1114 (this->*shift)(scratch, dest, dest); 1115 1116 // Mask off garbage to avoid saturation during packing 1117 asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x00FF00FF)), 1118 scratch); 1119 vpand(Operand(scratch), xtmp, xtmp); 1120 vpand(Operand(scratch), dest, dest); 1121 1122 vpackuswb(Operand(xtmp), dest, dest); 1123 } 1124 1125 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16( 1126 FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) { 1127 packedShiftByScalarInt8x16(in, count, xtmp, dest, 1128 &MacroAssemblerX86Shared::vpsllw, 1129 &MacroAssemblerX86Shared::vpmovzxbw); 1130 } 1131 1132 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16( 1133 Imm32 count, FloatRegister src, FloatRegister dest) { 1134 MOZ_ASSERT(count.value <= 7); 1135 if (MOZ_UNLIKELY(count.value == 0)) { 1136 moveSimd128Int(src, dest); 1137 return; 1138 } 1139 src = asMasm().moveSimd128IntIfNotAVX(src, dest); 1140 // Use the doubling trick for low shift counts, otherwise mask off the bits 1141 // that are shifted out of the low byte of each word and use word shifts. The 1142 // optimal cutoff remains to be explored. 1143 if (count.value <= 3) { 1144 vpaddb(Operand(src), src, dest); 1145 for (int32_t shift = count.value - 1; shift > 0; --shift) { 1146 vpaddb(Operand(dest), dest, dest); 1147 } 1148 } else { 1149 asMasm().bitwiseAndSimd128(src, SimdConstant::SplatX16(0xFF >> count.value), 1150 dest); 1151 vpsllw(count, dest, dest); 1152 } 1153 } 1154 1155 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16( 1156 FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) { 1157 packedShiftByScalarInt8x16(in, count, xtmp, dest, 1158 &MacroAssemblerX86Shared::vpsraw, 1159 &MacroAssemblerX86Shared::vpmovsxbw); 1160 } 1161 1162 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16( 1163 Imm32 count, FloatRegister src, FloatRegister dest) { 1164 MOZ_ASSERT(count.value <= 7); 1165 ScratchSimd128Scope scratch(asMasm()); 1166 1167 vpunpckhbw(src, scratch, scratch); 1168 vpunpcklbw(src, dest, dest); 1169 vpsraw(Imm32(count.value + 8), scratch, scratch); 1170 vpsraw(Imm32(count.value + 8), dest, dest); 1171 vpacksswb(Operand(scratch), dest, dest); 1172 } 1173 1174 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16( 1175 FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) { 1176 packedShiftByScalarInt8x16(in, count, xtmp, dest, 1177 &MacroAssemblerX86Shared::vpsrlw, 1178 &MacroAssemblerX86Shared::vpmovzxbw); 1179 } 1180 1181 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16( 1182 Imm32 count, FloatRegister src, FloatRegister dest) { 1183 MOZ_ASSERT(count.value <= 7); 1184 src = asMasm().moveSimd128IntIfNotAVX(src, dest); 1185 asMasm().bitwiseAndSimd128( 1186 src, SimdConstant::SplatX16((0xFF << count.value) & 0xFF), dest); 1187 vpsrlw(count, dest, dest); 1188 } 1189 1190 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8( 1191 FloatRegister in, Register count, FloatRegister dest) { 1192 ScratchSimd128Scope scratch(asMasm()); 1193 vmovd(count, scratch); 1194 vpsllw(scratch, in, dest); 1195 } 1196 1197 void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8( 1198 FloatRegister in, Register count, FloatRegister dest) { 1199 ScratchSimd128Scope scratch(asMasm()); 1200 vmovd(count, scratch); 1201 vpsraw(scratch, in, dest); 1202 } 1203 1204 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8( 1205 FloatRegister in, Register count, FloatRegister dest) { 1206 ScratchSimd128Scope scratch(asMasm()); 1207 vmovd(count, scratch); 1208 vpsrlw(scratch, in, dest); 1209 } 1210 1211 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4( 1212 FloatRegister in, Register count, FloatRegister dest) { 1213 ScratchSimd128Scope scratch(asMasm()); 1214 vmovd(count, scratch); 1215 vpslld(scratch, in, dest); 1216 } 1217 1218 void MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4( 1219 FloatRegister in, Register count, FloatRegister dest) { 1220 ScratchSimd128Scope scratch(asMasm()); 1221 vmovd(count, scratch); 1222 vpsrad(scratch, in, dest); 1223 } 1224 1225 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4( 1226 FloatRegister in, Register count, FloatRegister dest) { 1227 ScratchSimd128Scope scratch(asMasm()); 1228 vmovd(count, scratch); 1229 vpsrld(scratch, in, dest); 1230 } 1231 1232 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2( 1233 FloatRegister in, Register count, FloatRegister dest) { 1234 ScratchSimd128Scope scratch(asMasm()); 1235 vmovd(count, scratch); 1236 vpsllq(scratch, in, dest); 1237 } 1238 1239 void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2( 1240 FloatRegister in, Register count, FloatRegister temp, FloatRegister dest) { 1241 ScratchSimd128Scope scratch(asMasm()); 1242 vmovd(count, temp); 1243 asMasm().signReplicationInt64x2(in, scratch); 1244 in = asMasm().moveSimd128FloatIfNotAVX(in, dest); 1245 // Invert if negative, shift all, invert back if negative. 1246 vpxor(Operand(scratch), in, dest); 1247 vpsrlq(temp, dest, dest); 1248 vpxor(Operand(scratch), dest, dest); 1249 } 1250 1251 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2( 1252 FloatRegister in, Register count, FloatRegister dest) { 1253 ScratchSimd128Scope scratch(asMasm()); 1254 vmovd(count, scratch); 1255 vpsrlq(scratch, in, dest); 1256 } 1257 1258 void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2( 1259 Imm32 count, FloatRegister src, FloatRegister dest) { 1260 ScratchSimd128Scope scratch(asMasm()); 1261 asMasm().signReplicationInt64x2(src, scratch); 1262 // Invert if negative, shift all, invert back if negative. 1263 src = asMasm().moveSimd128FloatIfNotAVX(src, dest); 1264 vpxor(Operand(scratch), src, dest); 1265 vpsrlq(Imm32(count.value & 63), dest, dest); 1266 vpxor(Operand(scratch), dest, dest); 1267 } 1268 1269 void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask, 1270 FloatRegister onTrue, 1271 FloatRegister onFalse, 1272 FloatRegister temp, 1273 FloatRegister output) { 1274 // Normally the codegen will attempt to enforce these register assignments so 1275 // that the moves are avoided. 1276 1277 onTrue = asMasm().moveSimd128IntIfNotAVX(onTrue, output); 1278 if (MOZ_UNLIKELY(mask == onTrue)) { 1279 vpor(Operand(onFalse), onTrue, output); 1280 return; 1281 } 1282 1283 mask = asMasm().moveSimd128IntIfNotAVX(mask, temp); 1284 1285 vpand(Operand(mask), onTrue, output); 1286 vpandn(Operand(onFalse), mask, temp); 1287 vpor(Operand(temp), output, output); 1288 } 1289 1290 // Code sequences for int32x4<->float32x4 culled from v8; commentary added. 1291 1292 void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4( 1293 FloatRegister src, FloatRegister dest) { 1294 ScratchSimd128Scope scratch(asMasm()); 1295 src = asMasm().moveSimd128IntIfNotAVX(src, dest); 1296 vpxor(Operand(scratch), scratch, scratch); // extract low bits 1297 vpblendw(0x55, src, scratch, scratch); // into scratch 1298 vpsubd(Operand(scratch), src, dest); // and high bits into dest 1299 vcvtdq2ps(scratch, scratch); // convert low bits 1300 vpsrld(Imm32(1), dest, dest); // get high into unsigned range 1301 vcvtdq2ps(dest, dest); // convert 1302 vaddps(Operand(dest), dest, dest); // and back into signed 1303 vaddps(Operand(scratch), dest, dest); // combine high+low: may round 1304 } 1305 1306 void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src, 1307 FloatRegister dest) { 1308 ScratchSimd128Scope scratch(asMasm()); 1309 1310 // The cvttps2dq instruction is the workhorse but does not handle NaN or out 1311 // of range values as we need it to. We want to saturate too-large positive 1312 // values to 7FFFFFFFh and too-large negative values to 80000000h. NaN and -0 1313 // become 0. 1314 1315 // Convert NaN to 0 by masking away values that compare unordered to itself. 1316 if (HasAVX()) { 1317 vcmpeqps(Operand(src), src, scratch); 1318 vpand(Operand(scratch), src, dest); 1319 } else { 1320 vmovaps(src, scratch); 1321 vcmpeqps(Operand(scratch), scratch, scratch); 1322 moveSimd128Float(src, dest); 1323 vpand(Operand(scratch), dest, dest); 1324 } 1325 1326 // Make lanes in scratch == 0xFFFFFFFFh, if dest overflows during cvttps2dq, 1327 // otherwise 0. 1328 static const SimdConstant minOverflowedInt = 1329 SimdConstant::SplatX4(2147483648.f); 1330 if (HasAVX()) { 1331 asMasm().vcmpgepsSimd128(minOverflowedInt, dest, scratch); 1332 } else { 1333 asMasm().loadConstantSimd128Float(minOverflowedInt, scratch); 1334 vcmpleps(Operand(dest), scratch, scratch); 1335 } 1336 1337 // Convert. This will make the output 80000000h if the input is out of range. 1338 vcvttps2dq(dest, dest); 1339 1340 // Convert overflow lanes to 0x7FFFFFFF. 1341 vpxor(Operand(scratch), dest, dest); 1342 } 1343 1344 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4( 1345 FloatRegister src, FloatRegister temp, FloatRegister dest) { 1346 ScratchSimd128Scope scratch(asMasm()); 1347 src = asMasm().moveSimd128FloatIfNotAVX(src, dest); 1348 1349 // The cvttps2dq instruction is the workhorse but does not handle NaN or out 1350 // of range values as we need it to. We want to saturate too-large positive 1351 // values to FFFFFFFFh and negative values to zero. NaN and -0 become 0. 1352 1353 // Convert NaN and negative values to zeroes in dest. 1354 vxorps(Operand(scratch), scratch, scratch); 1355 vmaxps(Operand(scratch), src, dest); 1356 1357 // Place the largest positive signed integer in all lanes in scratch. 1358 // We use it to bias the conversion to handle edge cases. 1359 asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(2147483647.f), 1360 scratch); 1361 1362 // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned 1363 // range but above the signed range into the signed range; 0 => -7FFFFFFFh. 1364 vmovaps(dest, temp); 1365 vsubps(Operand(scratch), temp, temp); 1366 1367 // scratch = mask of biased values that are greater than 7FFFFFFFh. 1368 vcmpleps(Operand(temp), scratch, scratch); 1369 1370 // Convert the biased values to integer. Positive values above 7FFFFFFFh will 1371 // have been converted to 80000000h, all others become the expected integer. 1372 vcvttps2dq(temp, temp); 1373 1374 // As lanes of scratch are ~0 where the result overflows, this computes 1375 // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes 1376 // untouched as the biased integer. 1377 vpxor(Operand(scratch), temp, temp); 1378 1379 // Convert negative biased lanes in temp to zero. After this, temp will be 1380 // zero where the result should be zero or is less than 80000000h, 7FFFFFFF 1381 // where the result overflows, and will have the converted biased result in 1382 // other lanes (for input values >= 80000000h). 1383 vpxor(Operand(scratch), scratch, scratch); 1384 vpmaxsd(Operand(scratch), temp, temp); 1385 1386 // Convert. Overflow lanes above 7FFFFFFFh will be 80000000h, other lanes will 1387 // be what they should be. 1388 vcvttps2dq(dest, dest); 1389 1390 // Add temp to the result. Overflow lanes with 80000000h becomes FFFFFFFFh, 1391 // biased high-value unsigned lanes become unbiased, everything else is left 1392 // unchanged. 1393 vpaddd(Operand(temp), dest, dest); 1394 } 1395 1396 void MacroAssemblerX86Shared::unsignedTruncFloat32x4ToInt32x4Relaxed( 1397 FloatRegister src, FloatRegister dest) { 1398 ScratchSimd128Scope scratch(asMasm()); 1399 src = asMasm().moveSimd128FloatIfNotAVX(src, dest); 1400 1401 // Place lanes below 80000000h into dest, otherwise into scratch. 1402 // Keep dest or scratch 0 as default. 1403 asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(0x4f000000), scratch); 1404 vcmpltps(Operand(src), scratch, scratch); 1405 vpand(Operand(src), scratch, scratch); 1406 vpxor(Operand(scratch), src, dest); 1407 1408 // Convert lanes below 80000000h into unsigned int without issues. 1409 vcvttps2dq(dest, dest); 1410 // Knowing IEEE-754 number representation: convert lanes above 7FFFFFFFh, 1411 // mutiply by 2 (to add 1 in exponent) and shift to the left by 8 bits. 1412 vaddps(Operand(scratch), scratch, scratch); 1413 vpslld(Imm32(8), scratch, scratch); 1414 1415 // Combine the results. 1416 vpaddd(Operand(scratch), dest, dest); 1417 } 1418 1419 void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat64x2( 1420 FloatRegister src, FloatRegister dest) { 1421 src = asMasm().moveSimd128FloatIfNotAVX(src, dest); 1422 asMasm().vunpcklpsSimd128(SimdConstant::SplatX4(0x43300000), src, dest); 1423 asMasm().vsubpdSimd128(SimdConstant::SplatX2(4503599627370496.0), dest, dest); 1424 } 1425 1426 void MacroAssemblerX86Shared::truncSatFloat64x2ToInt32x4(FloatRegister src, 1427 FloatRegister temp, 1428 FloatRegister dest) { 1429 FloatRegister srcForTemp = asMasm().moveSimd128FloatIfNotAVX(src, temp); 1430 vcmpeqpd(Operand(srcForTemp), srcForTemp, temp); 1431 1432 src = asMasm().moveSimd128FloatIfNotAVX(src, dest); 1433 asMasm().vandpdSimd128(SimdConstant::SplatX2(2147483647.0), temp, temp); 1434 vminpd(Operand(temp), src, dest); 1435 vcvttpd2dq(dest, dest); 1436 } 1437 1438 void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4( 1439 FloatRegister src, FloatRegister temp, FloatRegister dest) { 1440 src = asMasm().moveSimd128FloatIfNotAVX(src, dest); 1441 1442 vxorpd(temp, temp, temp); 1443 vmaxpd(Operand(temp), src, dest); 1444 1445 asMasm().vminpdSimd128(SimdConstant::SplatX2(4294967295.0), dest, dest); 1446 vroundpd(SSERoundingMode::Trunc, Operand(dest), dest); 1447 asMasm().vaddpdSimd128(SimdConstant::SplatX2(4503599627370496.0), dest, dest); 1448 1449 // temp == 0 1450 vshufps(0x88, temp, dest, dest); 1451 } 1452 1453 void MacroAssemblerX86Shared::unsignedTruncFloat64x2ToInt32x4Relaxed( 1454 FloatRegister src, FloatRegister dest) { 1455 ScratchSimd128Scope scratch(asMasm()); 1456 1457 // The same as unsignedConvertInt32x4ToFloat64x2, but without NaN 1458 // and out-of-bounds checks. 1459 vroundpd(SSERoundingMode::Trunc, Operand(src), dest); 1460 asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4503599627370496.0), 1461 scratch); 1462 vaddpd(Operand(scratch), dest, dest); 1463 // The scratch has zeros in f32x4 lanes with index 0 and 2. The in-memory 1464 // representantation of the splatted double contantant contains zero in its 1465 // low bits. 1466 vshufps(0x88, scratch, dest, dest); 1467 } 1468 1469 void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src, 1470 FloatRegister temp, 1471 FloatRegister output) { 1472 ScratchSimd128Scope scratch(asMasm()); 1473 asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0x0f), scratch); 1474 FloatRegister srcForTemp = asMasm().moveSimd128IntIfNotAVX(src, temp); 1475 vpand(scratch, srcForTemp, temp); 1476 vpandn(src, scratch, scratch); 1477 int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; 1478 asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output); 1479 vpsrlw(Imm32(4), scratch, scratch); 1480 vpshufb(temp, output, output); 1481 asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), temp); 1482 vpshufb(scratch, temp, temp); 1483 vpaddb(Operand(temp), output, output); 1484 }