tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

MacroAssembler-x86-shared.cpp (76480B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "jit/x86-shared/MacroAssembler-x86-shared.h"
      8 
      9 #include "mozilla/Casting.h"
     10 
     11 #include "jsmath.h"
     12 
     13 #include "jit/JitFrames.h"
     14 #include "jit/MacroAssembler.h"
     15 #include "js/ScalarType.h"  // js::Scalar::Type
     16 
     17 #include "jit/MacroAssembler-inl.h"
     18 
     19 using namespace js;
     20 using namespace js::jit;
     21 
     22 // Note: this function clobbers the input register.
     23 void MacroAssembler::clampDoubleToUint8(FloatRegister input, Register output) {
     24  ScratchDoubleScope scratch(*this);
     25  MOZ_ASSERT(input != scratch);
     26  Label positive, done;
     27 
     28  // <= 0 or NaN --> 0
     29  zeroDouble(scratch);
     30  branchDouble(DoubleGreaterThan, input, scratch, &positive);
     31  {
     32    move32(Imm32(0), output);
     33    jump(&done);
     34  }
     35 
     36  bind(&positive);
     37 
     38  if (HasRoundInstruction(RoundingMode::NearestTiesToEven)) {
     39    // Round input to nearest integer.
     40    nearbyIntDouble(RoundingMode::NearestTiesToEven, input, input);
     41 
     42    // Truncate to int32 and ensure the result <= 255. This relies on the
     43    // processor setting output to a value > 255 for doubles outside the int32
     44    // range (for instance 0x80000000).
     45    vcvttsd2si(input, output);
     46    branch32(Assembler::BelowOrEqual, output, Imm32(255), &done);
     47    move32(Imm32(255), output);
     48  } else {
     49    Label outOfRange;
     50 
     51    // Truncate to int32 and ensure the result <= 255. This relies on the
     52    // processor setting output to a value > 255 for doubles outside the int32
     53    // range (for instance 0x80000000).
     54    vcvttsd2si(input, output);
     55    branch32(Assembler::AboveOrEqual, output, Imm32(255), &outOfRange);
     56    {
     57      // Check if we had a tie.
     58      convertInt32ToDouble(output, scratch);
     59      subDouble(scratch, input);
     60 
     61      loadConstantDouble(0.5, scratch);
     62 
     63      Label roundUp;
     64      vucomisd(scratch, input);
     65      j(Above, &roundUp);
     66      j(NotEqual, &done);
     67 
     68      // It was a tie. Round up if the output is odd.
     69      branchTest32(Zero, output, Imm32(1), &done);
     70 
     71      bind(&roundUp);
     72      add32(Imm32(1), output);
     73      jump(&done);
     74    }
     75 
     76    // > 255 --> 255
     77    bind(&outOfRange);
     78    move32(Imm32(255), output);
     79  }
     80 
     81  bind(&done);
     82 }
     83 
     84 bool MacroAssemblerX86Shared::buildOOLFakeExitFrame(void* fakeReturnAddr) {
     85  asMasm().Push(FrameDescriptor(FrameType::IonJS));
     86  asMasm().Push(ImmPtr(fakeReturnAddr));
     87  asMasm().Push(FramePointer);
     88  return true;
     89 }
     90 
     91 void MacroAssemblerX86Shared::branchNegativeZero(FloatRegister reg,
     92                                                 Register scratch, Label* label,
     93                                                 bool maybeNonZero) {
     94  // Determines whether the low double contained in the XMM register reg
     95  // is equal to -0.0.
     96 
     97 #if defined(JS_CODEGEN_X86)
     98  Label nonZero;
     99 
    100  // if not already compared to zero
    101  if (maybeNonZero) {
    102    ScratchDoubleScope scratchDouble(asMasm());
    103 
    104    // Compare to zero. Lets through {0, -0}.
    105    zeroDouble(scratchDouble);
    106 
    107    // If reg is non-zero, jump to nonZero.
    108    asMasm().branchDouble(DoubleNotEqual, reg, scratchDouble, &nonZero);
    109  }
    110  // Input register is either zero or negative zero. Retrieve sign of input.
    111  vmovmskpd(reg, scratch);
    112 
    113  // If reg is 1 or 3, input is negative zero.
    114  // If reg is 0 or 2, input is a normal zero.
    115  asMasm().branchTest32(NonZero, scratch, Imm32(1), label);
    116 
    117  bind(&nonZero);
    118 #elif defined(JS_CODEGEN_X64)
    119  vmovq(reg, scratch);
    120  cmpq(Imm32(1), scratch);
    121  j(Overflow, label);
    122 #endif
    123 }
    124 
    125 void MacroAssemblerX86Shared::branchNegativeZeroFloat32(FloatRegister reg,
    126                                                        Register scratch,
    127                                                        Label* label) {
    128  vmovd(reg, scratch);
    129  cmp32(scratch, Imm32(1));
    130  j(Overflow, label);
    131 }
    132 
    133 MacroAssembler& MacroAssemblerX86Shared::asMasm() {
    134  return *static_cast<MacroAssembler*>(this);
    135 }
    136 
    137 const MacroAssembler& MacroAssemblerX86Shared::asMasm() const {
    138  return *static_cast<const MacroAssembler*>(this);
    139 }
    140 
    141 template <class T, class Map>
    142 T* MacroAssemblerX86Shared::getConstant(const typename T::Pod& value, Map& map,
    143                                        Vector<T, 0, SystemAllocPolicy>& vec) {
    144  using AddPtr = typename Map::AddPtr;
    145  size_t index;
    146  if (AddPtr p = map.lookupForAdd(value)) {
    147    index = p->value();
    148  } else {
    149    index = vec.length();
    150    enoughMemory_ &= vec.append(T(value));
    151    if (!enoughMemory_) {
    152      return nullptr;
    153    }
    154    enoughMemory_ &= map.add(p, value, index);
    155    if (!enoughMemory_) {
    156      return nullptr;
    157    }
    158  }
    159  return &vec[index];
    160 }
    161 
    162 MacroAssemblerX86Shared::Float* MacroAssemblerX86Shared::getFloat(float f) {
    163  return getConstant<Float, FloatMap>(f, floatMap_, floats_);
    164 }
    165 
    166 MacroAssemblerX86Shared::Double* MacroAssemblerX86Shared::getDouble(double d) {
    167  return getConstant<Double, DoubleMap>(d, doubleMap_, doubles_);
    168 }
    169 
    170 MacroAssemblerX86Shared::SimdData* MacroAssemblerX86Shared::getSimdData(
    171    const SimdConstant& v) {
    172  return getConstant<SimdData, SimdMap>(v, simdMap_, simds_);
    173 }
    174 
    175 void MacroAssemblerX86Shared::binarySimd128(
    176    const SimdConstant& rhs, FloatRegister lhsDest,
    177    void (MacroAssembler::*regOp)(const Operand&, FloatRegister, FloatRegister),
    178    void (MacroAssembler::*constOp)(const SimdConstant&, FloatRegister)) {
    179  ScratchSimd128Scope scratch(asMasm());
    180  if (maybeInlineSimd128Int(rhs, scratch)) {
    181    (asMasm().*regOp)(Operand(scratch), lhsDest, lhsDest);
    182  } else {
    183    (asMasm().*constOp)(rhs, lhsDest);
    184  }
    185 }
    186 
    187 void MacroAssemblerX86Shared::binarySimd128(
    188    FloatRegister lhs, const SimdConstant& rhs, FloatRegister dest,
    189    void (MacroAssembler::*regOp)(const Operand&, FloatRegister, FloatRegister),
    190    void (MacroAssembler::*constOp)(const SimdConstant&, FloatRegister,
    191                                    FloatRegister)) {
    192  ScratchSimd128Scope scratch(asMasm());
    193  if (maybeInlineSimd128Int(rhs, scratch)) {
    194    (asMasm().*regOp)(Operand(scratch), lhs, dest);
    195  } else {
    196    (asMasm().*constOp)(rhs, lhs, dest);
    197  }
    198 }
    199 
    200 void MacroAssemblerX86Shared::binarySimd128(
    201    const SimdConstant& rhs, FloatRegister lhs,
    202    void (MacroAssembler::*regOp)(const Operand&, FloatRegister),
    203    void (MacroAssembler::*constOp)(const SimdConstant&, FloatRegister)) {
    204  ScratchSimd128Scope scratch(asMasm());
    205  if (maybeInlineSimd128Int(rhs, scratch)) {
    206    (asMasm().*regOp)(Operand(scratch), lhs);
    207  } else {
    208    (asMasm().*constOp)(rhs, lhs);
    209  }
    210 }
    211 
    212 void MacroAssemblerX86Shared::bitwiseTestSimd128(const SimdConstant& rhs,
    213                                                 FloatRegister lhs) {
    214  ScratchSimd128Scope scratch(asMasm());
    215  if (maybeInlineSimd128Int(rhs, scratch)) {
    216    vptest(scratch, lhs);
    217  } else {
    218    asMasm().vptestSimd128(rhs, lhs);
    219  }
    220 }
    221 
    222 void MacroAssemblerX86Shared::minMaxDouble(FloatRegister first,
    223                                           FloatRegister second, bool canBeNaN,
    224                                           bool isMax) {
    225  Label done, nan, minMaxInst;
    226 
    227  // Do a vucomisd to catch equality and NaNs, which both require special
    228  // handling. If the operands are ordered and inequal, we branch straight to
    229  // the min/max instruction. If we wanted, we could also branch for less-than
    230  // or greater-than here instead of using min/max, however these conditions
    231  // will sometimes be hard on the branch predictor.
    232  vucomisd(second, first);
    233  j(Assembler::NotEqual, &minMaxInst);
    234  if (canBeNaN) {
    235    j(Assembler::Parity, &nan);
    236  }
    237 
    238  // Ordered and equal. The operands are bit-identical unless they are zero
    239  // and negative zero. These instructions merge the sign bits in that
    240  // case, and are no-ops otherwise.
    241  if (isMax) {
    242    vandpd(second, first, first);
    243  } else {
    244    vorpd(second, first, first);
    245  }
    246  jump(&done);
    247 
    248  // x86's min/max are not symmetric; if either operand is a NaN, they return
    249  // the read-only operand. We need to return a NaN if either operand is a
    250  // NaN, so we explicitly check for a NaN in the read-write operand.
    251  if (canBeNaN) {
    252    bind(&nan);
    253    vucomisd(first, first);
    254    j(Assembler::Parity, &done);
    255  }
    256 
    257  // When the values are inequal, or second is NaN, x86's min and max will
    258  // return the value we need.
    259  bind(&minMaxInst);
    260  if (isMax) {
    261    vmaxsd(second, first, first);
    262  } else {
    263    vminsd(second, first, first);
    264  }
    265 
    266  bind(&done);
    267 }
    268 
    269 void MacroAssemblerX86Shared::minMaxFloat32(FloatRegister first,
    270                                            FloatRegister second, bool canBeNaN,
    271                                            bool isMax) {
    272  Label done, nan, minMaxInst;
    273 
    274  // Do a vucomiss to catch equality and NaNs, which both require special
    275  // handling. If the operands are ordered and inequal, we branch straight to
    276  // the min/max instruction. If we wanted, we could also branch for less-than
    277  // or greater-than here instead of using min/max, however these conditions
    278  // will sometimes be hard on the branch predictor.
    279  vucomiss(second, first);
    280  j(Assembler::NotEqual, &minMaxInst);
    281  if (canBeNaN) {
    282    j(Assembler::Parity, &nan);
    283  }
    284 
    285  // Ordered and equal. The operands are bit-identical unless they are zero
    286  // and negative zero. These instructions merge the sign bits in that
    287  // case, and are no-ops otherwise.
    288  if (isMax) {
    289    vandps(second, first, first);
    290  } else {
    291    vorps(second, first, first);
    292  }
    293  jump(&done);
    294 
    295  // x86's min/max are not symmetric; if either operand is a NaN, they return
    296  // the read-only operand. We need to return a NaN if either operand is a
    297  // NaN, so we explicitly check for a NaN in the read-write operand.
    298  if (canBeNaN) {
    299    bind(&nan);
    300    vucomiss(first, first);
    301    j(Assembler::Parity, &done);
    302  }
    303 
    304  // When the values are inequal, or second is NaN, x86's min and max will
    305  // return the value we need.
    306  bind(&minMaxInst);
    307  if (isMax) {
    308    vmaxss(second, first, first);
    309  } else {
    310    vminss(second, first, first);
    311  }
    312 
    313  bind(&done);
    314 }
    315 
    316 #ifdef ENABLE_WASM_SIMD
    317 bool MacroAssembler::MustMaskShiftCountSimd128(wasm::SimdOp op, int32_t* mask) {
    318  switch (op) {
    319    case wasm::SimdOp::I8x16Shl:
    320    case wasm::SimdOp::I8x16ShrU:
    321    case wasm::SimdOp::I8x16ShrS:
    322      *mask = 7;
    323      break;
    324    case wasm::SimdOp::I16x8Shl:
    325    case wasm::SimdOp::I16x8ShrU:
    326    case wasm::SimdOp::I16x8ShrS:
    327      *mask = 15;
    328      break;
    329    case wasm::SimdOp::I32x4Shl:
    330    case wasm::SimdOp::I32x4ShrU:
    331    case wasm::SimdOp::I32x4ShrS:
    332      *mask = 31;
    333      break;
    334    case wasm::SimdOp::I64x2Shl:
    335    case wasm::SimdOp::I64x2ShrU:
    336    case wasm::SimdOp::I64x2ShrS:
    337      *mask = 63;
    338      break;
    339    default:
    340      MOZ_CRASH("Unexpected shift operation");
    341  }
    342  return true;
    343 }
    344 #endif
    345 
    346 //{{{ check_macroassembler_style
    347 // ===============================================================
    348 // MacroAssembler high-level usage.
    349 
    350 void MacroAssembler::flush() {}
    351 
    352 void MacroAssembler::comment(const char* msg) { masm.comment(msg); }
    353 
    354 // This operation really consists of five phases, in order to enforce the
    355 // restriction that on x86_shared, the dividend must be eax and both eax and edx
    356 // will be clobbered.
    357 //
    358 //     Input: { lhs, rhs }
    359 //
    360 //  [PUSH] Preserve registers
    361 //  [MOVE] Generate moves to specific registers
    362 //
    363 //  [DIV] Input: { regForRhs, EAX }
    364 //  [DIV] extend EAX into EDX
    365 //  [DIV] x86 Division operator
    366 //  [DIV] Output: { EAX, EDX }
    367 //
    368 //  [MOVE] Move specific registers to outputs
    369 //  [POP] Restore registers
    370 //
    371 //    Output: { quotientOutput, remainderOutput }
    372 static void EmitDivMod32(MacroAssembler& masm, Register lhs, Register rhs,
    373                         Register divOutput, Register remOutput,
    374                         bool isUnsigned) {
    375  if (lhs == rhs) {
    376    if (divOutput != Register::Invalid()) {
    377      masm.movl(Imm32(1), divOutput);
    378    }
    379    if (remOutput != Register::Invalid()) {
    380      masm.movl(Imm32(0), remOutput);
    381    }
    382    return;
    383  }
    384 
    385  // Choose a register that is not edx or eax to hold the rhs;
    386  // ebx is chosen arbitrarily, and will be preserved if necessary.
    387  Register regForRhs = (rhs == eax || rhs == edx) ? ebx : rhs;
    388 
    389  // Add registers we will be clobbering as live, but
    390  // also remove the set we do not restore.
    391  LiveRegisterSet preserve;
    392  preserve.add(edx);
    393  preserve.add(eax);
    394  if (rhs != regForRhs) {
    395    preserve.add(regForRhs);
    396  }
    397 
    398  if (divOutput != Register::Invalid()) {
    399    preserve.takeUnchecked(divOutput);
    400  }
    401  if (remOutput != Register::Invalid()) {
    402    preserve.takeUnchecked(remOutput);
    403  }
    404 
    405  masm.PushRegsInMask(preserve);
    406 
    407  // Shuffle input into place.
    408  masm.moveRegPair(lhs, rhs, eax, regForRhs);
    409 
    410  // Sign extend eax into edx to make (edx:eax): idiv/udiv are 64-bit.
    411  if (isUnsigned) {
    412    masm.mov(ImmWord(0), edx);
    413    masm.udiv(regForRhs);
    414  } else {
    415    masm.cdq();
    416    masm.idiv(regForRhs);
    417  }
    418 
    419  if (divOutput != Register::Invalid() && remOutput != Register::Invalid()) {
    420    masm.moveRegPair(eax, edx, divOutput, remOutput);
    421  } else {
    422    if (divOutput != Register::Invalid() && divOutput != eax) {
    423      masm.mov(eax, divOutput);
    424    }
    425    if (remOutput != Register::Invalid() && remOutput != edx) {
    426      masm.mov(edx, remOutput);
    427    }
    428  }
    429 
    430  masm.PopRegsInMask(preserve);
    431 }
    432 
    433 void MacroAssembler::flexibleDivMod32(Register lhs, Register rhs,
    434                                      Register divOutput, Register remOutput,
    435                                      bool isUnsigned, const LiveRegisterSet&) {
    436  MOZ_ASSERT(lhs != divOutput && lhs != remOutput, "lhs is preserved");
    437  MOZ_ASSERT(rhs != divOutput && rhs != remOutput, "rhs is preserved");
    438 
    439  EmitDivMod32(*this, lhs, rhs, divOutput, remOutput, isUnsigned);
    440 }
    441 
    442 void MacroAssembler::flexibleQuotient32(
    443    Register lhs, Register rhs, Register dest, bool isUnsigned,
    444    const LiveRegisterSet& volatileLiveRegs) {
    445  EmitDivMod32(*this, lhs, rhs, dest, Register::Invalid(), isUnsigned);
    446 }
    447 
    448 void MacroAssembler::flexibleRemainder32(
    449    Register lhs, Register rhs, Register dest, bool isUnsigned,
    450    const LiveRegisterSet& volatileLiveRegs) {
    451  EmitDivMod32(*this, lhs, rhs, Register::Invalid(), dest, isUnsigned);
    452 }
    453 
    454 // ===============================================================
    455 // Stack manipulation functions.
    456 
    457 size_t MacroAssembler::PushRegsInMaskSizeInBytes(LiveRegisterSet set) {
    458  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
    459  return set.gprs().size() * sizeof(intptr_t) + fpuSet.getPushSizeInBytes();
    460 }
    461 
    462 void MacroAssembler::PushRegsInMask(LiveRegisterSet set) {
    463  mozilla::DebugOnly<size_t> framePushedInitial = framePushed();
    464 
    465  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
    466  unsigned numFpu = fpuSet.size();
    467  int32_t diffF = fpuSet.getPushSizeInBytes();
    468  int32_t diffG = set.gprs().size() * sizeof(intptr_t);
    469 
    470  // On x86, always use push to push the integer registers, as it's fast
    471  // on modern hardware and it's a small instruction.
    472  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
    473    diffG -= sizeof(intptr_t);
    474    Push(*iter);
    475  }
    476  MOZ_ASSERT(diffG == 0);
    477  (void)diffG;
    478 
    479  reserveStack(diffF);
    480  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
    481    FloatRegister reg = *iter;
    482    diffF -= reg.size();
    483    numFpu -= 1;
    484    Address spillAddress(StackPointer, diffF);
    485    if (reg.isDouble()) {
    486      storeDouble(reg, spillAddress);
    487    } else if (reg.isSingle()) {
    488      storeFloat32(reg, spillAddress);
    489    } else if (reg.isSimd128()) {
    490      storeUnalignedSimd128(reg, spillAddress);
    491    } else {
    492      MOZ_CRASH("Unknown register type.");
    493    }
    494  }
    495  MOZ_ASSERT(numFpu == 0);
    496  (void)numFpu;
    497 
    498  // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
    499  // GetPushSizeInBytes.
    500  size_t alignExtra = ((size_t)diffF) % sizeof(uintptr_t);
    501  MOZ_ASSERT_IF(sizeof(uintptr_t) == 8, alignExtra == 0 || alignExtra == 4);
    502  MOZ_ASSERT_IF(sizeof(uintptr_t) == 4, alignExtra == 0);
    503  diffF -= alignExtra;
    504  MOZ_ASSERT(diffF == 0);
    505 
    506  // The macroassembler will keep the stack sizeof(uintptr_t)-aligned, so
    507  // we don't need to take into account `alignExtra` here.
    508  MOZ_ASSERT(framePushed() - framePushedInitial ==
    509             PushRegsInMaskSizeInBytes(set));
    510 }
    511 
    512 void MacroAssembler::storeRegsInMask(LiveRegisterSet set, Address dest,
    513                                     Register) {
    514  mozilla::DebugOnly<size_t> offsetInitial = dest.offset;
    515 
    516  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
    517  unsigned numFpu = fpuSet.size();
    518  int32_t diffF = fpuSet.getPushSizeInBytes();
    519  int32_t diffG = set.gprs().size() * sizeof(intptr_t);
    520 
    521  MOZ_ASSERT(dest.offset >= diffG + diffF);
    522 
    523  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
    524    diffG -= sizeof(intptr_t);
    525    dest.offset -= sizeof(intptr_t);
    526    storePtr(*iter, dest);
    527  }
    528  MOZ_ASSERT(diffG == 0);
    529  (void)diffG;
    530 
    531  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
    532    FloatRegister reg = *iter;
    533    diffF -= reg.size();
    534    numFpu -= 1;
    535    dest.offset -= reg.size();
    536    if (reg.isDouble()) {
    537      storeDouble(reg, dest);
    538    } else if (reg.isSingle()) {
    539      storeFloat32(reg, dest);
    540    } else if (reg.isSimd128()) {
    541      storeUnalignedSimd128(reg, dest);
    542    } else {
    543      MOZ_CRASH("Unknown register type.");
    544    }
    545  }
    546  MOZ_ASSERT(numFpu == 0);
    547  (void)numFpu;
    548 
    549  // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
    550  // GetPushSizeInBytes.
    551  size_t alignExtra = ((size_t)diffF) % sizeof(uintptr_t);
    552  MOZ_ASSERT_IF(sizeof(uintptr_t) == 8, alignExtra == 0 || alignExtra == 4);
    553  MOZ_ASSERT_IF(sizeof(uintptr_t) == 4, alignExtra == 0);
    554  diffF -= alignExtra;
    555  MOZ_ASSERT(diffF == 0);
    556 
    557  // What this means is: if `alignExtra` is nonzero, then the save area size
    558  // actually used is `alignExtra` bytes smaller than what
    559  // PushRegsInMaskSizeInBytes claims.  Hence we need to compensate for that.
    560  MOZ_ASSERT(alignExtra + offsetInitial - dest.offset ==
    561             PushRegsInMaskSizeInBytes(set));
    562 }
    563 
    564 void MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set,
    565                                         LiveRegisterSet ignore) {
    566  mozilla::DebugOnly<size_t> framePushedInitial = framePushed();
    567 
    568  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
    569  unsigned numFpu = fpuSet.size();
    570  int32_t diffG = set.gprs().size() * sizeof(intptr_t);
    571  int32_t diffF = fpuSet.getPushSizeInBytes();
    572  const int32_t reservedG = diffG;
    573  const int32_t reservedF = diffF;
    574 
    575  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
    576    FloatRegister reg = *iter;
    577    diffF -= reg.size();
    578    numFpu -= 1;
    579    if (ignore.has(reg)) {
    580      continue;
    581    }
    582 
    583    Address spillAddress(StackPointer, diffF);
    584    if (reg.isDouble()) {
    585      loadDouble(spillAddress, reg);
    586    } else if (reg.isSingle()) {
    587      loadFloat32(spillAddress, reg);
    588    } else if (reg.isSimd128()) {
    589      loadUnalignedSimd128(spillAddress, reg);
    590    } else {
    591      MOZ_CRASH("Unknown register type.");
    592    }
    593  }
    594  freeStack(reservedF);
    595  MOZ_ASSERT(numFpu == 0);
    596  (void)numFpu;
    597  // x64 padding to keep the stack aligned on uintptr_t. Keep in sync with
    598  // GetPushBytesInSize.
    599  diffF -= diffF % sizeof(uintptr_t);
    600  MOZ_ASSERT(diffF == 0);
    601 
    602  // On x86, use pop to pop the integer registers, if we're not going to
    603  // ignore any slots, as it's fast on modern hardware and it's a small
    604  // instruction.
    605  if (ignore.emptyGeneral()) {
    606    for (GeneralRegisterForwardIterator iter(set.gprs()); iter.more(); ++iter) {
    607      diffG -= sizeof(intptr_t);
    608      Pop(*iter);
    609    }
    610  } else {
    611    for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more();
    612         ++iter) {
    613      diffG -= sizeof(intptr_t);
    614      if (!ignore.has(*iter)) {
    615        loadPtr(Address(StackPointer, diffG), *iter);
    616      }
    617    }
    618    freeStack(reservedG);
    619  }
    620  MOZ_ASSERT(diffG == 0);
    621 
    622  MOZ_ASSERT(framePushedInitial - framePushed() ==
    623             PushRegsInMaskSizeInBytes(set));
    624 }
    625 
    626 void MacroAssembler::Push(const Operand op) {
    627  push(op);
    628  adjustFrame(sizeof(intptr_t));
    629 }
    630 
    631 void MacroAssembler::Push(Register reg) {
    632  push(reg);
    633  adjustFrame(sizeof(intptr_t));
    634 }
    635 
    636 void MacroAssembler::Push(const Imm32 imm) {
    637  push(imm);
    638  adjustFrame(sizeof(intptr_t));
    639 }
    640 
    641 void MacroAssembler::Push(const ImmWord imm) {
    642  push(imm);
    643  adjustFrame(sizeof(intptr_t));
    644 }
    645 
    646 void MacroAssembler::Push(const ImmPtr imm) {
    647  Push(ImmWord(uintptr_t(imm.value)));
    648 }
    649 
    650 void MacroAssembler::Push(const ImmGCPtr ptr) {
    651  push(ptr);
    652  adjustFrame(sizeof(intptr_t));
    653 }
    654 
    655 void MacroAssembler::Push(FloatRegister t) {
    656  push(t);
    657  // See Assembler::push(FloatRegister) for why we use sizeof(double).
    658  adjustFrame(sizeof(double));
    659 }
    660 
    661 void MacroAssembler::PushFlags() {
    662  pushFlags();
    663  adjustFrame(sizeof(intptr_t));
    664 }
    665 
    666 void MacroAssembler::Pop(const Operand op) {
    667  pop(op);
    668  implicitPop(sizeof(intptr_t));
    669 }
    670 
    671 void MacroAssembler::Pop(Register reg) {
    672  pop(reg);
    673  implicitPop(sizeof(intptr_t));
    674 }
    675 
    676 void MacroAssembler::Pop(FloatRegister reg) {
    677  pop(reg);
    678  // See Assembler::pop(FloatRegister) for why we use sizeof(double).
    679  implicitPop(sizeof(double));
    680 }
    681 
    682 void MacroAssembler::Pop(const ValueOperand& val) {
    683  popValue(val);
    684  implicitPop(sizeof(Value));
    685 }
    686 
    687 void MacroAssembler::PopFlags() {
    688  popFlags();
    689  implicitPop(sizeof(intptr_t));
    690 }
    691 
    692 void MacroAssembler::PopStackPtr() { Pop(StackPointer); }
    693 
    694 void MacroAssembler::freeStackTo(uint32_t framePushed) {
    695  MOZ_ASSERT(framePushed <= framePushed_);
    696  lea(Operand(FramePointer, -int32_t(framePushed)), StackPointer);
    697  framePushed_ = framePushed;
    698 }
    699 
    700 // ===============================================================
    701 // Simple call functions.
    702 
    703 CodeOffset MacroAssembler::call(Register reg) { return Assembler::call(reg); }
    704 
    705 CodeOffset MacroAssembler::call(Label* label) { return Assembler::call(label); }
    706 
    707 CodeOffset MacroAssembler::call(const Address& addr) {
    708  Assembler::call(Operand(addr.base, addr.offset));
    709  return CodeOffset(currentOffset());
    710 }
    711 
    712 CodeOffset MacroAssembler::call(wasm::SymbolicAddress target) {
    713  mov(target, eax);
    714  return Assembler::call(eax);
    715 }
    716 
    717 void MacroAssembler::call(ImmWord target) { Assembler::call(target); }
    718 
    719 void MacroAssembler::call(ImmPtr target) { Assembler::call(target); }
    720 
    721 void MacroAssembler::call(JitCode* target) { Assembler::call(target); }
    722 
    723 CodeOffset MacroAssembler::callWithPatch() {
    724  return Assembler::callWithPatch();
    725 }
    726 void MacroAssembler::patchCall(uint32_t callerOffset, uint32_t calleeOffset) {
    727  Assembler::patchCall(callerOffset, calleeOffset);
    728 }
    729 
    730 void MacroAssembler::callAndPushReturnAddress(Register reg) { call(reg); }
    731 
    732 void MacroAssembler::callAndPushReturnAddress(Label* label) { call(label); }
    733 
    734 // ===============================================================
    735 // Patchable near/far jumps.
    736 
    737 CodeOffset MacroAssembler::farJumpWithPatch() {
    738  return Assembler::farJumpWithPatch();
    739 }
    740 
    741 void MacroAssembler::patchFarJump(CodeOffset farJump, uint32_t targetOffset) {
    742  Assembler::patchFarJump(farJump, targetOffset);
    743 }
    744 
    745 void MacroAssembler::patchFarJump(uint8_t* farJump, uint8_t* target) {
    746  Assembler::patchFarJump(farJump, target);
    747 }
    748 
    749 CodeOffset MacroAssembler::nopPatchableToCall() {
    750  masm.nop_five();
    751  return CodeOffset(currentOffset());
    752 }
    753 
    754 void MacroAssembler::patchNopToCall(uint8_t* callsite, uint8_t* target) {
    755  Assembler::patchFiveByteNopToCall(callsite, target);
    756 }
    757 
    758 void MacroAssembler::patchCallToNop(uint8_t* callsite) {
    759  Assembler::patchCallToFiveByteNop(callsite);
    760 }
    761 
    762 CodeOffset MacroAssembler::move32WithPatch(Register dest) {
    763  movl(Imm32(-1), dest);
    764  return CodeOffset(currentOffset());
    765 }
    766 
    767 void MacroAssembler::patchMove32(CodeOffset offset, Imm32 n) {
    768  X86Encoding::SetInt32(masm.data() + offset.offset(), n.value);
    769 }
    770 
    771 // ===============================================================
    772 // Jit Frames.
    773 
    774 uint32_t MacroAssembler::pushFakeReturnAddress(Register scratch) {
    775  CodeLabel cl;
    776 
    777  mov(&cl, scratch);
    778  Push(scratch);
    779  bind(&cl);
    780  uint32_t retAddr = currentOffset();
    781 
    782  addCodeLabel(cl);
    783  return retAddr;
    784 }
    785 
    786 // ===============================================================
    787 // WebAssembly
    788 
    789 FaultingCodeOffset MacroAssembler::wasmTrapInstruction() {
    790  return FaultingCodeOffset(ud2().offset());
    791 }
    792 
    793 void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
    794                                       Register boundsCheckLimit,
    795                                       Label* label) {
    796  cmp32(index, boundsCheckLimit);
    797  j(cond, label);
    798  if (JitOptions.spectreIndexMasking) {
    799    cmovCCl(cond, Operand(boundsCheckLimit), index);
    800  }
    801 }
    802 
    803 void MacroAssembler::wasmBoundsCheck32(Condition cond, Register index,
    804                                       Address boundsCheckLimit, Label* label) {
    805  cmp32(index, Operand(boundsCheckLimit));
    806  j(cond, label);
    807  if (JitOptions.spectreIndexMasking) {
    808    cmovCCl(cond, Operand(boundsCheckLimit), index);
    809  }
    810 }
    811 
    812 // RAII class that generates the jumps to traps when it's destructed, to
    813 // prevent some code duplication in the outOfLineWasmTruncateXtoY methods.
    814 struct MOZ_RAII AutoHandleWasmTruncateToIntErrors {
    815  MacroAssembler& masm;
    816  Label inputIsNaN;
    817  Label intOverflow;
    818  const wasm::TrapSiteDesc& trapSiteDesc;
    819 
    820  explicit AutoHandleWasmTruncateToIntErrors(
    821      MacroAssembler& masm, const wasm::TrapSiteDesc& trapSiteDesc)
    822      : masm(masm), trapSiteDesc(trapSiteDesc) {}
    823 
    824  ~AutoHandleWasmTruncateToIntErrors() {
    825    // Handle errors.  These cases are not in arbitrary order: code will
    826    // fall through to intOverflow.
    827    masm.bind(&intOverflow);
    828    masm.wasmTrap(wasm::Trap::IntegerOverflow, trapSiteDesc);
    829 
    830    masm.bind(&inputIsNaN);
    831    masm.wasmTrap(wasm::Trap::InvalidConversionToInteger, trapSiteDesc);
    832  }
    833 };
    834 
    835 void MacroAssembler::wasmTruncateDoubleToInt32(FloatRegister input,
    836                                               Register output,
    837                                               bool isSaturating,
    838                                               Label* oolEntry) {
    839  vcvttsd2si(input, output);
    840  cmp32(output, Imm32(1));
    841  j(Assembler::Overflow, oolEntry);
    842 }
    843 
    844 void MacroAssembler::wasmTruncateFloat32ToInt32(FloatRegister input,
    845                                                Register output,
    846                                                bool isSaturating,
    847                                                Label* oolEntry) {
    848  vcvttss2si(input, output);
    849  cmp32(output, Imm32(1));
    850  j(Assembler::Overflow, oolEntry);
    851 }
    852 
    853 void MacroAssembler::oolWasmTruncateCheckF64ToI32(
    854    FloatRegister input, Register output, TruncFlags flags,
    855    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
    856  bool isUnsigned = flags & TRUNC_UNSIGNED;
    857  bool isSaturating = flags & TRUNC_SATURATING;
    858 
    859  if (isSaturating) {
    860    if (isUnsigned) {
    861      // Negative overflow and NaN both are converted to 0, and the only
    862      // other case is positive overflow which is converted to
    863      // UINT32_MAX.
    864      Label nonNegative;
    865      ScratchDoubleScope fpscratch(*this);
    866      loadConstantDouble(0.0, fpscratch);
    867      branchDouble(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
    868                   &nonNegative);
    869      move32(Imm32(0), output);
    870      jump(rejoin);
    871 
    872      bind(&nonNegative);
    873      move32(Imm32(UINT32_MAX), output);
    874    } else {
    875      // Negative overflow is already saturated to INT32_MIN, so we only
    876      // have to handle NaN and positive overflow here.
    877      Label notNaN;
    878      branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
    879      move32(Imm32(0), output);
    880      jump(rejoin);
    881 
    882      bind(&notNaN);
    883      ScratchDoubleScope fpscratch(*this);
    884      loadConstantDouble(0.0, fpscratch);
    885      branchDouble(Assembler::DoubleLessThan, input, fpscratch, rejoin);
    886      sub32(Imm32(1), output);
    887    }
    888    jump(rejoin);
    889    return;
    890  }
    891 
    892  AutoHandleWasmTruncateToIntErrors traps(*this, trapSiteDesc);
    893 
    894  // Eagerly take care of NaNs.
    895  branchDouble(Assembler::DoubleUnordered, input, input, &traps.inputIsNaN);
    896 
    897  // For unsigned, fall through to intOverflow failure case.
    898  if (isUnsigned) {
    899    return;
    900  }
    901 
    902  // Handle special values.
    903 
    904  // We've used vcvttsd2si. The only valid double values that can
    905  // truncate to INT32_MIN are in ]INT32_MIN - 1; INT32_MIN].
    906  ScratchDoubleScope fpscratch(*this);
    907  loadConstantDouble(double(INT32_MIN) - 1.0, fpscratch);
    908  branchDouble(Assembler::DoubleLessThanOrEqual, input, fpscratch,
    909               &traps.intOverflow);
    910 
    911  loadConstantDouble(0.0, fpscratch);
    912  branchDouble(Assembler::DoubleGreaterThan, input, fpscratch,
    913               &traps.intOverflow);
    914  jump(rejoin);
    915 }
    916 
    917 void MacroAssembler::oolWasmTruncateCheckF32ToI32(
    918    FloatRegister input, Register output, TruncFlags flags,
    919    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
    920  bool isUnsigned = flags & TRUNC_UNSIGNED;
    921  bool isSaturating = flags & TRUNC_SATURATING;
    922 
    923  if (isSaturating) {
    924    if (isUnsigned) {
    925      // Negative overflow and NaN both are converted to 0, and the only
    926      // other case is positive overflow which is converted to
    927      // UINT32_MAX.
    928      Label nonNegative;
    929      ScratchFloat32Scope fpscratch(*this);
    930      loadConstantFloat32(0.0f, fpscratch);
    931      branchFloat(Assembler::DoubleGreaterThanOrEqual, input, fpscratch,
    932                  &nonNegative);
    933      move32(Imm32(0), output);
    934      jump(rejoin);
    935 
    936      bind(&nonNegative);
    937      move32(Imm32(UINT32_MAX), output);
    938    } else {
    939      // Negative overflow is already saturated to INT32_MIN, so we only
    940      // have to handle NaN and positive overflow here.
    941      Label notNaN;
    942      branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
    943      move32(Imm32(0), output);
    944      jump(rejoin);
    945 
    946      bind(&notNaN);
    947      ScratchFloat32Scope fpscratch(*this);
    948      loadConstantFloat32(0.0f, fpscratch);
    949      branchFloat(Assembler::DoubleLessThan, input, fpscratch, rejoin);
    950      sub32(Imm32(1), output);
    951    }
    952    jump(rejoin);
    953    return;
    954  }
    955 
    956  AutoHandleWasmTruncateToIntErrors traps(*this, trapSiteDesc);
    957 
    958  // Eagerly take care of NaNs.
    959  branchFloat(Assembler::DoubleUnordered, input, input, &traps.inputIsNaN);
    960 
    961  // For unsigned, fall through to intOverflow failure case.
    962  if (isUnsigned) {
    963    return;
    964  }
    965 
    966  // Handle special values.
    967 
    968  // We've used vcvttss2si. Check that the input wasn't
    969  // float(INT32_MIN), which is the only legimitate input that
    970  // would truncate to INT32_MIN.
    971  ScratchFloat32Scope fpscratch(*this);
    972  loadConstantFloat32(float(INT32_MIN), fpscratch);
    973  branchFloat(Assembler::DoubleNotEqual, input, fpscratch, &traps.intOverflow);
    974  jump(rejoin);
    975 }
    976 
    977 void MacroAssembler::oolWasmTruncateCheckF64ToI64(
    978    FloatRegister input, Register64 output, TruncFlags flags,
    979    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
    980  bool isUnsigned = flags & TRUNC_UNSIGNED;
    981  bool isSaturating = flags & TRUNC_SATURATING;
    982 
    983  if (isSaturating) {
    984    if (isUnsigned) {
    985      // Negative overflow and NaN both are converted to 0, and the only
    986      // other case is positive overflow which is converted to
    987      // UINT64_MAX.
    988      Label positive;
    989      ScratchDoubleScope fpscratch(*this);
    990      loadConstantDouble(0.0, fpscratch);
    991      branchDouble(Assembler::DoubleGreaterThan, input, fpscratch, &positive);
    992      move64(Imm64(0), output);
    993      jump(rejoin);
    994 
    995      bind(&positive);
    996      move64(Imm64(UINT64_MAX), output);
    997    } else {
    998      // Negative overflow is already saturated to INT64_MIN, so we only
    999      // have to handle NaN and positive overflow here.
   1000      Label notNaN;
   1001      branchDouble(Assembler::DoubleOrdered, input, input, &notNaN);
   1002      move64(Imm64(0), output);
   1003      jump(rejoin);
   1004 
   1005      bind(&notNaN);
   1006      ScratchDoubleScope fpscratch(*this);
   1007      loadConstantDouble(0.0, fpscratch);
   1008      branchDouble(Assembler::DoubleLessThan, input, fpscratch, rejoin);
   1009      sub64(Imm64(1), output);
   1010    }
   1011    jump(rejoin);
   1012    return;
   1013  }
   1014 
   1015  AutoHandleWasmTruncateToIntErrors traps(*this, trapSiteDesc);
   1016 
   1017  // Eagerly take care of NaNs.
   1018  branchDouble(Assembler::DoubleUnordered, input, input, &traps.inputIsNaN);
   1019 
   1020  // Handle special values.
   1021  if (isUnsigned) {
   1022    ScratchDoubleScope fpscratch(*this);
   1023    loadConstantDouble(0.0, fpscratch);
   1024    branchDouble(Assembler::DoubleGreaterThan, input, fpscratch,
   1025                 &traps.intOverflow);
   1026    loadConstantDouble(-1.0, fpscratch);
   1027    branchDouble(Assembler::DoubleLessThanOrEqual, input, fpscratch,
   1028                 &traps.intOverflow);
   1029    jump(rejoin);
   1030    return;
   1031  }
   1032 
   1033  // We've used vcvtsd2sq. The only legit value whose i64
   1034  // truncation is INT64_MIN is double(INT64_MIN): exponent is so
   1035  // high that the highest resolution around is much more than 1.
   1036  ScratchDoubleScope fpscratch(*this);
   1037  loadConstantDouble(double(int64_t(INT64_MIN)), fpscratch);
   1038  branchDouble(Assembler::DoubleNotEqual, input, fpscratch, &traps.intOverflow);
   1039  jump(rejoin);
   1040 }
   1041 
   1042 void MacroAssembler::oolWasmTruncateCheckF32ToI64(
   1043    FloatRegister input, Register64 output, TruncFlags flags,
   1044    const wasm::TrapSiteDesc& trapSiteDesc, Label* rejoin) {
   1045  bool isUnsigned = flags & TRUNC_UNSIGNED;
   1046  bool isSaturating = flags & TRUNC_SATURATING;
   1047 
   1048  if (isSaturating) {
   1049    if (isUnsigned) {
   1050      // Negative overflow and NaN both are converted to 0, and the only
   1051      // other case is positive overflow which is converted to
   1052      // UINT64_MAX.
   1053      Label positive;
   1054      ScratchFloat32Scope fpscratch(*this);
   1055      loadConstantFloat32(0.0f, fpscratch);
   1056      branchFloat(Assembler::DoubleGreaterThan, input, fpscratch, &positive);
   1057      move64(Imm64(0), output);
   1058      jump(rejoin);
   1059 
   1060      bind(&positive);
   1061      move64(Imm64(UINT64_MAX), output);
   1062    } else {
   1063      // Negative overflow is already saturated to INT64_MIN, so we only
   1064      // have to handle NaN and positive overflow here.
   1065      Label notNaN;
   1066      branchFloat(Assembler::DoubleOrdered, input, input, &notNaN);
   1067      move64(Imm64(0), output);
   1068      jump(rejoin);
   1069 
   1070      bind(&notNaN);
   1071      ScratchFloat32Scope fpscratch(*this);
   1072      loadConstantFloat32(0.0f, fpscratch);
   1073      branchFloat(Assembler::DoubleLessThan, input, fpscratch, rejoin);
   1074      sub64(Imm64(1), output);
   1075    }
   1076    jump(rejoin);
   1077    return;
   1078  }
   1079 
   1080  AutoHandleWasmTruncateToIntErrors traps(*this, trapSiteDesc);
   1081 
   1082  // Eagerly take care of NaNs.
   1083  branchFloat(Assembler::DoubleUnordered, input, input, &traps.inputIsNaN);
   1084 
   1085  // Handle special values.
   1086  if (isUnsigned) {
   1087    ScratchFloat32Scope fpscratch(*this);
   1088    loadConstantFloat32(0.0f, fpscratch);
   1089    branchFloat(Assembler::DoubleGreaterThan, input, fpscratch,
   1090                &traps.intOverflow);
   1091    loadConstantFloat32(-1.0f, fpscratch);
   1092    branchFloat(Assembler::DoubleLessThanOrEqual, input, fpscratch,
   1093                &traps.intOverflow);
   1094    jump(rejoin);
   1095    return;
   1096  }
   1097 
   1098  // We've used vcvtss2sq. See comment in outOfLineWasmTruncateDoubleToInt64.
   1099  ScratchFloat32Scope fpscratch(*this);
   1100  loadConstantFloat32(float(int64_t(INT64_MIN)), fpscratch);
   1101  branchFloat(Assembler::DoubleNotEqual, input, fpscratch, &traps.intOverflow);
   1102  jump(rejoin);
   1103 }
   1104 
   1105 void MacroAssembler::enterFakeExitFrameForWasm(Register cxreg, Register scratch,
   1106                                               ExitFrameType type) {
   1107  enterFakeExitFrame(cxreg, scratch, type);
   1108 }
   1109 
   1110 CodeOffset MacroAssembler::sub32FromMemAndBranchIfNegativeWithPatch(
   1111    Address address, Label* label) {
   1112  // -128 is arbitrary, but makes `*address` count upwards, which may help
   1113  // to identify cases where the subsequent ::patch..() call was forgotten.
   1114  int numImmBytes = subl(Imm32(-128), Operand(address));
   1115  // This is vitally important for patching
   1116  MOZ_RELEASE_ASSERT(numImmBytes == 1);
   1117  // Points immediately after the location to patch
   1118  CodeOffset patchPoint = CodeOffset(currentOffset());
   1119  jSrc(Condition::Signed, label);
   1120  return patchPoint;
   1121 }
   1122 
   1123 void MacroAssembler::patchSub32FromMemAndBranchIfNegative(CodeOffset offset,
   1124                                                          Imm32 imm) {
   1125  int32_t val = imm.value;
   1126  // Patching it to zero would make the insn pointless
   1127  MOZ_RELEASE_ASSERT(val >= 1 && val <= 127);
   1128  uint8_t* ptr = (uint8_t*)masm.data() + offset.offset() - 1;
   1129  MOZ_RELEASE_ASSERT(*ptr == uint8_t(-128));  // as created above
   1130  *ptr = uint8_t(val) & 0x7F;
   1131 }
   1132 
   1133 // ========================================================================
   1134 // Primitive atomic operations.
   1135 
   1136 static void ExtendTo32(MacroAssembler& masm, Scalar::Type type, Register r) {
   1137  switch (type) {
   1138    case Scalar::Int8:
   1139      masm.movsbl(r, r);
   1140      break;
   1141    case Scalar::Uint8:
   1142      masm.movzbl(r, r);
   1143      break;
   1144    case Scalar::Int16:
   1145      masm.movswl(r, r);
   1146      break;
   1147    case Scalar::Uint16:
   1148      masm.movzwl(r, r);
   1149      break;
   1150    case Scalar::Int32:
   1151    case Scalar::Uint32:
   1152      break;
   1153    default:
   1154      MOZ_CRASH("unexpected type");
   1155  }
   1156 }
   1157 
   1158 #ifdef DEBUG
   1159 static inline bool IsByteReg(Register r) {
   1160  AllocatableGeneralRegisterSet byteRegs(Registers::SingleByteRegs);
   1161  return byteRegs.has(r);
   1162 }
   1163 
   1164 static inline bool IsByteReg(Imm32 r) {
   1165  // Nothing
   1166  return true;
   1167 }
   1168 #endif
   1169 
   1170 template <typename T>
   1171 static void CompareExchange(MacroAssembler& masm,
   1172                            const wasm::MemoryAccessDesc* access,
   1173                            Scalar::Type type, const T& mem, Register oldval,
   1174                            Register newval, Register output) {
   1175  MOZ_ASSERT(output == eax);
   1176 
   1177  if (oldval != output) {
   1178    masm.movl(oldval, output);
   1179  }
   1180 
   1181  if (access) {
   1182    masm.append(*access, wasm::TrapMachineInsn::Atomic,
   1183                FaultingCodeOffset(masm.currentOffset()));
   1184  }
   1185 
   1186  // NOTE: the generated code must match the assembly code in gen_cmpxchg in
   1187  // GenerateAtomicOperations.py
   1188  switch (Scalar::byteSize(type)) {
   1189    case 1:
   1190      MOZ_ASSERT(IsByteReg(newval));
   1191      masm.lock_cmpxchgb(newval, Operand(mem));
   1192      break;
   1193    case 2:
   1194      masm.lock_cmpxchgw(newval, Operand(mem));
   1195      break;
   1196    case 4:
   1197      masm.lock_cmpxchgl(newval, Operand(mem));
   1198      break;
   1199    default:
   1200      MOZ_CRASH("Invalid");
   1201  }
   1202 
   1203  ExtendTo32(masm, type, output);
   1204 }
   1205 
   1206 void MacroAssembler::compareExchange(Scalar::Type type, Synchronization,
   1207                                     const Address& mem, Register oldval,
   1208                                     Register newval, Register output) {
   1209  CompareExchange(*this, nullptr, type, mem, oldval, newval, output);
   1210 }
   1211 
   1212 void MacroAssembler::compareExchange(Scalar::Type type, Synchronization,
   1213                                     const BaseIndex& mem, Register oldval,
   1214                                     Register newval, Register output) {
   1215  CompareExchange(*this, nullptr, type, mem, oldval, newval, output);
   1216 }
   1217 
   1218 void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
   1219                                         const Address& mem, Register oldval,
   1220                                         Register newval, Register output) {
   1221  CompareExchange(*this, &access, access.type(), mem, oldval, newval, output);
   1222 }
   1223 
   1224 void MacroAssembler::wasmCompareExchange(const wasm::MemoryAccessDesc& access,
   1225                                         const BaseIndex& mem, Register oldval,
   1226                                         Register newval, Register output) {
   1227  CompareExchange(*this, &access, access.type(), mem, oldval, newval, output);
   1228 }
   1229 
   1230 template <typename T>
   1231 static void AtomicExchange(MacroAssembler& masm,
   1232                           const wasm::MemoryAccessDesc* access,
   1233                           Scalar::Type type, const T& mem, Register value,
   1234                           Register output)
   1235 // NOTE: the generated code must match the assembly code in gen_exchange in
   1236 // GenerateAtomicOperations.py
   1237 {
   1238  if (value != output) {
   1239    masm.movl(value, output);
   1240  }
   1241 
   1242  if (access) {
   1243    masm.append(*access, wasm::TrapMachineInsn::Atomic,
   1244                FaultingCodeOffset(masm.currentOffset()));
   1245  }
   1246 
   1247  switch (Scalar::byteSize(type)) {
   1248    case 1:
   1249      MOZ_ASSERT(IsByteReg(output));
   1250      masm.xchgb(output, Operand(mem));
   1251      break;
   1252    case 2:
   1253      masm.xchgw(output, Operand(mem));
   1254      break;
   1255    case 4:
   1256      masm.xchgl(output, Operand(mem));
   1257      break;
   1258    default:
   1259      MOZ_CRASH("Invalid");
   1260  }
   1261  ExtendTo32(masm, type, output);
   1262 }
   1263 
   1264 void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization,
   1265                                    const Address& mem, Register value,
   1266                                    Register output) {
   1267  AtomicExchange(*this, nullptr, type, mem, value, output);
   1268 }
   1269 
   1270 void MacroAssembler::atomicExchange(Scalar::Type type, Synchronization,
   1271                                    const BaseIndex& mem, Register value,
   1272                                    Register output) {
   1273  AtomicExchange(*this, nullptr, type, mem, value, output);
   1274 }
   1275 
   1276 void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
   1277                                        const Address& mem, Register value,
   1278                                        Register output) {
   1279  AtomicExchange(*this, &access, access.type(), mem, value, output);
   1280 }
   1281 
   1282 void MacroAssembler::wasmAtomicExchange(const wasm::MemoryAccessDesc& access,
   1283                                        const BaseIndex& mem, Register value,
   1284                                        Register output) {
   1285  AtomicExchange(*this, &access, access.type(), mem, value, output);
   1286 }
   1287 
   1288 static void SetupValue(MacroAssembler& masm, AtomicOp op, Imm32 src,
   1289                       Register output) {
   1290  if (op == AtomicOp::Sub) {
   1291    masm.movl(Imm32(-src.value), output);
   1292  } else {
   1293    masm.movl(src, output);
   1294  }
   1295 }
   1296 
   1297 static void SetupValue(MacroAssembler& masm, AtomicOp op, Register src,
   1298                       Register output) {
   1299  if (src != output) {
   1300    masm.movl(src, output);
   1301  }
   1302  if (op == AtomicOp::Sub) {
   1303    masm.negl(output);
   1304  }
   1305 }
   1306 
   1307 static auto WasmTrapMachineInsn(Scalar::Type arrayType, AtomicOp op) {
   1308  switch (op) {
   1309    case AtomicOp::Add:
   1310    case AtomicOp::Sub:
   1311      return wasm::TrapMachineInsn::Atomic;
   1312    case AtomicOp::And:
   1313    case AtomicOp::Or:
   1314    case AtomicOp::Xor:
   1315      switch (arrayType) {
   1316        case Scalar::Int8:
   1317        case Scalar::Uint8:
   1318          return wasm::TrapMachineInsn::Load8;
   1319        case Scalar::Int16:
   1320        case Scalar::Uint16:
   1321          return wasm::TrapMachineInsn::Load16;
   1322        case Scalar::Int32:
   1323        case Scalar::Uint32:
   1324          return wasm::TrapMachineInsn::Load32;
   1325        default:
   1326          break;
   1327      }
   1328      [[fallthrough]];
   1329    default:
   1330      break;
   1331  }
   1332  MOZ_CRASH();
   1333 }
   1334 
   1335 template <typename T, typename V>
   1336 static void AtomicFetchOp(MacroAssembler& masm,
   1337                          const wasm::MemoryAccessDesc* access,
   1338                          Scalar::Type arrayType, AtomicOp op, V value,
   1339                          const T& mem, Register temp, Register output) {
   1340  // Note value can be an Imm or a Register.
   1341 
   1342  // NOTE: the generated code must match the assembly code in gen_fetchop in
   1343  // GenerateAtomicOperations.py
   1344 
   1345  // Setup the output register.
   1346  switch (op) {
   1347    case AtomicOp::Add:
   1348    case AtomicOp::Sub:
   1349      MOZ_ASSERT(temp == InvalidReg);
   1350      MOZ_ASSERT_IF(Scalar::byteSize(arrayType) == 1,
   1351                    IsByteReg(output) && IsByteReg(value));
   1352 
   1353      SetupValue(masm, op, value, output);
   1354      break;
   1355    case AtomicOp::And:
   1356    case AtomicOp::Or:
   1357    case AtomicOp::Xor:
   1358      MOZ_ASSERT(output != temp && output == eax);
   1359      MOZ_ASSERT_IF(Scalar::byteSize(arrayType) == 1,
   1360                    IsByteReg(output) && IsByteReg(temp));
   1361 
   1362      // Bitwise operations don't require any additional setup.
   1363      break;
   1364    default:
   1365      MOZ_CRASH();
   1366  }
   1367 
   1368  auto lock_xadd = [&]() {
   1369    switch (arrayType) {
   1370      case Scalar::Int8:
   1371      case Scalar::Uint8:
   1372        masm.lock_xaddb(output, Operand(mem));
   1373        break;
   1374      case Scalar::Int16:
   1375      case Scalar::Uint16:
   1376        masm.lock_xaddw(output, Operand(mem));
   1377        break;
   1378      case Scalar::Int32:
   1379      case Scalar::Uint32:
   1380        masm.lock_xaddl(output, Operand(mem));
   1381        break;
   1382      default:
   1383        MOZ_CRASH();
   1384    }
   1385  };
   1386 
   1387  auto load = [&]() {
   1388    switch (arrayType) {
   1389      case Scalar::Int8:
   1390      case Scalar::Uint8:
   1391        masm.movzbl(Operand(mem), eax);
   1392        break;
   1393      case Scalar::Int16:
   1394      case Scalar::Uint16:
   1395        masm.movzwl(Operand(mem), eax);
   1396        break;
   1397      case Scalar::Int32:
   1398      case Scalar::Uint32:
   1399        masm.movl(Operand(mem), eax);
   1400        break;
   1401      default:
   1402        MOZ_CRASH();
   1403    }
   1404  };
   1405 
   1406  auto bitwiseOp = [&]() {
   1407    switch (op) {
   1408      case AtomicOp::And:
   1409        masm.andl(value, temp);
   1410        break;
   1411      case AtomicOp::Or:
   1412        masm.orl(value, temp);
   1413        break;
   1414      case AtomicOp::Xor:
   1415        masm.xorl(value, temp);
   1416        break;
   1417      default:
   1418        MOZ_CRASH();
   1419    }
   1420  };
   1421 
   1422  auto lock_cmpxchg = [&]() {
   1423    switch (arrayType) {
   1424      case Scalar::Int8:
   1425      case Scalar::Uint8:
   1426        masm.lock_cmpxchgb(temp, Operand(mem));
   1427        break;
   1428      case Scalar::Int16:
   1429      case Scalar::Uint16:
   1430        masm.lock_cmpxchgw(temp, Operand(mem));
   1431        break;
   1432      case Scalar::Int32:
   1433      case Scalar::Uint32:
   1434        masm.lock_cmpxchgl(temp, Operand(mem));
   1435        break;
   1436      default:
   1437        MOZ_CRASH();
   1438    }
   1439  };
   1440 
   1441  // Add trap instruction directly before the load.
   1442  if (access) {
   1443    masm.append(*access, WasmTrapMachineInsn(arrayType, op),
   1444                FaultingCodeOffset(masm.currentOffset()));
   1445  }
   1446 
   1447  switch (op) {
   1448    case AtomicOp::Add:
   1449    case AtomicOp::Sub:
   1450      // `add` and `sub` operations can be optimized with XADD.
   1451      lock_xadd();
   1452 
   1453      ExtendTo32(masm, arrayType, output);
   1454      break;
   1455 
   1456    case AtomicOp::And:
   1457    case AtomicOp::Or:
   1458    case AtomicOp::Xor: {
   1459      // Bitwise operations need a CAS loop.
   1460 
   1461      // Load memory into eax.
   1462      load();
   1463 
   1464      // Loop.
   1465      Label again;
   1466      masm.bind(&again);
   1467      masm.movl(eax, temp);
   1468 
   1469      // temp = temp <op> value.
   1470      bitwiseOp();
   1471 
   1472      // Compare and swap `temp` with memory.
   1473      lock_cmpxchg();
   1474 
   1475      // Repeat if the comparison failed.
   1476      masm.j(MacroAssembler::NonZero, &again);
   1477 
   1478      // Sign-extend the zero-extended load.
   1479      if (Scalar::isSignedIntType(arrayType)) {
   1480        ExtendTo32(masm, arrayType, eax);
   1481      }
   1482      break;
   1483    }
   1484 
   1485    default:
   1486      MOZ_CRASH();
   1487  }
   1488 }
   1489 
   1490 void MacroAssembler::atomicFetchOp(Scalar::Type arrayType, Synchronization,
   1491                                   AtomicOp op, Register value,
   1492                                   const BaseIndex& mem, Register temp,
   1493                                   Register output) {
   1494  AtomicFetchOp(*this, nullptr, arrayType, op, value, mem, temp, output);
   1495 }
   1496 
   1497 void MacroAssembler::atomicFetchOp(Scalar::Type arrayType, Synchronization,
   1498                                   AtomicOp op, Register value,
   1499                                   const Address& mem, Register temp,
   1500                                   Register output) {
   1501  AtomicFetchOp(*this, nullptr, arrayType, op, value, mem, temp, output);
   1502 }
   1503 
   1504 void MacroAssembler::atomicFetchOp(Scalar::Type arrayType, Synchronization,
   1505                                   AtomicOp op, Imm32 value,
   1506                                   const BaseIndex& mem, Register temp,
   1507                                   Register output) {
   1508  AtomicFetchOp(*this, nullptr, arrayType, op, value, mem, temp, output);
   1509 }
   1510 
   1511 void MacroAssembler::atomicFetchOp(Scalar::Type arrayType, Synchronization,
   1512                                   AtomicOp op, Imm32 value, const Address& mem,
   1513                                   Register temp, Register output) {
   1514  AtomicFetchOp(*this, nullptr, arrayType, op, value, mem, temp, output);
   1515 }
   1516 
   1517 void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
   1518                                       AtomicOp op, Register value,
   1519                                       const Address& mem, Register temp,
   1520                                       Register output) {
   1521  AtomicFetchOp(*this, &access, access.type(), op, value, mem, temp, output);
   1522 }
   1523 
   1524 void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
   1525                                       AtomicOp op, Imm32 value,
   1526                                       const Address& mem, Register temp,
   1527                                       Register output) {
   1528  AtomicFetchOp(*this, &access, access.type(), op, value, mem, temp, output);
   1529 }
   1530 
   1531 void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
   1532                                       AtomicOp op, Register value,
   1533                                       const BaseIndex& mem, Register temp,
   1534                                       Register output) {
   1535  AtomicFetchOp(*this, &access, access.type(), op, value, mem, temp, output);
   1536 }
   1537 
   1538 void MacroAssembler::wasmAtomicFetchOp(const wasm::MemoryAccessDesc& access,
   1539                                       AtomicOp op, Imm32 value,
   1540                                       const BaseIndex& mem, Register temp,
   1541                                       Register output) {
   1542  AtomicFetchOp(*this, &access, access.type(), op, value, mem, temp, output);
   1543 }
   1544 
   1545 template <typename T, typename V>
   1546 static void AtomicEffectOp(MacroAssembler& masm,
   1547                           const wasm::MemoryAccessDesc* access,
   1548                           Scalar::Type arrayType, AtomicOp op, V value,
   1549                           const T& mem) {
   1550  if (access) {
   1551    masm.append(*access, wasm::TrapMachineInsn::Atomic,
   1552                FaultingCodeOffset(masm.currentOffset()));
   1553  }
   1554 
   1555  switch (Scalar::byteSize(arrayType)) {
   1556    case 1:
   1557      switch (op) {
   1558        case AtomicOp::Add:
   1559          masm.lock_addb(value, Operand(mem));
   1560          break;
   1561        case AtomicOp::Sub:
   1562          masm.lock_subb(value, Operand(mem));
   1563          break;
   1564        case AtomicOp::And:
   1565          masm.lock_andb(value, Operand(mem));
   1566          break;
   1567        case AtomicOp::Or:
   1568          masm.lock_orb(value, Operand(mem));
   1569          break;
   1570        case AtomicOp::Xor:
   1571          masm.lock_xorb(value, Operand(mem));
   1572          break;
   1573        default:
   1574          MOZ_CRASH();
   1575      }
   1576      break;
   1577    case 2:
   1578      switch (op) {
   1579        case AtomicOp::Add:
   1580          masm.lock_addw(value, Operand(mem));
   1581          break;
   1582        case AtomicOp::Sub:
   1583          masm.lock_subw(value, Operand(mem));
   1584          break;
   1585        case AtomicOp::And:
   1586          masm.lock_andw(value, Operand(mem));
   1587          break;
   1588        case AtomicOp::Or:
   1589          masm.lock_orw(value, Operand(mem));
   1590          break;
   1591        case AtomicOp::Xor:
   1592          masm.lock_xorw(value, Operand(mem));
   1593          break;
   1594        default:
   1595          MOZ_CRASH();
   1596      }
   1597      break;
   1598    case 4:
   1599      switch (op) {
   1600        case AtomicOp::Add:
   1601          masm.lock_addl(value, Operand(mem));
   1602          break;
   1603        case AtomicOp::Sub:
   1604          masm.lock_subl(value, Operand(mem));
   1605          break;
   1606        case AtomicOp::And:
   1607          masm.lock_andl(value, Operand(mem));
   1608          break;
   1609        case AtomicOp::Or:
   1610          masm.lock_orl(value, Operand(mem));
   1611          break;
   1612        case AtomicOp::Xor:
   1613          masm.lock_xorl(value, Operand(mem));
   1614          break;
   1615        default:
   1616          MOZ_CRASH();
   1617      }
   1618      break;
   1619    default:
   1620      MOZ_CRASH();
   1621  }
   1622 }
   1623 
   1624 void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
   1625                                        AtomicOp op, Register value,
   1626                                        const Address& mem, Register temp) {
   1627  MOZ_ASSERT(temp == InvalidReg);
   1628  AtomicEffectOp(*this, &access, access.type(), op, value, mem);
   1629 }
   1630 
   1631 void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
   1632                                        AtomicOp op, Imm32 value,
   1633                                        const Address& mem, Register temp) {
   1634  MOZ_ASSERT(temp == InvalidReg);
   1635  AtomicEffectOp(*this, &access, access.type(), op, value, mem);
   1636 }
   1637 
   1638 void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
   1639                                        AtomicOp op, Register value,
   1640                                        const BaseIndex& mem, Register temp) {
   1641  MOZ_ASSERT(temp == InvalidReg);
   1642  AtomicEffectOp(*this, &access, access.type(), op, value, mem);
   1643 }
   1644 
   1645 void MacroAssembler::wasmAtomicEffectOp(const wasm::MemoryAccessDesc& access,
   1646                                        AtomicOp op, Imm32 value,
   1647                                        const BaseIndex& mem, Register temp) {
   1648  MOZ_ASSERT(temp == InvalidReg);
   1649  AtomicEffectOp(*this, &access, access.type(), op, value, mem);
   1650 }
   1651 
   1652 // ========================================================================
   1653 // JS atomic operations.
   1654 
   1655 template <typename T>
   1656 static void CompareExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
   1657                              Synchronization sync, const T& mem,
   1658                              Register oldval, Register newval, Register temp,
   1659                              AnyRegister output) {
   1660  if (arrayType == Scalar::Uint32) {
   1661    masm.compareExchange(arrayType, sync, mem, oldval, newval, temp);
   1662    masm.convertUInt32ToDouble(temp, output.fpu());
   1663  } else {
   1664    masm.compareExchange(arrayType, sync, mem, oldval, newval, output.gpr());
   1665  }
   1666 }
   1667 
   1668 void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
   1669                                       Synchronization sync, const Address& mem,
   1670                                       Register oldval, Register newval,
   1671                                       Register temp, AnyRegister output) {
   1672  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, temp, output);
   1673 }
   1674 
   1675 void MacroAssembler::compareExchangeJS(Scalar::Type arrayType,
   1676                                       Synchronization sync,
   1677                                       const BaseIndex& mem, Register oldval,
   1678                                       Register newval, Register temp,
   1679                                       AnyRegister output) {
   1680  CompareExchangeJS(*this, arrayType, sync, mem, oldval, newval, temp, output);
   1681 }
   1682 
   1683 template <typename T>
   1684 static void AtomicExchangeJS(MacroAssembler& masm, Scalar::Type arrayType,
   1685                             Synchronization sync, const T& mem, Register value,
   1686                             Register temp, AnyRegister output) {
   1687  if (arrayType == Scalar::Uint32) {
   1688    masm.atomicExchange(arrayType, sync, mem, value, temp);
   1689    masm.convertUInt32ToDouble(temp, output.fpu());
   1690  } else {
   1691    masm.atomicExchange(arrayType, sync, mem, value, output.gpr());
   1692  }
   1693 }
   1694 
   1695 void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
   1696                                      Synchronization sync, const Address& mem,
   1697                                      Register value, Register temp,
   1698                                      AnyRegister output) {
   1699  AtomicExchangeJS(*this, arrayType, sync, mem, value, temp, output);
   1700 }
   1701 
   1702 void MacroAssembler::atomicExchangeJS(Scalar::Type arrayType,
   1703                                      Synchronization sync,
   1704                                      const BaseIndex& mem, Register value,
   1705                                      Register temp, AnyRegister output) {
   1706  AtomicExchangeJS(*this, arrayType, sync, mem, value, temp, output);
   1707 }
   1708 
   1709 template <typename T>
   1710 static void AtomicFetchOpJS(MacroAssembler& masm, Scalar::Type arrayType,
   1711                            Synchronization sync, AtomicOp op, Register value,
   1712                            const T& mem, Register temp1, Register temp2,
   1713                            AnyRegister output) {
   1714  if (arrayType == Scalar::Uint32) {
   1715    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp2, temp1);
   1716    masm.convertUInt32ToDouble(temp1, output.fpu());
   1717  } else {
   1718    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp1, output.gpr());
   1719  }
   1720 }
   1721 
   1722 void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
   1723                                     Synchronization sync, AtomicOp op,
   1724                                     Register value, const Address& mem,
   1725                                     Register temp1, Register temp2,
   1726                                     AnyRegister output) {
   1727  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
   1728 }
   1729 
   1730 void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
   1731                                     Synchronization sync, AtomicOp op,
   1732                                     Register value, const BaseIndex& mem,
   1733                                     Register temp1, Register temp2,
   1734                                     AnyRegister output) {
   1735  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
   1736 }
   1737 
   1738 void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType, Synchronization,
   1739                                      AtomicOp op, Register value,
   1740                                      const BaseIndex& mem, Register temp) {
   1741  MOZ_ASSERT(temp == InvalidReg);
   1742  AtomicEffectOp(*this, nullptr, arrayType, op, value, mem);
   1743 }
   1744 
   1745 void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType, Synchronization,
   1746                                      AtomicOp op, Register value,
   1747                                      const Address& mem, Register temp) {
   1748  MOZ_ASSERT(temp == InvalidReg);
   1749  AtomicEffectOp(*this, nullptr, arrayType, op, value, mem);
   1750 }
   1751 
   1752 void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType, Synchronization,
   1753                                      AtomicOp op, Imm32 value,
   1754                                      const Address& mem, Register temp) {
   1755  MOZ_ASSERT(temp == InvalidReg);
   1756  AtomicEffectOp(*this, nullptr, arrayType, op, value, mem);
   1757 }
   1758 
   1759 void MacroAssembler::atomicEffectOpJS(Scalar::Type arrayType,
   1760                                      Synchronization sync, AtomicOp op,
   1761                                      Imm32 value, const BaseIndex& mem,
   1762                                      Register temp) {
   1763  MOZ_ASSERT(temp == InvalidReg);
   1764  AtomicEffectOp(*this, nullptr, arrayType, op, value, mem);
   1765 }
   1766 
   1767 template <typename T>
   1768 static void AtomicFetchOpJS(MacroAssembler& masm, Scalar::Type arrayType,
   1769                            Synchronization sync, AtomicOp op, Imm32 value,
   1770                            const T& mem, Register temp1, Register temp2,
   1771                            AnyRegister output) {
   1772  if (arrayType == Scalar::Uint32) {
   1773    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp2, temp1);
   1774    masm.convertUInt32ToDouble(temp1, output.fpu());
   1775  } else {
   1776    masm.atomicFetchOp(arrayType, sync, op, value, mem, temp1, output.gpr());
   1777  }
   1778 }
   1779 
   1780 void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
   1781                                     Synchronization sync, AtomicOp op,
   1782                                     Imm32 value, const Address& mem,
   1783                                     Register temp1, Register temp2,
   1784                                     AnyRegister output) {
   1785  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
   1786 }
   1787 
   1788 void MacroAssembler::atomicFetchOpJS(Scalar::Type arrayType,
   1789                                     Synchronization sync, AtomicOp op,
   1790                                     Imm32 value, const BaseIndex& mem,
   1791                                     Register temp1, Register temp2,
   1792                                     AnyRegister output) {
   1793  AtomicFetchOpJS(*this, arrayType, sync, op, value, mem, temp1, temp2, output);
   1794 }
   1795 
   1796 void MacroAssembler::atomicPause() { masm.pause(); }
   1797 
   1798 // ========================================================================
   1799 // Spectre Mitigations.
   1800 
   1801 void MacroAssembler::speculationBarrier() {
   1802  // Spectre mitigation recommended by Intel and AMD suggest to use lfence as
   1803  // a way to force all speculative execution of instructions to end.
   1804  MOZ_ASSERT(HasSSE2());
   1805  masm.lfence();
   1806 }
   1807 
   1808 void MacroAssembler::floorFloat32ToInt32(FloatRegister src, Register dest,
   1809                                         Label* fail) {
   1810  if (HasSSE41()) {
   1811    // Fail on negative-zero.
   1812    branchNegativeZeroFloat32(src, dest, fail);
   1813 
   1814    // Round toward -Infinity.
   1815    {
   1816      ScratchFloat32Scope scratch(*this);
   1817      vroundss(X86Encoding::RoundDown, src, scratch);
   1818      truncateFloat32ToInt32(scratch, dest, fail);
   1819    }
   1820  } else {
   1821    Label negative, end;
   1822 
   1823    // Branch to a slow path for negative inputs. Doesn't catch NaN or -0.
   1824    {
   1825      ScratchFloat32Scope scratch(*this);
   1826      zeroFloat32(scratch);
   1827      branchFloat(Assembler::DoubleLessThan, src, scratch, &negative);
   1828    }
   1829 
   1830    // Fail on negative-zero.
   1831    branchNegativeZeroFloat32(src, dest, fail);
   1832 
   1833    // Input is non-negative, so truncation correctly rounds.
   1834    truncateFloat32ToInt32(src, dest, fail);
   1835    jump(&end);
   1836 
   1837    // Input is negative, but isn't -0.
   1838    // Negative values go on a comparatively expensive path, since no
   1839    // native rounding mode matches JS semantics. Still better than callVM.
   1840    bind(&negative);
   1841    {
   1842      // Truncate and round toward zero.
   1843      // This is off-by-one for everything but integer-valued inputs.
   1844      //
   1845      // Directly call vcvttss2si instead of truncateFloat32ToInt32 because we
   1846      // want to perform failure handling ourselves.
   1847      vcvttss2si(src, dest);
   1848 
   1849      // Test whether the input double was integer-valued.
   1850      {
   1851        ScratchFloat32Scope scratch(*this);
   1852        convertInt32ToFloat32(dest, scratch);
   1853        branchFloat(Assembler::DoubleEqualOrUnordered, src, scratch, &end);
   1854      }
   1855 
   1856      // Input is not integer-valued, so we rounded off-by-one in the
   1857      // wrong direction. Correct by subtraction.
   1858      //
   1859      // Overflows if vcvttss2si returned the failure return value INT_MIN.
   1860      branchSub32(Assembler::Overflow, Imm32(1), dest, fail);
   1861    }
   1862 
   1863    bind(&end);
   1864  }
   1865 }
   1866 
   1867 void MacroAssembler::floorDoubleToInt32(FloatRegister src, Register dest,
   1868                                        Label* fail) {
   1869  if (HasSSE41()) {
   1870    // Fail on negative-zero.
   1871    branchNegativeZero(src, dest, fail);
   1872 
   1873    // Round toward -Infinity.
   1874    {
   1875      ScratchDoubleScope scratch(*this);
   1876      vroundsd(X86Encoding::RoundDown, src, scratch);
   1877      truncateDoubleToInt32(scratch, dest, fail);
   1878    }
   1879  } else {
   1880    Label negative, end;
   1881 
   1882    // Branch to a slow path for negative inputs. Doesn't catch NaN or -0.
   1883    {
   1884      ScratchDoubleScope scratch(*this);
   1885      zeroDouble(scratch);
   1886      branchDouble(Assembler::DoubleLessThan, src, scratch, &negative);
   1887    }
   1888 
   1889    // Fail on negative-zero.
   1890    branchNegativeZero(src, dest, fail);
   1891 
   1892    // Input is non-negative, so truncation correctly rounds.
   1893    truncateDoubleToInt32(src, dest, fail);
   1894    jump(&end);
   1895 
   1896    // Input is negative, but isn't -0.
   1897    // Negative values go on a comparatively expensive path, since no
   1898    // native rounding mode matches JS semantics. Still better than callVM.
   1899    bind(&negative);
   1900    {
   1901      // Truncate and round toward zero.
   1902      // This is off-by-one for everything but integer-valued inputs.
   1903      //
   1904      // Directly call vcvttsd2si instead of truncateDoubleToInt32 because we
   1905      // want to perform failure handling ourselves.
   1906      vcvttsd2si(src, dest);
   1907 
   1908      // Test whether the input double was integer-valued.
   1909      {
   1910        ScratchDoubleScope scratch(*this);
   1911        convertInt32ToDouble(dest, scratch);
   1912        branchDouble(Assembler::DoubleEqualOrUnordered, src, scratch, &end);
   1913      }
   1914 
   1915      // Input is not integer-valued, so we rounded off-by-one in the
   1916      // wrong direction. Correct by subtraction.
   1917      //
   1918      // Overflows if vcvttsd2si returned the failure return value INT_MIN.
   1919      branchSub32(Assembler::Overflow, Imm32(1), dest, fail);
   1920    }
   1921 
   1922    bind(&end);
   1923  }
   1924 }
   1925 
   1926 void MacroAssembler::ceilFloat32ToInt32(FloatRegister src, Register dest,
   1927                                        Label* fail) {
   1928  ScratchFloat32Scope scratch(*this);
   1929 
   1930  Label lessThanOrEqualMinusOne;
   1931 
   1932  // If x is in ]-1,0], ceil(x) is -0, which cannot be represented as an int32.
   1933  // Fail if x > -1 and the sign bit is set.
   1934  loadConstantFloat32(-1.f, scratch);
   1935  branchFloat(Assembler::DoubleLessThanOrEqualOrUnordered, src, scratch,
   1936              &lessThanOrEqualMinusOne);
   1937  vmovmskps(src, dest);
   1938  branchTest32(Assembler::NonZero, dest, Imm32(1), fail);
   1939 
   1940  if (HasSSE41()) {
   1941    // x <= -1 or x > -0
   1942    bind(&lessThanOrEqualMinusOne);
   1943    // Round toward +Infinity.
   1944    vroundss(X86Encoding::RoundUp, src, scratch);
   1945    truncateFloat32ToInt32(scratch, dest, fail);
   1946    return;
   1947  }
   1948 
   1949  // No SSE4.1
   1950  Label end;
   1951 
   1952  // x >= 0 and x is not -0.0. We can truncate integer values, and truncate and
   1953  // add 1 to non-integer values. This will also work for values >= INT_MAX + 1,
   1954  // as the truncate operation will return INT_MIN and we'll fail.
   1955  truncateFloat32ToInt32(src, dest, fail);
   1956  convertInt32ToFloat32(dest, scratch);
   1957  branchFloat(Assembler::DoubleEqualOrUnordered, src, scratch, &end);
   1958 
   1959  // Input is not integer-valued, add 1 to obtain the ceiling value.
   1960  // If input > INT_MAX, output == INT_MAX so adding 1 will overflow.
   1961  branchAdd32(Assembler::Overflow, Imm32(1), dest, fail);
   1962  jump(&end);
   1963 
   1964  // x <= -1, truncation is the way to go.
   1965  bind(&lessThanOrEqualMinusOne);
   1966  truncateFloat32ToInt32(src, dest, fail);
   1967 
   1968  bind(&end);
   1969 }
   1970 
   1971 void MacroAssembler::ceilDoubleToInt32(FloatRegister src, Register dest,
   1972                                       Label* fail) {
   1973  ScratchDoubleScope scratch(*this);
   1974 
   1975  Label lessThanOrEqualMinusOne;
   1976 
   1977  // If x is in ]-1,0], ceil(x) is -0, which cannot be represented as an int32.
   1978  // Fail if x > -1 and the sign bit is set.
   1979  loadConstantDouble(-1.0, scratch);
   1980  branchDouble(Assembler::DoubleLessThanOrEqualOrUnordered, src, scratch,
   1981               &lessThanOrEqualMinusOne);
   1982  vmovmskpd(src, dest);
   1983  branchTest32(Assembler::NonZero, dest, Imm32(1), fail);
   1984 
   1985  if (HasSSE41()) {
   1986    // x <= -1 or x > -0
   1987    bind(&lessThanOrEqualMinusOne);
   1988    // Round toward +Infinity.
   1989    vroundsd(X86Encoding::RoundUp, src, scratch);
   1990    truncateDoubleToInt32(scratch, dest, fail);
   1991    return;
   1992  }
   1993 
   1994  // No SSE4.1
   1995  Label end;
   1996 
   1997  // x >= 0 and x is not -0.0. We can truncate integer values, and truncate and
   1998  // add 1 to non-integer values. This will also work for values >= INT_MAX + 1,
   1999  // as the truncate operation will return INT_MIN and we'll fail.
   2000  truncateDoubleToInt32(src, dest, fail);
   2001  convertInt32ToDouble(dest, scratch);
   2002  branchDouble(Assembler::DoubleEqualOrUnordered, src, scratch, &end);
   2003 
   2004  // Input is not integer-valued, add 1 to obtain the ceiling value.
   2005  // If input > INT_MAX, output == INT_MAX so adding 1 will overflow.
   2006  branchAdd32(Assembler::Overflow, Imm32(1), dest, fail);
   2007  jump(&end);
   2008 
   2009  // x <= -1, truncation is the way to go.
   2010  bind(&lessThanOrEqualMinusOne);
   2011  truncateDoubleToInt32(src, dest, fail);
   2012 
   2013  bind(&end);
   2014 }
   2015 
   2016 void MacroAssembler::truncDoubleToInt32(FloatRegister src, Register dest,
   2017                                        Label* fail) {
   2018  Label lessThanOrEqualMinusOne;
   2019 
   2020  // Bail on ]-1; -0] range
   2021  {
   2022    ScratchDoubleScope scratch(*this);
   2023    loadConstantDouble(-1, scratch);
   2024    branchDouble(Assembler::DoubleLessThanOrEqualOrUnordered, src, scratch,
   2025                 &lessThanOrEqualMinusOne);
   2026  }
   2027 
   2028  // Test for remaining values with the sign bit set, i.e. ]-1; -0]
   2029  vmovmskpd(src, dest);
   2030  branchTest32(Assembler::NonZero, dest, Imm32(1), fail);
   2031 
   2032  // x <= -1 or x >= +0, truncation is the way to go.
   2033  bind(&lessThanOrEqualMinusOne);
   2034  truncateDoubleToInt32(src, dest, fail);
   2035 }
   2036 
   2037 void MacroAssembler::truncFloat32ToInt32(FloatRegister src, Register dest,
   2038                                         Label* fail) {
   2039  Label lessThanOrEqualMinusOne;
   2040 
   2041  // Bail on ]-1; -0] range
   2042  {
   2043    ScratchFloat32Scope scratch(*this);
   2044    loadConstantFloat32(-1.f, scratch);
   2045    branchFloat(Assembler::DoubleLessThanOrEqualOrUnordered, src, scratch,
   2046                &lessThanOrEqualMinusOne);
   2047  }
   2048 
   2049  // Test for remaining values with the sign bit set, i.e. ]-1; -0]
   2050  vmovmskps(src, dest);
   2051  branchTest32(Assembler::NonZero, dest, Imm32(1), fail);
   2052 
   2053  // x <= -1 or x >= +0, truncation is the way to go.
   2054  bind(&lessThanOrEqualMinusOne);
   2055  truncateFloat32ToInt32(src, dest, fail);
   2056 }
   2057 
   2058 void MacroAssembler::roundFloat32ToInt32(FloatRegister src, Register dest,
   2059                                         FloatRegister temp, Label* fail) {
   2060  ScratchFloat32Scope scratch(*this);
   2061 
   2062  Label negativeOrZero, negative, end;
   2063 
   2064  // Branch to a slow path for non-positive inputs. Doesn't catch NaN.
   2065  zeroFloat32(scratch);
   2066  loadConstantFloat32(GetBiggestNumberLessThan(0.5f), temp);
   2067  branchFloat(Assembler::DoubleLessThanOrEqual, src, scratch, &negativeOrZero);
   2068  {
   2069    // Input is strictly positive or NaN. Add the biggest float less than 0.5
   2070    // and truncate, rounding down (because if the input is the biggest float
   2071    // less than 0.5, adding 0.5 would undesirably round up to 1). Note that we
   2072    // have to add the input to the temp register because we're not allowed to
   2073    // modify the input register.
   2074    addFloat32(src, temp);
   2075    truncateFloat32ToInt32(temp, dest, fail);
   2076    jump(&end);
   2077  }
   2078 
   2079  // Input is negative, +0 or -0.
   2080  bind(&negativeOrZero);
   2081  {
   2082    // Branch on negative input.
   2083    j(Assembler::NotEqual, &negative);
   2084 
   2085    // Fail on negative-zero.
   2086    branchNegativeZeroFloat32(src, dest, fail);
   2087 
   2088    // Input is +0.
   2089    xor32(dest, dest);
   2090    jump(&end);
   2091  }
   2092 
   2093  // Input is negative.
   2094  bind(&negative);
   2095  {
   2096    // Inputs in [-0.5, 0) are rounded to -0. Fail.
   2097    loadConstantFloat32(-0.5f, scratch);
   2098    branchFloat(Assembler::DoubleGreaterThanOrEqual, src, scratch, fail);
   2099 
   2100    // Other negative inputs need the biggest float less than 0.5 added.
   2101    //
   2102    // The result is stored in the temp register (currently contains the biggest
   2103    // float less than 0.5).
   2104    addFloat32(src, temp);
   2105 
   2106    if (HasSSE41()) {
   2107      // Round toward -Infinity.
   2108      vroundss(X86Encoding::RoundDown, temp, scratch);
   2109 
   2110      // Truncate.
   2111      truncateFloat32ToInt32(scratch, dest, fail);
   2112    } else {
   2113      // Round toward -Infinity without the benefit of ROUNDSS.
   2114 
   2115      // Truncate and round toward zero.
   2116      // This is off-by-one for everything but integer-valued inputs.
   2117      //
   2118      // Directly call vcvttss2si instead of truncateFloat32ToInt32 because we
   2119      // want to perform failure handling ourselves.
   2120      vcvttss2si(temp, dest);
   2121 
   2122      // Test whether the truncated float was integer-valued.
   2123      convertInt32ToFloat32(dest, scratch);
   2124      branchFloat(Assembler::DoubleEqualOrUnordered, temp, scratch, &end);
   2125 
   2126      // Input is not integer-valued, so we rounded off-by-one in the
   2127      // wrong direction. Correct by subtraction.
   2128      //
   2129      // Overflows if vcvttss2si returned the failure return value INT_MIN.
   2130      branchSub32(Assembler::Overflow, Imm32(1), dest, fail);
   2131    }
   2132  }
   2133 
   2134  bind(&end);
   2135 }
   2136 
   2137 void MacroAssembler::roundDoubleToInt32(FloatRegister src, Register dest,
   2138                                        FloatRegister temp, Label* fail) {
   2139  ScratchDoubleScope scratch(*this);
   2140 
   2141  Label negativeOrZero, negative, end;
   2142 
   2143  // Branch to a slow path for non-positive inputs. Doesn't catch NaN.
   2144  zeroDouble(scratch);
   2145  loadConstantDouble(GetBiggestNumberLessThan(0.5), temp);
   2146  branchDouble(Assembler::DoubleLessThanOrEqual, src, scratch, &negativeOrZero);
   2147  {
   2148    // Input is strictly positive or NaN. Add the biggest double less than 0.5
   2149    // and truncate, rounding down (because if the input is the biggest double
   2150    // less than 0.5, adding 0.5 would undesirably round up to 1). Note that we
   2151    // have to add the input to the temp register because we're not allowed to
   2152    // modify the input register.
   2153    addDouble(src, temp);
   2154    truncateDoubleToInt32(temp, dest, fail);
   2155    jump(&end);
   2156  }
   2157 
   2158  // Input is negative, +0 or -0.
   2159  bind(&negativeOrZero);
   2160  {
   2161    // Branch on negative input.
   2162    j(Assembler::NotEqual, &negative);
   2163 
   2164    // Fail on negative-zero.
   2165    branchNegativeZero(src, dest, fail, /* maybeNonZero = */ false);
   2166 
   2167    // Input is +0
   2168    xor32(dest, dest);
   2169    jump(&end);
   2170  }
   2171 
   2172  // Input is negative.
   2173  bind(&negative);
   2174  {
   2175    // Inputs in [-0.5, 0) are rounded to -0. Fail.
   2176    loadConstantDouble(-0.5, scratch);
   2177    branchDouble(Assembler::DoubleGreaterThanOrEqual, src, scratch, fail);
   2178 
   2179    // Other negative inputs need the biggest double less than 0.5 added.
   2180    //
   2181    // The result is stored in the temp register (currently contains the biggest
   2182    // double less than 0.5).
   2183    addDouble(src, temp);
   2184 
   2185    if (HasSSE41()) {
   2186      // Round toward -Infinity.
   2187      vroundsd(X86Encoding::RoundDown, temp, scratch);
   2188 
   2189      // Truncate.
   2190      truncateDoubleToInt32(scratch, dest, fail);
   2191    } else {
   2192      // Round toward -Infinity without the benefit of ROUNDSD.
   2193 
   2194      // Truncate and round toward zero.
   2195      // This is off-by-one for everything but integer-valued inputs.
   2196      //
   2197      // Directly call vcvttsd2si instead of truncateDoubleToInt32 because we
   2198      // want to perform failure handling ourselves.
   2199      vcvttsd2si(temp, dest);
   2200 
   2201      // Test whether the truncated double was integer-valued.
   2202      convertInt32ToDouble(dest, scratch);
   2203      branchDouble(Assembler::DoubleEqualOrUnordered, temp, scratch, &end);
   2204 
   2205      // Input is not integer-valued, so we rounded off-by-one in the
   2206      // wrong direction. Correct by subtraction.
   2207      //
   2208      // Overflows if vcvttsd2si returned the failure return value INT_MIN.
   2209      branchSub32(Assembler::Overflow, Imm32(1), dest, fail);
   2210    }
   2211  }
   2212 
   2213  bind(&end);
   2214 }
   2215 
   2216 void MacroAssembler::nearbyIntDouble(RoundingMode mode, FloatRegister src,
   2217                                     FloatRegister dest) {
   2218  MOZ_ASSERT(HasRoundInstruction(mode));
   2219  vroundsd(Assembler::ToX86RoundingMode(mode), src, dest);
   2220 }
   2221 
   2222 void MacroAssembler::nearbyIntFloat32(RoundingMode mode, FloatRegister src,
   2223                                      FloatRegister dest) {
   2224  MOZ_ASSERT(HasRoundInstruction(mode));
   2225  vroundss(Assembler::ToX86RoundingMode(mode), src, dest);
   2226 }
   2227 
   2228 void MacroAssembler::copySignDouble(FloatRegister lhs, FloatRegister rhs,
   2229                                    FloatRegister output) {
   2230  ScratchDoubleScope scratch(*this);
   2231 
   2232  double keepSignMask = mozilla::BitwiseCast<double>(INT64_MIN);
   2233  double clearSignMask = mozilla::BitwiseCast<double>(INT64_MAX);
   2234 
   2235  if (HasAVX()) {
   2236    if (rhs == output) {
   2237      MOZ_ASSERT(lhs != rhs);
   2238      vandpdSimd128(SimdConstant::SplatX2(keepSignMask), rhs, output);
   2239      vandpdSimd128(SimdConstant::SplatX2(clearSignMask), lhs, scratch);
   2240    } else {
   2241      vandpdSimd128(SimdConstant::SplatX2(clearSignMask), lhs, output);
   2242      vandpdSimd128(SimdConstant::SplatX2(keepSignMask), rhs, scratch);
   2243    }
   2244  } else {
   2245    if (rhs == output) {
   2246      MOZ_ASSERT(lhs != rhs);
   2247      loadConstantDouble(keepSignMask, scratch);
   2248      vandpd(scratch, rhs, output);
   2249 
   2250      loadConstantDouble(clearSignMask, scratch);
   2251      vandpd(lhs, scratch, scratch);
   2252    } else {
   2253      loadConstantDouble(clearSignMask, scratch);
   2254      vandpd(scratch, lhs, output);
   2255 
   2256      loadConstantDouble(keepSignMask, scratch);
   2257      vandpd(rhs, scratch, scratch);
   2258    }
   2259  }
   2260 
   2261  vorpd(scratch, output, output);
   2262 }
   2263 
   2264 void MacroAssembler::copySignFloat32(FloatRegister lhs, FloatRegister rhs,
   2265                                     FloatRegister output) {
   2266  ScratchFloat32Scope scratch(*this);
   2267 
   2268  float keepSignMask = mozilla::BitwiseCast<float>(INT32_MIN);
   2269  float clearSignMask = mozilla::BitwiseCast<float>(INT32_MAX);
   2270 
   2271  if (HasAVX()) {
   2272    if (rhs == output) {
   2273      MOZ_ASSERT(lhs != rhs);
   2274      vandpsSimd128(SimdConstant::SplatX4(keepSignMask), rhs, output);
   2275      vandpsSimd128(SimdConstant::SplatX4(clearSignMask), lhs, scratch);
   2276    } else {
   2277      vandpsSimd128(SimdConstant::SplatX4(clearSignMask), lhs, output);
   2278      vandpsSimd128(SimdConstant::SplatX4(keepSignMask), rhs, scratch);
   2279    }
   2280  } else {
   2281    if (rhs == output) {
   2282      MOZ_ASSERT(lhs != rhs);
   2283      loadConstantFloat32(keepSignMask, scratch);
   2284      vandps(scratch, output, output);
   2285 
   2286      loadConstantFloat32(clearSignMask, scratch);
   2287      vandps(lhs, scratch, scratch);
   2288    } else {
   2289      loadConstantFloat32(clearSignMask, scratch);
   2290      vandps(scratch, lhs, output);
   2291 
   2292      loadConstantFloat32(keepSignMask, scratch);
   2293      vandps(rhs, scratch, scratch);
   2294    }
   2295  }
   2296 
   2297  vorps(scratch, output, output);
   2298 }
   2299 
   2300 void MacroAssembler::shiftIndex32AndAdd(Register indexTemp32, int shift,
   2301                                        Register pointer) {
   2302  if (IsShiftInScaleRange(shift)) {
   2303    computeEffectiveAddress(
   2304        BaseIndex(pointer, indexTemp32, ShiftToScale(shift)), pointer);
   2305    return;
   2306  }
   2307  lshift32(Imm32(shift), indexTemp32);
   2308  addPtr(indexTemp32, pointer);
   2309 }
   2310 
   2311 CodeOffset MacroAssembler::wasmMarkedSlowCall(const wasm::CallSiteDesc& desc,
   2312                                              const Register reg) {
   2313  CodeOffset offset = call(desc, reg);
   2314  wasmMarkCallAsSlow();
   2315  return offset;
   2316 }
   2317 
   2318 //}}} check_macroassembler_style