tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

MacroAssembler-x86-shared-SIMD.cpp (57385B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "jit/MacroAssembler.h"
      8 #include "jit/x86-shared/MacroAssembler-x86-shared.h"
      9 
     10 #include "jit/MacroAssembler-inl.h"
     11 
     12 using namespace js;
     13 using namespace js::jit;
     14 
     15 using mozilla::DebugOnly;
     16 using mozilla::FloatingPoint;
     17 using mozilla::Maybe;
     18 using mozilla::SpecificNaN;
     19 
     20 void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
     21  ScratchSimd128Scope scratch(asMasm());
     22 
     23  vmovd(input, output);
     24  if (HasAVX2()) {
     25    vbroadcastb(Operand(output), output);
     26    return;
     27  }
     28  vpxor(scratch, scratch, scratch);
     29  vpshufb(scratch, output, output);
     30 }
     31 
     32 void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {
     33  vmovd(input, output);
     34  if (HasAVX2()) {
     35    vbroadcastw(Operand(output), output);
     36    return;
     37  }
     38  vpshuflw(0, output, output);
     39  vpshufd(0, output, output);
     40 }
     41 
     42 void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) {
     43  vmovd(input, output);
     44  if (HasAVX2()) {
     45    vbroadcastd(Operand(output), output);
     46    return;
     47  }
     48  vpshufd(0, output, output);
     49 }
     50 
     51 void MacroAssemblerX86Shared::splatX4(FloatRegister input,
     52                                      FloatRegister output) {
     53  MOZ_ASSERT(input.isSingle() && output.isSimd128());
     54  if (HasAVX2()) {
     55    vbroadcastss(Operand(input), output);
     56    return;
     57  }
     58  input = asMasm().moveSimd128FloatIfNotAVX(input.asSimd128(), output);
     59  vshufps(0, input, input, output);
     60 }
     61 
     62 void MacroAssemblerX86Shared::splatX2(FloatRegister input,
     63                                      FloatRegister output) {
     64  MOZ_ASSERT(input.isDouble() && output.isSimd128());
     65  vmovddup(Operand(input.asSimd128()), output);
     66 }
     67 
     68 void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,
     69                                                 Register output,
     70                                                 unsigned lane) {
     71  if (lane == 0) {
     72    // The value we want to extract is in the low double-word
     73    moveLowInt32(input, output);
     74  } else {
     75    vpextrd(lane, input, output);
     76  }
     77 }
     78 
     79 void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input,
     80                                                   FloatRegister output,
     81                                                   unsigned lane) {
     82  MOZ_ASSERT(input.isSimd128() && output.isSingle());
     83  if (lane == 0) {
     84    // The value we want to extract is in the low double-word
     85    if (input.asSingle() != output) {
     86      moveFloat32(input, output);
     87    }
     88  } else if (lane == 2) {
     89    moveHighPairToLowPairFloat32(input, output);
     90  } else {
     91    uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
     92    FloatRegister dest = output.asSimd128();
     93    input = moveSimd128FloatIfNotAVX(input, dest);
     94    vshufps(mask, input, input, dest);
     95  }
     96 }
     97 
     98 void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input,
     99                                                   FloatRegister output,
    100                                                   unsigned lane) {
    101  MOZ_ASSERT(input.isSimd128() && output.isDouble());
    102  if (lane == 0) {
    103    // The value we want to extract is in the low quadword
    104    if (input.asDouble() != output) {
    105      moveDouble(input, output);
    106    }
    107  } else {
    108    vpalignr(Operand(input), output, output, 8);
    109  }
    110 }
    111 
    112 void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input,
    113                                                 Register output, unsigned lane,
    114                                                 SimdSign sign) {
    115  vpextrw(lane, input, Operand(output));
    116  if (sign == SimdSign::Signed) {
    117    movswl(output, output);
    118  }
    119 }
    120 
    121 void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input,
    122                                                 Register output, unsigned lane,
    123                                                 SimdSign sign) {
    124  vpextrb(lane, input, Operand(output));
    125  if (sign == SimdSign::Signed) {
    126    if (!AllocatableGeneralRegisterSet(Registers::SingleByteRegs).has(output)) {
    127      xchgl(eax, output);
    128      movsbl(eax, eax);
    129      xchgl(eax, output);
    130    } else {
    131      movsbl(output, output);
    132    }
    133  }
    134 }
    135 
    136 void MacroAssemblerX86Shared::replaceLaneFloat32x4(unsigned lane,
    137                                                   FloatRegister lhs,
    138                                                   FloatRegister rhs,
    139                                                   FloatRegister dest) {
    140  MOZ_ASSERT(lhs.isSimd128() && rhs.isSingle());
    141 
    142  if (lane == 0) {
    143    if (rhs.asSimd128() == lhs) {
    144      // no-op, although this should not normally happen for type checking
    145      // reasons higher up in the stack.
    146      moveSimd128Float(lhs, dest);
    147    } else {
    148      // move low dword of value into low dword of output
    149      vmovss(rhs, lhs, dest);
    150    }
    151  } else {
    152    vinsertps(vinsertpsMask(0, lane), rhs, lhs, dest);
    153  }
    154 }
    155 
    156 void MacroAssemblerX86Shared::replaceLaneFloat64x2(unsigned lane,
    157                                                   FloatRegister lhs,
    158                                                   FloatRegister rhs,
    159                                                   FloatRegister dest) {
    160  MOZ_ASSERT(lhs.isSimd128() && rhs.isDouble());
    161 
    162  if (lane == 0) {
    163    if (rhs.asSimd128() == lhs) {
    164      // no-op, although this should not normally happen for type checking
    165      // reasons higher up in the stack.
    166      moveSimd128Float(lhs, dest);
    167    } else {
    168      // move low qword of value into low qword of output
    169      vmovsd(rhs, lhs, dest);
    170    }
    171  } else {
    172    // move low qword of value into high qword of output
    173    vshufpd(0, rhs, lhs, dest);
    174  }
    175 }
    176 
    177 void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
    178                                           FloatRegister output,
    179                                           FloatRegister temp,
    180                                           const uint8_t lanes[16]) {
    181  asMasm().loadConstantSimd128Int(
    182      SimdConstant::CreateX16(reinterpret_cast<const int8_t*>(lanes)), temp);
    183  vpblendvb(temp, rhs, lhs, output);
    184 }
    185 
    186 void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
    187                                           FloatRegister output,
    188                                           const uint16_t lanes[8]) {
    189  uint32_t mask = 0;
    190  for (unsigned i = 0; i < 8; i++) {
    191    if (lanes[i]) {
    192      mask |= (1 << i);
    193    }
    194  }
    195  vpblendw(mask, rhs, lhs, output);
    196 }
    197 
    198 void MacroAssemblerX86Shared::laneSelectSimd128(FloatRegister mask,
    199                                                FloatRegister lhs,
    200                                                FloatRegister rhs,
    201                                                FloatRegister output) {
    202  vpblendvb(mask, lhs, rhs, output);
    203 }
    204 
    205 void MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs,
    206                                             FloatRegister rhs,
    207                                             FloatRegister output,
    208                                             const uint8_t lanes[16]) {
    209  ScratchSimd128Scope scratch(asMasm());
    210 
    211  // Use pshufb instructions to gather the lanes from each source vector.
    212  // A negative index creates a zero lane, so the two vectors can be combined.
    213 
    214  // Set scratch = lanes from rhs.
    215  int8_t idx[16];
    216  for (unsigned i = 0; i < 16; i++) {
    217    idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
    218  }
    219  rhs = moveSimd128IntIfNotAVX(rhs, scratch);
    220  asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), rhs, scratch);
    221 
    222  // Set output = lanes from lhs.
    223  for (unsigned i = 0; i < 16; i++) {
    224    idx[i] = lanes[i] < 16 ? lanes[i] : -1;
    225  }
    226  lhs = moveSimd128IntIfNotAVX(lhs, output);
    227  asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), lhs, output);
    228 
    229  // Combine.
    230  vpor(scratch, output, output);
    231 }
    232 
    233 static inline FloatRegister ToSimdFloatRegister(const Operand& op) {
    234  return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128);
    235 }
    236 
    237 void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
    238                                             Assembler::Condition cond,
    239                                             FloatRegister output) {
    240  switch (cond) {
    241    case Assembler::Condition::GreaterThan:
    242      vpcmpgtb(rhs, lhs, output);
    243      break;
    244    case Assembler::Condition::Equal:
    245      vpcmpeqb(rhs, lhs, output);
    246      break;
    247    case Assembler::Condition::LessThan: {
    248      ScratchSimd128Scope scratch(asMasm());
    249      if (lhs == output) {
    250        moveSimd128Int(lhs, scratch);
    251        lhs = scratch;
    252      }
    253      if (rhs.kind() == Operand::FPREG) {
    254        moveSimd128Int(ToSimdFloatRegister(rhs), output);
    255      } else {
    256        loadAlignedSimd128Int(rhs, output);
    257      }
    258      vpcmpgtb(Operand(lhs), output, output);
    259      break;
    260    }
    261    case Assembler::Condition::NotEqual:
    262      vpcmpeqb(rhs, lhs, output);
    263      asMasm().bitwiseNotSimd128(output, output);
    264      break;
    265    case Assembler::Condition::GreaterThanOrEqual: {
    266      ScratchSimd128Scope scratch(asMasm());
    267      if (lhs == output) {
    268        moveSimd128Int(lhs, scratch);
    269        lhs = scratch;
    270      }
    271      if (rhs.kind() == Operand::FPREG) {
    272        moveSimd128Int(ToSimdFloatRegister(rhs), output);
    273      } else {
    274        loadAlignedSimd128Int(rhs, output);
    275      }
    276      vpcmpgtb(Operand(lhs), output, output);
    277    }
    278      asMasm().bitwiseNotSimd128(output, output);
    279      break;
    280    case Assembler::Condition::LessThanOrEqual:
    281      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
    282      vpcmpgtb(rhs, lhs, output);
    283      asMasm().bitwiseNotSimd128(output, output);
    284      break;
    285    case Assembler::Above:
    286      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    287        vpminub(rhs, lhs, output);
    288        vpcmpeqb(Operand(lhs), output, output);
    289      } else {
    290        vpmaxub(rhs, lhs, output);
    291        vpcmpeqb(rhs, output, output);
    292      }
    293      asMasm().bitwiseNotSimd128(output, output);
    294      break;
    295    case Assembler::BelowOrEqual:
    296      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    297        vpminub(rhs, lhs, output);
    298        vpcmpeqb(Operand(lhs), output, output);
    299      } else {
    300        vpmaxub(rhs, lhs, output);
    301        vpcmpeqb(rhs, output, output);
    302      }
    303      break;
    304    case Assembler::Below:
    305      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    306        vpmaxub(rhs, lhs, output);
    307        vpcmpeqb(Operand(lhs), output, output);
    308      } else {
    309        vpminub(rhs, lhs, output);
    310        vpcmpeqb(rhs, output, output);
    311      }
    312      asMasm().bitwiseNotSimd128(output, output);
    313      break;
    314    case Assembler::AboveOrEqual:
    315      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    316        vpmaxub(rhs, lhs, output);
    317        vpcmpeqb(Operand(lhs), output, output);
    318      } else {
    319        vpminub(rhs, lhs, output);
    320        vpcmpeqb(rhs, output, output);
    321      }
    322      break;
    323    default:
    324      MOZ_CRASH("unexpected condition op");
    325  }
    326 }
    327 
    328 void MacroAssemblerX86Shared::compareInt8x16(Assembler::Condition cond,
    329                                             FloatRegister lhs,
    330                                             const SimdConstant& rhs,
    331                                             FloatRegister dest) {
    332  bool complement = false;
    333  switch (cond) {
    334    case Assembler::Condition::NotEqual:
    335      complement = true;
    336      [[fallthrough]];
    337    case Assembler::Condition::Equal:
    338      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqb,
    339                    &MacroAssembler::vpcmpeqbSimd128);
    340      break;
    341    case Assembler::Condition::LessThanOrEqual:
    342      complement = true;
    343      [[fallthrough]];
    344    case Assembler::Condition::GreaterThan:
    345      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtb,
    346                    &MacroAssembler::vpcmpgtbSimd128);
    347      break;
    348    default:
    349      MOZ_CRASH("unexpected condition op");
    350  }
    351  if (complement) {
    352    asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest);
    353  }
    354 }
    355 
    356 void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
    357                                             Assembler::Condition cond,
    358                                             FloatRegister output) {
    359  switch (cond) {
    360    case Assembler::Condition::GreaterThan:
    361      vpcmpgtw(rhs, lhs, output);
    362      break;
    363    case Assembler::Condition::Equal:
    364      vpcmpeqw(rhs, lhs, output);
    365      break;
    366    case Assembler::Condition::LessThan: {
    367      ScratchSimd128Scope scratch(asMasm());
    368      if (lhs == output) {
    369        moveSimd128Int(lhs, scratch);
    370        lhs = scratch;
    371      }
    372      if (rhs.kind() == Operand::FPREG) {
    373        moveSimd128Int(ToSimdFloatRegister(rhs), output);
    374      } else {
    375        loadAlignedSimd128Int(rhs, output);
    376      }
    377      vpcmpgtw(Operand(lhs), output, output);
    378      break;
    379    }
    380    case Assembler::Condition::NotEqual:
    381      vpcmpeqw(rhs, lhs, output);
    382      asMasm().bitwiseNotSimd128(output, output);
    383      break;
    384    case Assembler::Condition::GreaterThanOrEqual: {
    385      ScratchSimd128Scope scratch(asMasm());
    386      if (lhs == output) {
    387        moveSimd128Int(lhs, scratch);
    388        lhs = scratch;
    389      }
    390      if (rhs.kind() == Operand::FPREG) {
    391        moveSimd128Int(ToSimdFloatRegister(rhs), output);
    392      } else {
    393        loadAlignedSimd128Int(rhs, output);
    394      }
    395      vpcmpgtw(Operand(lhs), output, output);
    396    }
    397      asMasm().bitwiseNotSimd128(output, output);
    398      break;
    399    case Assembler::Condition::LessThanOrEqual:
    400      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
    401      vpcmpgtw(rhs, lhs, output);
    402      asMasm().bitwiseNotSimd128(output, output);
    403      break;
    404    case Assembler::Above:
    405      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    406        vpminuw(rhs, lhs, output);
    407        vpcmpeqw(Operand(lhs), output, output);
    408      } else {
    409        vpmaxuw(rhs, lhs, output);
    410        vpcmpeqw(rhs, output, output);
    411      }
    412      asMasm().bitwiseNotSimd128(output, output);
    413      break;
    414    case Assembler::BelowOrEqual:
    415      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    416        vpminuw(rhs, lhs, output);
    417        vpcmpeqw(Operand(lhs), output, output);
    418      } else {
    419        vpmaxuw(rhs, lhs, output);
    420        vpcmpeqw(rhs, output, output);
    421      }
    422      break;
    423    case Assembler::Below:
    424      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    425        vpmaxuw(rhs, lhs, output);
    426        vpcmpeqw(Operand(lhs), output, output);
    427      } else {
    428        vpminuw(rhs, lhs, output);
    429        vpcmpeqw(rhs, output, output);
    430      }
    431      asMasm().bitwiseNotSimd128(output, output);
    432      break;
    433    case Assembler::AboveOrEqual:
    434      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    435        vpmaxuw(rhs, lhs, output);
    436        vpcmpeqw(Operand(lhs), output, output);
    437      } else {
    438        vpminuw(rhs, lhs, output);
    439        vpcmpeqw(rhs, output, output);
    440      }
    441      break;
    442    default:
    443      MOZ_CRASH("unexpected condition op");
    444  }
    445 }
    446 
    447 void MacroAssemblerX86Shared::compareInt16x8(Assembler::Condition cond,
    448                                             FloatRegister lhs,
    449                                             const SimdConstant& rhs,
    450                                             FloatRegister dest) {
    451  bool complement = false;
    452  switch (cond) {
    453    case Assembler::Condition::NotEqual:
    454      complement = true;
    455      [[fallthrough]];
    456    case Assembler::Condition::Equal:
    457      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqw,
    458                    &MacroAssembler::vpcmpeqwSimd128);
    459      break;
    460    case Assembler::Condition::LessThanOrEqual:
    461      complement = true;
    462      [[fallthrough]];
    463    case Assembler::Condition::GreaterThan:
    464      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtw,
    465                    &MacroAssembler::vpcmpgtwSimd128);
    466      break;
    467    default:
    468      MOZ_CRASH("unexpected condition op");
    469  }
    470  if (complement) {
    471    asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest);
    472  }
    473 }
    474 
    475 void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,
    476                                             Assembler::Condition cond,
    477                                             FloatRegister output) {
    478  switch (cond) {
    479    case Assembler::Condition::GreaterThan:
    480      vpcmpgtd(rhs, lhs, output);
    481      break;
    482    case Assembler::Condition::Equal:
    483      vpcmpeqd(rhs, lhs, output);
    484      break;
    485    case Assembler::Condition::LessThan: {
    486      ScratchSimd128Scope scratch(asMasm());
    487      if (lhs == output) {
    488        moveSimd128Int(lhs, scratch);
    489        lhs = scratch;
    490      }
    491      if (rhs.kind() == Operand::FPREG) {
    492        moveSimd128Int(ToSimdFloatRegister(rhs), output);
    493      } else {
    494        loadAlignedSimd128Int(rhs, output);
    495      }
    496      vpcmpgtd(Operand(lhs), output, output);
    497      break;
    498    }
    499    case Assembler::Condition::NotEqual:
    500      vpcmpeqd(rhs, lhs, output);
    501      asMasm().bitwiseNotSimd128(output, output);
    502      break;
    503    case Assembler::Condition::GreaterThanOrEqual: {
    504      ScratchSimd128Scope scratch(asMasm());
    505      if (lhs == output) {
    506        moveSimd128Int(lhs, scratch);
    507        lhs = scratch;
    508      }
    509      if (rhs.kind() == Operand::FPREG) {
    510        moveSimd128Int(ToSimdFloatRegister(rhs), output);
    511      } else {
    512        loadAlignedSimd128Int(rhs, output);
    513      }
    514      vpcmpgtd(Operand(lhs), output, output);
    515    }
    516      asMasm().bitwiseNotSimd128(output, output);
    517      break;
    518    case Assembler::Condition::LessThanOrEqual:
    519      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
    520      vpcmpgtd(rhs, lhs, output);
    521      asMasm().bitwiseNotSimd128(output, output);
    522      break;
    523    case Assembler::Above:
    524      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    525        vpminud(rhs, lhs, output);
    526        vpcmpeqd(Operand(lhs), output, output);
    527      } else {
    528        vpmaxud(rhs, lhs, output);
    529        vpcmpeqd(rhs, output, output);
    530      }
    531      asMasm().bitwiseNotSimd128(output, output);
    532      break;
    533    case Assembler::BelowOrEqual:
    534      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    535        vpminud(rhs, lhs, output);
    536        vpcmpeqd(Operand(lhs), output, output);
    537      } else {
    538        vpmaxud(rhs, lhs, output);
    539        vpcmpeqd(rhs, output, output);
    540      }
    541      break;
    542    case Assembler::Below:
    543      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    544        vpmaxud(rhs, lhs, output);
    545        vpcmpeqd(Operand(lhs), output, output);
    546      } else {
    547        vpminud(rhs, lhs, output);
    548        vpcmpeqd(rhs, output, output);
    549      }
    550      asMasm().bitwiseNotSimd128(output, output);
    551      break;
    552    case Assembler::AboveOrEqual:
    553      if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) {
    554        vpmaxud(rhs, lhs, output);
    555        vpcmpeqd(Operand(lhs), output, output);
    556      } else {
    557        vpminud(rhs, lhs, output);
    558        vpcmpeqd(rhs, output, output);
    559      }
    560      break;
    561    default:
    562      MOZ_CRASH("unexpected condition op");
    563  }
    564 }
    565 
    566 void MacroAssemblerX86Shared::compareInt32x4(Assembler::Condition cond,
    567                                             FloatRegister lhs,
    568                                             const SimdConstant& rhs,
    569                                             FloatRegister dest) {
    570  bool complement = false;
    571  switch (cond) {
    572    case Assembler::Condition::NotEqual:
    573      complement = true;
    574      [[fallthrough]];
    575    case Assembler::Condition::Equal:
    576      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqd,
    577                    &MacroAssembler::vpcmpeqdSimd128);
    578      break;
    579    case Assembler::Condition::LessThanOrEqual:
    580      complement = true;
    581      [[fallthrough]];
    582    case Assembler::Condition::GreaterThan:
    583      binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtd,
    584                    &MacroAssembler::vpcmpgtdSimd128);
    585      break;
    586    default:
    587      MOZ_CRASH("unexpected condition op");
    588  }
    589  if (complement) {
    590    asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest);
    591  }
    592 }
    593 
    594 void MacroAssemblerX86Shared::compareForEqualityInt64x2(
    595    FloatRegister lhs, Operand rhs, Assembler::Condition cond,
    596    FloatRegister output) {
    597  static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
    598  switch (cond) {
    599    case Assembler::Condition::Equal:
    600      vpcmpeqq(rhs, lhs, output);
    601      break;
    602    case Assembler::Condition::NotEqual:
    603      vpcmpeqq(rhs, lhs, output);
    604      asMasm().bitwiseXorSimd128(output, allOnes, output);
    605      break;
    606    default:
    607      MOZ_CRASH("unexpected condition op");
    608  }
    609 }
    610 
    611 void MacroAssemblerX86Shared::compareForOrderingInt64x2(
    612    FloatRegister lhs, Operand rhs, Assembler::Condition cond,
    613    FloatRegister temp1, FloatRegister temp2, FloatRegister output) {
    614  static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
    615  // The pseudo code is for (e.g. > comparison):
    616  //  __m128i pcmpgtq_sse2 (__m128i a, __m128i b) {
    617  //    __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a));
    618  //    r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b));
    619  //    return _mm_shuffle_epi32(r, _MM_SHUFFLE(3,3,1,1));
    620  //  }
    621  // Credits to https://stackoverflow.com/a/65175746
    622  switch (cond) {
    623    case Assembler::Condition::GreaterThan:
    624      vmovdqa(rhs, temp1);
    625      vmovdqa(Operand(lhs), temp2);
    626      vpsubq(Operand(lhs), temp1, temp1);
    627      vpcmpeqd(rhs, temp2, temp2);
    628      vandpd(temp2, temp1, temp1);
    629      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
    630      vpcmpgtd(rhs, lhs, output);
    631      vpor(Operand(temp1), output, output);
    632      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
    633      break;
    634    case Assembler::Condition::LessThan:
    635      vmovdqa(rhs, temp1);
    636      vmovdqa(Operand(lhs), temp2);
    637      vpcmpgtd(Operand(lhs), temp1, temp1);
    638      vpcmpeqd(Operand(rhs), temp2, temp2);
    639      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
    640      vpsubq(rhs, lhs, output);
    641      vandpd(temp2, output, output);
    642      vpor(Operand(temp1), output, output);
    643      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
    644      break;
    645    case Assembler::Condition::GreaterThanOrEqual:
    646      vmovdqa(rhs, temp1);
    647      vmovdqa(Operand(lhs), temp2);
    648      vpcmpgtd(Operand(lhs), temp1, temp1);
    649      vpcmpeqd(Operand(rhs), temp2, temp2);
    650      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
    651      vpsubq(rhs, lhs, output);
    652      vandpd(temp2, output, output);
    653      vpor(Operand(temp1), output, output);
    654      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
    655      asMasm().bitwiseXorSimd128(output, allOnes, output);
    656      break;
    657    case Assembler::Condition::LessThanOrEqual:
    658      vmovdqa(rhs, temp1);
    659      vmovdqa(Operand(lhs), temp2);
    660      vpsubq(Operand(lhs), temp1, temp1);
    661      vpcmpeqd(rhs, temp2, temp2);
    662      vandpd(temp2, temp1, temp1);
    663      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
    664      vpcmpgtd(rhs, lhs, output);
    665      vpor(Operand(temp1), output, output);
    666      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
    667      asMasm().bitwiseXorSimd128(output, allOnes, output);
    668      break;
    669    default:
    670      MOZ_CRASH("unexpected condition op");
    671  }
    672 }
    673 
    674 void MacroAssemblerX86Shared::compareForOrderingInt64x2AVX(
    675    FloatRegister lhs, FloatRegister rhs, Assembler::Condition cond,
    676    FloatRegister output) {
    677  MOZ_ASSERT(HasSSE42());
    678  static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
    679  switch (cond) {
    680    case Assembler::Condition::GreaterThan:
    681      vpcmpgtq(Operand(rhs), lhs, output);
    682      break;
    683    case Assembler::Condition::LessThan:
    684      vpcmpgtq(Operand(lhs), rhs, output);
    685      break;
    686    case Assembler::Condition::GreaterThanOrEqual:
    687      vpcmpgtq(Operand(lhs), rhs, output);
    688      asMasm().bitwiseXorSimd128(output, allOnes, output);
    689      break;
    690    case Assembler::Condition::LessThanOrEqual:
    691      vpcmpgtq(Operand(rhs), lhs, output);
    692      asMasm().bitwiseXorSimd128(output, allOnes, output);
    693      break;
    694    default:
    695      MOZ_CRASH("unexpected condition op");
    696  }
    697 }
    698 
    699 void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs,
    700                                               Assembler::Condition cond,
    701                                               FloatRegister output) {
    702  // TODO Can do better here with three-address compares
    703 
    704  // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.
    705  // This is bad, but Ion does not need this fixup.
    706  ScratchSimd128Scope scratch(asMasm());
    707  if (!HasAVX() && !lhs.aliases(output)) {
    708    if (rhs.kind() == Operand::FPREG &&
    709        output.aliases(FloatRegister::FromCode(rhs.fpu()))) {
    710      vmovaps(rhs, scratch);
    711      rhs = Operand(scratch);
    712    }
    713    vmovaps(lhs, output);
    714    lhs = output;
    715  }
    716 
    717  switch (cond) {
    718    case Assembler::Condition::Equal:
    719      vcmpeqps(rhs, lhs, output);
    720      break;
    721    case Assembler::Condition::LessThan:
    722      vcmpltps(rhs, lhs, output);
    723      break;
    724    case Assembler::Condition::LessThanOrEqual:
    725      vcmpleps(rhs, lhs, output);
    726      break;
    727    case Assembler::Condition::NotEqual:
    728      vcmpneqps(rhs, lhs, output);
    729      break;
    730    case Assembler::Condition::GreaterThanOrEqual:
    731    case Assembler::Condition::GreaterThan:
    732      // We reverse these operations in the -inl.h file so that we don't have to
    733      // copy into and out of temporaries after codegen.
    734      MOZ_CRASH("should have reversed this");
    735    default:
    736      MOZ_CRASH("unexpected condition op");
    737  }
    738 }
    739 
    740 void MacroAssemblerX86Shared::compareFloat32x4(Assembler::Condition cond,
    741                                               FloatRegister lhs,
    742                                               const SimdConstant& rhs,
    743                                               FloatRegister dest) {
    744  switch (cond) {
    745    case Assembler::Condition::Equal:
    746      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpeqps,
    747                    &MacroAssembler::vcmpeqpsSimd128);
    748      break;
    749    case Assembler::Condition::LessThan:
    750      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpltps,
    751                    &MacroAssembler::vcmpltpsSimd128);
    752      break;
    753    case Assembler::Condition::LessThanOrEqual:
    754      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpleps,
    755                    &MacroAssembler::vcmplepsSimd128);
    756      break;
    757    case Assembler::Condition::NotEqual:
    758      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpneqps,
    759                    &MacroAssembler::vcmpneqpsSimd128);
    760      break;
    761    default:
    762      MOZ_CRASH("unexpected condition op");
    763  }
    764 }
    765 
    766 void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,
    767                                               Assembler::Condition cond,
    768                                               FloatRegister output) {
    769  // TODO Can do better here with three-address compares
    770 
    771  // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.
    772  // This is bad, but Ion does not need this fixup.
    773  ScratchSimd128Scope scratch(asMasm());
    774  if (!HasAVX() && !lhs.aliases(output)) {
    775    if (rhs.kind() == Operand::FPREG &&
    776        output.aliases(FloatRegister::FromCode(rhs.fpu()))) {
    777      vmovapd(rhs, scratch);
    778      rhs = Operand(scratch);
    779    }
    780    vmovapd(lhs, output);
    781    lhs = output;
    782  }
    783 
    784  switch (cond) {
    785    case Assembler::Condition::Equal:
    786      vcmpeqpd(rhs, lhs, output);
    787      break;
    788    case Assembler::Condition::LessThan:
    789      vcmpltpd(rhs, lhs, output);
    790      break;
    791    case Assembler::Condition::LessThanOrEqual:
    792      vcmplepd(rhs, lhs, output);
    793      break;
    794    case Assembler::Condition::NotEqual:
    795      vcmpneqpd(rhs, lhs, output);
    796      break;
    797    case Assembler::Condition::GreaterThanOrEqual:
    798    case Assembler::Condition::GreaterThan:
    799      // We reverse these operations in the -inl.h file so that we don't have to
    800      // copy into and out of temporaries after codegen.
    801      MOZ_CRASH("should have reversed this");
    802    default:
    803      MOZ_CRASH("unexpected condition op");
    804  }
    805 }
    806 
    807 void MacroAssemblerX86Shared::compareFloat64x2(Assembler::Condition cond,
    808                                               FloatRegister lhs,
    809                                               const SimdConstant& rhs,
    810                                               FloatRegister dest) {
    811  switch (cond) {
    812    case Assembler::Condition::Equal:
    813      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpeqpd,
    814                    &MacroAssembler::vcmpeqpdSimd128);
    815      break;
    816    case Assembler::Condition::LessThan:
    817      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpltpd,
    818                    &MacroAssembler::vcmpltpdSimd128);
    819      break;
    820    case Assembler::Condition::LessThanOrEqual:
    821      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmplepd,
    822                    &MacroAssembler::vcmplepdSimd128);
    823      break;
    824    case Assembler::Condition::NotEqual:
    825      binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpneqpd,
    826                    &MacroAssembler::vcmpneqpdSimd128);
    827      break;
    828    default:
    829      MOZ_CRASH("unexpected condition op");
    830  }
    831 }
    832 
    833 // Semantics of wasm max and min.
    834 //
    835 //  * -0 < 0
    836 //  * If one input is NaN then that NaN is the output
    837 //  * If both inputs are NaN then the output is selected nondeterministically
    838 //  * Any returned NaN is always made quiet
    839 //  * The MVP spec 2.2.3 says "No distinction is made between signalling and
    840 //    quiet NaNs", suggesting SNaN inputs are allowed and should not fault
    841 //
    842 // Semantics of maxps/minps/maxpd/minpd:
    843 //
    844 //  * If the values are both +/-0 the rhs is returned
    845 //  * If the rhs is SNaN then the rhs is returned
    846 //  * If either value is NaN then the rhs is returned
    847 //  * An SNaN operand does not appear to give rise to an exception, at least
    848 //    not in the JS shell on Linux, though the Intel spec lists Invalid
    849 //    as one of the possible exceptions
    850 
    851 // Various unaddressed considerations:
    852 //
    853 // It's pretty insane for this to take an Operand rhs - it really needs to be
    854 // a register, given the number of times we access it.
    855 //
    856 // Constant load can be folded into the ANDPS.  Do we care?  It won't save us
    857 // any registers, since output/temp1/temp2/scratch are all live at the same time
    858 // after the first instruction of the slow path.
    859 //
    860 // Can we use blend for the NaN extraction/insertion?  We'd need xmm0 for the
    861 // mask, which is no fun.  But it would be lhs UNORD lhs -> mask, blend;
    862 // rhs UNORD rhs -> mask; blend.  Better than the mess we have below.  But
    863 // we'd still need to setup the QNaN bits, unless we can blend those too
    864 // with the lhs UNORD rhs mask?
    865 //
    866 // If we could determine that both input lanes are NaN then the result of the
    867 // fast path should be fine modulo the QNaN bits, but it's not obvious this is
    868 // much of an advantage.
    869 
    870 void MacroAssemblerX86Shared::minMaxFloat32x4(bool isMin, FloatRegister lhs,
    871                                              Operand rhs, FloatRegister temp1,
    872                                              FloatRegister temp2,
    873                                              FloatRegister output) {
    874  ScratchSimd128Scope scratch(asMasm());
    875  Label l;
    876  SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000)));
    877 
    878  /* clang-format off */ /* leave my comments alone */
    879  lhs = moveSimd128FloatIfNotAVXOrOther(lhs, scratch, output);
    880  if (isMin) {
    881    vmovaps(lhs, output);                    // compute
    882    vminps(rhs, output, output);             //   min lhs, rhs
    883    vmovaps(rhs, temp1);                     // compute
    884    vminps(Operand(lhs), temp1, temp1);      //   min rhs, lhs
    885    vorps(temp1, output, output);            // fix min(-0, 0) with OR
    886  } else {
    887    vmovaps(lhs, output);                    // compute
    888    vmaxps(rhs, output, output);             //   max lhs, rhs
    889    vmovaps(rhs, temp1);                     // compute
    890    vmaxps(Operand(lhs), temp1, temp1);      //   max rhs, lhs
    891    vandps(temp1, output, output);           // fix max(-0, 0) with AND
    892  }
    893  vmovaps(lhs, temp1);                       // compute
    894  vcmpunordps(rhs, temp1, temp1);            //   lhs UNORD rhs
    895  vptest(temp1, temp1);                      // check if any unordered
    896  j(Assembler::Equal, &l);                   //   and exit if not
    897 
    898  // Slow path.
    899  // output has result for non-NaN lanes, garbage in NaN lanes.
    900  // temp1 has lhs UNORD rhs.
    901  // temp2 is dead.
    902 
    903  vmovaps(temp1, temp2);                     // clear NaN lanes of result
    904  vpandn(output, temp2, temp2);              //   result now in temp2
    905  asMasm().vpandSimd128(quietBits, temp1, temp1);   // setup QNaN bits in NaN lanes
    906  vorps(temp1, temp2, temp2);                //   and OR into result
    907  vmovaps(lhs, temp1);                       // find NaN lanes
    908  vcmpunordps(Operand(temp1), temp1, temp1); //   in lhs
    909  vmovaps(temp1, output);                    //     (and save them for later)
    910  vandps(lhs, temp1, temp1);                 //       and extract the NaNs
    911  vorps(temp1, temp2, temp2);                //         and add to the result
    912  vmovaps(rhs, temp1);                       // find NaN lanes
    913  vcmpunordps(Operand(temp1), temp1, temp1); //   in rhs
    914  vpandn(temp1, output, output);             //     except if they were in lhs
    915  vandps(rhs, output, output);               //       and extract the NaNs
    916  vorps(temp2, output, output);              //         and add to the result
    917 
    918  bind(&l);
    919  /* clang-format on */
    920 }
    921 
    922 void MacroAssemblerX86Shared::minMaxFloat32x4AVX(bool isMin, FloatRegister lhs,
    923                                                 FloatRegister rhs,
    924                                                 FloatRegister temp1,
    925                                                 FloatRegister temp2,
    926                                                 FloatRegister output) {
    927  ScratchSimd128Scope scratch(asMasm());
    928  Label l;
    929  SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000)));
    930 
    931  /* clang-format off */ /* leave my comments alone */
    932  FloatRegister lhsCopy = moveSimd128FloatIfEqual(lhs, scratch, output);
    933  // Allow rhs be assigned to scratch when rhs == lhs and == output --
    934  // don't make a special case since the semantics require setup QNaN bits.
    935  FloatRegister rhsCopy = moveSimd128FloatIfEqual(rhs, scratch, output);
    936  if (isMin) {
    937    vminps(Operand(rhs), lhs, temp2);             // min lhs, rhs
    938    vminps(Operand(lhs), rhs, temp1);             // min rhs, lhs
    939    vorps(temp1, temp2, output);                  // fix min(-0, 0) with OR
    940  } else {
    941    vmaxps(Operand(rhs), lhs, temp2);             // max lhs, rhs
    942    vmaxps(Operand(lhs), rhs, temp1);             // max rhs, lhs
    943    vandps(temp1, temp2, output);                 // fix max(-0, 0) with AND
    944  }
    945  vcmpunordps(Operand(rhsCopy), lhsCopy, temp1);  // lhs UNORD rhs
    946  vptest(temp1, temp1);                           // check if any unordered
    947  j(Assembler::Equal, &l);                        //   and exit if not
    948 
    949  // Slow path.
    950  // output has result for non-NaN lanes, garbage in NaN lanes.
    951  // temp1 has lhs UNORD rhs.
    952  // temp2 is dead.
    953  vcmpunordps(Operand(lhsCopy), lhsCopy, temp2);  // find NaN lanes in lhs
    954  vblendvps(temp2, lhsCopy, rhsCopy, temp2);      //   add other lines from rhs
    955  asMasm().vporSimd128(quietBits, temp2, temp2);  // setup QNaN bits in NaN lanes
    956  vblendvps(temp1, temp2, output, output);        // replace NaN lines from temp2
    957 
    958  bind(&l);
    959  /* clang-format on */
    960 }
    961 
    962 // Exactly as above.
    963 void MacroAssemblerX86Shared::minMaxFloat64x2(bool isMin, FloatRegister lhs,
    964                                              Operand rhs, FloatRegister temp1,
    965                                              FloatRegister temp2,
    966                                              FloatRegister output) {
    967  ScratchSimd128Scope scratch(asMasm());
    968  Label l;
    969  SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull)));
    970 
    971  /* clang-format off */ /* leave my comments alone */
    972  lhs = moveSimd128FloatIfNotAVXOrOther(lhs, scratch, output);
    973  if (isMin) {
    974    vmovapd(lhs, output);                    // compute
    975    vminpd(rhs, output, output);             //   min lhs, rhs
    976    vmovapd(rhs, temp1);                     // compute
    977    vminpd(Operand(lhs), temp1, temp1);      //   min rhs, lhs
    978    vorpd(temp1, output, output);            // fix min(-0, 0) with OR
    979  } else {
    980    vmovapd(lhs, output);                    // compute
    981    vmaxpd(rhs, output, output);             //   max lhs, rhs
    982    vmovapd(rhs, temp1);                     // compute
    983    vmaxpd(Operand(lhs), temp1, temp1);      //   max rhs, lhs
    984    vandpd(temp1, output, output);           // fix max(-0, 0) with AND
    985  }
    986  vmovapd(lhs, temp1);                       // compute
    987  vcmpunordpd(rhs, temp1, temp1);                   //   lhs UNORD rhs
    988  vptest(temp1, temp1);                      // check if any unordered
    989  j(Assembler::Equal, &l);                   //   and exit if not
    990 
    991  // Slow path.
    992  // output has result for non-NaN lanes, garbage in NaN lanes.
    993  // temp1 has lhs UNORD rhs.
    994  // temp2 is dead.
    995 
    996  vmovapd(temp1, temp2);                     // clear NaN lanes of result
    997  vpandn(output, temp2, temp2);              //   result now in temp2
    998  asMasm().vpandSimd128(quietBits, temp1, temp1);   // setup QNaN bits in NaN lanes
    999  vorpd(temp1, temp2, temp2);                //   and OR into result
   1000  vmovapd(lhs, temp1);                       // find NaN lanes
   1001  vcmpunordpd(Operand(temp1), temp1, temp1);        //   in lhs
   1002  vmovapd(temp1, output);                    //     (and save them for later)
   1003  vandpd(lhs, temp1, temp1);                 //       and extract the NaNs
   1004  vorpd(temp1, temp2, temp2);                //         and add to the result
   1005  vmovapd(rhs, temp1);                       // find NaN lanes
   1006  vcmpunordpd(Operand(temp1), temp1, temp1);        //   in rhs
   1007  vpandn(temp1, output, output);             //     except if they were in lhs
   1008  vandpd(rhs, output, output);               //       and extract the NaNs
   1009  vorpd(temp2, output, output);              //         and add to the result
   1010 
   1011  bind(&l);
   1012  /* clang-format on */
   1013 }
   1014 
   1015 void MacroAssemblerX86Shared::minMaxFloat64x2AVX(bool isMin, FloatRegister lhs,
   1016                                                 FloatRegister rhs,
   1017                                                 FloatRegister temp1,
   1018                                                 FloatRegister temp2,
   1019                                                 FloatRegister output) {
   1020  ScratchSimd128Scope scratch(asMasm());
   1021  Label l;
   1022  SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull)));
   1023 
   1024  /* clang-format off */ /* leave my comments alone */
   1025  FloatRegister lhsCopy = moveSimd128FloatIfEqual(lhs, scratch, output);
   1026  // Allow rhs be assigned to scratch when rhs == lhs and == output --
   1027  // don't make a special case since the semantics require setup QNaN bits.
   1028  FloatRegister rhsCopy = moveSimd128FloatIfEqual(rhs, scratch, output);
   1029  if (isMin) {
   1030    vminpd(Operand(rhs), lhs, temp2);             // min lhs, rhs
   1031    vminpd(Operand(lhs), rhs, temp1);             // min rhs, lhs
   1032    vorpd(temp1, temp2, output);                  // fix min(-0, 0) with OR
   1033  } else {
   1034    vmaxpd(Operand(rhs), lhs, temp2);             // max lhs, rhs
   1035    vmaxpd(Operand(lhs), rhs, temp1);             // max rhs, lhs
   1036    vandpd(temp1, temp2, output);                 // fix max(-0, 0) with AND
   1037  }
   1038  vcmpunordpd(Operand(rhsCopy), lhsCopy, temp1);  // lhs UNORD rhs
   1039  vptest(temp1, temp1);                           // check if any unordered
   1040  j(Assembler::Equal, &l);                        //   and exit if not
   1041 
   1042  // Slow path.
   1043  // output has result for non-NaN lanes, garbage in NaN lanes.
   1044  // temp1 has lhs UNORD rhs.
   1045  // temp2 is dead.
   1046  vcmpunordpd(Operand(lhsCopy), lhsCopy, temp2);  // find NaN lanes in lhs
   1047  vblendvpd(temp2, lhsCopy, rhsCopy, temp2);      //   add other lines from rhs
   1048  asMasm().vporSimd128(quietBits, temp2, temp2);  // setup QNaN bits in NaN lanes
   1049  vblendvpd(temp1, temp2, output, output);        // replace NaN lines from temp2
   1050 
   1051  bind(&l);
   1052  /* clang-format on */
   1053 }
   1054 
   1055 void MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, FloatRegister rhs,
   1056                                           FloatRegister temp1,
   1057                                           FloatRegister temp2,
   1058                                           FloatRegister output) {
   1059  if (HasAVX()) {
   1060    minMaxFloat32x4AVX(/*isMin=*/true, lhs, rhs, temp1, temp2, output);
   1061    return;
   1062  }
   1063  minMaxFloat32x4(/*isMin=*/true, lhs, Operand(rhs), temp1, temp2, output);
   1064 }
   1065 
   1066 void MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, FloatRegister rhs,
   1067                                           FloatRegister temp1,
   1068                                           FloatRegister temp2,
   1069                                           FloatRegister output) {
   1070  if (HasAVX()) {
   1071    minMaxFloat32x4AVX(/*isMin=*/false, lhs, rhs, temp1, temp2, output);
   1072    return;
   1073  }
   1074  minMaxFloat32x4(/*isMin=*/false, lhs, Operand(rhs), temp1, temp2, output);
   1075 }
   1076 
   1077 void MacroAssemblerX86Shared::minFloat64x2(FloatRegister lhs, FloatRegister rhs,
   1078                                           FloatRegister temp1,
   1079                                           FloatRegister temp2,
   1080                                           FloatRegister output) {
   1081  if (HasAVX()) {
   1082    minMaxFloat64x2AVX(/*isMin=*/true, lhs, rhs, temp1, temp2, output);
   1083    return;
   1084  }
   1085  minMaxFloat64x2(/*isMin=*/true, lhs, Operand(rhs), temp1, temp2, output);
   1086 }
   1087 
   1088 void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, FloatRegister rhs,
   1089                                           FloatRegister temp1,
   1090                                           FloatRegister temp2,
   1091                                           FloatRegister output) {
   1092  if (HasAVX()) {
   1093    minMaxFloat64x2AVX(/*isMin=*/false, lhs, rhs, temp1, temp2, output);
   1094    return;
   1095  }
   1096  minMaxFloat64x2(/*isMin=*/false, lhs, Operand(rhs), temp1, temp2, output);
   1097 }
   1098 
   1099 void MacroAssemblerX86Shared::packedShiftByScalarInt8x16(
   1100    FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest,
   1101    void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister,
   1102                                           FloatRegister),
   1103    void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister)) {
   1104  ScratchSimd128Scope scratch(asMasm());
   1105  vmovd(count, scratch);
   1106 
   1107  // High bytes
   1108  vpalignr(Operand(in), xtmp, xtmp, 8);
   1109  (this->*extend)(Operand(xtmp), xtmp);
   1110  (this->*shift)(scratch, xtmp, xtmp);
   1111 
   1112  // Low bytes
   1113  (this->*extend)(Operand(dest), dest);
   1114  (this->*shift)(scratch, dest, dest);
   1115 
   1116  // Mask off garbage to avoid saturation during packing
   1117  asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x00FF00FF)),
   1118                                  scratch);
   1119  vpand(Operand(scratch), xtmp, xtmp);
   1120  vpand(Operand(scratch), dest, dest);
   1121 
   1122  vpackuswb(Operand(xtmp), dest, dest);
   1123 }
   1124 
   1125 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
   1126    FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) {
   1127  packedShiftByScalarInt8x16(in, count, xtmp, dest,
   1128                             &MacroAssemblerX86Shared::vpsllw,
   1129                             &MacroAssemblerX86Shared::vpmovzxbw);
   1130 }
   1131 
   1132 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
   1133    Imm32 count, FloatRegister src, FloatRegister dest) {
   1134  MOZ_ASSERT(count.value <= 7);
   1135  if (MOZ_UNLIKELY(count.value == 0)) {
   1136    moveSimd128Int(src, dest);
   1137    return;
   1138  }
   1139  src = asMasm().moveSimd128IntIfNotAVX(src, dest);
   1140  // Use the doubling trick for low shift counts, otherwise mask off the bits
   1141  // that are shifted out of the low byte of each word and use word shifts.  The
   1142  // optimal cutoff remains to be explored.
   1143  if (count.value <= 3) {
   1144    vpaddb(Operand(src), src, dest);
   1145    for (int32_t shift = count.value - 1; shift > 0; --shift) {
   1146      vpaddb(Operand(dest), dest, dest);
   1147    }
   1148  } else {
   1149    asMasm().bitwiseAndSimd128(src, SimdConstant::SplatX16(0xFF >> count.value),
   1150                               dest);
   1151    vpsllw(count, dest, dest);
   1152  }
   1153 }
   1154 
   1155 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
   1156    FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) {
   1157  packedShiftByScalarInt8x16(in, count, xtmp, dest,
   1158                             &MacroAssemblerX86Shared::vpsraw,
   1159                             &MacroAssemblerX86Shared::vpmovsxbw);
   1160 }
   1161 
   1162 void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
   1163    Imm32 count, FloatRegister src, FloatRegister dest) {
   1164  MOZ_ASSERT(count.value <= 7);
   1165  ScratchSimd128Scope scratch(asMasm());
   1166 
   1167  vpunpckhbw(src, scratch, scratch);
   1168  vpunpcklbw(src, dest, dest);
   1169  vpsraw(Imm32(count.value + 8), scratch, scratch);
   1170  vpsraw(Imm32(count.value + 8), dest, dest);
   1171  vpacksswb(Operand(scratch), dest, dest);
   1172 }
   1173 
   1174 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
   1175    FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) {
   1176  packedShiftByScalarInt8x16(in, count, xtmp, dest,
   1177                             &MacroAssemblerX86Shared::vpsrlw,
   1178                             &MacroAssemblerX86Shared::vpmovzxbw);
   1179 }
   1180 
   1181 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
   1182    Imm32 count, FloatRegister src, FloatRegister dest) {
   1183  MOZ_ASSERT(count.value <= 7);
   1184  src = asMasm().moveSimd128IntIfNotAVX(src, dest);
   1185  asMasm().bitwiseAndSimd128(
   1186      src, SimdConstant::SplatX16((0xFF << count.value) & 0xFF), dest);
   1187  vpsrlw(count, dest, dest);
   1188 }
   1189 
   1190 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8(
   1191    FloatRegister in, Register count, FloatRegister dest) {
   1192  ScratchSimd128Scope scratch(asMasm());
   1193  vmovd(count, scratch);
   1194  vpsllw(scratch, in, dest);
   1195 }
   1196 
   1197 void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8(
   1198    FloatRegister in, Register count, FloatRegister dest) {
   1199  ScratchSimd128Scope scratch(asMasm());
   1200  vmovd(count, scratch);
   1201  vpsraw(scratch, in, dest);
   1202 }
   1203 
   1204 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8(
   1205    FloatRegister in, Register count, FloatRegister dest) {
   1206  ScratchSimd128Scope scratch(asMasm());
   1207  vmovd(count, scratch);
   1208  vpsrlw(scratch, in, dest);
   1209 }
   1210 
   1211 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4(
   1212    FloatRegister in, Register count, FloatRegister dest) {
   1213  ScratchSimd128Scope scratch(asMasm());
   1214  vmovd(count, scratch);
   1215  vpslld(scratch, in, dest);
   1216 }
   1217 
   1218 void MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4(
   1219    FloatRegister in, Register count, FloatRegister dest) {
   1220  ScratchSimd128Scope scratch(asMasm());
   1221  vmovd(count, scratch);
   1222  vpsrad(scratch, in, dest);
   1223 }
   1224 
   1225 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4(
   1226    FloatRegister in, Register count, FloatRegister dest) {
   1227  ScratchSimd128Scope scratch(asMasm());
   1228  vmovd(count, scratch);
   1229  vpsrld(scratch, in, dest);
   1230 }
   1231 
   1232 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2(
   1233    FloatRegister in, Register count, FloatRegister dest) {
   1234  ScratchSimd128Scope scratch(asMasm());
   1235  vmovd(count, scratch);
   1236  vpsllq(scratch, in, dest);
   1237 }
   1238 
   1239 void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
   1240    FloatRegister in, Register count, FloatRegister temp, FloatRegister dest) {
   1241  ScratchSimd128Scope scratch(asMasm());
   1242  vmovd(count, temp);
   1243  asMasm().signReplicationInt64x2(in, scratch);
   1244  in = asMasm().moveSimd128FloatIfNotAVX(in, dest);
   1245  // Invert if negative, shift all, invert back if negative.
   1246  vpxor(Operand(scratch), in, dest);
   1247  vpsrlq(temp, dest, dest);
   1248  vpxor(Operand(scratch), dest, dest);
   1249 }
   1250 
   1251 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(
   1252    FloatRegister in, Register count, FloatRegister dest) {
   1253  ScratchSimd128Scope scratch(asMasm());
   1254  vmovd(count, scratch);
   1255  vpsrlq(scratch, in, dest);
   1256 }
   1257 
   1258 void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
   1259    Imm32 count, FloatRegister src, FloatRegister dest) {
   1260  ScratchSimd128Scope scratch(asMasm());
   1261  asMasm().signReplicationInt64x2(src, scratch);
   1262  // Invert if negative, shift all, invert back if negative.
   1263  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
   1264  vpxor(Operand(scratch), src, dest);
   1265  vpsrlq(Imm32(count.value & 63), dest, dest);
   1266  vpxor(Operand(scratch), dest, dest);
   1267 }
   1268 
   1269 void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
   1270                                            FloatRegister onTrue,
   1271                                            FloatRegister onFalse,
   1272                                            FloatRegister temp,
   1273                                            FloatRegister output) {
   1274  // Normally the codegen will attempt to enforce these register assignments so
   1275  // that the moves are avoided.
   1276 
   1277  onTrue = asMasm().moveSimd128IntIfNotAVX(onTrue, output);
   1278  if (MOZ_UNLIKELY(mask == onTrue)) {
   1279    vpor(Operand(onFalse), onTrue, output);
   1280    return;
   1281  }
   1282 
   1283  mask = asMasm().moveSimd128IntIfNotAVX(mask, temp);
   1284 
   1285  vpand(Operand(mask), onTrue, output);
   1286  vpandn(Operand(onFalse), mask, temp);
   1287  vpor(Operand(temp), output, output);
   1288 }
   1289 
   1290 // Code sequences for int32x4<->float32x4 culled from v8; commentary added.
   1291 
   1292 void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
   1293    FloatRegister src, FloatRegister dest) {
   1294  ScratchSimd128Scope scratch(asMasm());
   1295  src = asMasm().moveSimd128IntIfNotAVX(src, dest);
   1296  vpxor(Operand(scratch), scratch, scratch);  // extract low bits
   1297  vpblendw(0x55, src, scratch, scratch);      //   into scratch
   1298  vpsubd(Operand(scratch), src, dest);        //     and high bits into dest
   1299  vcvtdq2ps(scratch, scratch);                // convert low bits
   1300  vpsrld(Imm32(1), dest, dest);               // get high into unsigned range
   1301  vcvtdq2ps(dest, dest);                      //   convert
   1302  vaddps(Operand(dest), dest, dest);          //     and back into signed
   1303  vaddps(Operand(scratch), dest, dest);       // combine high+low: may round
   1304 }
   1305 
   1306 void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
   1307                                                         FloatRegister dest) {
   1308  ScratchSimd128Scope scratch(asMasm());
   1309 
   1310  // The cvttps2dq instruction is the workhorse but does not handle NaN or out
   1311  // of range values as we need it to.  We want to saturate too-large positive
   1312  // values to 7FFFFFFFh and too-large negative values to 80000000h.  NaN and -0
   1313  // become 0.
   1314 
   1315  // Convert NaN to 0 by masking away values that compare unordered to itself.
   1316  if (HasAVX()) {
   1317    vcmpeqps(Operand(src), src, scratch);
   1318    vpand(Operand(scratch), src, dest);
   1319  } else {
   1320    vmovaps(src, scratch);
   1321    vcmpeqps(Operand(scratch), scratch, scratch);
   1322    moveSimd128Float(src, dest);
   1323    vpand(Operand(scratch), dest, dest);
   1324  }
   1325 
   1326  // Make lanes in scratch == 0xFFFFFFFFh, if dest overflows during cvttps2dq,
   1327  // otherwise 0.
   1328  static const SimdConstant minOverflowedInt =
   1329      SimdConstant::SplatX4(2147483648.f);
   1330  if (HasAVX()) {
   1331    asMasm().vcmpgepsSimd128(minOverflowedInt, dest, scratch);
   1332  } else {
   1333    asMasm().loadConstantSimd128Float(minOverflowedInt, scratch);
   1334    vcmpleps(Operand(dest), scratch, scratch);
   1335  }
   1336 
   1337  // Convert.  This will make the output 80000000h if the input is out of range.
   1338  vcvttps2dq(dest, dest);
   1339 
   1340  // Convert overflow lanes to 0x7FFFFFFF.
   1341  vpxor(Operand(scratch), dest, dest);
   1342 }
   1343 
   1344 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
   1345    FloatRegister src, FloatRegister temp, FloatRegister dest) {
   1346  ScratchSimd128Scope scratch(asMasm());
   1347  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
   1348 
   1349  // The cvttps2dq instruction is the workhorse but does not handle NaN or out
   1350  // of range values as we need it to.  We want to saturate too-large positive
   1351  // values to FFFFFFFFh and negative values to zero.  NaN and -0 become 0.
   1352 
   1353  // Convert NaN and negative values to zeroes in dest.
   1354  vxorps(Operand(scratch), scratch, scratch);
   1355  vmaxps(Operand(scratch), src, dest);
   1356 
   1357  // Place the largest positive signed integer in all lanes in scratch.
   1358  // We use it to bias the conversion to handle edge cases.
   1359  asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(2147483647.f),
   1360                                    scratch);
   1361 
   1362  // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned
   1363  // range but above the signed range into the signed range; 0 => -7FFFFFFFh.
   1364  vmovaps(dest, temp);
   1365  vsubps(Operand(scratch), temp, temp);
   1366 
   1367  // scratch = mask of biased values that are greater than 7FFFFFFFh.
   1368  vcmpleps(Operand(temp), scratch, scratch);
   1369 
   1370  // Convert the biased values to integer.  Positive values above 7FFFFFFFh will
   1371  // have been converted to 80000000h, all others become the expected integer.
   1372  vcvttps2dq(temp, temp);
   1373 
   1374  // As lanes of scratch are ~0 where the result overflows, this computes
   1375  // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes
   1376  // untouched as the biased integer.
   1377  vpxor(Operand(scratch), temp, temp);
   1378 
   1379  // Convert negative biased lanes in temp to zero.  After this, temp will be
   1380  // zero where the result should be zero or is less than 80000000h, 7FFFFFFF
   1381  // where the result overflows, and will have the converted biased result in
   1382  // other lanes (for input values >= 80000000h).
   1383  vpxor(Operand(scratch), scratch, scratch);
   1384  vpmaxsd(Operand(scratch), temp, temp);
   1385 
   1386  // Convert. Overflow lanes above 7FFFFFFFh will be 80000000h, other lanes will
   1387  // be what they should be.
   1388  vcvttps2dq(dest, dest);
   1389 
   1390  // Add temp to the result.  Overflow lanes with 80000000h becomes FFFFFFFFh,
   1391  // biased high-value unsigned lanes become unbiased, everything else is left
   1392  // unchanged.
   1393  vpaddd(Operand(temp), dest, dest);
   1394 }
   1395 
   1396 void MacroAssemblerX86Shared::unsignedTruncFloat32x4ToInt32x4Relaxed(
   1397    FloatRegister src, FloatRegister dest) {
   1398  ScratchSimd128Scope scratch(asMasm());
   1399  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
   1400 
   1401  // Place lanes below 80000000h into dest, otherwise into scratch.
   1402  // Keep dest or scratch 0 as default.
   1403  asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(0x4f000000), scratch);
   1404  vcmpltps(Operand(src), scratch, scratch);
   1405  vpand(Operand(src), scratch, scratch);
   1406  vpxor(Operand(scratch), src, dest);
   1407 
   1408  // Convert lanes below 80000000h into unsigned int without issues.
   1409  vcvttps2dq(dest, dest);
   1410  // Knowing IEEE-754 number representation: convert lanes above 7FFFFFFFh,
   1411  // mutiply by 2 (to add 1 in exponent) and shift to the left by 8 bits.
   1412  vaddps(Operand(scratch), scratch, scratch);
   1413  vpslld(Imm32(8), scratch, scratch);
   1414 
   1415  // Combine the results.
   1416  vpaddd(Operand(scratch), dest, dest);
   1417 }
   1418 
   1419 void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat64x2(
   1420    FloatRegister src, FloatRegister dest) {
   1421  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
   1422  asMasm().vunpcklpsSimd128(SimdConstant::SplatX4(0x43300000), src, dest);
   1423  asMasm().vsubpdSimd128(SimdConstant::SplatX2(4503599627370496.0), dest, dest);
   1424 }
   1425 
   1426 void MacroAssemblerX86Shared::truncSatFloat64x2ToInt32x4(FloatRegister src,
   1427                                                         FloatRegister temp,
   1428                                                         FloatRegister dest) {
   1429  FloatRegister srcForTemp = asMasm().moveSimd128FloatIfNotAVX(src, temp);
   1430  vcmpeqpd(Operand(srcForTemp), srcForTemp, temp);
   1431 
   1432  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
   1433  asMasm().vandpdSimd128(SimdConstant::SplatX2(2147483647.0), temp, temp);
   1434  vminpd(Operand(temp), src, dest);
   1435  vcvttpd2dq(dest, dest);
   1436 }
   1437 
   1438 void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(
   1439    FloatRegister src, FloatRegister temp, FloatRegister dest) {
   1440  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
   1441 
   1442  vxorpd(temp, temp, temp);
   1443  vmaxpd(Operand(temp), src, dest);
   1444 
   1445  asMasm().vminpdSimd128(SimdConstant::SplatX2(4294967295.0), dest, dest);
   1446  vroundpd(SSERoundingMode::Trunc, Operand(dest), dest);
   1447  asMasm().vaddpdSimd128(SimdConstant::SplatX2(4503599627370496.0), dest, dest);
   1448 
   1449  // temp == 0
   1450  vshufps(0x88, temp, dest, dest);
   1451 }
   1452 
   1453 void MacroAssemblerX86Shared::unsignedTruncFloat64x2ToInt32x4Relaxed(
   1454    FloatRegister src, FloatRegister dest) {
   1455  ScratchSimd128Scope scratch(asMasm());
   1456 
   1457  // The same as unsignedConvertInt32x4ToFloat64x2, but without NaN
   1458  // and out-of-bounds checks.
   1459  vroundpd(SSERoundingMode::Trunc, Operand(src), dest);
   1460  asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4503599627370496.0),
   1461                                    scratch);
   1462  vaddpd(Operand(scratch), dest, dest);
   1463  // The scratch has zeros in f32x4 lanes with index 0 and 2. The in-memory
   1464  // representantation of the splatted double contantant contains zero in its
   1465  // low bits.
   1466  vshufps(0x88, scratch, dest, dest);
   1467 }
   1468 
   1469 void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src,
   1470                                            FloatRegister temp,
   1471                                            FloatRegister output) {
   1472  ScratchSimd128Scope scratch(asMasm());
   1473  asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0x0f), scratch);
   1474  FloatRegister srcForTemp = asMasm().moveSimd128IntIfNotAVX(src, temp);
   1475  vpand(scratch, srcForTemp, temp);
   1476  vpandn(src, scratch, scratch);
   1477  int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
   1478  asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output);
   1479  vpsrlw(Imm32(4), scratch, scratch);
   1480  vpshufb(temp, output, output);
   1481  asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), temp);
   1482  vpshufb(scratch, temp, temp);
   1483  vpaddb(Operand(temp), output, output);
   1484 }