[ tor-browser ].git.dasho

UnrollLoops.cpp (78818B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "jit/UnrollLoops.h"
      8 
      9 #include "mozilla/Maybe.h"
     10 
     11 #include <stdio.h>
     12 
     13 #include "jit/DominatorTree.h"
     14 #include "jit/IonAnalysis.h"
     15 #include "jit/MIRGraph.h"
     16 
     17 namespace js {
     18 namespace jit {
     19 
     20 // [SMDOC] Loop unroller implementation summary
     21 //
     22 // This is a simple loop unroller, intended (initially at least) to handle only
     23 // wasm loops, with the aims of amortizing the per-iteration interrupt check
     24 // cost, and of lifting an initial iteration outside the loop so as to
     25 // facilitate subsequent optimizations.  Unrolling and peeling can be selected
     26 // independently, so the available choices are: peeling only, unrolling only,
     27 // or both peeling and unrolling.
     28 //
     29 // The flow of control (for a single function) is roughly:
     30 //
     31 // (0) The top level function is UnrollLoops.
     32 //
     33 // (1) UnrollLoops calls FindInitialCandidates to identify "initial candidate"
     34 //     loops in the function.  These are innermost loops.  Non-innermost loops
     35 //     cannot be unrolled/peeled.
     36 //
     37 // (2) FindInitialCandidates counts blocks and value defs in each initial
     38 //     candidate, creating an initial score which increases with loop size.
     39 //
     40 // (3) Back in UnrollLoops, initial candidates whose initial score does not
     41 //     exceed InitialCandidateThreshold proceed to the next stage in the
     42 //     process: they are passed to function AnalyzeLoop.
     43 //
     44 // (4) AnalyzeLoop, as called from UnrollLoops, checks many structural
     45 //     invariants on each loop, since the core unroll machinery depends on
     46 //     those invariants.  It also makes a final determination for each loop, as
     47 //     to whether the loop should be peeled, unrolled, peeled and unrolled, or
     48 //     be left unchanged.  In the latter case it returns one of a few reasons
     49 //     why the loop is to be left unchanged.
     50 //
     51 // (5) AnalyzeLoop also identifies, for a loop, its basic blocks, the set of
     52 //     blocks it jumps to when exiting, and the values defined in the loop
     53 //     which are used afterwards.
     54 //
     55 // (6) Back in UnrollLoops, we now know which loops are suitable for
     56 //     processing, what processing is to happen to them, and what their blocks
     57 //     are.  This is used to construct and initialize a `struct UnrollState`
     58 //     for each loop candidate that made it this far.
     59 //
     60 // (7) Before a loop can be unrolled, single-argument phi nodes need to be
     61 //     added to its exit target blocks, to "catch" values defined in the loop
     62 //     that are used afterwards.  This is a limited conversion into what is
     63 //     called "Loop-Closed SSA form" in the GCC/Clang literature.  UnrollLoops
     64 //     calls AddClosingPhisForLoop to get this done.  This is done for all
     65 //     loops before proceeding further.
     66 //
     67 // (8) Finally, we get to the core of the unrolling: the loops are handed to
     68 //     UnrollAndOrPeelLoop in turn.  This has extensive comments; a summary
     69 //     of the actions is:
     70 //     - blocks, phis and normal instructions are cloned so as to create
     71 //       the extra loop body copies.
     72 //     - the new blocks are added to the loop's UnrollState::blockTable.
     73 //     - during cloning, value uses are remapped to the correct iteration
     74 //       using an MDefinitionRemapper.
     75 //     - also during cloning, the new values are parked in a ValueTable for
     76 //       later reference.
     77 //     - control flow edges and predecessor vectors in the new and original
     78 //       block sets are fixed up, using RemapEdgeSource and
     79 //       RemapEdgeDestination.
     80 //     - header block phi nodes are fixed up in both the body copies
     81 //       and the original.
     82 //     - new exiting blocks are created, so as to maintain the invariant
     83 //       that there are no critical edges.
     84 //     - for exit target blocks, both their predecessor arrays and phi nodes
     85 //       (as installed by AddClosingPhisForLoop) are augmented to handle
     86 //       the new exit edges.
     87 //     - Wasm interrupt checks in all but the last body copy are nop'd out.
     88 //     - If peeling is required, the back edge of the unrolled loop is changed
     89 //       so it points at the second body copy, not the first (the original).
     90 //     - Finally, the new blocks are installed in the MIRGraph.
     91 //
     92 // (9) Back in UnrollLoops, once all loops have been processed,
     93 //     some function-level fixups are done:
     94 //     - the blocks are renumbered
     95 //     - the dominator tree is rebuilt
     96 //
     97 // UnrollLoops now returns to OptimizeMIR, handing back an indication of how
     98 // many loops were processed.  OptimizeMIR will tidy up the resulting function
     99 // by re-running GVN, empty block removal, and possibly other transformations.
    100 
    101 // The core logic in UnrollAndOrPeelLoop is heavily commented but still
    102 // nearly incomprehensible.  For debugging and modification it is helpful to
    103 // use the Dump{Block,Value}Table* routines provided.
    104 
    105 // The logic below functions without reference to block IDs or value IDs.
    106 
    107 // Much of the logic uses simple scans over vectors (classes BlockVector,
    108 // Matrix, SimpleSet, MDefinitionRemapper) rather than more complex data
    109 // structures.  The loops we process are small, which keeps the costs of these
    110 // scans low.  The InitialCandidateThreshold value used limits most processed
    111 // loops to about 6 blocks and 30-ish value definitions.
    112 
    113 // The idiom `blockTable.rowContains(0, block)` in the logic below means
    114 // "is `block` in the original loop?"
    115 
    116 // =====================================================================
    117 //
    118 // Tunables
    119 
    120 // As a crude first pass for filtering out unsuitable candidates, top level
    121 // function UnrollLoops calculates a size-related value for each candidate,
    122 // with lower values indicating more desirability for unrolling/peeling.
    123 // Candidates with a score above this value will not be unrolled.  Reducing the
    124 // value therefore reduces the aggressiveness of the unroller.
    125 const float InitialCandidateThreshold = 100.0;
    126 
    127 // On further analysis of candidates that pass the above test, we have more
    128 // detailed limitations on size.  The specified action will not be taken for a
    129 // loop if the associated metric exceeds the constants here.  Hence increasing
    130 // these values increases the aggressiveness of the unroller, but only to point
    131 // where the `InitialCandidateThreshold` would have cut it off.
    132 
    133 // Loops with more than this number of basic blocks or values (phis and normal
    134 // definitions) will be excluded from peeling.  The actual numbers are pretty
    135 // much pulled out of a hat.
    136 const size_t MaxBlocksForPeel = 12;
    137 const size_t MaxValuesForPeel = 150;
    138 
    139 // Loops with more than this number of blocks or values will be excluded from
    140 // unrolling.  Unrolling adds multiple copies of the loop body, whereas peeling
    141 // adds only one, so it seems prudent to make unrolling less aggressive than
    142 // peeling.
    143 const size_t MaxBlocksForUnroll = 10;
    144 const size_t MaxValuesForUnroll = 100;
    145 
    146 // =====================================================================
    147 //
    148 // BlockVector, a vector of MBasicBlock*s.
    149 
    150 using BlockVector = mozilla::Vector<MBasicBlock*, 32, SystemAllocPolicy>;
    151 
    152 static bool BlockVectorContains(const BlockVector& vec,
    153                                const MBasicBlock* block) {
    154  for (const MBasicBlock* b : vec) {
    155    if (b == block) {
    156      return true;
    157    }
    158  }
    159  return false;
    160 }
    161 
    162 // =====================================================================
    163 //
    164 // Matrix, a 2-D array of pointers
    165 
    166 template <typename T, int N, typename AP>
    167 class Matrix {
    168  static_assert(std::is_pointer<T>::value);
    169  mozilla::Vector<T, N, AP> vec_;
    170  size_t size1_ = 0;
    171  size_t size2_ = 0;
    172  inline size_t index(size_t ix1, size_t ix2) const {
    173    MOZ_ASSERT(ix1 < size1_ && ix2 < size2_);
    174    return ix1 * size2_ + ix2;
    175  }
    176 
    177 public:
    178  [[nodiscard]] bool init(size_t size1, size_t size2) {
    179    // Not already init'd.
    180    MOZ_ASSERT(size1_ == 0 && size2_ == 0 && vec_.empty());
    181    // This would be pointless.
    182    MOZ_ASSERT(size1 > 0 && size2 > 0);
    183    // So we can safely multiply them.  InitialCandidateThreshold ensures we'll
    184    // never have to process a loop anywhere near this big.
    185    MOZ_RELEASE_ASSERT(size1 <= 1000 && size2 <= 1000);
    186    size1_ = size1;
    187    size2_ = size2;
    188    if (!vec_.resize(size1 * size2)) {
    189      return false;
    190    }
    191    // Resizing also zero-initialises all the `vec_` entries.
    192    return true;
    193  }
    194 
    195  inline size_t size1() const {
    196    MOZ_ASSERT(size1_ * size2_ == vec_.length());
    197    return size1_;
    198  }
    199  inline size_t size2() const {
    200    MOZ_ASSERT(size1_ * size2_ == vec_.length());
    201    return size2_;
    202  }
    203  inline T get(size_t ix1, size_t ix2) const { return vec_[index(ix1, ix2)]; }
    204  inline void set(size_t ix1, size_t ix2, T value) {
    205    vec_[index(ix1, ix2)] = value;
    206  }
    207 
    208  // Does the matrix slice `[ix1, ..]` contain `value`?
    209  inline bool rowContains(size_t ix1, const T value) const {
    210    return findInRow(ix1, value).isSome();
    211  }
    212 
    213  // Find the column number (ix2) such that entry `[ix1, ix2] == value`.
    214  mozilla::Maybe<size_t> findInRow(size_t ix1, const T value) const {
    215    for (size_t ix2 = 0; ix2 < size2_; ix2++) {
    216      if (get(ix1, ix2) == value) {
    217        return mozilla::Some(ix2);
    218      }
    219    }
    220    return mozilla::Nothing();
    221  }
    222 };
    223 
    224 using BlockTable = Matrix<MBasicBlock*, 32, SystemAllocPolicy>;
    225 using ValueTable = Matrix<MDefinition*, 128, SystemAllocPolicy>;
    226 
    227 // =====================================================================
    228 //
    229 // Debug printing
    230 
    231 #ifdef JS_JITSPEW
    232 
    233 // For debugging, dump specific rows (loop body copies) of a block table.
    234 static void DumpBlockTableRows(const BlockTable& table, int32_t firstCix,
    235                               int32_t lastCix, const char* tag) {
    236  Fprinter& printer(JitSpewPrinter());
    237  printer.printf("<<<< %s\n", tag);
    238  for (int32_t cix = firstCix; cix <= lastCix; cix++) {
    239    if (cix == 0) {
    240      printer.printf("  -------- Original --------\n");
    241    } else {
    242      printer.printf("  -------- Copy %u --------\n", cix);
    243    }
    244    for (uint32_t bix = 0; bix < table.size2(); bix++) {
    245      DumpMIRBlock(printer, table.get(uint32_t(cix), bix),
    246                   /*showDetails=*/true);
    247    }
    248  }
    249  printer.printf(">>>>\n");
    250 }
    251 
    252 // Dump an entire block table.
    253 static void DumpBlockTable(const BlockTable& table, const char* tag) {
    254  DumpBlockTableRows(table, 0, int32_t(table.size1()) - 1, tag);
    255 }
    256 
    257 // Dump just the original loop body in a block table.
    258 static void DumpBlockTableRowZero(const BlockTable& table, const char* tag) {
    259  DumpBlockTableRows(table, 0, 0, tag);
    260 }
    261 
    262 // Dump a value table.
    263 static void DumpValueTable(const ValueTable& table, const char* tag) {
    264  Fprinter& printer(JitSpewPrinter());
    265  printer.printf("<<<< %s\n", tag);
    266  for (uint32_t cix = 0; cix < table.size1(); cix++) {
    267    if (cix == 0) {
    268      printer.printf("  -------- Original --------\n");
    269    } else {
    270      printer.printf("  -------- Copy %u --------\n", cix);
    271    }
    272    for (uint32_t vix = 0; vix < table.size2(); vix++) {
    273      printer.printf("    ");
    274      DumpMIRDefinition(printer, table.get(cix, vix),
    275                        /*showDetails=*/true);
    276      printer.printf("\n");
    277    }
    278  }
    279  printer.printf(">>>>\n");
    280 }
    281 
    282 #endif  // JS_JITSPEW
    283 
    284 // =====================================================================
    285 //
    286 // SimpleSet, a set of pointers
    287 
    288 template <typename T, int N, typename AP>
    289 class SimpleSet {
    290  static_assert(std::is_pointer<T>::value);
    291  mozilla::Vector<T, N, AP> vec_;
    292 
    293 public:
    294  [[nodiscard]] bool copy(const SimpleSet<T, N, AP>& other) {
    295    vec_.clear();
    296    for (T elem : other.vec_) {
    297      if (!vec_.append(elem)) {
    298        return false;
    299      }
    300    }
    301    return true;
    302  }
    303  bool contains(T t) const {
    304    for (auto* existing : vec_) {
    305      if (existing == t) {
    306        return true;
    307      }
    308    }
    309    return false;
    310  }
    311  [[nodiscard]] bool add(T t) {
    312    MOZ_ASSERT(t);
    313    return contains(t) ? true : vec_.append(t);
    314  }
    315  bool empty() const { return vec_.empty(); }
    316  size_t size() const { return vec_.length(); }
    317  T get(size_t ix) const { return vec_[ix]; }
    318 };
    319 
    320 using BlockSet = SimpleSet<MBasicBlock*, 8, SystemAllocPolicy>;
    321 using ValueSet = SimpleSet<MDefinition*, 64, SystemAllocPolicy>;
    322 
    323 // =====================================================================
    324 //
    325 // MDefinitionRemapper, a very simple MDefinition*-remapping class.
    326 
    327 class MDefinitionRemapper {
    328  using Pair = std::pair<MDefinition*, MDefinition*>;
    329  mozilla::Vector<Pair, 32, SystemAllocPolicy> pairs;
    330 
    331 public:
    332  MDefinitionRemapper() {}
    333  // Register `original` as a key in the mapper, and map it to itself.
    334  [[nodiscard]] bool enregister(MDefinition* original) {
    335    MOZ_ASSERT(original);
    336    for (auto& p : pairs) {
    337      (void)p;
    338      MOZ_ASSERT(p.first != original);  // not mapped at all
    339      MOZ_ASSERT(p.second == p.first);  // no `update` calls happened yet
    340    }
    341    return pairs.append(std::pair(original, original));
    342  }
    343  // Look up the binding for `original` in the mapper.
    344  MDefinition* lookup(const MDefinition* original) const {
    345    MOZ_ASSERT(original);
    346    MDefinition* res = nullptr;
    347    for (auto& p : pairs) {
    348      if (p.first == original) {
    349        res = p.second;
    350        break;
    351      }
    352    }
    353    return res;
    354  }
    355  // Update the mapper so as to map `original` to `replacement`.  `original`
    356  // must have previously been registered with ::enregister.
    357  void update(const MDefinition* original, MDefinition* replacement) {
    358    MOZ_ASSERT(original && replacement);
    359    MOZ_ASSERT(original != replacement);
    360    for (auto& p : pairs) {
    361      if (p.first == original) {
    362        p.second = replacement;
    363        return;
    364      }
    365    }
    366    MOZ_CRASH();  // asked to update a non-existent key
    367  }
    368 };
    369 
    370 // =====================================================================
    371 //
    372 // MakeReplacementInstruction / MakeReplacementPhi
    373 
    374 // Make a cloned version of `ins`, but with its arguments remapped by `mapper`.
    375 static MInstruction* MakeReplacementInstruction(
    376    TempAllocator& alloc, const MDefinitionRemapper& mapper,
    377    const MInstruction* ins) {
    378  MDefinitionVector inputs(alloc);
    379  for (size_t i = 0; i < ins->numOperands(); i++) {
    380    MDefinition* old = ins->getOperand(i);
    381    MDefinition* replacement = mapper.lookup(old);
    382    if (!replacement) {
    383      // The mapper didn't map it, which means that `old` is a value that's not
    384      // defined in the loop body.  So leave it unchanged.
    385      replacement = old;
    386    }
    387    if (!inputs.append(replacement)) {
    388      return nullptr;
    389    }
    390  }
    391  return ins->clone(alloc, inputs);
    392 }
    393 
    394 // Make a cloned version of `phi`, but with its arguments remapped by `mapper`.
    395 static MPhi* MakeReplacementPhi(TempAllocator& alloc,
    396                                const MDefinitionRemapper& mapper,
    397                                const MPhi* phi) {
    398  MDefinitionVector inputs(alloc);
    399  for (size_t i = 0; i < phi->numOperands(); i++) {
    400    MDefinition* old = phi->getOperand(i);
    401    MDefinition* replacement = mapper.lookup(old);
    402    if (!replacement) {
    403      replacement = old;
    404    }
    405    if (!inputs.append(replacement)) {
    406      return nullptr;
    407    }
    408  }
    409  return phi->clone(alloc, inputs);
    410 }
    411 
    412 // =====================================================================
    413 //
    414 // UnrollState
    415 
    416 enum class UnrollMode { Peel, Unroll, PeelAndUnroll };
    417 
    418 struct UnrollState {
    419  // What should happen to the specified loop.
    420  const UnrollMode mode;
    421 
    422  // The central data structure: BlockTable, a 2-D array of block pointers.
    423  //
    424  // To create copies of the original loop body, we need to copy the original
    425  // blocks and fix up the inter-block edges and phis appropriately.  This
    426  // edge-fixup is done without reference to the block IDs; instead it is done
    427  // purely on the basis of the MBasicBlock* values themselves.  Hence it does
    428  // not make any assumption about the ordering of the block ID values.
    429  //
    430  // The block copies are managed using a 2-D array of block pointers, via
    431  // which we can uniformly access the original and new blocks.  The array is
    432  // indexed by `(copy index, block index in copy)`.  Both indices are
    433  // zero-based.  The sequencing of blocks implied by "block index in copy" is
    434  // the sequence produced by the RPO iterator as it enumerates the blocks in
    435  // the loop.
    436  //
    437  // Per the above, note that "block index" (`bix` in the code) is a zero-based
    438  // index into the RPO sequence of blocks in the loop and is not to be
    439  // confused with the pre-existing concept of "block ID".
    440  //
    441  // Note also that the required unroll factor (including that for the peeled
    442  // copy, if any) is equal to the first dimension of `blockTable`, and the
    443  // number of blocks in the original loop is implied by the second dimension
    444  // of `blockTable`.  Hence neither value needs to be stored explicitly.
    445  BlockTable blockTable;
    446 
    447  // The set of blocks arrived at (directly) by exiting branches in the
    448  // original loop.  FIXME: try to make const
    449  /*const*/ BlockSet exitTargetBlocks;
    450 
    451  // The set of values (MDefinition*s) defined in the original loop and used
    452  // afterwards.
    453  /*const*/ ValueSet exitingValues;
    454 
    455  explicit UnrollState(UnrollMode mode) : mode(mode) {}
    456 
    457  bool doPeeling() const {
    458    return mode == UnrollMode::Peel || mode == UnrollMode::PeelAndUnroll;
    459  }
    460  bool doUnrolling() const {
    461    return mode == UnrollMode::Unroll || mode == UnrollMode::PeelAndUnroll;
    462  }
    463 };
    464 
    465 // =====================================================================
    466 //
    467 // AnalyzeLoop and AnalysisResult
    468 
    469 // Summarises the result of a detailed analysis of a candidate loop, producing
    470 // either a recommendation about what to do with it, or a reason why it should
    471 // be left unchanged.
    472 enum class AnalysisResult {
    473  // OOM during analysis
    474  OOM,
    475 
    476  // The chosen transformation for the loop ..
    477  Peel,
    478  Unroll,
    479  PeelAndUnroll,
    480 
    481  // .. or .. reasons why the loop can't be unrolled:
    482  //
    483  // The unrolling/peeling machinery below relies on the fact that MIR loops
    484  // are heavily constrained in their structure.  AnalyzeLoop checks the
    485  // relevant invariants carefully, with this value indicating non-compliance.
    486  // Top level function UnrollLoops will MOZ_CRASH if this happens.
    487  BadInvariants,
    488  // Cannot be handled by the unroller: if the loop defines value(s) that are
    489  // used after the loop, then we can only handle the case where it has one
    490  // exiting branch.
    491  TooComplex,
    492  // The loop contains MIR nodes that are uncloneable
    493  Uncloneable,
    494  // The loop has too many blocks or values
    495  TooLarge,
    496  // The loop is otherwise unsuitable:
    497  // * contains a call or table switch
    498  // * is an infinite loop
    499  Unsuitable
    500 };
    501 
    502 #ifdef JS_JITSPEW
    503 static const char* Name_of_AnalysisResult(AnalysisResult res) {
    504  switch (res) {
    505    case AnalysisResult::OOM:
    506      return "OOM";
    507    case AnalysisResult::Peel:
    508      return "Peel";
    509    case AnalysisResult::Unroll:
    510      return "Unroll";
    511    case AnalysisResult::PeelAndUnroll:
    512      return "PeelAndUnroll";
    513    case AnalysisResult::BadInvariants:
    514      return "BadInvariants";
    515    case AnalysisResult::TooComplex:
    516      return "TooComplex";
    517    case AnalysisResult::Uncloneable:
    518      return "Uncloneable";
    519    case AnalysisResult::TooLarge:
    520      return "TooLarge";
    521    case AnalysisResult::Unsuitable:
    522      return "Unsuitable";
    523    default:
    524      MOZ_CRASH();
    525  }
    526 }
    527 #endif
    528 
    529 // Examine the original loop in `originalBlocks` for unrolling suitability,
    530 // and, if acceptable, collect auxiliary information:
    531 //
    532 // * the set of blocks that are direct exit targets for the loop
    533 // * the set of values defined in the loop AND used afterwards
    534 
    535 static AnalysisResult AnalyzeLoop(const BlockVector& originalBlocks,
    536                                  BlockSet* exitTargetBlocks,
    537                                  ValueSet* exitingValues) {
    538  MOZ_ASSERT(exitTargetBlocks->empty());
    539  MOZ_ASSERT(exitingValues->empty());
    540 
    541  // ==== BEGIN check invariants on the loop structure ====
    542 
    543  // The unroller/peeler below depends on these invariants.
    544  // In summary (where nBlocks == originalBlocks.length()):
    545  //
    546  // * At least 2 blocks in original loop (1 for header phis + possible insns,
    547  //   >= 0 for "ordinary" blocks, 1 for the backedge jump)
    548  //
    549  // * originalBlocks[0] is a loop header
    550  // * originalBlocks[nBlocks-1] == originalBlocks[0]->backedge
    551  //
    552  // * all blocks in original loop have at least one insn
    553  // * all blocks in original loop have a control insn as their last insn
    554  // * all blocks in original loop have non-control insns for all other insns
    555  // * all blocks in original loop have loop depth > 0
    556  //
    557  // * none of originalBlocks[0 .. nBlocks-2] (inclusive) jumps to header
    558  //
    559  // * originalBlocks[nBlocks-1] jumps unconditionally to header
    560  //
    561  // * original header node has 2 preds
    562  // * original header node pred[0] is from outside the loop, and
    563  //   pred[1] is from inside the loop.
    564  //
    565  // * original header node phis all have 2 args
    566  // * original header node phis have arg[0] from outside the loop
    567  //   (because pred[0] is the loop-entry edge)
    568  // * original header node phis have their arg[1] as
    569  //   -- (almost always) defined in the loop -- loop carried variable
    570  //   -- (very occasionally) defined outside the loop -- in which case
    571  //      this is just a vanilla phi, nothing to do with the loop.
    572  //   Hence there is nothing to check for phi arg[1]s, but leaving this
    573  //   here for clarity.
    574  //
    575  // Aside: how can a loop head phi not depend on a value defined within the
    576  // loop?  Consider the strange-but-valid case
    577  //
    578  //    x = 5 ; loop { .. x = 7; .. }
    579  //
    580  // The loop head phi will merge the initial value (5) and the loop-defined
    581  // value in the usual way.  Imagine now the loop is LICMd; the loop defined
    582  // value `x = 7;` is observed to be invariant and so is lifted out of the
    583  // loop.  Hence the loop head phi will have both args defined outside the
    584  // loop.
    585 
    586  // * At least 2 blocks in original loop (1 for header phis + possible insns,
    587  //   >= 0 for "ordinary" blocks, 1 for the backedge jump)
    588  const size_t numBlocksInOriginal = originalBlocks.length();
    589  if (numBlocksInOriginal < 2) {
    590    return AnalysisResult::BadInvariants;
    591  }
    592 
    593  // * originalBlocks[0] is a loop header
    594  // * originalBlocks[nBlocks-1] == originalBlocks[0]->backedge
    595  {
    596    MBasicBlock* header = originalBlocks[0];
    597    if (!header->isLoopHeader() ||
    598        header->backedge() != originalBlocks[numBlocksInOriginal - 1]) {
    599      return AnalysisResult::BadInvariants;
    600    }
    601  }
    602 
    603  // * all blocks in original loop have at least one insn
    604  // * all blocks in original loop have a control insn as their last insn
    605  // * all blocks in original loop have non-control insns for all other insns
    606  // * all blocks in original loop have loop depth > 0
    607  for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
    608    MBasicBlock* block = originalBlocks[bix];
    609    // This is a bit lame, but ..
    610    size_t numInsns = 0;
    611    for (MInstructionIterator insnIter(block->begin());
    612         insnIter != block->end(); insnIter++) {
    613      numInsns++;
    614    }
    615    if (numInsns < 1) {
    616      return AnalysisResult::BadInvariants;
    617    }
    618    bool ok = true;
    619    size_t curInsn = 0;
    620    for (MInstructionIterator insnIter(block->begin());
    621         insnIter != block->end(); insnIter++) {
    622      MInstruction* insn = *insnIter;
    623      if (curInsn == numInsns - 1) {
    624        ok = ok && insn->isControlInstruction();
    625      } else {
    626        ok = ok && !insn->isControlInstruction();
    627      }
    628      curInsn++;
    629    }
    630    MOZ_ASSERT(curInsn == numInsns);  // internal logic
    631    ok = ok && block->loopDepth() > 0;
    632    if (!ok) {
    633      return AnalysisResult::BadInvariants;
    634    }
    635  }
    636 
    637  // * none of originalBlocks[0 .. nBlocks-2] (inclusive) jumps to header
    638  {
    639    MBasicBlock* header = originalBlocks[0];
    640    for (uint32_t bix = 0; bix < numBlocksInOriginal - 1; bix++) {
    641      MBasicBlock* block = originalBlocks[bix];
    642      if (!block->hasLastIns()) {
    643        return AnalysisResult::BadInvariants;
    644      }
    645      MControlInstruction* lastIns = block->lastIns();
    646      for (size_t i = 0; i < lastIns->numSuccessors(); i++) {
    647        if (lastIns->getSuccessor(i) == header) {
    648          return AnalysisResult::BadInvariants;
    649        }
    650      }
    651    }
    652  }
    653 
    654  // originalBlocks[nBlocks-1] jumps unconditionally to header
    655  {
    656    MBasicBlock* header = originalBlocks[0];
    657    MBasicBlock* backedge = originalBlocks[numBlocksInOriginal - 1];
    658    if (!backedge->hasLastIns()) {
    659      return AnalysisResult::BadInvariants;
    660    }
    661    MControlInstruction* lastIns = backedge->lastIns();
    662    if (!lastIns->isGoto() || lastIns->toGoto()->target() != header) {
    663      return AnalysisResult::BadInvariants;
    664    }
    665  }
    666 
    667  // * original header node has 2 preds
    668  // * original header node pred[0] is from outside the loop.
    669  //   pred[1] is from inside the loop.
    670  {
    671    MBasicBlock* header = originalBlocks[0];
    672    if (header->numPredecessors() != 2 ||
    673        BlockVectorContains(originalBlocks, header->getPredecessor(0)) ||
    674        !BlockVectorContains(originalBlocks, header->getPredecessor(1))) {
    675      return AnalysisResult::BadInvariants;
    676    }
    677  }
    678 
    679  // * original header node phis all have 2 args
    680  // * original header node phis have arg[0] from outside the loop
    681  //   (because pred[0] is the loop-entry edge)
    682  // * original header node phis have their arg[1] as
    683  //   -- (almost always) defined in the loop -- loop carried variable
    684  //   -- (very occasionally) defined outside the loop -- in which case
    685  //      this is just a vanilla phi, nothing to do with the loop.
    686  //   Hence there is nothing to check for phi arg[1]s, but leaving this
    687  //   here for clarity.
    688  {
    689    MBasicBlock* header = originalBlocks[0];
    690    for (MPhiIterator phiIter(header->phisBegin());
    691         phiIter != header->phisEnd(); phiIter++) {
    692      MPhi* phi = *phiIter;
    693      if (phi->numOperands() != 2 ||
    694          BlockVectorContains(originalBlocks, phi->getOperand(0)->block())) {
    695        return AnalysisResult::BadInvariants;
    696      }
    697    }
    698  }
    699 
    700  // ==== END check invariants on the loop structure ====
    701 
    702  // Check that all the insns are cloneable.
    703  for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
    704    MBasicBlock* block = originalBlocks[bix];
    705    for (MInstructionIterator insIter(block->begin()); insIter != block->end();
    706         insIter++) {
    707      MInstruction* ins = *insIter;
    708      if (ins->isWasmCallCatchable() || ins->isWasmCallUncatchable() ||
    709          ins->isWasmReturnCall() || ins->isTableSwitch()
    710          /* TODO: other node kinds unsuitable for unrolling? */) {
    711        return AnalysisResult::Unsuitable;
    712      }
    713      if (!ins->canClone()) {
    714        // `ins->opName()` will show the name of the insn kind, if you want to
    715        // see it
    716        return AnalysisResult::Uncloneable;
    717      }
    718    }
    719  }
    720 
    721  // More analysis: make up a set of blocks that are not in the loop, but which
    722  // are jumped to from within the loop.  We will need this later.
    723  for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
    724    MBasicBlock* block = originalBlocks[bix];
    725    MControlInstruction* lastIns = block->lastIns();  // asserted safe above
    726    for (size_t i = 0; i < lastIns->numSuccessors(); i++) {
    727      MBasicBlock* succ = lastIns->getSuccessor(i);
    728      // See if `succ` is any of the blocks in the (original) loop.  If not,
    729      // add it to `exitTargetBlocks`.
    730      if (!BlockVectorContains(originalBlocks, succ)) {
    731        if (!exitTargetBlocks->add(succ)) {
    732          return AnalysisResult::OOM;
    733        }
    734      }
    735    }
    736  }
    737 
    738  // If the loop has zero exits, we consider it sufficiently mutant to ignore
    739  // (it's an infinite loop?)
    740  if (exitTargetBlocks->empty()) {
    741    return AnalysisResult::Unsuitable;
    742  }
    743 
    744  // Even more analysis: make up a set of MDefinition*s that are defined in the
    745  // original loop and which are used after the loop.  We will later need to
    746  // add phi nodes that generate the correct values for these, since, after
    747  // unrolling, these values will have multiple definition points (one per
    748  // unrolled iteration).  While we're at it, count the number of values
    749  // defined in the original loop.
    750  size_t numValuesInOriginal = 0;
    751  for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
    752    MBasicBlock* block = originalBlocks[bix];
    753    // Check all phi nodes ..
    754    for (MPhiIterator phiIter(block->phisBegin()); phiIter != block->phisEnd();
    755         phiIter++) {
    756      numValuesInOriginal++;
    757      MPhi* phi = *phiIter;
    758      bool usedAfterLoop = false;
    759      // Is the value of `phi` used after the loop?
    760      for (MUseIterator useIter(phi->usesBegin()); useIter != phi->usesEnd();
    761           useIter++) {
    762        MUse* use = *useIter;
    763        if (!BlockVectorContains(originalBlocks, use->consumer()->block())) {
    764          usedAfterLoop = true;
    765          break;
    766        }
    767      }
    768      if (usedAfterLoop && !exitingValues->add(phi)) {
    769        return AnalysisResult::OOM;
    770      }
    771    }
    772    // .. and all normal nodes ..
    773    for (MInstructionIterator insnIter(block->begin());
    774         insnIter != block->end(); insnIter++) {
    775      numValuesInOriginal++;
    776      MInstruction* insn = *insnIter;
    777      bool usedAfterLoop = false;
    778      // Is the value of `insn` used after the loop?
    779      for (MUseIterator useIter(insn->usesBegin()); useIter != insn->usesEnd();
    780           useIter++) {
    781        MUse* use = *useIter;
    782        if (!BlockVectorContains(originalBlocks, use->consumer()->block())) {
    783          usedAfterLoop = true;
    784          break;
    785        }
    786      }
    787      if (usedAfterLoop && !exitingValues->add(insn)) {
    788        return AnalysisResult::OOM;
    789      }
    790    }
    791  }
    792 
    793  // Due to limitations in AddClosingPhisForLoop (see comments there),
    794  // currently we can only unroll loops where, either:
    795  //
    796  // * no value defined in the loop is used afterwards, in which case we handle
    797  //   any number of exits from the loop.
    798  //
    799  // * one or more values defined in the loop is used afterwards, in which case
    800  //   there must be one exit point from the loop.
    801  //
    802  // IOW we can't handle multiple exits *and* values defined in the loop.
    803  if (exitTargetBlocks->size() >= 2 && exitingValues->size() > 0) {
    804    return AnalysisResult::TooComplex;
    805  }
    806 
    807  // There is an invariant that a loop exit target block does not contain any
    808  // phis.  This is really just the no-critical-edges invariant.
    809  //
    810  // By definition, a loop exit block has at least one successor in the loop
    811  // and at least one successor outside the loop.  If the loop exit target has
    812  // phis, that would imply it has multiple predecessors.  But then the edge
    813  // would be critical, because it connects a block with multiple successors to
    814  // a block with multiple predecessors.
    815  //
    816  // So enforce that; it simplifies the addition of loop-closing phis that will
    817  // happen shortly.
    818  for (size_t i = 0; i < exitTargetBlocks->size(); i++) {
    819    MBasicBlock* exitTargetBlock = exitTargetBlocks->get(i);
    820    if (!exitTargetBlock->phisEmpty()) {
    821      return AnalysisResult::BadInvariants;
    822    }
    823    // Also we need this, since we'll be adding single-argument phis to this
    824    // block during loop-closing.
    825    if (exitTargetBlock->numPredecessors() != 1) {
    826      return AnalysisResult::BadInvariants;
    827    }
    828  }
    829 
    830  // At this point, we've worked through all invariant- and structural- reasons
    831  // why we can't/shouldn't unroll/peel this loop.  Now we need to decide which
    832  // (or both) of those actions should happen.
    833  bool peel = true;
    834  bool unroll = true;
    835  if (numBlocksInOriginal > MaxBlocksForPeel ||
    836      numValuesInOriginal > MaxValuesForPeel) {
    837    peel = false;
    838  }
    839  if (numBlocksInOriginal > MaxBlocksForUnroll ||
    840      numValuesInOriginal > MaxValuesForUnroll) {
    841    unroll = false;
    842  }
    843 
    844  if (peel) {
    845    return unroll ? AnalysisResult::PeelAndUnroll : AnalysisResult::Peel;
    846  } else {
    847    return unroll ? AnalysisResult::Unroll : AnalysisResult::TooLarge;
    848  }
    849 }
    850 
    851 // =====================================================================
    852 //
    853 // AddClosingPhisForLoop
    854 
    855 // Add "loop-closing phis" for values defined in a loop and used afterwards.
    856 // In GCC parlance, the resulting form is called LCSSA (Loop-Closed SSA).  This
    857 // is a very limited intervention in that it can only handle a single exit in a
    858 // loop, thereby restricting the places where the phis should go to the
    859 // one-and-only exit block, and avoiding the need to compute iterated dominance
    860 // frontiers.
    861 
    862 [[nodiscard]]
    863 static bool AddClosingPhisForLoop(TempAllocator& alloc,
    864                                  const UnrollState& state) {
    865  // If no values exit the loop, we don't care how many exit target blocks
    866  // there are, and need to make no changes.
    867  if (state.exitingValues.empty()) {
    868    return true;
    869  }
    870 
    871  // Ensured by AnalyzeLoop
    872  MOZ_ASSERT(state.exitTargetBlocks.size() == 1);
    873  MBasicBlock* targetBlock = state.exitTargetBlocks.get(0);
    874  // Ensured by AnalyzeLoop.  Required because we're adding single-arg phis
    875  // to `targetBlock`.
    876  MOZ_ASSERT(targetBlock->numPredecessors() == 1);
    877 
    878  for (size_t i = 0; i < state.exitingValues.size(); i++) {
    879    MDefinition* exitingValue = state.exitingValues.get(i);
    880    // In `targetBlock`, add a single-argument phi node that "catches"
    881    // `exitingValue`, and change all uses of `exitingValue` to be the phi
    882    // node instead.
    883    MPhi* phi = MPhi::New(alloc, exitingValue->type());
    884    if (!phi) {
    885      return false;
    886    }
    887    targetBlock->addPhi(phi);
    888    // Change all uses of `exitingValue` outside of the loop into uses of
    889    // `phi`.
    890    for (MUseIterator useIter(exitingValue->usesBegin()),
    891         useIterEnd(exitingValue->usesEnd());
    892         useIter != useIterEnd;
    893         /**/) {
    894      MUse* use = *useIter++;
    895      if (state.blockTable.rowContains(0, use->consumer()->block())) {
    896        continue;
    897      }
    898      MOZ_ASSERT(use->consumer());
    899      use->replaceProducer(phi);
    900    }
    901    // Finally, make `phi` be the one-and-only user of `exitingValue`.
    902    if (!phi->addInputFallible(exitingValue)) {
    903      return false;
    904    }
    905  }
    906 
    907  // Invariant that should hold at this point: for each exiting value, all uses
    908  // of it outside the loop should only be phi nodes.
    909  for (size_t i = 0; i < state.exitingValues.size(); i++) {
    910    MDefinition* exitingValue = state.exitingValues.get(i);
    911    for (MUseIterator useIter(exitingValue->usesBegin());
    912         useIter != exitingValue->usesEnd(); useIter++) {
    913      mozilla::DebugOnly<MUse*> use = *useIter;
    914      MOZ_ASSERT_IF(!state.blockTable.rowContains(0, use->consumer()->block()),
    915                    use->consumer()->isDefinition() &&
    916                        use->consumer()->toDefinition()->isPhi());
    917    }
    918  }
    919 
    920  return true;
    921 }
    922 
    923 // =====================================================================
    924 //
    925 // UnrollAndOrPeelLoop
    926 
    927 [[nodiscard]]
    928 static bool UnrollAndOrPeelLoop(MIRGraph& graph, UnrollState& state) {
    929  // Prerequisites (assumed):
    930  // * AnalyzeLoop has approved this loop for peeling and/or unrolling
    931  // * AddClosingPhisForLoop has been called for it
    932  //
    933  // We expect `state` to be ready to go, with the caveat that only the first
    934  // row of entries in `state.blockTable` have been filled in (these are the
    935  // original loop blocks).  All other entries are null and will be filled in
    936  // by this routine.
    937  //
    938  // Almost all the logic here is to do with unrolling.  That's because peeling
    939  // (done far below) can be done by a trivial modification of the loop
    940  // resulting from unrolling: make the backedge jump to the second copy of the
    941  // loop body rather than the first.  That leaves the first copy (that is, the
    942  // original loop body) in a "will only be executed once" state, as we desire.
    943 
    944  // We need this so often it's worth pulling out specially.
    945  const MBasicBlock* originalHeader = state.blockTable.get(0, 0);
    946  MOZ_ASSERT(originalHeader->isLoopHeader());
    947 
    948 #ifdef JS_JITSPEW
    949  if (JitSpewEnabled(JitSpew_UnrollDetails)) {
    950    Fprinter& printer(JitSpewPrinter());
    951    printer.printf(
    952        "<<<< ORIGINAL FUNCTION (after LCSSA-ification of chosen loops)\n");
    953    DumpMIRGraph(printer, graph, /*showDetails=*/true);
    954    printer.printf(">>>>\n");
    955  }
    956 #endif
    957 
    958  // Decide on the total unroll factor.  "Copy #0" is the original loop, so the
    959  // number of new copies is `unrollFactor - 1`.  For example, if the original
    960  // loop was `loop { B; }` then with `unrollFactor == 3`, the unrolled loop
    961  // will be `loop { B; B; B; }`.  Note that the value acquired here includes
    962  // the copy needed for peeling, if that has been requested.
    963  const uint32_t unrollFactor = state.blockTable.size1();
    964  // The logic below will fail if this doesn't hold.
    965  MOZ_ASSERT(unrollFactor >= 2);
    966 
    967  // The number of blocks in the original loop.
    968  const uint32_t numBlocksInOriginal = state.blockTable.size2();
    969 
    970  // The number of values (phis and normal values) in the original loop.
    971  uint32_t numValuesInOriginal = 0;
    972  for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
    973    MBasicBlock* block = state.blockTable.get(0, bix);
    974    // This is very feeble, but I can't think of anything better.
    975    for (MPhiIterator phiIter(block->phisBegin()); phiIter != block->phisEnd();
    976         phiIter++) {
    977      numValuesInOriginal++;
    978    }
    979    for (MInstructionIterator insnIter(block->begin());
    980         insnIter != block->end(); insnIter++) {
    981      numValuesInOriginal++;
    982    }
    983  }
    984  // `numValuesInOriginal` is const after this point.
    985 
    986 #ifdef JS_JITSPEW
    987  if (JitSpewEnabled(JitSpew_UnrollDetails)) {
    988    DumpBlockTableRowZero(state.blockTable, "ORIGINAL LOOP");
    989    Fprinter& printer(JitSpewPrinter());
    990    printer.printf("<<<< EXIT TARGET BLOCKS: ");
    991    for (size_t i = 0; i < state.exitTargetBlocks.size(); i++) {
    992      MBasicBlock* targetBlock = state.exitTargetBlocks.get(i);
    993      DumpMIRBlockID(printer, targetBlock, /*showDetails=*/true);
    994      printer.printf(" ");
    995    }
    996    printer.printf(">>>>\n");
    997    printer.printf("<<<< EXITING VALUES: ");
    998    for (size_t i = 0; i < state.exitingValues.size(); i++) {
    999      MDefinition* exitingValue = state.exitingValues.get(i);
   1000      DumpMIRDefinitionID(printer, exitingValue, /*showDetails=*/true);
   1001      printer.printf(" ");
   1002    }
   1003    printer.printf(">>>>\n");
   1004  }
   1005 #endif
   1006 
   1007  // At this point, `state.blockTable` row zero contains the original blocks,
   1008  // and all other rows are nulled out.
   1009 
   1010  // Set up the value table.  This is a structure similar to
   1011  // `state.blockTable`: a 2-D array of values.  This is local to this routine
   1012  // rather than being part of `state` because it is only needed here, and can
   1013  // take quite a lot of space.
   1014  //
   1015  // As with the block table, the main (actually, only) purpose of the value
   1016  // table is to make it possible to find equivalent definitions in different
   1017  // iterations.  Also as above, "value index" (`vix` in the code) is a
   1018  // zero-based index into the sequence of values (including phis) in the loop
   1019  // as generated by the RPO traversal of the loop's blocks.
   1020 
   1021  ValueTable valueTable;
   1022  if (!valueTable.init(unrollFactor, numValuesInOriginal)) {
   1023    return false;
   1024  }
   1025 
   1026  // Copy the original values into row zero of `valueTable`, parking them
   1027  // end-to-end, without regard to block boundaries.  All other rows remain
   1028  // nulled out for now.
   1029  {
   1030    uint32_t vix = 0;
   1031    for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1032      MBasicBlock* block = state.blockTable.get(0, bix);
   1033      for (MPhiIterator phiIter(block->phisBegin());
   1034           phiIter != block->phisEnd(); phiIter++) {
   1035        valueTable.set(0, vix, *phiIter);
   1036        vix++;
   1037      }
   1038      for (MInstructionIterator insnIter(block->begin());
   1039           insnIter != block->end(); insnIter++) {
   1040        valueTable.set(0, vix, *insnIter);
   1041        vix++;
   1042      }
   1043    }
   1044    MOZ_ASSERT(vix == numValuesInOriginal);
   1045  }
   1046 
   1047  // Set up the value remapper.  This maps MDefinitions* in the original body
   1048  // to their "current" definition in new bodies.  The only allowed keys are
   1049  // MDefinition*s from the original body.  To enforce this, the keys have to
   1050  // be "registered" before use.
   1051  MDefinitionRemapper mapper;
   1052  for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1053    MBasicBlock* originalBlock = state.blockTable.get(0, bix);
   1054    // Create initial entries in `mapper` for both the normal insns and phi
   1055    // nodes in `originalBlock`.
   1056    for (MPhiIterator phiIter(originalBlock->phisBegin());
   1057         phiIter != originalBlock->phisEnd(); phiIter++) {
   1058      MPhi* originalPhi = *phiIter;
   1059      if (!mapper.enregister(originalPhi)) {
   1060        return false;
   1061      }
   1062    }
   1063    for (MInstructionIterator insnIter(originalBlock->begin());
   1064         insnIter != originalBlock->end(); insnIter++) {
   1065      MInstruction* originalInsn = *insnIter;
   1066      if (!mapper.enregister(originalInsn)) {
   1067        return false;
   1068      }
   1069    }
   1070  }
   1071 
   1072  // Now we begin with the very core of the unrolling activity.
   1073 
   1074  // Create new empty blocks for copy indices 1 .. (unrollFactor-1)
   1075  const CompileInfo& info = originalHeader->info();
   1076  for (uint32_t cix = 1; cix < unrollFactor; cix++) {
   1077    for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1078      MBasicBlock* empty = MBasicBlock::New(graph, info, /*pred=*/nullptr,
   1079                                            MBasicBlock::Kind::NORMAL);
   1080      if (!empty) {
   1081        return false;
   1082      }
   1083      empty->setLoopDepth(state.blockTable.get(0, bix)->loopDepth());
   1084      // We don't bother to set the block's ID, because the logic below not
   1085      // depend on it. In any case the blocks will get renumbered at the end of
   1086      // UnrollLoops, so it doesn't matter what we choose here.  Unrolling
   1087      // still works even if we don't set the ID to anything.
   1088      MOZ_ASSERT(!state.blockTable.get(cix, bix));
   1089      state.blockTable.set(cix, bix, empty);
   1090    }
   1091  }
   1092 
   1093  // Copy verbatim, blocks in the original loop, into our new copies.
   1094  // For each copy of the loop ..
   1095  for (uint32_t cix = 1; cix < unrollFactor; cix++) {
   1096    // For each block in the original copy ..
   1097    uint32_t vix = 0;
   1098    for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1099      MBasicBlock* originalBlock = state.blockTable.get(0, bix);
   1100      MBasicBlock* clonedBlock = state.blockTable.get(cix, bix);
   1101 
   1102      // Clone the phi nodes, but don't update the mapper until all original
   1103      // arguments have been looked up.  This is required because the phi nodes
   1104      // collectively implement a parallel assignment; there is no implied
   1105      // sequentiality.
   1106      mozilla::Vector<std::pair<const MPhi*, MPhi*>, 16, SystemAllocPolicy>
   1107          mapperUpdates;
   1108      for (MPhiIterator phiIter(originalBlock->phisBegin());
   1109           phiIter != originalBlock->phisEnd(); phiIter++) {
   1110        const MPhi* originalPhi = *phiIter;
   1111        MPhi* clonedPhi =
   1112            MakeReplacementPhi(graph.alloc(), mapper, originalPhi);
   1113        if (!clonedPhi) {
   1114          return false;
   1115        }
   1116        clonedBlock->addPhi(clonedPhi);
   1117        MOZ_ASSERT(!valueTable.get(cix, vix));
   1118        valueTable.set(cix, vix, clonedPhi);
   1119        vix++;
   1120        if (!mapperUpdates.append(std::pair(originalPhi, clonedPhi))) {
   1121          return false;
   1122        }
   1123      }
   1124      for (auto& p : mapperUpdates) {
   1125        // Update the value mapper.  The `originalPhi` values must be part of
   1126        // the original loop body and so must already have a key in `mapper`.
   1127        mapper.update(p.first, p.second);
   1128      }
   1129 
   1130      // Cloning the instructions is simpler, since we can incrementally update
   1131      // the mapper.
   1132      for (MInstructionIterator insnIter(originalBlock->begin());
   1133           insnIter != originalBlock->end(); insnIter++) {
   1134        const MInstruction* originalInsn = *insnIter;
   1135        MInstruction* clonedInsn =
   1136            MakeReplacementInstruction(graph.alloc(), mapper, originalInsn);
   1137        if (!clonedInsn) {
   1138          return false;
   1139        }
   1140        clonedBlock->insertAtEnd(clonedInsn);
   1141        MOZ_ASSERT(!valueTable.get(cix, vix));
   1142        valueTable.set(cix, vix, clonedInsn);
   1143        vix++;
   1144        // Update the value mapper.  `originalInsn` must be part of the
   1145        // original loop body and so must already have a key in `mapper`.
   1146        mapper.update(originalInsn, clonedInsn);
   1147      }
   1148 
   1149      // Clone the block's predecessor array
   1150      MOZ_ASSERT(clonedBlock->numPredecessors() == 0);
   1151      for (uint32_t i = 0; i < originalBlock->numPredecessors(); i++) {
   1152        MBasicBlock* pred = originalBlock->getPredecessor(i);
   1153        if (!clonedBlock->appendPredecessor(pred)) {
   1154          return false;
   1155        }
   1156      }
   1157    }
   1158 
   1159    MOZ_ASSERT(vix == numValuesInOriginal);
   1160 
   1161    // Check we didn't mess up the valueTable ordering relative to the original
   1162    // (best effort assertion).
   1163    for (vix = 0; vix < numValuesInOriginal; vix++) {
   1164      MOZ_ASSERT(valueTable.get(cix, vix)->op() ==
   1165                 valueTable.get(0, vix)->op());
   1166    }
   1167 
   1168    // For cloned instructions that have a (load) dependency field, and that
   1169    // field points to an instruction in the original body, update the field so
   1170    // as to point to the equivalent instruction in the cloned body.
   1171    for (vix = 0; vix < numValuesInOriginal; vix++) {
   1172      MDefinition* clonedInsn = valueTable.get(cix, vix);
   1173      MDefinition* originalDep = clonedInsn->dependency();
   1174      if (originalDep) {
   1175        mozilla::Maybe<size_t> originalInsnIndex =
   1176            valueTable.findInRow(0, originalDep);
   1177        if (originalInsnIndex.isSome()) {
   1178          // The dependency is to an insn in the original body, so we need to
   1179          // remap it to this body.
   1180          MDefinition* clonedDep =
   1181              valueTable.get(cix, originalInsnIndex.value());
   1182          clonedInsn->setDependency(clonedDep);
   1183        }
   1184      }
   1185    }
   1186  }
   1187 
   1188 #ifdef JS_JITSPEW
   1189  if (JitSpewEnabled(JitSpew_UnrollDetails)) {
   1190    DumpValueTable(valueTable, "VALUE TABLE (final)");
   1191  }
   1192 #endif
   1193 
   1194  // Fix up (remap) inter-block edges in both the original and copies, using
   1195  // uniform indexing as above (original = index 0, copies = indices >= 1):
   1196 
   1197  // For a block with copy-index N, inspect each edge.  Determine the role the
   1198  // edge-destination (target) plays in the original copy:
   1199  //
   1200  // * backedge
   1201  //     -> remap to header node of copy (N + 1) % unrollFactor
   1202  //        (so, generates a backedge in the last copy)
   1203  // * internal jump within the original body
   1204  //     -> remap to corresponding block in copy N
   1205  // * jump to after the original body
   1206  //     -> unchanged (it is an exiting edge)
   1207  // * jump to before the original body
   1208  //     -> this can't happen
   1209  //        (but if it did, it would be just another exit block)
   1210  auto RemapEdgeDestination =
   1211      [=, &state](uint32_t cix, MBasicBlock* oldDestination) -> MBasicBlock* {
   1212    // `oldDestination` is the target of a control-flow insn in the original
   1213    // loop.  Figure out what its replacement should be, for copy index `cix`
   1214    // (with `cix == 0` meaning the original copy).
   1215 
   1216    // If it is a back edge, remap to the header node of the next copy, with
   1217    // wraparound for the last copy.
   1218    if (oldDestination == state.blockTable.get(0, 0)) {
   1219      MOZ_ASSERT(oldDestination == originalHeader);
   1220      return state.blockTable.get((cix + 1) % unrollFactor, 0);
   1221    }
   1222 
   1223    // If it is to some other node within the original body, remap to the
   1224    // corresponding node in this copy.
   1225    for (uint32_t i = 1; i < numBlocksInOriginal; i++) {
   1226      if (oldDestination == state.blockTable.get(0, i)) {
   1227        return state.blockTable.get(cix, i);
   1228      }
   1229    }
   1230 
   1231    // The target of the edge is not in the original loop.  Either it's to
   1232    // after the loop (an exiting branch) or it is to before the loop, although
   1233    // that would be bizarre and probably illegal.  Either way, leave it
   1234    // unchanged.
   1235    return oldDestination;
   1236  };
   1237 
   1238  // Similarly, for an edge that claims to come from `oldSource` in the
   1239  // original loop, determine where it should come from if we imagine that it
   1240  // applies to copy index `cix`.
   1241  auto RemapEdgeSource = [=, &state](uint32_t cix,
   1242                                     MBasicBlock* oldSource) -> MBasicBlock* {
   1243    if (oldSource == state.blockTable.get(0, numBlocksInOriginal - 1)) {
   1244      // Source is backedge node in the original loop.  Remap to the backedge
   1245      // node in the previous iteration.
   1246      MOZ_ASSERT(oldSource == originalHeader->backedge());
   1247      return state.blockTable.get(cix == 0 ? (unrollFactor - 1) : (cix - 1),
   1248                                  numBlocksInOriginal - 1);
   1249    }
   1250    for (uint32_t i = 0; i < numBlocksInOriginal - 1; i++) {
   1251      if (oldSource == state.blockTable.get(0, i)) {
   1252        // Source is a non-backedge node in the original loop.  Remap it to
   1253        // the equivalent node in this copy.
   1254        return state.blockTable.get(cix, i);
   1255      }
   1256    }
   1257    // Source is not in the original loop.  Leave unchanged.
   1258    return oldSource;
   1259  };
   1260 
   1261  // Work through all control-flow instructions in all copies and remap their
   1262  // edges.
   1263  // For each copy of the loop, including the original ..
   1264  for (uint32_t cix = 0; cix < unrollFactor; cix++) {
   1265    // For each block in the copy ..
   1266    for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1267      MBasicBlock* b = state.blockTable.get(cix, bix);
   1268      MControlInstruction* lastI = b->lastIns();
   1269      if (lastI->isGoto()) {
   1270        MGoto* insn = lastI->toGoto();
   1271        insn->setTarget(RemapEdgeDestination(cix, insn->target()));
   1272      } else if (lastI->isTest()) {
   1273        MTest* insn = lastI->toTest();
   1274        insn->setIfTrue(RemapEdgeDestination(cix, insn->ifTrue()));
   1275        insn->setIfFalse(RemapEdgeDestination(cix, insn->ifFalse()));
   1276      } else {
   1277        // The other existing control instructions are MTableSwitch,
   1278        // MUnreachable, MWasmTrap, MWasmReturn, and MWasmReturnVoid.
   1279        // MTableSwitch is ruled out above.  The others end a block without a
   1280        // successor, which means that they can't be loop blocks
   1281        // (MarkLoopBlocks marks blocks that are predecessors of the back
   1282        // edge.)  So we don't expect to see them here.
   1283        MOZ_CRASH();
   1284      }
   1285    }
   1286  }
   1287 
   1288  // Also we need to remap within the each block's predecessors vector.
   1289  // We have to do this backwards to stop RemapEdgeSource from asserting.
   1290  // For each copy of the loop, including the original ..
   1291  for (int32_t cix = unrollFactor - 1; cix >= 0; cix--) {
   1292    // For each block in the copy ..
   1293    for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1294      MBasicBlock* block = state.blockTable.get(cix, bix);
   1295      for (uint32_t i = 0; i < block->numPredecessors(); i++) {
   1296        MBasicBlock* oldPred = block->getPredecessor(i);
   1297        MBasicBlock* newPred = RemapEdgeSource(cix, oldPred);
   1298        block->setPredecessor(i, newPred);
   1299      }
   1300    }
   1301  }
   1302 
   1303  // Clean up the header node copies (but not for the original copy) since they
   1304  // all still claim to have two predecessors, the first of which is the entry
   1305  // edge from outside the loop.  This is no longer true, so delete that
   1306  // predecessor.  Remove the corresponding Phi args too.  This leaves
   1307  // one-argument phis, meh.
   1308  for (uint32_t cix = 1; cix < unrollFactor; cix++) {
   1309    MBasicBlock* block = state.blockTable.get(cix, 0);
   1310    MOZ_ASSERT(block->numPredecessors() == 2);  // from AnalyzeLoop
   1311    block->erasePredecessor(0);
   1312    MOZ_ASSERT(block->numPredecessors() == 1);  // duh
   1313    for (MPhiIterator phiIter(block->phisBegin()); phiIter != block->phisEnd();
   1314         phiIter++) {
   1315      MPhi* phi = *phiIter;
   1316      MOZ_ASSERT(phi->numOperands() == 2);  // from AnalyzeLoop
   1317      phi->removeOperand(0);
   1318      MOZ_ASSERT(phi->numOperands() == 1);
   1319    }
   1320  }
   1321 
   1322  // Fix up the phi nodes in the original header; the previous step did not do
   1323  // that.  Specifically, the second arg in each phi -- which are values that
   1324  // flow in from the backedge block -- need to be the ones from the last body
   1325  // copy; but currently they are from the original copy.  At this point, the
   1326  // `mapper` should be able to tell us the correct value.
   1327  for (MPhiIterator phiIter(originalHeader->phisBegin());
   1328       phiIter != originalHeader->phisEnd(); phiIter++) {
   1329    MPhi* phi = *phiIter;
   1330    MOZ_ASSERT(phi->numOperands() == 2);  // from AnalyzeLoop
   1331    // phi arg[0] is defined before the loop.  Ensured by AnalyzeLoop.
   1332    MOZ_ASSERT(!state.blockTable.rowContains(0, phi->getOperand(0)->block()));
   1333    // phi arg[1] is almost always defined in the loop (hence it is a
   1334    // loop-carried value) but is very occasionally defined before the loop
   1335    // (so it is unrelated to the loop).  Proceed with maximum paranoia.
   1336    MDefinition* operand1 = phi->getOperand(1);
   1337    MDefinition* replacement = mapper.lookup(operand1);
   1338    // If we didn't find a replacement (unlikely but possible) then
   1339    // `operand1` must be defined before the loop.
   1340    MOZ_ASSERT((replacement == nullptr) ==
   1341               !state.blockTable.rowContains(0, operand1->block()));
   1342    if (replacement) {
   1343      phi->replaceOperand(1, replacement);
   1344    }
   1345  }
   1346 
   1347  // Now we need to fix up the exit target blocks, by adding sources of exiting
   1348  // edges to their `predecessors_` vectors and updating their phi nodes
   1349  // accordingly.  Recall that those phi nodes were previously installed in
   1350  // these blocks by AddClosingPhisForLoop.
   1351  //
   1352  // Currently the exit target blocks mention only predecessors in the original
   1353  // loop, but they also need to mention equivalent predecessors in each of the
   1354  // loop copies.
   1355  for (uint32_t cix = 0; cix < unrollFactor; cix++) {
   1356    for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1357      MBasicBlock* block = state.blockTable.get(cix, bix);
   1358      MControlInstruction* lastIns = block->lastIns();
   1359      for (size_t i = 0; i < lastIns->numSuccessors(); i++) {
   1360        MBasicBlock* succ = lastIns->getSuccessor(i);
   1361        if (!state.exitTargetBlocks.contains(succ)) {
   1362          // This isn't an exiting edge -- not interesting.
   1363          continue;
   1364        }
   1365        // If `cix` is zero (the original body), we expect `succ` to already
   1366        // list `block` as a predecessor.  If `cix` is nonzero (a body copy)
   1367        // then we expect `succ` *not* to list it, and we must add it.
   1368        mozilla::DebugOnly<bool> found = false;
   1369        for (uint32_t j = 0; j < succ->numPredecessors(); j++) {
   1370          if (block == succ->getPredecessor(j)) {
   1371            found = true;
   1372            break;
   1373          }
   1374        }
   1375        MOZ_ASSERT((cix == 0) == found);
   1376        if (cix > 0) {
   1377          if (!succ->appendPredecessor(block)) {
   1378            return false;
   1379          }
   1380          for (MPhiIterator phiIter(succ->phisBegin());
   1381               phiIter != succ->phisEnd(); phiIter++) {
   1382            MPhi* phi = *phiIter;
   1383            // "+ 1" because we just added a predecessor
   1384            MOZ_ASSERT(phi->numOperands() + 1 == succ->numPredecessors());
   1385            MDefinition* exitingValue = phi->getOperand(0);
   1386            MOZ_ASSERT(state.exitingValues.contains(exitingValue));
   1387            // `exitingValue` is defined in the original loop body.  We need to
   1388            // find the equivalent value in copy `cix` so we can add it as an
   1389            // argument to the phi.  This bit is the one-and-only reason for
   1390            // `valueTable`s existence.
   1391            mozilla::Maybe<size_t> colIndex =
   1392                valueTable.findInRow(0, exitingValue);
   1393            MOZ_ASSERT(colIndex.isSome());  // We must find it!
   1394            MDefinition* equivValue = valueTable.get(cix, colIndex.value());
   1395            if (!phi->addInputFallible(equivValue)) {
   1396              return false;
   1397            }
   1398          }
   1399        }
   1400      }
   1401    }
   1402  }
   1403 
   1404  // At this point the control flow is correct.  But we may now have critical
   1405  // edges where none existed before.  Consider a loop {Block1, Block2} with an
   1406  // exit target Block9:
   1407  //
   1408  //   Block1
   1409  //     stuff1
   1410  //     Goto Block2
   1411  //   Block2
   1412  //     stuff2
   1413  //     Test true->Block1(== continue) false->Block9(== break)
   1414  //   ...
   1415  //   Block9
   1416  //     this is a loop-exit target
   1417  //     preds = Block2
   1418  //
   1419  // After factor-of-2 unrolling:
   1420  //
   1421  //   Block1
   1422  //     stuff1
   1423  //     Goto Block2
   1424  //   Block2
   1425  //     stuff2
   1426  //     Test true->Block3(== continue) false->Block9(== break)
   1427  //   Block3
   1428  //     stuff3
   1429  //     Goto Block4
   1430  //   Block4
   1431  //     stuff4
   1432  //     Test true->Block1(== continue) false->Block9(== break)
   1433  //   ...
   1434  //   Block9
   1435  //     preds = Block2, Block4
   1436  //
   1437  // Now the exiting edges (Block2 to Block9 and Block4 to Block9) are critical.
   1438  //
   1439  // The obvious fix is to insert a new block on each exiting edge.
   1440 
   1441  mozilla::Vector<MBasicBlock*, 16, SystemAllocPolicy> splitterBlocks;
   1442  for (uint32_t cix = 0; cix < unrollFactor; cix++) {
   1443    for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1444      MBasicBlock* block = state.blockTable.get(cix, bix);
   1445      // Inspect all successors of `block`.  If any are exiting edges, create a
   1446      // new block and route through that instead.
   1447      MControlInstruction* lastIns = block->lastIns();
   1448      for (size_t i = 0; i < lastIns->numSuccessors(); i++) {
   1449        MBasicBlock* succ = lastIns->getSuccessor(i);
   1450        if (!state.exitTargetBlocks.contains(succ)) {
   1451          // This isn't an exiting edge -- not interesting.
   1452          continue;
   1453        }
   1454        // Invent a new block.
   1455        MBasicBlock* splitter =
   1456            MBasicBlock::New(graph, info, block, MBasicBlock::Kind::SPLIT_EDGE);
   1457        if (!splitter || !splitterBlocks.append(splitter)) {
   1458          return false;
   1459        }
   1460        splitter->setLoopDepth(succ->loopDepth());
   1461        // As with empty-block creation earlier in this routine, we don't
   1462        // bother to set the block's ID since none of the logic depends on it,
   1463        // and it will eventually be renumbered anyway by UnrollLoops.
   1464        MGoto* jump = MGoto::New(graph.alloc(), succ);
   1465        if (!jump) {
   1466          return false;
   1467        }
   1468        splitter->insertAtEnd(jump);
   1469        // For the target, fix up the `predecessor_` entry.
   1470        mozilla::DebugOnly<bool> found = false;
   1471        for (uint32_t j = 0; j < succ->numPredecessors(); j++) {
   1472          if (succ->getPredecessor(j) == block) {
   1473            succ->setPredecessor(j, splitter);
   1474            found = true;
   1475            break;
   1476          }
   1477        }
   1478        // Finally, reroute the jump to `succ` via `splitter`.
   1479        lastIns->replaceSuccessor(i, splitter);
   1480        MOZ_ASSERT(found);
   1481      }
   1482    }
   1483  }
   1484 
   1485  // Now the control flow is correct *and* we don't have any critical edges.
   1486  // Fix up the `successorWithPhis_` and `positionInPhiSuccessor_` fields in
   1487  // both the unrolled loop and the splitter blocks, thusly:
   1488  //
   1489  // For all blocks `B` in the unrolled loop, and for all splitter blocks, fix
   1490  // up the `B.successorWithPhis_` and `B.positionInPhiSuccessor_` fields.
   1491  // Each block may have at maximum one successor that has phi nodes, in which
   1492  // case `B.successorWithPhis_` should point at that block.  And
   1493  // `B.positionInPhiSuccessor_` should indicate the index that successor's
   1494  // `predecessors_` vector, of B.
   1495  {
   1496    // Make up a list of blocks to fix up, then process them.
   1497    mozilla::Vector<MBasicBlock*, 32 + 16, SystemAllocPolicy> workList;
   1498    for (uint32_t cix = 0; cix < unrollFactor; cix++) {
   1499      for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1500        MBasicBlock* block = state.blockTable.get(cix, bix);
   1501        if (!workList.append(block)) {
   1502          return false;
   1503        }
   1504      }
   1505    }
   1506    for (MBasicBlock* block : splitterBlocks) {
   1507      if (!workList.append(block)) {
   1508        return false;
   1509      }
   1510    }
   1511    // Now process them.
   1512    for (MBasicBlock* block : workList) {
   1513      // Inspect `block`s successors.
   1514      MBasicBlock* succWithPhis = nullptr;
   1515      MControlInstruction* lastIns = block->lastIns();
   1516      // Determine `succWithPhis`, if one exists.
   1517      for (size_t i = 0; i < lastIns->numSuccessors(); i++) {
   1518        MBasicBlock* succ = lastIns->getSuccessor(i);
   1519        if (succ->phisEmpty()) {
   1520          continue;
   1521        }
   1522        MOZ_ASSERT(succ);
   1523        // `succ` is a successor to `block` that has phis.  So we can't already
   1524        // have seen a (different) successor.
   1525        if (!succWithPhis) {
   1526          succWithPhis = succ;
   1527        } else if (succWithPhis == succ) {
   1528          // ignore duplicate successors
   1529        } else {
   1530          // More than one successor with phis.  Structural invariants
   1531          // invalidated.  Give up.
   1532          MOZ_CRASH();
   1533        }
   1534      }
   1535      // Annotate `block`.
   1536      if (!succWithPhis) {
   1537        block->setSuccessorWithPhis(nullptr, 0);
   1538      } else {
   1539        MOZ_ASSERT(!succWithPhis->phisEmpty());
   1540        // Figure out which of `succWithPhis`s predecessors `block` is.
   1541        // Should we assert `succWithPhis->predecessors_` is duplicate-free?
   1542        uint32_t predIx = UINT32_MAX;  // invalid
   1543        for (uint32_t i = 0; i < succWithPhis->numPredecessors(); i++) {
   1544          if (succWithPhis->getPredecessor(i) == block) {
   1545            predIx = i;
   1546            break;
   1547          }
   1548        }
   1549        MOZ_ASSERT(predIx != UINT32_MAX);
   1550        block->setSuccessorWithPhis(succWithPhis, predIx);
   1551      }
   1552    }
   1553  }
   1554 
   1555  // Find and remove the MWasmInterruptCheck in all but the last iteration.
   1556  // We don't assume that there is an interrupt check, since
   1557  // wasm::FunctionCompiler::fillArray, at least, generates a loop with no
   1558  // check.
   1559  for (uint32_t cix = 0; cix < unrollFactor - 1; cix++) {
   1560    for (uint32_t vix = 0; vix < numValuesInOriginal; vix++) {
   1561      MDefinition* ins = valueTable.get(cix, vix);
   1562      if (!ins->isWasmInterruptCheck()) {
   1563        continue;
   1564      }
   1565      MWasmInterruptCheck* ic = ins->toWasmInterruptCheck();
   1566      ic->block()->discard(ic);
   1567      valueTable.set(cix, vix, nullptr);
   1568    }
   1569  }
   1570 
   1571  // We're all done with unrolling.  If peeling was also requested, make a
   1572  // minor modification to the unrolled loop: change the backedge so that,
   1573  // instead of jumping to the first body copy, make it jump to the second body
   1574  // copy.  This leaves the first body copy in a
   1575  // will-only-ever-be-executed-once state, hence achieving peeling.
   1576  if (state.doPeeling()) {
   1577    MBasicBlock* backedge =
   1578        state.blockTable.get(unrollFactor - 1, numBlocksInOriginal - 1);
   1579    MBasicBlock* header0 = state.blockTable.get(0, 0);
   1580    MBasicBlock* header1 = state.blockTable.get(1, 0);
   1581    MOZ_ASSERT(header0 == originalHeader);
   1582 
   1583    // Fix up the back edge, to make it jump to the second copy of the loop
   1584    // body
   1585    MOZ_ASSERT(backedge->hasLastIns());  // should always hold (AnalyzeLoop)
   1586    MOZ_ASSERT(backedge->lastIns()->isGoto());  // AnalyzeLoop ensures
   1587    MGoto* backedgeG = backedge->lastIns()->toGoto();
   1588    MOZ_ASSERT(backedgeG->target() == header0);  // AnalyzeLoop ensures
   1589    backedgeG->setTarget(header1);
   1590    header0->clearLoopHeader();
   1591    header1->setLoopHeader();
   1592    for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1593      MBasicBlock* originalBlock = state.blockTable.get(0, bix);
   1594      MOZ_ASSERT(originalBlock->loopDepth() > 0);
   1595      originalBlock->setLoopDepth(originalBlock->loopDepth() - 1);
   1596    }
   1597 
   1598    // Fix up the preds of the original header
   1599    MOZ_ASSERT(header0->numPredecessors() == 2);
   1600    MOZ_ASSERT(header0->getPredecessor(1) == backedge);
   1601    header0->erasePredecessor(1);
   1602 
   1603    // Fix up the preds of the new header
   1604    MOZ_ASSERT(header1->numPredecessors() == 1);
   1605    if (!header1->appendPredecessor(backedge)) {
   1606      return false;
   1607    }
   1608 
   1609    // Fix up phis of both headers, by moving the second operand of each phi of
   1610    // `header0` to be the second operand of the corresponding phi in
   1611    // `header1`.
   1612    MPhiIterator phiIter0(header0->phisBegin());
   1613    MPhiIterator phiIter1(header1->phisBegin());
   1614    while (true) {
   1615      bool finished0 = phiIter0 == header0->phisEnd();
   1616      mozilla::DebugOnly<bool> finished1 = phiIter1 == header1->phisEnd();
   1617      // The two blocks must have the same number of phis.
   1618      MOZ_ASSERT(finished0 == finished1);
   1619      if (finished0) {
   1620        break;
   1621      }
   1622      MPhi* phi0 = *phiIter0;
   1623      MPhi* phi1 = *phiIter1;
   1624      MOZ_ASSERT(phi0->numOperands() == 2);
   1625      MOZ_ASSERT(phi1->numOperands() == 1);
   1626      MDefinition* phi0arg1 = phi0->getOperand(1);
   1627      phi0->removeOperand(1);
   1628      if (!phi1->addInputFallible(phi0arg1)) {
   1629        return false;
   1630      }
   1631      phiIter0++;
   1632      phiIter1++;
   1633    }
   1634 
   1635    // Fix the succ-w-phis entry for `backEdge`.  In very rare circumstances,
   1636    // the loop may have no header phis, so this must be conditional.
   1637    if (backedge->successorWithPhis()) {
   1638      MOZ_ASSERT(backedge->successorWithPhis() == header0);
   1639      MOZ_ASSERT(backedge->positionInPhiSuccessor() == 1);
   1640      backedge->setSuccessorWithPhis(header1, 1);
   1641    }
   1642  }
   1643 
   1644 #ifdef JS_JITSPEW
   1645  if (JitSpewEnabled(JitSpew_UnrollDetails)) {
   1646    DumpBlockTable(state.blockTable, "LOOP BLOCKS (final)");
   1647    Fprinter& printer(JitSpewPrinter());
   1648    printer.printf("<<<< SPLITTER BLOCKS\n");
   1649    for (MBasicBlock* block : splitterBlocks) {
   1650      DumpMIRBlock(printer, block, /*showDetails=*/true);
   1651    }
   1652    printer.printf(">>>>\n");
   1653  }
   1654 #endif
   1655 
   1656  // Add the new blocks to the graph.
   1657  {
   1658    // We want to add them to the end of the existing loop.
   1659    MBasicBlock* cursor = state.blockTable.get(0, numBlocksInOriginal - 1);
   1660    for (uint32_t cix = 1; cix < unrollFactor; cix++) {
   1661      for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) {
   1662        MBasicBlock* addMe = state.blockTable.get(cix, bix);
   1663        graph.insertBlockAfter(cursor, addMe);
   1664        cursor = addMe;
   1665      }
   1666    }
   1667    for (MBasicBlock* addMe : splitterBlocks) {
   1668      graph.insertBlockAfter(cursor, addMe);
   1669      cursor = addMe;
   1670    }
   1671  }
   1672 
   1673  // UnrollLoops will renumber the blocks and redo the dominator tree once all
   1674  // loops have been unrolled.
   1675 
   1676 #ifdef JS_JITSPEW
   1677  if (JitSpewEnabled(JitSpew_UnrollDetails)) {
   1678    Fprinter& printer(JitSpewPrinter());
   1679    printer.printf("<<<< FUNCTION AFTER UNROLLING\n");
   1680    DumpMIRGraph(printer, graph, /*showDetails=*/true);
   1681    printer.printf(">>>>\n");
   1682  }
   1683 #endif
   1684 
   1685  return true;
   1686 }
   1687 
   1688 // =====================================================================
   1689 //
   1690 // FindInitialCandidates, which finds "initial candidates" for
   1691 // unrolling/peeling.  See SMDOC at the top of this file.
   1692 
   1693 struct InitialCandidate {
   1694  MBasicBlock* header;
   1695  uint32_t numBlocks;
   1696  float score;
   1697  bool valid;
   1698  InitialCandidate(MBasicBlock* header, uint32_t numBlocks)
   1699      : header(header), numBlocks(numBlocks), score(0.0), valid(true) {}
   1700 };
   1701 
   1702 using InitialCandidateVector =
   1703    mozilla::Vector<InitialCandidate, 64, SystemAllocPolicy>;
   1704 
   1705 [[nodiscard]]
   1706 bool FindInitialCandidates(MIRGraph& graph,
   1707                           InitialCandidateVector& initialCandidates) {
   1708  // Find initial candidate loops, and place them in `initialCandidates`.
   1709  MOZ_ASSERT(initialCandidates.empty());
   1710 
   1711  // First, using a single scan over the blocks, find blocks that are the
   1712  // headers of innermost loops.  This uses the same logic to find innermost
   1713  // loops as BacktrackingAllocator::init.
   1714  //
   1715  // The general idea is: we traverse the entire graph in RPO order, but ignore
   1716  // all blocks except loop headers and loop backedge blocks.  At a header we
   1717  // make a note of the backedge block it points at.  Because the loops are
   1718  // properly nested, this means that we will ignore backedges for
   1719  // non-innermost loops, and, so, when we do come to a block that matches
   1720  // `backedge`, we know we've identified an innermost loop.
   1721  //
   1722  // At the same time, release-assert that the blocks are numbered
   1723  // sequentially.  We need this for the loop-non-overlapping check below.
   1724  mozilla::Vector<MBasicBlock*, 128, SystemAllocPolicy> initialHeaders;
   1725  {
   1726    MBasicBlock* backedge = nullptr;
   1727    uint32_t expectedNextId = 0;
   1728    for (auto rpoIter(graph.rpoBegin()), rpoIterEnd(graph.rpoEnd());
   1729         rpoIter != rpoIterEnd; ++rpoIter) {
   1730      MBasicBlock* block = *rpoIter;
   1731      MOZ_RELEASE_ASSERT(block->id() == expectedNextId);
   1732      expectedNextId++;
   1733 
   1734      if (block->isLoopHeader()) {
   1735        backedge = block->backedge();
   1736      }
   1737      if (block == backedge) {
   1738        if (!initialHeaders.append(block->loopHeaderOfBackedge())) {
   1739          return false;
   1740        }
   1741      }
   1742    }
   1743  }
   1744 
   1745  // Now use MarkLoopBlocks to check each potential loop in more detail, and,
   1746  // for each non-empty, non-OSR one, create an entry in initialCandidates for
   1747  // it.
   1748  for (MBasicBlock* header : initialHeaders) {
   1749    MOZ_ASSERT(header->isLoopHeader());
   1750 
   1751    bool hasOsrEntry;
   1752    size_t numBlocks = MarkLoopBlocks(graph, header, &hasOsrEntry);
   1753    if (numBlocks == 0) {
   1754      // Bogus; skip.
   1755      continue;
   1756    }
   1757    UnmarkLoopBlocks(graph, header);
   1758    if (hasOsrEntry) {
   1759      // Unhandled; skip.
   1760      continue;
   1761    }
   1762 
   1763    MOZ_RELEASE_ASSERT(numBlocks <= UINT32_MAX);
   1764    if (!initialCandidates.append(InitialCandidate(header, numBlocks))) {
   1765      return false;
   1766    }
   1767  }
   1768 
   1769  // Sort the loops by header ID.
   1770  std::sort(initialCandidates.begin(), initialCandidates.end(),
   1771            [](const InitialCandidate& cand1, const InitialCandidate& cand2) {
   1772              return cand1.header->id() < cand2.header->id();
   1773            });
   1774 
   1775  // Now we can conveniently assert non-overlappingness.  Note that this
   1776  // depends on the property that each loop is a sequential sequence of block
   1777  // IDs, as we release-asserted above.  Presence of an overlap might indicate
   1778  // that a non-innermost loop made it this far.
   1779  for (size_t i = 1; i < initialCandidates.length(); i++) {
   1780    MOZ_RELEASE_ASSERT(initialCandidates[i - 1].header->id() +
   1781                           initialCandidates[i - 1].numBlocks <=
   1782                       initialCandidates[i].header->id());
   1783  }
   1784 
   1785  // Heuristics: calculate a "score" for each loop, indicating our desire to
   1786  // unroll it.  **Lower** values indicate a higher desirability for unrolling.
   1787  //
   1788  // A loop gets a lower score if:
   1789  // * it is small
   1790  // * it has few blocks
   1791  // * it is deeply nested (currently ignored)
   1792 
   1793  for (InitialCandidate& cand : initialCandidates) {
   1794    uint32_t nBlocks = cand.numBlocks;
   1795    uint32_t nInsns = 0;
   1796    uint32_t nPhis = 0;
   1797 
   1798    // Visit each block in the loop
   1799    mozilla::DebugOnly<uint32_t> numBlocksVisited = 0;
   1800    const MBasicBlock* backedge = cand.header->backedge();
   1801 
   1802    for (auto i(graph.rpoBegin(cand.header)); /**/; ++i) {
   1803      MOZ_ASSERT(i != graph.rpoEnd(),
   1804                 "UnrollLoops: loop blocks overrun graph end");
   1805      MBasicBlock* block = *i;
   1806      numBlocksVisited++;
   1807      for (MInstructionIterator iter(block->begin()); iter != block->end();
   1808           iter++) {
   1809        nInsns++;
   1810      }
   1811      for (MPhiIterator iter(block->phisBegin()); iter != block->phisEnd();
   1812           iter++) {
   1813        nPhis++;
   1814      }
   1815 
   1816      if (block == backedge) {
   1817        break;
   1818      }
   1819    }
   1820 
   1821    MOZ_ASSERT(numBlocksVisited == nBlocks);
   1822 
   1823    cand.score =
   1824        10.0 * float(nBlocks) + 2.0 * float(nInsns) + 1.0 * float(nPhis);
   1825  }
   1826 
   1827  // Sort the loops by `score`.
   1828  std::sort(initialCandidates.begin(), initialCandidates.end(),
   1829            [](const InitialCandidate& cand1, const InitialCandidate& cand2) {
   1830              return cand1.score < cand2.score;
   1831            });
   1832 
   1833  return true;
   1834 }
   1835 
   1836 // =====================================================================
   1837 //
   1838 // UnrollLoops, the top level of the unroller.
   1839 
   1840 bool UnrollLoops(const MIRGenerator* mir, MIRGraph& graph, bool* changed) {
   1841  *changed = false;
   1842 
   1843 #ifdef JS_JITSPEW
   1844  if (JitSpewEnabled(JitSpew_Unroll)) {
   1845    JitSpew(JitSpew_Unroll, "BEGIN UnrollLoops");
   1846  }
   1847 #endif
   1848 
   1849  // Make up a vector of initial candidates, which are innermost loops only,
   1850  // and not too big.
   1851 
   1852  InitialCandidateVector initialCandidates;
   1853  if (!FindInitialCandidates(graph, initialCandidates)) {
   1854    return false;
   1855  }
   1856 
   1857 #ifdef JS_JITSPEW
   1858  if (JitSpewEnabled(JitSpew_Unroll)) {
   1859    for (const InitialCandidate& cand : initialCandidates) {
   1860      MOZ_ASSERT(cand.valid);
   1861      JitSpew(JitSpew_Unroll, "   initial cand:  score=%-5.1f  blocks=%u-%u",
   1862              cand.score, cand.header->id(),
   1863              cand.header->id() + cand.numBlocks - 1);
   1864    }
   1865  }
   1866 #endif
   1867 
   1868  // Perform a more expensive analysis of the candidates, leading to a further
   1869  // reduction in the set.  For those remaining, an UnrollState is created and
   1870  // initialized.  Also a decision is made whether to unroll only, peel only,
   1871  // or both.
   1872 
   1873  mozilla::Vector<UnrollState, 32, SystemAllocPolicy> unrollStates;
   1874 
   1875  for (const InitialCandidate& cand : initialCandidates) {
   1876    if (cand.score > InitialCandidateThreshold) {
   1877      // Give up.  We just sorted them by `score`, so the rest will be even
   1878      // more undesirable for unrolling.
   1879      break;
   1880    }
   1881 
   1882    // Park the loop's blocks in a vector, so we can hand it off to
   1883    // AnalyzeLoops for examination.  This is the place where the original
   1884    // block sequence, as seen by the entire rest of the machinery, is fixed.
   1885    BlockVector originalBlocks;
   1886 
   1887    const MBasicBlock* backedge = cand.header->backedge();
   1888 
   1889    for (auto blockIter(graph.rpoBegin(cand.header)); /**/; ++blockIter) {
   1890      MOZ_ASSERT(blockIter != graph.rpoEnd());
   1891      MBasicBlock* block = *blockIter;
   1892      if (!originalBlocks.append(block)) {
   1893        return false;
   1894      }
   1895      if (block == backedge) {
   1896        break;
   1897      }
   1898    }
   1899 
   1900    // Analyze the candidate in detail.  If unrolling and/or peeling is
   1901    // approved, this also collects up for us, the loop's target block set and
   1902    // exiting value set.
   1903    BlockSet exitTargetBlocks;
   1904    ValueSet exitingValues;
   1905 
   1906    AnalysisResult res =
   1907        AnalyzeLoop(originalBlocks, &exitTargetBlocks, &exitingValues);
   1908 
   1909 #ifdef JS_JITSPEW
   1910    if (JitSpewEnabled(JitSpew_Unroll)) {
   1911      MOZ_ASSERT(cand.valid);
   1912      JitSpew(JitSpew_Unroll, "  %13s:  score=%-5.1f  blocks=%u-%u",
   1913              Name_of_AnalysisResult(res), cand.score, cand.header->id(),
   1914              cand.header->id() + cand.numBlocks - 1);
   1915    }
   1916 #endif
   1917 
   1918    switch (res) {
   1919      case AnalysisResult::OOM:
   1920        return false;
   1921      case AnalysisResult::Peel:
   1922      case AnalysisResult::Unroll:
   1923      case AnalysisResult::PeelAndUnroll:
   1924        break;
   1925      case AnalysisResult::BadInvariants:
   1926        MOZ_CRASH("js::jit::UnrollLoops: unexpected incoming MIR");
   1927      case AnalysisResult::TooComplex:
   1928      case AnalysisResult::Uncloneable:
   1929      case AnalysisResult::TooLarge:
   1930      case AnalysisResult::Unsuitable:
   1931        // Ignore this loop
   1932        continue;
   1933      default:
   1934        MOZ_CRASH();
   1935    }
   1936 
   1937    // The candidate has been accepted for unrolling and/or peeling.  Make an
   1938    // initial UnrollState for it and add it to our collection thereof.
   1939 
   1940    UnrollMode mode;
   1941    if (res == AnalysisResult::Peel) {
   1942      mode = UnrollMode::Peel;
   1943    } else if (res == AnalysisResult::Unroll) {
   1944      mode = UnrollMode::Unroll;
   1945    } else if (res == AnalysisResult::PeelAndUnroll) {
   1946      mode = UnrollMode::PeelAndUnroll;
   1947    } else {
   1948      MOZ_CRASH();
   1949    }
   1950 
   1951    if (!unrollStates.emplaceBack(mode)) {
   1952      return false;
   1953    }
   1954    UnrollState* state = &unrollStates.back();
   1955    if (!state->exitTargetBlocks.copy(exitTargetBlocks) ||
   1956        !state->exitingValues.copy(exitingValues)) {
   1957      return false;
   1958    }
   1959 
   1960    // Ignoring peeling, how many times should we unroll a loop?
   1961    // For example, if the original loop was `loop { B; }` then with
   1962    // `basicUnrollingFactor = 3`, the unrolled loop will be `loop { B; B; B;
   1963    // }`.  If peeling is also requested then we will unroll one more time than
   1964    // this, giving overall result `B; loop { B; B; B; }`.
   1965    uint32_t basicUnrollingFactor = JS::Prefs::wasm_unroll_factor();
   1966    if (basicUnrollingFactor < 2) {
   1967      // It needs to be at least 2, else we're not unrolling at all.
   1968      basicUnrollingFactor = 2;
   1969    } else if (basicUnrollingFactor > 8) {
   1970      // Unrolling gives rapidly diminishing marginal gains.  Even beyond 4
   1971      // there's little benefit to be had.
   1972      basicUnrollingFactor = 8;
   1973    }
   1974 
   1975    // Decide on the total number of duplicates of the loop body we'll need
   1976    // (including the original).
   1977    uint32_t unrollingFactor;
   1978    if (res == AnalysisResult::Peel) {
   1979      // 1 for the peeled iteration,
   1980      // 1 for the loop (since we are not unrolling it)
   1981      unrollingFactor = 1 + 1;
   1982    } else if (res == AnalysisResult::Unroll) {
   1983      // No peeled iteration, we unroll only
   1984      unrollingFactor = 0 + basicUnrollingFactor;
   1985    } else if (res == AnalysisResult::PeelAndUnroll) {
   1986      // 1 peeled iteration, and we unroll the loop
   1987      unrollingFactor = 1 + basicUnrollingFactor;
   1988    } else {
   1989      MOZ_CRASH();
   1990    }
   1991 
   1992    // Set up UnrollState::blockTable and copy in the initial loop blocks.
   1993    if (!state->blockTable.init(unrollingFactor, originalBlocks.length())) {
   1994      return false;
   1995    }
   1996    for (uint32_t bix = 0; bix < originalBlocks.length(); bix++) {
   1997      state->blockTable.set(0, bix, originalBlocks[bix]);
   1998    }
   1999  }
   2000 
   2001  // Now, finally, we have an initialized vector of UnrollStates ready to go.
   2002  // After this point, `originalBlocks` is no longer used.  Also, because of
   2003  // the checking done in AnalyzeLoop, we know the transformations can't fail
   2004  // (apart from OOMing).
   2005 
   2006  // Add "loop-closing phis" for values defined in the loop and used
   2007  // afterwards.  See comments on AddClosingPhisForLoop.
   2008  for (const UnrollState& state : unrollStates) {
   2009    if (!AddClosingPhisForLoop(graph.alloc(), state)) {
   2010      return false;
   2011    }
   2012  }
   2013 
   2014  // Actually do the unrolling and/or peeling.
   2015  uint32_t numLoopsPeeled = 0;
   2016  uint32_t numLoopsUnrolled = 0;
   2017  uint32_t numLoopsPeeledAndUnrolled = 0;
   2018  for (UnrollState& state : unrollStates) {
   2019    if (!UnrollAndOrPeelLoop(graph, state)) {
   2020      return false;
   2021    }
   2022    // Update stats.
   2023    switch (state.mode) {
   2024      case UnrollMode::Peel:
   2025        numLoopsPeeled++;
   2026        break;
   2027      case UnrollMode::Unroll:
   2028        numLoopsUnrolled++;
   2029        break;
   2030      case UnrollMode::PeelAndUnroll:
   2031        numLoopsPeeledAndUnrolled++;
   2032        break;
   2033      default:
   2034        MOZ_CRASH();
   2035    }
   2036  }
   2037 
   2038  // Do function-level fixups, if anything changed.
   2039  if (!unrollStates.empty()) {
   2040    RenumberBlocks(graph);
   2041    ClearDominatorTree(graph);
   2042    if (!BuildDominatorTree(mir, graph)) {
   2043      return false;
   2044    }
   2045  }
   2046 
   2047  uint32_t numLoopsChanged =
   2048      numLoopsPeeled + numLoopsUnrolled + numLoopsPeeledAndUnrolled;
   2049 
   2050 #ifdef JS_JITSPEW
   2051  if (JitSpewEnabled(JitSpew_Unroll)) {
   2052    if (numLoopsChanged == 0) {
   2053      JitSpew(JitSpew_Unroll, "END   UnrollLoops");
   2054    } else {
   2055      JitSpew(JitSpew_Unroll,
   2056              "END UnrollLoops, %u processed (P=%u, U=%u, P&U=%u)",
   2057              numLoopsChanged, numLoopsPeeled, numLoopsUnrolled,
   2058              numLoopsPeeledAndUnrolled);
   2059    }
   2060  }
   2061 #endif
   2062 
   2063  *changed = numLoopsChanged > 0;
   2064  return true;
   2065 }
   2066 
   2067 }  // namespace jit
   2068 }  // namespace js
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE