UnrollLoops.cpp (78818B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "jit/UnrollLoops.h" 8 9 #include "mozilla/Maybe.h" 10 11 #include <stdio.h> 12 13 #include "jit/DominatorTree.h" 14 #include "jit/IonAnalysis.h" 15 #include "jit/MIRGraph.h" 16 17 namespace js { 18 namespace jit { 19 20 // [SMDOC] Loop unroller implementation summary 21 // 22 // This is a simple loop unroller, intended (initially at least) to handle only 23 // wasm loops, with the aims of amortizing the per-iteration interrupt check 24 // cost, and of lifting an initial iteration outside the loop so as to 25 // facilitate subsequent optimizations. Unrolling and peeling can be selected 26 // independently, so the available choices are: peeling only, unrolling only, 27 // or both peeling and unrolling. 28 // 29 // The flow of control (for a single function) is roughly: 30 // 31 // (0) The top level function is UnrollLoops. 32 // 33 // (1) UnrollLoops calls FindInitialCandidates to identify "initial candidate" 34 // loops in the function. These are innermost loops. Non-innermost loops 35 // cannot be unrolled/peeled. 36 // 37 // (2) FindInitialCandidates counts blocks and value defs in each initial 38 // candidate, creating an initial score which increases with loop size. 39 // 40 // (3) Back in UnrollLoops, initial candidates whose initial score does not 41 // exceed InitialCandidateThreshold proceed to the next stage in the 42 // process: they are passed to function AnalyzeLoop. 43 // 44 // (4) AnalyzeLoop, as called from UnrollLoops, checks many structural 45 // invariants on each loop, since the core unroll machinery depends on 46 // those invariants. It also makes a final determination for each loop, as 47 // to whether the loop should be peeled, unrolled, peeled and unrolled, or 48 // be left unchanged. In the latter case it returns one of a few reasons 49 // why the loop is to be left unchanged. 50 // 51 // (5) AnalyzeLoop also identifies, for a loop, its basic blocks, the set of 52 // blocks it jumps to when exiting, and the values defined in the loop 53 // which are used afterwards. 54 // 55 // (6) Back in UnrollLoops, we now know which loops are suitable for 56 // processing, what processing is to happen to them, and what their blocks 57 // are. This is used to construct and initialize a `struct UnrollState` 58 // for each loop candidate that made it this far. 59 // 60 // (7) Before a loop can be unrolled, single-argument phi nodes need to be 61 // added to its exit target blocks, to "catch" values defined in the loop 62 // that are used afterwards. This is a limited conversion into what is 63 // called "Loop-Closed SSA form" in the GCC/Clang literature. UnrollLoops 64 // calls AddClosingPhisForLoop to get this done. This is done for all 65 // loops before proceeding further. 66 // 67 // (8) Finally, we get to the core of the unrolling: the loops are handed to 68 // UnrollAndOrPeelLoop in turn. This has extensive comments; a summary 69 // of the actions is: 70 // - blocks, phis and normal instructions are cloned so as to create 71 // the extra loop body copies. 72 // - the new blocks are added to the loop's UnrollState::blockTable. 73 // - during cloning, value uses are remapped to the correct iteration 74 // using an MDefinitionRemapper. 75 // - also during cloning, the new values are parked in a ValueTable for 76 // later reference. 77 // - control flow edges and predecessor vectors in the new and original 78 // block sets are fixed up, using RemapEdgeSource and 79 // RemapEdgeDestination. 80 // - header block phi nodes are fixed up in both the body copies 81 // and the original. 82 // - new exiting blocks are created, so as to maintain the invariant 83 // that there are no critical edges. 84 // - for exit target blocks, both their predecessor arrays and phi nodes 85 // (as installed by AddClosingPhisForLoop) are augmented to handle 86 // the new exit edges. 87 // - Wasm interrupt checks in all but the last body copy are nop'd out. 88 // - If peeling is required, the back edge of the unrolled loop is changed 89 // so it points at the second body copy, not the first (the original). 90 // - Finally, the new blocks are installed in the MIRGraph. 91 // 92 // (9) Back in UnrollLoops, once all loops have been processed, 93 // some function-level fixups are done: 94 // - the blocks are renumbered 95 // - the dominator tree is rebuilt 96 // 97 // UnrollLoops now returns to OptimizeMIR, handing back an indication of how 98 // many loops were processed. OptimizeMIR will tidy up the resulting function 99 // by re-running GVN, empty block removal, and possibly other transformations. 100 101 // The core logic in UnrollAndOrPeelLoop is heavily commented but still 102 // nearly incomprehensible. For debugging and modification it is helpful to 103 // use the Dump{Block,Value}Table* routines provided. 104 105 // The logic below functions without reference to block IDs or value IDs. 106 107 // Much of the logic uses simple scans over vectors (classes BlockVector, 108 // Matrix, SimpleSet, MDefinitionRemapper) rather than more complex data 109 // structures. The loops we process are small, which keeps the costs of these 110 // scans low. The InitialCandidateThreshold value used limits most processed 111 // loops to about 6 blocks and 30-ish value definitions. 112 113 // The idiom `blockTable.rowContains(0, block)` in the logic below means 114 // "is `block` in the original loop?" 115 116 // ===================================================================== 117 // 118 // Tunables 119 120 // As a crude first pass for filtering out unsuitable candidates, top level 121 // function UnrollLoops calculates a size-related value for each candidate, 122 // with lower values indicating more desirability for unrolling/peeling. 123 // Candidates with a score above this value will not be unrolled. Reducing the 124 // value therefore reduces the aggressiveness of the unroller. 125 const float InitialCandidateThreshold = 100.0; 126 127 // On further analysis of candidates that pass the above test, we have more 128 // detailed limitations on size. The specified action will not be taken for a 129 // loop if the associated metric exceeds the constants here. Hence increasing 130 // these values increases the aggressiveness of the unroller, but only to point 131 // where the `InitialCandidateThreshold` would have cut it off. 132 133 // Loops with more than this number of basic blocks or values (phis and normal 134 // definitions) will be excluded from peeling. The actual numbers are pretty 135 // much pulled out of a hat. 136 const size_t MaxBlocksForPeel = 12; 137 const size_t MaxValuesForPeel = 150; 138 139 // Loops with more than this number of blocks or values will be excluded from 140 // unrolling. Unrolling adds multiple copies of the loop body, whereas peeling 141 // adds only one, so it seems prudent to make unrolling less aggressive than 142 // peeling. 143 const size_t MaxBlocksForUnroll = 10; 144 const size_t MaxValuesForUnroll = 100; 145 146 // ===================================================================== 147 // 148 // BlockVector, a vector of MBasicBlock*s. 149 150 using BlockVector = mozilla::Vector<MBasicBlock*, 32, SystemAllocPolicy>; 151 152 static bool BlockVectorContains(const BlockVector& vec, 153 const MBasicBlock* block) { 154 for (const MBasicBlock* b : vec) { 155 if (b == block) { 156 return true; 157 } 158 } 159 return false; 160 } 161 162 // ===================================================================== 163 // 164 // Matrix, a 2-D array of pointers 165 166 template <typename T, int N, typename AP> 167 class Matrix { 168 static_assert(std::is_pointer<T>::value); 169 mozilla::Vector<T, N, AP> vec_; 170 size_t size1_ = 0; 171 size_t size2_ = 0; 172 inline size_t index(size_t ix1, size_t ix2) const { 173 MOZ_ASSERT(ix1 < size1_ && ix2 < size2_); 174 return ix1 * size2_ + ix2; 175 } 176 177 public: 178 [[nodiscard]] bool init(size_t size1, size_t size2) { 179 // Not already init'd. 180 MOZ_ASSERT(size1_ == 0 && size2_ == 0 && vec_.empty()); 181 // This would be pointless. 182 MOZ_ASSERT(size1 > 0 && size2 > 0); 183 // So we can safely multiply them. InitialCandidateThreshold ensures we'll 184 // never have to process a loop anywhere near this big. 185 MOZ_RELEASE_ASSERT(size1 <= 1000 && size2 <= 1000); 186 size1_ = size1; 187 size2_ = size2; 188 if (!vec_.resize(size1 * size2)) { 189 return false; 190 } 191 // Resizing also zero-initialises all the `vec_` entries. 192 return true; 193 } 194 195 inline size_t size1() const { 196 MOZ_ASSERT(size1_ * size2_ == vec_.length()); 197 return size1_; 198 } 199 inline size_t size2() const { 200 MOZ_ASSERT(size1_ * size2_ == vec_.length()); 201 return size2_; 202 } 203 inline T get(size_t ix1, size_t ix2) const { return vec_[index(ix1, ix2)]; } 204 inline void set(size_t ix1, size_t ix2, T value) { 205 vec_[index(ix1, ix2)] = value; 206 } 207 208 // Does the matrix slice `[ix1, ..]` contain `value`? 209 inline bool rowContains(size_t ix1, const T value) const { 210 return findInRow(ix1, value).isSome(); 211 } 212 213 // Find the column number (ix2) such that entry `[ix1, ix2] == value`. 214 mozilla::Maybe<size_t> findInRow(size_t ix1, const T value) const { 215 for (size_t ix2 = 0; ix2 < size2_; ix2++) { 216 if (get(ix1, ix2) == value) { 217 return mozilla::Some(ix2); 218 } 219 } 220 return mozilla::Nothing(); 221 } 222 }; 223 224 using BlockTable = Matrix<MBasicBlock*, 32, SystemAllocPolicy>; 225 using ValueTable = Matrix<MDefinition*, 128, SystemAllocPolicy>; 226 227 // ===================================================================== 228 // 229 // Debug printing 230 231 #ifdef JS_JITSPEW 232 233 // For debugging, dump specific rows (loop body copies) of a block table. 234 static void DumpBlockTableRows(const BlockTable& table, int32_t firstCix, 235 int32_t lastCix, const char* tag) { 236 Fprinter& printer(JitSpewPrinter()); 237 printer.printf("<<<< %s\n", tag); 238 for (int32_t cix = firstCix; cix <= lastCix; cix++) { 239 if (cix == 0) { 240 printer.printf(" -------- Original --------\n"); 241 } else { 242 printer.printf(" -------- Copy %u --------\n", cix); 243 } 244 for (uint32_t bix = 0; bix < table.size2(); bix++) { 245 DumpMIRBlock(printer, table.get(uint32_t(cix), bix), 246 /*showDetails=*/true); 247 } 248 } 249 printer.printf(">>>>\n"); 250 } 251 252 // Dump an entire block table. 253 static void DumpBlockTable(const BlockTable& table, const char* tag) { 254 DumpBlockTableRows(table, 0, int32_t(table.size1()) - 1, tag); 255 } 256 257 // Dump just the original loop body in a block table. 258 static void DumpBlockTableRowZero(const BlockTable& table, const char* tag) { 259 DumpBlockTableRows(table, 0, 0, tag); 260 } 261 262 // Dump a value table. 263 static void DumpValueTable(const ValueTable& table, const char* tag) { 264 Fprinter& printer(JitSpewPrinter()); 265 printer.printf("<<<< %s\n", tag); 266 for (uint32_t cix = 0; cix < table.size1(); cix++) { 267 if (cix == 0) { 268 printer.printf(" -------- Original --------\n"); 269 } else { 270 printer.printf(" -------- Copy %u --------\n", cix); 271 } 272 for (uint32_t vix = 0; vix < table.size2(); vix++) { 273 printer.printf(" "); 274 DumpMIRDefinition(printer, table.get(cix, vix), 275 /*showDetails=*/true); 276 printer.printf("\n"); 277 } 278 } 279 printer.printf(">>>>\n"); 280 } 281 282 #endif // JS_JITSPEW 283 284 // ===================================================================== 285 // 286 // SimpleSet, a set of pointers 287 288 template <typename T, int N, typename AP> 289 class SimpleSet { 290 static_assert(std::is_pointer<T>::value); 291 mozilla::Vector<T, N, AP> vec_; 292 293 public: 294 [[nodiscard]] bool copy(const SimpleSet<T, N, AP>& other) { 295 vec_.clear(); 296 for (T elem : other.vec_) { 297 if (!vec_.append(elem)) { 298 return false; 299 } 300 } 301 return true; 302 } 303 bool contains(T t) const { 304 for (auto* existing : vec_) { 305 if (existing == t) { 306 return true; 307 } 308 } 309 return false; 310 } 311 [[nodiscard]] bool add(T t) { 312 MOZ_ASSERT(t); 313 return contains(t) ? true : vec_.append(t); 314 } 315 bool empty() const { return vec_.empty(); } 316 size_t size() const { return vec_.length(); } 317 T get(size_t ix) const { return vec_[ix]; } 318 }; 319 320 using BlockSet = SimpleSet<MBasicBlock*, 8, SystemAllocPolicy>; 321 using ValueSet = SimpleSet<MDefinition*, 64, SystemAllocPolicy>; 322 323 // ===================================================================== 324 // 325 // MDefinitionRemapper, a very simple MDefinition*-remapping class. 326 327 class MDefinitionRemapper { 328 using Pair = std::pair<MDefinition*, MDefinition*>; 329 mozilla::Vector<Pair, 32, SystemAllocPolicy> pairs; 330 331 public: 332 MDefinitionRemapper() {} 333 // Register `original` as a key in the mapper, and map it to itself. 334 [[nodiscard]] bool enregister(MDefinition* original) { 335 MOZ_ASSERT(original); 336 for (auto& p : pairs) { 337 (void)p; 338 MOZ_ASSERT(p.first != original); // not mapped at all 339 MOZ_ASSERT(p.second == p.first); // no `update` calls happened yet 340 } 341 return pairs.append(std::pair(original, original)); 342 } 343 // Look up the binding for `original` in the mapper. 344 MDefinition* lookup(const MDefinition* original) const { 345 MOZ_ASSERT(original); 346 MDefinition* res = nullptr; 347 for (auto& p : pairs) { 348 if (p.first == original) { 349 res = p.second; 350 break; 351 } 352 } 353 return res; 354 } 355 // Update the mapper so as to map `original` to `replacement`. `original` 356 // must have previously been registered with ::enregister. 357 void update(const MDefinition* original, MDefinition* replacement) { 358 MOZ_ASSERT(original && replacement); 359 MOZ_ASSERT(original != replacement); 360 for (auto& p : pairs) { 361 if (p.first == original) { 362 p.second = replacement; 363 return; 364 } 365 } 366 MOZ_CRASH(); // asked to update a non-existent key 367 } 368 }; 369 370 // ===================================================================== 371 // 372 // MakeReplacementInstruction / MakeReplacementPhi 373 374 // Make a cloned version of `ins`, but with its arguments remapped by `mapper`. 375 static MInstruction* MakeReplacementInstruction( 376 TempAllocator& alloc, const MDefinitionRemapper& mapper, 377 const MInstruction* ins) { 378 MDefinitionVector inputs(alloc); 379 for (size_t i = 0; i < ins->numOperands(); i++) { 380 MDefinition* old = ins->getOperand(i); 381 MDefinition* replacement = mapper.lookup(old); 382 if (!replacement) { 383 // The mapper didn't map it, which means that `old` is a value that's not 384 // defined in the loop body. So leave it unchanged. 385 replacement = old; 386 } 387 if (!inputs.append(replacement)) { 388 return nullptr; 389 } 390 } 391 return ins->clone(alloc, inputs); 392 } 393 394 // Make a cloned version of `phi`, but with its arguments remapped by `mapper`. 395 static MPhi* MakeReplacementPhi(TempAllocator& alloc, 396 const MDefinitionRemapper& mapper, 397 const MPhi* phi) { 398 MDefinitionVector inputs(alloc); 399 for (size_t i = 0; i < phi->numOperands(); i++) { 400 MDefinition* old = phi->getOperand(i); 401 MDefinition* replacement = mapper.lookup(old); 402 if (!replacement) { 403 replacement = old; 404 } 405 if (!inputs.append(replacement)) { 406 return nullptr; 407 } 408 } 409 return phi->clone(alloc, inputs); 410 } 411 412 // ===================================================================== 413 // 414 // UnrollState 415 416 enum class UnrollMode { Peel, Unroll, PeelAndUnroll }; 417 418 struct UnrollState { 419 // What should happen to the specified loop. 420 const UnrollMode mode; 421 422 // The central data structure: BlockTable, a 2-D array of block pointers. 423 // 424 // To create copies of the original loop body, we need to copy the original 425 // blocks and fix up the inter-block edges and phis appropriately. This 426 // edge-fixup is done without reference to the block IDs; instead it is done 427 // purely on the basis of the MBasicBlock* values themselves. Hence it does 428 // not make any assumption about the ordering of the block ID values. 429 // 430 // The block copies are managed using a 2-D array of block pointers, via 431 // which we can uniformly access the original and new blocks. The array is 432 // indexed by `(copy index, block index in copy)`. Both indices are 433 // zero-based. The sequencing of blocks implied by "block index in copy" is 434 // the sequence produced by the RPO iterator as it enumerates the blocks in 435 // the loop. 436 // 437 // Per the above, note that "block index" (`bix` in the code) is a zero-based 438 // index into the RPO sequence of blocks in the loop and is not to be 439 // confused with the pre-existing concept of "block ID". 440 // 441 // Note also that the required unroll factor (including that for the peeled 442 // copy, if any) is equal to the first dimension of `blockTable`, and the 443 // number of blocks in the original loop is implied by the second dimension 444 // of `blockTable`. Hence neither value needs to be stored explicitly. 445 BlockTable blockTable; 446 447 // The set of blocks arrived at (directly) by exiting branches in the 448 // original loop. FIXME: try to make const 449 /*const*/ BlockSet exitTargetBlocks; 450 451 // The set of values (MDefinition*s) defined in the original loop and used 452 // afterwards. 453 /*const*/ ValueSet exitingValues; 454 455 explicit UnrollState(UnrollMode mode) : mode(mode) {} 456 457 bool doPeeling() const { 458 return mode == UnrollMode::Peel || mode == UnrollMode::PeelAndUnroll; 459 } 460 bool doUnrolling() const { 461 return mode == UnrollMode::Unroll || mode == UnrollMode::PeelAndUnroll; 462 } 463 }; 464 465 // ===================================================================== 466 // 467 // AnalyzeLoop and AnalysisResult 468 469 // Summarises the result of a detailed analysis of a candidate loop, producing 470 // either a recommendation about what to do with it, or a reason why it should 471 // be left unchanged. 472 enum class AnalysisResult { 473 // OOM during analysis 474 OOM, 475 476 // The chosen transformation for the loop .. 477 Peel, 478 Unroll, 479 PeelAndUnroll, 480 481 // .. or .. reasons why the loop can't be unrolled: 482 // 483 // The unrolling/peeling machinery below relies on the fact that MIR loops 484 // are heavily constrained in their structure. AnalyzeLoop checks the 485 // relevant invariants carefully, with this value indicating non-compliance. 486 // Top level function UnrollLoops will MOZ_CRASH if this happens. 487 BadInvariants, 488 // Cannot be handled by the unroller: if the loop defines value(s) that are 489 // used after the loop, then we can only handle the case where it has one 490 // exiting branch. 491 TooComplex, 492 // The loop contains MIR nodes that are uncloneable 493 Uncloneable, 494 // The loop has too many blocks or values 495 TooLarge, 496 // The loop is otherwise unsuitable: 497 // * contains a call or table switch 498 // * is an infinite loop 499 Unsuitable 500 }; 501 502 #ifdef JS_JITSPEW 503 static const char* Name_of_AnalysisResult(AnalysisResult res) { 504 switch (res) { 505 case AnalysisResult::OOM: 506 return "OOM"; 507 case AnalysisResult::Peel: 508 return "Peel"; 509 case AnalysisResult::Unroll: 510 return "Unroll"; 511 case AnalysisResult::PeelAndUnroll: 512 return "PeelAndUnroll"; 513 case AnalysisResult::BadInvariants: 514 return "BadInvariants"; 515 case AnalysisResult::TooComplex: 516 return "TooComplex"; 517 case AnalysisResult::Uncloneable: 518 return "Uncloneable"; 519 case AnalysisResult::TooLarge: 520 return "TooLarge"; 521 case AnalysisResult::Unsuitable: 522 return "Unsuitable"; 523 default: 524 MOZ_CRASH(); 525 } 526 } 527 #endif 528 529 // Examine the original loop in `originalBlocks` for unrolling suitability, 530 // and, if acceptable, collect auxiliary information: 531 // 532 // * the set of blocks that are direct exit targets for the loop 533 // * the set of values defined in the loop AND used afterwards 534 535 static AnalysisResult AnalyzeLoop(const BlockVector& originalBlocks, 536 BlockSet* exitTargetBlocks, 537 ValueSet* exitingValues) { 538 MOZ_ASSERT(exitTargetBlocks->empty()); 539 MOZ_ASSERT(exitingValues->empty()); 540 541 // ==== BEGIN check invariants on the loop structure ==== 542 543 // The unroller/peeler below depends on these invariants. 544 // In summary (where nBlocks == originalBlocks.length()): 545 // 546 // * At least 2 blocks in original loop (1 for header phis + possible insns, 547 // >= 0 for "ordinary" blocks, 1 for the backedge jump) 548 // 549 // * originalBlocks[0] is a loop header 550 // * originalBlocks[nBlocks-1] == originalBlocks[0]->backedge 551 // 552 // * all blocks in original loop have at least one insn 553 // * all blocks in original loop have a control insn as their last insn 554 // * all blocks in original loop have non-control insns for all other insns 555 // * all blocks in original loop have loop depth > 0 556 // 557 // * none of originalBlocks[0 .. nBlocks-2] (inclusive) jumps to header 558 // 559 // * originalBlocks[nBlocks-1] jumps unconditionally to header 560 // 561 // * original header node has 2 preds 562 // * original header node pred[0] is from outside the loop, and 563 // pred[1] is from inside the loop. 564 // 565 // * original header node phis all have 2 args 566 // * original header node phis have arg[0] from outside the loop 567 // (because pred[0] is the loop-entry edge) 568 // * original header node phis have their arg[1] as 569 // -- (almost always) defined in the loop -- loop carried variable 570 // -- (very occasionally) defined outside the loop -- in which case 571 // this is just a vanilla phi, nothing to do with the loop. 572 // Hence there is nothing to check for phi arg[1]s, but leaving this 573 // here for clarity. 574 // 575 // Aside: how can a loop head phi not depend on a value defined within the 576 // loop? Consider the strange-but-valid case 577 // 578 // x = 5 ; loop { .. x = 7; .. } 579 // 580 // The loop head phi will merge the initial value (5) and the loop-defined 581 // value in the usual way. Imagine now the loop is LICMd; the loop defined 582 // value `x = 7;` is observed to be invariant and so is lifted out of the 583 // loop. Hence the loop head phi will have both args defined outside the 584 // loop. 585 586 // * At least 2 blocks in original loop (1 for header phis + possible insns, 587 // >= 0 for "ordinary" blocks, 1 for the backedge jump) 588 const size_t numBlocksInOriginal = originalBlocks.length(); 589 if (numBlocksInOriginal < 2) { 590 return AnalysisResult::BadInvariants; 591 } 592 593 // * originalBlocks[0] is a loop header 594 // * originalBlocks[nBlocks-1] == originalBlocks[0]->backedge 595 { 596 MBasicBlock* header = originalBlocks[0]; 597 if (!header->isLoopHeader() || 598 header->backedge() != originalBlocks[numBlocksInOriginal - 1]) { 599 return AnalysisResult::BadInvariants; 600 } 601 } 602 603 // * all blocks in original loop have at least one insn 604 // * all blocks in original loop have a control insn as their last insn 605 // * all blocks in original loop have non-control insns for all other insns 606 // * all blocks in original loop have loop depth > 0 607 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 608 MBasicBlock* block = originalBlocks[bix]; 609 // This is a bit lame, but .. 610 size_t numInsns = 0; 611 for (MInstructionIterator insnIter(block->begin()); 612 insnIter != block->end(); insnIter++) { 613 numInsns++; 614 } 615 if (numInsns < 1) { 616 return AnalysisResult::BadInvariants; 617 } 618 bool ok = true; 619 size_t curInsn = 0; 620 for (MInstructionIterator insnIter(block->begin()); 621 insnIter != block->end(); insnIter++) { 622 MInstruction* insn = *insnIter; 623 if (curInsn == numInsns - 1) { 624 ok = ok && insn->isControlInstruction(); 625 } else { 626 ok = ok && !insn->isControlInstruction(); 627 } 628 curInsn++; 629 } 630 MOZ_ASSERT(curInsn == numInsns); // internal logic 631 ok = ok && block->loopDepth() > 0; 632 if (!ok) { 633 return AnalysisResult::BadInvariants; 634 } 635 } 636 637 // * none of originalBlocks[0 .. nBlocks-2] (inclusive) jumps to header 638 { 639 MBasicBlock* header = originalBlocks[0]; 640 for (uint32_t bix = 0; bix < numBlocksInOriginal - 1; bix++) { 641 MBasicBlock* block = originalBlocks[bix]; 642 if (!block->hasLastIns()) { 643 return AnalysisResult::BadInvariants; 644 } 645 MControlInstruction* lastIns = block->lastIns(); 646 for (size_t i = 0; i < lastIns->numSuccessors(); i++) { 647 if (lastIns->getSuccessor(i) == header) { 648 return AnalysisResult::BadInvariants; 649 } 650 } 651 } 652 } 653 654 // originalBlocks[nBlocks-1] jumps unconditionally to header 655 { 656 MBasicBlock* header = originalBlocks[0]; 657 MBasicBlock* backedge = originalBlocks[numBlocksInOriginal - 1]; 658 if (!backedge->hasLastIns()) { 659 return AnalysisResult::BadInvariants; 660 } 661 MControlInstruction* lastIns = backedge->lastIns(); 662 if (!lastIns->isGoto() || lastIns->toGoto()->target() != header) { 663 return AnalysisResult::BadInvariants; 664 } 665 } 666 667 // * original header node has 2 preds 668 // * original header node pred[0] is from outside the loop. 669 // pred[1] is from inside the loop. 670 { 671 MBasicBlock* header = originalBlocks[0]; 672 if (header->numPredecessors() != 2 || 673 BlockVectorContains(originalBlocks, header->getPredecessor(0)) || 674 !BlockVectorContains(originalBlocks, header->getPredecessor(1))) { 675 return AnalysisResult::BadInvariants; 676 } 677 } 678 679 // * original header node phis all have 2 args 680 // * original header node phis have arg[0] from outside the loop 681 // (because pred[0] is the loop-entry edge) 682 // * original header node phis have their arg[1] as 683 // -- (almost always) defined in the loop -- loop carried variable 684 // -- (very occasionally) defined outside the loop -- in which case 685 // this is just a vanilla phi, nothing to do with the loop. 686 // Hence there is nothing to check for phi arg[1]s, but leaving this 687 // here for clarity. 688 { 689 MBasicBlock* header = originalBlocks[0]; 690 for (MPhiIterator phiIter(header->phisBegin()); 691 phiIter != header->phisEnd(); phiIter++) { 692 MPhi* phi = *phiIter; 693 if (phi->numOperands() != 2 || 694 BlockVectorContains(originalBlocks, phi->getOperand(0)->block())) { 695 return AnalysisResult::BadInvariants; 696 } 697 } 698 } 699 700 // ==== END check invariants on the loop structure ==== 701 702 // Check that all the insns are cloneable. 703 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 704 MBasicBlock* block = originalBlocks[bix]; 705 for (MInstructionIterator insIter(block->begin()); insIter != block->end(); 706 insIter++) { 707 MInstruction* ins = *insIter; 708 if (ins->isWasmCallCatchable() || ins->isWasmCallUncatchable() || 709 ins->isWasmReturnCall() || ins->isTableSwitch() 710 /* TODO: other node kinds unsuitable for unrolling? */) { 711 return AnalysisResult::Unsuitable; 712 } 713 if (!ins->canClone()) { 714 // `ins->opName()` will show the name of the insn kind, if you want to 715 // see it 716 return AnalysisResult::Uncloneable; 717 } 718 } 719 } 720 721 // More analysis: make up a set of blocks that are not in the loop, but which 722 // are jumped to from within the loop. We will need this later. 723 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 724 MBasicBlock* block = originalBlocks[bix]; 725 MControlInstruction* lastIns = block->lastIns(); // asserted safe above 726 for (size_t i = 0; i < lastIns->numSuccessors(); i++) { 727 MBasicBlock* succ = lastIns->getSuccessor(i); 728 // See if `succ` is any of the blocks in the (original) loop. If not, 729 // add it to `exitTargetBlocks`. 730 if (!BlockVectorContains(originalBlocks, succ)) { 731 if (!exitTargetBlocks->add(succ)) { 732 return AnalysisResult::OOM; 733 } 734 } 735 } 736 } 737 738 // If the loop has zero exits, we consider it sufficiently mutant to ignore 739 // (it's an infinite loop?) 740 if (exitTargetBlocks->empty()) { 741 return AnalysisResult::Unsuitable; 742 } 743 744 // Even more analysis: make up a set of MDefinition*s that are defined in the 745 // original loop and which are used after the loop. We will later need to 746 // add phi nodes that generate the correct values for these, since, after 747 // unrolling, these values will have multiple definition points (one per 748 // unrolled iteration). While we're at it, count the number of values 749 // defined in the original loop. 750 size_t numValuesInOriginal = 0; 751 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 752 MBasicBlock* block = originalBlocks[bix]; 753 // Check all phi nodes .. 754 for (MPhiIterator phiIter(block->phisBegin()); phiIter != block->phisEnd(); 755 phiIter++) { 756 numValuesInOriginal++; 757 MPhi* phi = *phiIter; 758 bool usedAfterLoop = false; 759 // Is the value of `phi` used after the loop? 760 for (MUseIterator useIter(phi->usesBegin()); useIter != phi->usesEnd(); 761 useIter++) { 762 MUse* use = *useIter; 763 if (!BlockVectorContains(originalBlocks, use->consumer()->block())) { 764 usedAfterLoop = true; 765 break; 766 } 767 } 768 if (usedAfterLoop && !exitingValues->add(phi)) { 769 return AnalysisResult::OOM; 770 } 771 } 772 // .. and all normal nodes .. 773 for (MInstructionIterator insnIter(block->begin()); 774 insnIter != block->end(); insnIter++) { 775 numValuesInOriginal++; 776 MInstruction* insn = *insnIter; 777 bool usedAfterLoop = false; 778 // Is the value of `insn` used after the loop? 779 for (MUseIterator useIter(insn->usesBegin()); useIter != insn->usesEnd(); 780 useIter++) { 781 MUse* use = *useIter; 782 if (!BlockVectorContains(originalBlocks, use->consumer()->block())) { 783 usedAfterLoop = true; 784 break; 785 } 786 } 787 if (usedAfterLoop && !exitingValues->add(insn)) { 788 return AnalysisResult::OOM; 789 } 790 } 791 } 792 793 // Due to limitations in AddClosingPhisForLoop (see comments there), 794 // currently we can only unroll loops where, either: 795 // 796 // * no value defined in the loop is used afterwards, in which case we handle 797 // any number of exits from the loop. 798 // 799 // * one or more values defined in the loop is used afterwards, in which case 800 // there must be one exit point from the loop. 801 // 802 // IOW we can't handle multiple exits *and* values defined in the loop. 803 if (exitTargetBlocks->size() >= 2 && exitingValues->size() > 0) { 804 return AnalysisResult::TooComplex; 805 } 806 807 // There is an invariant that a loop exit target block does not contain any 808 // phis. This is really just the no-critical-edges invariant. 809 // 810 // By definition, a loop exit block has at least one successor in the loop 811 // and at least one successor outside the loop. If the loop exit target has 812 // phis, that would imply it has multiple predecessors. But then the edge 813 // would be critical, because it connects a block with multiple successors to 814 // a block with multiple predecessors. 815 // 816 // So enforce that; it simplifies the addition of loop-closing phis that will 817 // happen shortly. 818 for (size_t i = 0; i < exitTargetBlocks->size(); i++) { 819 MBasicBlock* exitTargetBlock = exitTargetBlocks->get(i); 820 if (!exitTargetBlock->phisEmpty()) { 821 return AnalysisResult::BadInvariants; 822 } 823 // Also we need this, since we'll be adding single-argument phis to this 824 // block during loop-closing. 825 if (exitTargetBlock->numPredecessors() != 1) { 826 return AnalysisResult::BadInvariants; 827 } 828 } 829 830 // At this point, we've worked through all invariant- and structural- reasons 831 // why we can't/shouldn't unroll/peel this loop. Now we need to decide which 832 // (or both) of those actions should happen. 833 bool peel = true; 834 bool unroll = true; 835 if (numBlocksInOriginal > MaxBlocksForPeel || 836 numValuesInOriginal > MaxValuesForPeel) { 837 peel = false; 838 } 839 if (numBlocksInOriginal > MaxBlocksForUnroll || 840 numValuesInOriginal > MaxValuesForUnroll) { 841 unroll = false; 842 } 843 844 if (peel) { 845 return unroll ? AnalysisResult::PeelAndUnroll : AnalysisResult::Peel; 846 } else { 847 return unroll ? AnalysisResult::Unroll : AnalysisResult::TooLarge; 848 } 849 } 850 851 // ===================================================================== 852 // 853 // AddClosingPhisForLoop 854 855 // Add "loop-closing phis" for values defined in a loop and used afterwards. 856 // In GCC parlance, the resulting form is called LCSSA (Loop-Closed SSA). This 857 // is a very limited intervention in that it can only handle a single exit in a 858 // loop, thereby restricting the places where the phis should go to the 859 // one-and-only exit block, and avoiding the need to compute iterated dominance 860 // frontiers. 861 862 [[nodiscard]] 863 static bool AddClosingPhisForLoop(TempAllocator& alloc, 864 const UnrollState& state) { 865 // If no values exit the loop, we don't care how many exit target blocks 866 // there are, and need to make no changes. 867 if (state.exitingValues.empty()) { 868 return true; 869 } 870 871 // Ensured by AnalyzeLoop 872 MOZ_ASSERT(state.exitTargetBlocks.size() == 1); 873 MBasicBlock* targetBlock = state.exitTargetBlocks.get(0); 874 // Ensured by AnalyzeLoop. Required because we're adding single-arg phis 875 // to `targetBlock`. 876 MOZ_ASSERT(targetBlock->numPredecessors() == 1); 877 878 for (size_t i = 0; i < state.exitingValues.size(); i++) { 879 MDefinition* exitingValue = state.exitingValues.get(i); 880 // In `targetBlock`, add a single-argument phi node that "catches" 881 // `exitingValue`, and change all uses of `exitingValue` to be the phi 882 // node instead. 883 MPhi* phi = MPhi::New(alloc, exitingValue->type()); 884 if (!phi) { 885 return false; 886 } 887 targetBlock->addPhi(phi); 888 // Change all uses of `exitingValue` outside of the loop into uses of 889 // `phi`. 890 for (MUseIterator useIter(exitingValue->usesBegin()), 891 useIterEnd(exitingValue->usesEnd()); 892 useIter != useIterEnd; 893 /**/) { 894 MUse* use = *useIter++; 895 if (state.blockTable.rowContains(0, use->consumer()->block())) { 896 continue; 897 } 898 MOZ_ASSERT(use->consumer()); 899 use->replaceProducer(phi); 900 } 901 // Finally, make `phi` be the one-and-only user of `exitingValue`. 902 if (!phi->addInputFallible(exitingValue)) { 903 return false; 904 } 905 } 906 907 // Invariant that should hold at this point: for each exiting value, all uses 908 // of it outside the loop should only be phi nodes. 909 for (size_t i = 0; i < state.exitingValues.size(); i++) { 910 MDefinition* exitingValue = state.exitingValues.get(i); 911 for (MUseIterator useIter(exitingValue->usesBegin()); 912 useIter != exitingValue->usesEnd(); useIter++) { 913 mozilla::DebugOnly<MUse*> use = *useIter; 914 MOZ_ASSERT_IF(!state.blockTable.rowContains(0, use->consumer()->block()), 915 use->consumer()->isDefinition() && 916 use->consumer()->toDefinition()->isPhi()); 917 } 918 } 919 920 return true; 921 } 922 923 // ===================================================================== 924 // 925 // UnrollAndOrPeelLoop 926 927 [[nodiscard]] 928 static bool UnrollAndOrPeelLoop(MIRGraph& graph, UnrollState& state) { 929 // Prerequisites (assumed): 930 // * AnalyzeLoop has approved this loop for peeling and/or unrolling 931 // * AddClosingPhisForLoop has been called for it 932 // 933 // We expect `state` to be ready to go, with the caveat that only the first 934 // row of entries in `state.blockTable` have been filled in (these are the 935 // original loop blocks). All other entries are null and will be filled in 936 // by this routine. 937 // 938 // Almost all the logic here is to do with unrolling. That's because peeling 939 // (done far below) can be done by a trivial modification of the loop 940 // resulting from unrolling: make the backedge jump to the second copy of the 941 // loop body rather than the first. That leaves the first copy (that is, the 942 // original loop body) in a "will only be executed once" state, as we desire. 943 944 // We need this so often it's worth pulling out specially. 945 const MBasicBlock* originalHeader = state.blockTable.get(0, 0); 946 MOZ_ASSERT(originalHeader->isLoopHeader()); 947 948 #ifdef JS_JITSPEW 949 if (JitSpewEnabled(JitSpew_UnrollDetails)) { 950 Fprinter& printer(JitSpewPrinter()); 951 printer.printf( 952 "<<<< ORIGINAL FUNCTION (after LCSSA-ification of chosen loops)\n"); 953 DumpMIRGraph(printer, graph, /*showDetails=*/true); 954 printer.printf(">>>>\n"); 955 } 956 #endif 957 958 // Decide on the total unroll factor. "Copy #0" is the original loop, so the 959 // number of new copies is `unrollFactor - 1`. For example, if the original 960 // loop was `loop { B; }` then with `unrollFactor == 3`, the unrolled loop 961 // will be `loop { B; B; B; }`. Note that the value acquired here includes 962 // the copy needed for peeling, if that has been requested. 963 const uint32_t unrollFactor = state.blockTable.size1(); 964 // The logic below will fail if this doesn't hold. 965 MOZ_ASSERT(unrollFactor >= 2); 966 967 // The number of blocks in the original loop. 968 const uint32_t numBlocksInOriginal = state.blockTable.size2(); 969 970 // The number of values (phis and normal values) in the original loop. 971 uint32_t numValuesInOriginal = 0; 972 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 973 MBasicBlock* block = state.blockTable.get(0, bix); 974 // This is very feeble, but I can't think of anything better. 975 for (MPhiIterator phiIter(block->phisBegin()); phiIter != block->phisEnd(); 976 phiIter++) { 977 numValuesInOriginal++; 978 } 979 for (MInstructionIterator insnIter(block->begin()); 980 insnIter != block->end(); insnIter++) { 981 numValuesInOriginal++; 982 } 983 } 984 // `numValuesInOriginal` is const after this point. 985 986 #ifdef JS_JITSPEW 987 if (JitSpewEnabled(JitSpew_UnrollDetails)) { 988 DumpBlockTableRowZero(state.blockTable, "ORIGINAL LOOP"); 989 Fprinter& printer(JitSpewPrinter()); 990 printer.printf("<<<< EXIT TARGET BLOCKS: "); 991 for (size_t i = 0; i < state.exitTargetBlocks.size(); i++) { 992 MBasicBlock* targetBlock = state.exitTargetBlocks.get(i); 993 DumpMIRBlockID(printer, targetBlock, /*showDetails=*/true); 994 printer.printf(" "); 995 } 996 printer.printf(">>>>\n"); 997 printer.printf("<<<< EXITING VALUES: "); 998 for (size_t i = 0; i < state.exitingValues.size(); i++) { 999 MDefinition* exitingValue = state.exitingValues.get(i); 1000 DumpMIRDefinitionID(printer, exitingValue, /*showDetails=*/true); 1001 printer.printf(" "); 1002 } 1003 printer.printf(">>>>\n"); 1004 } 1005 #endif 1006 1007 // At this point, `state.blockTable` row zero contains the original blocks, 1008 // and all other rows are nulled out. 1009 1010 // Set up the value table. This is a structure similar to 1011 // `state.blockTable`: a 2-D array of values. This is local to this routine 1012 // rather than being part of `state` because it is only needed here, and can 1013 // take quite a lot of space. 1014 // 1015 // As with the block table, the main (actually, only) purpose of the value 1016 // table is to make it possible to find equivalent definitions in different 1017 // iterations. Also as above, "value index" (`vix` in the code) is a 1018 // zero-based index into the sequence of values (including phis) in the loop 1019 // as generated by the RPO traversal of the loop's blocks. 1020 1021 ValueTable valueTable; 1022 if (!valueTable.init(unrollFactor, numValuesInOriginal)) { 1023 return false; 1024 } 1025 1026 // Copy the original values into row zero of `valueTable`, parking them 1027 // end-to-end, without regard to block boundaries. All other rows remain 1028 // nulled out for now. 1029 { 1030 uint32_t vix = 0; 1031 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1032 MBasicBlock* block = state.blockTable.get(0, bix); 1033 for (MPhiIterator phiIter(block->phisBegin()); 1034 phiIter != block->phisEnd(); phiIter++) { 1035 valueTable.set(0, vix, *phiIter); 1036 vix++; 1037 } 1038 for (MInstructionIterator insnIter(block->begin()); 1039 insnIter != block->end(); insnIter++) { 1040 valueTable.set(0, vix, *insnIter); 1041 vix++; 1042 } 1043 } 1044 MOZ_ASSERT(vix == numValuesInOriginal); 1045 } 1046 1047 // Set up the value remapper. This maps MDefinitions* in the original body 1048 // to their "current" definition in new bodies. The only allowed keys are 1049 // MDefinition*s from the original body. To enforce this, the keys have to 1050 // be "registered" before use. 1051 MDefinitionRemapper mapper; 1052 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1053 MBasicBlock* originalBlock = state.blockTable.get(0, bix); 1054 // Create initial entries in `mapper` for both the normal insns and phi 1055 // nodes in `originalBlock`. 1056 for (MPhiIterator phiIter(originalBlock->phisBegin()); 1057 phiIter != originalBlock->phisEnd(); phiIter++) { 1058 MPhi* originalPhi = *phiIter; 1059 if (!mapper.enregister(originalPhi)) { 1060 return false; 1061 } 1062 } 1063 for (MInstructionIterator insnIter(originalBlock->begin()); 1064 insnIter != originalBlock->end(); insnIter++) { 1065 MInstruction* originalInsn = *insnIter; 1066 if (!mapper.enregister(originalInsn)) { 1067 return false; 1068 } 1069 } 1070 } 1071 1072 // Now we begin with the very core of the unrolling activity. 1073 1074 // Create new empty blocks for copy indices 1 .. (unrollFactor-1) 1075 const CompileInfo& info = originalHeader->info(); 1076 for (uint32_t cix = 1; cix < unrollFactor; cix++) { 1077 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1078 MBasicBlock* empty = MBasicBlock::New(graph, info, /*pred=*/nullptr, 1079 MBasicBlock::Kind::NORMAL); 1080 if (!empty) { 1081 return false; 1082 } 1083 empty->setLoopDepth(state.blockTable.get(0, bix)->loopDepth()); 1084 // We don't bother to set the block's ID, because the logic below not 1085 // depend on it. In any case the blocks will get renumbered at the end of 1086 // UnrollLoops, so it doesn't matter what we choose here. Unrolling 1087 // still works even if we don't set the ID to anything. 1088 MOZ_ASSERT(!state.blockTable.get(cix, bix)); 1089 state.blockTable.set(cix, bix, empty); 1090 } 1091 } 1092 1093 // Copy verbatim, blocks in the original loop, into our new copies. 1094 // For each copy of the loop .. 1095 for (uint32_t cix = 1; cix < unrollFactor; cix++) { 1096 // For each block in the original copy .. 1097 uint32_t vix = 0; 1098 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1099 MBasicBlock* originalBlock = state.blockTable.get(0, bix); 1100 MBasicBlock* clonedBlock = state.blockTable.get(cix, bix); 1101 1102 // Clone the phi nodes, but don't update the mapper until all original 1103 // arguments have been looked up. This is required because the phi nodes 1104 // collectively implement a parallel assignment; there is no implied 1105 // sequentiality. 1106 mozilla::Vector<std::pair<const MPhi*, MPhi*>, 16, SystemAllocPolicy> 1107 mapperUpdates; 1108 for (MPhiIterator phiIter(originalBlock->phisBegin()); 1109 phiIter != originalBlock->phisEnd(); phiIter++) { 1110 const MPhi* originalPhi = *phiIter; 1111 MPhi* clonedPhi = 1112 MakeReplacementPhi(graph.alloc(), mapper, originalPhi); 1113 if (!clonedPhi) { 1114 return false; 1115 } 1116 clonedBlock->addPhi(clonedPhi); 1117 MOZ_ASSERT(!valueTable.get(cix, vix)); 1118 valueTable.set(cix, vix, clonedPhi); 1119 vix++; 1120 if (!mapperUpdates.append(std::pair(originalPhi, clonedPhi))) { 1121 return false; 1122 } 1123 } 1124 for (auto& p : mapperUpdates) { 1125 // Update the value mapper. The `originalPhi` values must be part of 1126 // the original loop body and so must already have a key in `mapper`. 1127 mapper.update(p.first, p.second); 1128 } 1129 1130 // Cloning the instructions is simpler, since we can incrementally update 1131 // the mapper. 1132 for (MInstructionIterator insnIter(originalBlock->begin()); 1133 insnIter != originalBlock->end(); insnIter++) { 1134 const MInstruction* originalInsn = *insnIter; 1135 MInstruction* clonedInsn = 1136 MakeReplacementInstruction(graph.alloc(), mapper, originalInsn); 1137 if (!clonedInsn) { 1138 return false; 1139 } 1140 clonedBlock->insertAtEnd(clonedInsn); 1141 MOZ_ASSERT(!valueTable.get(cix, vix)); 1142 valueTable.set(cix, vix, clonedInsn); 1143 vix++; 1144 // Update the value mapper. `originalInsn` must be part of the 1145 // original loop body and so must already have a key in `mapper`. 1146 mapper.update(originalInsn, clonedInsn); 1147 } 1148 1149 // Clone the block's predecessor array 1150 MOZ_ASSERT(clonedBlock->numPredecessors() == 0); 1151 for (uint32_t i = 0; i < originalBlock->numPredecessors(); i++) { 1152 MBasicBlock* pred = originalBlock->getPredecessor(i); 1153 if (!clonedBlock->appendPredecessor(pred)) { 1154 return false; 1155 } 1156 } 1157 } 1158 1159 MOZ_ASSERT(vix == numValuesInOriginal); 1160 1161 // Check we didn't mess up the valueTable ordering relative to the original 1162 // (best effort assertion). 1163 for (vix = 0; vix < numValuesInOriginal; vix++) { 1164 MOZ_ASSERT(valueTable.get(cix, vix)->op() == 1165 valueTable.get(0, vix)->op()); 1166 } 1167 1168 // For cloned instructions that have a (load) dependency field, and that 1169 // field points to an instruction in the original body, update the field so 1170 // as to point to the equivalent instruction in the cloned body. 1171 for (vix = 0; vix < numValuesInOriginal; vix++) { 1172 MDefinition* clonedInsn = valueTable.get(cix, vix); 1173 MDefinition* originalDep = clonedInsn->dependency(); 1174 if (originalDep) { 1175 mozilla::Maybe<size_t> originalInsnIndex = 1176 valueTable.findInRow(0, originalDep); 1177 if (originalInsnIndex.isSome()) { 1178 // The dependency is to an insn in the original body, so we need to 1179 // remap it to this body. 1180 MDefinition* clonedDep = 1181 valueTable.get(cix, originalInsnIndex.value()); 1182 clonedInsn->setDependency(clonedDep); 1183 } 1184 } 1185 } 1186 } 1187 1188 #ifdef JS_JITSPEW 1189 if (JitSpewEnabled(JitSpew_UnrollDetails)) { 1190 DumpValueTable(valueTable, "VALUE TABLE (final)"); 1191 } 1192 #endif 1193 1194 // Fix up (remap) inter-block edges in both the original and copies, using 1195 // uniform indexing as above (original = index 0, copies = indices >= 1): 1196 1197 // For a block with copy-index N, inspect each edge. Determine the role the 1198 // edge-destination (target) plays in the original copy: 1199 // 1200 // * backedge 1201 // -> remap to header node of copy (N + 1) % unrollFactor 1202 // (so, generates a backedge in the last copy) 1203 // * internal jump within the original body 1204 // -> remap to corresponding block in copy N 1205 // * jump to after the original body 1206 // -> unchanged (it is an exiting edge) 1207 // * jump to before the original body 1208 // -> this can't happen 1209 // (but if it did, it would be just another exit block) 1210 auto RemapEdgeDestination = 1211 [=, &state](uint32_t cix, MBasicBlock* oldDestination) -> MBasicBlock* { 1212 // `oldDestination` is the target of a control-flow insn in the original 1213 // loop. Figure out what its replacement should be, for copy index `cix` 1214 // (with `cix == 0` meaning the original copy). 1215 1216 // If it is a back edge, remap to the header node of the next copy, with 1217 // wraparound for the last copy. 1218 if (oldDestination == state.blockTable.get(0, 0)) { 1219 MOZ_ASSERT(oldDestination == originalHeader); 1220 return state.blockTable.get((cix + 1) % unrollFactor, 0); 1221 } 1222 1223 // If it is to some other node within the original body, remap to the 1224 // corresponding node in this copy. 1225 for (uint32_t i = 1; i < numBlocksInOriginal; i++) { 1226 if (oldDestination == state.blockTable.get(0, i)) { 1227 return state.blockTable.get(cix, i); 1228 } 1229 } 1230 1231 // The target of the edge is not in the original loop. Either it's to 1232 // after the loop (an exiting branch) or it is to before the loop, although 1233 // that would be bizarre and probably illegal. Either way, leave it 1234 // unchanged. 1235 return oldDestination; 1236 }; 1237 1238 // Similarly, for an edge that claims to come from `oldSource` in the 1239 // original loop, determine where it should come from if we imagine that it 1240 // applies to copy index `cix`. 1241 auto RemapEdgeSource = [=, &state](uint32_t cix, 1242 MBasicBlock* oldSource) -> MBasicBlock* { 1243 if (oldSource == state.blockTable.get(0, numBlocksInOriginal - 1)) { 1244 // Source is backedge node in the original loop. Remap to the backedge 1245 // node in the previous iteration. 1246 MOZ_ASSERT(oldSource == originalHeader->backedge()); 1247 return state.blockTable.get(cix == 0 ? (unrollFactor - 1) : (cix - 1), 1248 numBlocksInOriginal - 1); 1249 } 1250 for (uint32_t i = 0; i < numBlocksInOriginal - 1; i++) { 1251 if (oldSource == state.blockTable.get(0, i)) { 1252 // Source is a non-backedge node in the original loop. Remap it to 1253 // the equivalent node in this copy. 1254 return state.blockTable.get(cix, i); 1255 } 1256 } 1257 // Source is not in the original loop. Leave unchanged. 1258 return oldSource; 1259 }; 1260 1261 // Work through all control-flow instructions in all copies and remap their 1262 // edges. 1263 // For each copy of the loop, including the original .. 1264 for (uint32_t cix = 0; cix < unrollFactor; cix++) { 1265 // For each block in the copy .. 1266 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1267 MBasicBlock* b = state.blockTable.get(cix, bix); 1268 MControlInstruction* lastI = b->lastIns(); 1269 if (lastI->isGoto()) { 1270 MGoto* insn = lastI->toGoto(); 1271 insn->setTarget(RemapEdgeDestination(cix, insn->target())); 1272 } else if (lastI->isTest()) { 1273 MTest* insn = lastI->toTest(); 1274 insn->setIfTrue(RemapEdgeDestination(cix, insn->ifTrue())); 1275 insn->setIfFalse(RemapEdgeDestination(cix, insn->ifFalse())); 1276 } else { 1277 // The other existing control instructions are MTableSwitch, 1278 // MUnreachable, MWasmTrap, MWasmReturn, and MWasmReturnVoid. 1279 // MTableSwitch is ruled out above. The others end a block without a 1280 // successor, which means that they can't be loop blocks 1281 // (MarkLoopBlocks marks blocks that are predecessors of the back 1282 // edge.) So we don't expect to see them here. 1283 MOZ_CRASH(); 1284 } 1285 } 1286 } 1287 1288 // Also we need to remap within the each block's predecessors vector. 1289 // We have to do this backwards to stop RemapEdgeSource from asserting. 1290 // For each copy of the loop, including the original .. 1291 for (int32_t cix = unrollFactor - 1; cix >= 0; cix--) { 1292 // For each block in the copy .. 1293 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1294 MBasicBlock* block = state.blockTable.get(cix, bix); 1295 for (uint32_t i = 0; i < block->numPredecessors(); i++) { 1296 MBasicBlock* oldPred = block->getPredecessor(i); 1297 MBasicBlock* newPred = RemapEdgeSource(cix, oldPred); 1298 block->setPredecessor(i, newPred); 1299 } 1300 } 1301 } 1302 1303 // Clean up the header node copies (but not for the original copy) since they 1304 // all still claim to have two predecessors, the first of which is the entry 1305 // edge from outside the loop. This is no longer true, so delete that 1306 // predecessor. Remove the corresponding Phi args too. This leaves 1307 // one-argument phis, meh. 1308 for (uint32_t cix = 1; cix < unrollFactor; cix++) { 1309 MBasicBlock* block = state.blockTable.get(cix, 0); 1310 MOZ_ASSERT(block->numPredecessors() == 2); // from AnalyzeLoop 1311 block->erasePredecessor(0); 1312 MOZ_ASSERT(block->numPredecessors() == 1); // duh 1313 for (MPhiIterator phiIter(block->phisBegin()); phiIter != block->phisEnd(); 1314 phiIter++) { 1315 MPhi* phi = *phiIter; 1316 MOZ_ASSERT(phi->numOperands() == 2); // from AnalyzeLoop 1317 phi->removeOperand(0); 1318 MOZ_ASSERT(phi->numOperands() == 1); 1319 } 1320 } 1321 1322 // Fix up the phi nodes in the original header; the previous step did not do 1323 // that. Specifically, the second arg in each phi -- which are values that 1324 // flow in from the backedge block -- need to be the ones from the last body 1325 // copy; but currently they are from the original copy. At this point, the 1326 // `mapper` should be able to tell us the correct value. 1327 for (MPhiIterator phiIter(originalHeader->phisBegin()); 1328 phiIter != originalHeader->phisEnd(); phiIter++) { 1329 MPhi* phi = *phiIter; 1330 MOZ_ASSERT(phi->numOperands() == 2); // from AnalyzeLoop 1331 // phi arg[0] is defined before the loop. Ensured by AnalyzeLoop. 1332 MOZ_ASSERT(!state.blockTable.rowContains(0, phi->getOperand(0)->block())); 1333 // phi arg[1] is almost always defined in the loop (hence it is a 1334 // loop-carried value) but is very occasionally defined before the loop 1335 // (so it is unrelated to the loop). Proceed with maximum paranoia. 1336 MDefinition* operand1 = phi->getOperand(1); 1337 MDefinition* replacement = mapper.lookup(operand1); 1338 // If we didn't find a replacement (unlikely but possible) then 1339 // `operand1` must be defined before the loop. 1340 MOZ_ASSERT((replacement == nullptr) == 1341 !state.blockTable.rowContains(0, operand1->block())); 1342 if (replacement) { 1343 phi->replaceOperand(1, replacement); 1344 } 1345 } 1346 1347 // Now we need to fix up the exit target blocks, by adding sources of exiting 1348 // edges to their `predecessors_` vectors and updating their phi nodes 1349 // accordingly. Recall that those phi nodes were previously installed in 1350 // these blocks by AddClosingPhisForLoop. 1351 // 1352 // Currently the exit target blocks mention only predecessors in the original 1353 // loop, but they also need to mention equivalent predecessors in each of the 1354 // loop copies. 1355 for (uint32_t cix = 0; cix < unrollFactor; cix++) { 1356 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1357 MBasicBlock* block = state.blockTable.get(cix, bix); 1358 MControlInstruction* lastIns = block->lastIns(); 1359 for (size_t i = 0; i < lastIns->numSuccessors(); i++) { 1360 MBasicBlock* succ = lastIns->getSuccessor(i); 1361 if (!state.exitTargetBlocks.contains(succ)) { 1362 // This isn't an exiting edge -- not interesting. 1363 continue; 1364 } 1365 // If `cix` is zero (the original body), we expect `succ` to already 1366 // list `block` as a predecessor. If `cix` is nonzero (a body copy) 1367 // then we expect `succ` *not* to list it, and we must add it. 1368 mozilla::DebugOnly<bool> found = false; 1369 for (uint32_t j = 0; j < succ->numPredecessors(); j++) { 1370 if (block == succ->getPredecessor(j)) { 1371 found = true; 1372 break; 1373 } 1374 } 1375 MOZ_ASSERT((cix == 0) == found); 1376 if (cix > 0) { 1377 if (!succ->appendPredecessor(block)) { 1378 return false; 1379 } 1380 for (MPhiIterator phiIter(succ->phisBegin()); 1381 phiIter != succ->phisEnd(); phiIter++) { 1382 MPhi* phi = *phiIter; 1383 // "+ 1" because we just added a predecessor 1384 MOZ_ASSERT(phi->numOperands() + 1 == succ->numPredecessors()); 1385 MDefinition* exitingValue = phi->getOperand(0); 1386 MOZ_ASSERT(state.exitingValues.contains(exitingValue)); 1387 // `exitingValue` is defined in the original loop body. We need to 1388 // find the equivalent value in copy `cix` so we can add it as an 1389 // argument to the phi. This bit is the one-and-only reason for 1390 // `valueTable`s existence. 1391 mozilla::Maybe<size_t> colIndex = 1392 valueTable.findInRow(0, exitingValue); 1393 MOZ_ASSERT(colIndex.isSome()); // We must find it! 1394 MDefinition* equivValue = valueTable.get(cix, colIndex.value()); 1395 if (!phi->addInputFallible(equivValue)) { 1396 return false; 1397 } 1398 } 1399 } 1400 } 1401 } 1402 } 1403 1404 // At this point the control flow is correct. But we may now have critical 1405 // edges where none existed before. Consider a loop {Block1, Block2} with an 1406 // exit target Block9: 1407 // 1408 // Block1 1409 // stuff1 1410 // Goto Block2 1411 // Block2 1412 // stuff2 1413 // Test true->Block1(== continue) false->Block9(== break) 1414 // ... 1415 // Block9 1416 // this is a loop-exit target 1417 // preds = Block2 1418 // 1419 // After factor-of-2 unrolling: 1420 // 1421 // Block1 1422 // stuff1 1423 // Goto Block2 1424 // Block2 1425 // stuff2 1426 // Test true->Block3(== continue) false->Block9(== break) 1427 // Block3 1428 // stuff3 1429 // Goto Block4 1430 // Block4 1431 // stuff4 1432 // Test true->Block1(== continue) false->Block9(== break) 1433 // ... 1434 // Block9 1435 // preds = Block2, Block4 1436 // 1437 // Now the exiting edges (Block2 to Block9 and Block4 to Block9) are critical. 1438 // 1439 // The obvious fix is to insert a new block on each exiting edge. 1440 1441 mozilla::Vector<MBasicBlock*, 16, SystemAllocPolicy> splitterBlocks; 1442 for (uint32_t cix = 0; cix < unrollFactor; cix++) { 1443 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1444 MBasicBlock* block = state.blockTable.get(cix, bix); 1445 // Inspect all successors of `block`. If any are exiting edges, create a 1446 // new block and route through that instead. 1447 MControlInstruction* lastIns = block->lastIns(); 1448 for (size_t i = 0; i < lastIns->numSuccessors(); i++) { 1449 MBasicBlock* succ = lastIns->getSuccessor(i); 1450 if (!state.exitTargetBlocks.contains(succ)) { 1451 // This isn't an exiting edge -- not interesting. 1452 continue; 1453 } 1454 // Invent a new block. 1455 MBasicBlock* splitter = 1456 MBasicBlock::New(graph, info, block, MBasicBlock::Kind::SPLIT_EDGE); 1457 if (!splitter || !splitterBlocks.append(splitter)) { 1458 return false; 1459 } 1460 splitter->setLoopDepth(succ->loopDepth()); 1461 // As with empty-block creation earlier in this routine, we don't 1462 // bother to set the block's ID since none of the logic depends on it, 1463 // and it will eventually be renumbered anyway by UnrollLoops. 1464 MGoto* jump = MGoto::New(graph.alloc(), succ); 1465 if (!jump) { 1466 return false; 1467 } 1468 splitter->insertAtEnd(jump); 1469 // For the target, fix up the `predecessor_` entry. 1470 mozilla::DebugOnly<bool> found = false; 1471 for (uint32_t j = 0; j < succ->numPredecessors(); j++) { 1472 if (succ->getPredecessor(j) == block) { 1473 succ->setPredecessor(j, splitter); 1474 found = true; 1475 break; 1476 } 1477 } 1478 // Finally, reroute the jump to `succ` via `splitter`. 1479 lastIns->replaceSuccessor(i, splitter); 1480 MOZ_ASSERT(found); 1481 } 1482 } 1483 } 1484 1485 // Now the control flow is correct *and* we don't have any critical edges. 1486 // Fix up the `successorWithPhis_` and `positionInPhiSuccessor_` fields in 1487 // both the unrolled loop and the splitter blocks, thusly: 1488 // 1489 // For all blocks `B` in the unrolled loop, and for all splitter blocks, fix 1490 // up the `B.successorWithPhis_` and `B.positionInPhiSuccessor_` fields. 1491 // Each block may have at maximum one successor that has phi nodes, in which 1492 // case `B.successorWithPhis_` should point at that block. And 1493 // `B.positionInPhiSuccessor_` should indicate the index that successor's 1494 // `predecessors_` vector, of B. 1495 { 1496 // Make up a list of blocks to fix up, then process them. 1497 mozilla::Vector<MBasicBlock*, 32 + 16, SystemAllocPolicy> workList; 1498 for (uint32_t cix = 0; cix < unrollFactor; cix++) { 1499 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1500 MBasicBlock* block = state.blockTable.get(cix, bix); 1501 if (!workList.append(block)) { 1502 return false; 1503 } 1504 } 1505 } 1506 for (MBasicBlock* block : splitterBlocks) { 1507 if (!workList.append(block)) { 1508 return false; 1509 } 1510 } 1511 // Now process them. 1512 for (MBasicBlock* block : workList) { 1513 // Inspect `block`s successors. 1514 MBasicBlock* succWithPhis = nullptr; 1515 MControlInstruction* lastIns = block->lastIns(); 1516 // Determine `succWithPhis`, if one exists. 1517 for (size_t i = 0; i < lastIns->numSuccessors(); i++) { 1518 MBasicBlock* succ = lastIns->getSuccessor(i); 1519 if (succ->phisEmpty()) { 1520 continue; 1521 } 1522 MOZ_ASSERT(succ); 1523 // `succ` is a successor to `block` that has phis. So we can't already 1524 // have seen a (different) successor. 1525 if (!succWithPhis) { 1526 succWithPhis = succ; 1527 } else if (succWithPhis == succ) { 1528 // ignore duplicate successors 1529 } else { 1530 // More than one successor with phis. Structural invariants 1531 // invalidated. Give up. 1532 MOZ_CRASH(); 1533 } 1534 } 1535 // Annotate `block`. 1536 if (!succWithPhis) { 1537 block->setSuccessorWithPhis(nullptr, 0); 1538 } else { 1539 MOZ_ASSERT(!succWithPhis->phisEmpty()); 1540 // Figure out which of `succWithPhis`s predecessors `block` is. 1541 // Should we assert `succWithPhis->predecessors_` is duplicate-free? 1542 uint32_t predIx = UINT32_MAX; // invalid 1543 for (uint32_t i = 0; i < succWithPhis->numPredecessors(); i++) { 1544 if (succWithPhis->getPredecessor(i) == block) { 1545 predIx = i; 1546 break; 1547 } 1548 } 1549 MOZ_ASSERT(predIx != UINT32_MAX); 1550 block->setSuccessorWithPhis(succWithPhis, predIx); 1551 } 1552 } 1553 } 1554 1555 // Find and remove the MWasmInterruptCheck in all but the last iteration. 1556 // We don't assume that there is an interrupt check, since 1557 // wasm::FunctionCompiler::fillArray, at least, generates a loop with no 1558 // check. 1559 for (uint32_t cix = 0; cix < unrollFactor - 1; cix++) { 1560 for (uint32_t vix = 0; vix < numValuesInOriginal; vix++) { 1561 MDefinition* ins = valueTable.get(cix, vix); 1562 if (!ins->isWasmInterruptCheck()) { 1563 continue; 1564 } 1565 MWasmInterruptCheck* ic = ins->toWasmInterruptCheck(); 1566 ic->block()->discard(ic); 1567 valueTable.set(cix, vix, nullptr); 1568 } 1569 } 1570 1571 // We're all done with unrolling. If peeling was also requested, make a 1572 // minor modification to the unrolled loop: change the backedge so that, 1573 // instead of jumping to the first body copy, make it jump to the second body 1574 // copy. This leaves the first body copy in a 1575 // will-only-ever-be-executed-once state, hence achieving peeling. 1576 if (state.doPeeling()) { 1577 MBasicBlock* backedge = 1578 state.blockTable.get(unrollFactor - 1, numBlocksInOriginal - 1); 1579 MBasicBlock* header0 = state.blockTable.get(0, 0); 1580 MBasicBlock* header1 = state.blockTable.get(1, 0); 1581 MOZ_ASSERT(header0 == originalHeader); 1582 1583 // Fix up the back edge, to make it jump to the second copy of the loop 1584 // body 1585 MOZ_ASSERT(backedge->hasLastIns()); // should always hold (AnalyzeLoop) 1586 MOZ_ASSERT(backedge->lastIns()->isGoto()); // AnalyzeLoop ensures 1587 MGoto* backedgeG = backedge->lastIns()->toGoto(); 1588 MOZ_ASSERT(backedgeG->target() == header0); // AnalyzeLoop ensures 1589 backedgeG->setTarget(header1); 1590 header0->clearLoopHeader(); 1591 header1->setLoopHeader(); 1592 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1593 MBasicBlock* originalBlock = state.blockTable.get(0, bix); 1594 MOZ_ASSERT(originalBlock->loopDepth() > 0); 1595 originalBlock->setLoopDepth(originalBlock->loopDepth() - 1); 1596 } 1597 1598 // Fix up the preds of the original header 1599 MOZ_ASSERT(header0->numPredecessors() == 2); 1600 MOZ_ASSERT(header0->getPredecessor(1) == backedge); 1601 header0->erasePredecessor(1); 1602 1603 // Fix up the preds of the new header 1604 MOZ_ASSERT(header1->numPredecessors() == 1); 1605 if (!header1->appendPredecessor(backedge)) { 1606 return false; 1607 } 1608 1609 // Fix up phis of both headers, by moving the second operand of each phi of 1610 // `header0` to be the second operand of the corresponding phi in 1611 // `header1`. 1612 MPhiIterator phiIter0(header0->phisBegin()); 1613 MPhiIterator phiIter1(header1->phisBegin()); 1614 while (true) { 1615 bool finished0 = phiIter0 == header0->phisEnd(); 1616 mozilla::DebugOnly<bool> finished1 = phiIter1 == header1->phisEnd(); 1617 // The two blocks must have the same number of phis. 1618 MOZ_ASSERT(finished0 == finished1); 1619 if (finished0) { 1620 break; 1621 } 1622 MPhi* phi0 = *phiIter0; 1623 MPhi* phi1 = *phiIter1; 1624 MOZ_ASSERT(phi0->numOperands() == 2); 1625 MOZ_ASSERT(phi1->numOperands() == 1); 1626 MDefinition* phi0arg1 = phi0->getOperand(1); 1627 phi0->removeOperand(1); 1628 if (!phi1->addInputFallible(phi0arg1)) { 1629 return false; 1630 } 1631 phiIter0++; 1632 phiIter1++; 1633 } 1634 1635 // Fix the succ-w-phis entry for `backEdge`. In very rare circumstances, 1636 // the loop may have no header phis, so this must be conditional. 1637 if (backedge->successorWithPhis()) { 1638 MOZ_ASSERT(backedge->successorWithPhis() == header0); 1639 MOZ_ASSERT(backedge->positionInPhiSuccessor() == 1); 1640 backedge->setSuccessorWithPhis(header1, 1); 1641 } 1642 } 1643 1644 #ifdef JS_JITSPEW 1645 if (JitSpewEnabled(JitSpew_UnrollDetails)) { 1646 DumpBlockTable(state.blockTable, "LOOP BLOCKS (final)"); 1647 Fprinter& printer(JitSpewPrinter()); 1648 printer.printf("<<<< SPLITTER BLOCKS\n"); 1649 for (MBasicBlock* block : splitterBlocks) { 1650 DumpMIRBlock(printer, block, /*showDetails=*/true); 1651 } 1652 printer.printf(">>>>\n"); 1653 } 1654 #endif 1655 1656 // Add the new blocks to the graph. 1657 { 1658 // We want to add them to the end of the existing loop. 1659 MBasicBlock* cursor = state.blockTable.get(0, numBlocksInOriginal - 1); 1660 for (uint32_t cix = 1; cix < unrollFactor; cix++) { 1661 for (uint32_t bix = 0; bix < numBlocksInOriginal; bix++) { 1662 MBasicBlock* addMe = state.blockTable.get(cix, bix); 1663 graph.insertBlockAfter(cursor, addMe); 1664 cursor = addMe; 1665 } 1666 } 1667 for (MBasicBlock* addMe : splitterBlocks) { 1668 graph.insertBlockAfter(cursor, addMe); 1669 cursor = addMe; 1670 } 1671 } 1672 1673 // UnrollLoops will renumber the blocks and redo the dominator tree once all 1674 // loops have been unrolled. 1675 1676 #ifdef JS_JITSPEW 1677 if (JitSpewEnabled(JitSpew_UnrollDetails)) { 1678 Fprinter& printer(JitSpewPrinter()); 1679 printer.printf("<<<< FUNCTION AFTER UNROLLING\n"); 1680 DumpMIRGraph(printer, graph, /*showDetails=*/true); 1681 printer.printf(">>>>\n"); 1682 } 1683 #endif 1684 1685 return true; 1686 } 1687 1688 // ===================================================================== 1689 // 1690 // FindInitialCandidates, which finds "initial candidates" for 1691 // unrolling/peeling. See SMDOC at the top of this file. 1692 1693 struct InitialCandidate { 1694 MBasicBlock* header; 1695 uint32_t numBlocks; 1696 float score; 1697 bool valid; 1698 InitialCandidate(MBasicBlock* header, uint32_t numBlocks) 1699 : header(header), numBlocks(numBlocks), score(0.0), valid(true) {} 1700 }; 1701 1702 using InitialCandidateVector = 1703 mozilla::Vector<InitialCandidate, 64, SystemAllocPolicy>; 1704 1705 [[nodiscard]] 1706 bool FindInitialCandidates(MIRGraph& graph, 1707 InitialCandidateVector& initialCandidates) { 1708 // Find initial candidate loops, and place them in `initialCandidates`. 1709 MOZ_ASSERT(initialCandidates.empty()); 1710 1711 // First, using a single scan over the blocks, find blocks that are the 1712 // headers of innermost loops. This uses the same logic to find innermost 1713 // loops as BacktrackingAllocator::init. 1714 // 1715 // The general idea is: we traverse the entire graph in RPO order, but ignore 1716 // all blocks except loop headers and loop backedge blocks. At a header we 1717 // make a note of the backedge block it points at. Because the loops are 1718 // properly nested, this means that we will ignore backedges for 1719 // non-innermost loops, and, so, when we do come to a block that matches 1720 // `backedge`, we know we've identified an innermost loop. 1721 // 1722 // At the same time, release-assert that the blocks are numbered 1723 // sequentially. We need this for the loop-non-overlapping check below. 1724 mozilla::Vector<MBasicBlock*, 128, SystemAllocPolicy> initialHeaders; 1725 { 1726 MBasicBlock* backedge = nullptr; 1727 uint32_t expectedNextId = 0; 1728 for (auto rpoIter(graph.rpoBegin()), rpoIterEnd(graph.rpoEnd()); 1729 rpoIter != rpoIterEnd; ++rpoIter) { 1730 MBasicBlock* block = *rpoIter; 1731 MOZ_RELEASE_ASSERT(block->id() == expectedNextId); 1732 expectedNextId++; 1733 1734 if (block->isLoopHeader()) { 1735 backedge = block->backedge(); 1736 } 1737 if (block == backedge) { 1738 if (!initialHeaders.append(block->loopHeaderOfBackedge())) { 1739 return false; 1740 } 1741 } 1742 } 1743 } 1744 1745 // Now use MarkLoopBlocks to check each potential loop in more detail, and, 1746 // for each non-empty, non-OSR one, create an entry in initialCandidates for 1747 // it. 1748 for (MBasicBlock* header : initialHeaders) { 1749 MOZ_ASSERT(header->isLoopHeader()); 1750 1751 bool hasOsrEntry; 1752 size_t numBlocks = MarkLoopBlocks(graph, header, &hasOsrEntry); 1753 if (numBlocks == 0) { 1754 // Bogus; skip. 1755 continue; 1756 } 1757 UnmarkLoopBlocks(graph, header); 1758 if (hasOsrEntry) { 1759 // Unhandled; skip. 1760 continue; 1761 } 1762 1763 MOZ_RELEASE_ASSERT(numBlocks <= UINT32_MAX); 1764 if (!initialCandidates.append(InitialCandidate(header, numBlocks))) { 1765 return false; 1766 } 1767 } 1768 1769 // Sort the loops by header ID. 1770 std::sort(initialCandidates.begin(), initialCandidates.end(), 1771 [](const InitialCandidate& cand1, const InitialCandidate& cand2) { 1772 return cand1.header->id() < cand2.header->id(); 1773 }); 1774 1775 // Now we can conveniently assert non-overlappingness. Note that this 1776 // depends on the property that each loop is a sequential sequence of block 1777 // IDs, as we release-asserted above. Presence of an overlap might indicate 1778 // that a non-innermost loop made it this far. 1779 for (size_t i = 1; i < initialCandidates.length(); i++) { 1780 MOZ_RELEASE_ASSERT(initialCandidates[i - 1].header->id() + 1781 initialCandidates[i - 1].numBlocks <= 1782 initialCandidates[i].header->id()); 1783 } 1784 1785 // Heuristics: calculate a "score" for each loop, indicating our desire to 1786 // unroll it. **Lower** values indicate a higher desirability for unrolling. 1787 // 1788 // A loop gets a lower score if: 1789 // * it is small 1790 // * it has few blocks 1791 // * it is deeply nested (currently ignored) 1792 1793 for (InitialCandidate& cand : initialCandidates) { 1794 uint32_t nBlocks = cand.numBlocks; 1795 uint32_t nInsns = 0; 1796 uint32_t nPhis = 0; 1797 1798 // Visit each block in the loop 1799 mozilla::DebugOnly<uint32_t> numBlocksVisited = 0; 1800 const MBasicBlock* backedge = cand.header->backedge(); 1801 1802 for (auto i(graph.rpoBegin(cand.header)); /**/; ++i) { 1803 MOZ_ASSERT(i != graph.rpoEnd(), 1804 "UnrollLoops: loop blocks overrun graph end"); 1805 MBasicBlock* block = *i; 1806 numBlocksVisited++; 1807 for (MInstructionIterator iter(block->begin()); iter != block->end(); 1808 iter++) { 1809 nInsns++; 1810 } 1811 for (MPhiIterator iter(block->phisBegin()); iter != block->phisEnd(); 1812 iter++) { 1813 nPhis++; 1814 } 1815 1816 if (block == backedge) { 1817 break; 1818 } 1819 } 1820 1821 MOZ_ASSERT(numBlocksVisited == nBlocks); 1822 1823 cand.score = 1824 10.0 * float(nBlocks) + 2.0 * float(nInsns) + 1.0 * float(nPhis); 1825 } 1826 1827 // Sort the loops by `score`. 1828 std::sort(initialCandidates.begin(), initialCandidates.end(), 1829 [](const InitialCandidate& cand1, const InitialCandidate& cand2) { 1830 return cand1.score < cand2.score; 1831 }); 1832 1833 return true; 1834 } 1835 1836 // ===================================================================== 1837 // 1838 // UnrollLoops, the top level of the unroller. 1839 1840 bool UnrollLoops(const MIRGenerator* mir, MIRGraph& graph, bool* changed) { 1841 *changed = false; 1842 1843 #ifdef JS_JITSPEW 1844 if (JitSpewEnabled(JitSpew_Unroll)) { 1845 JitSpew(JitSpew_Unroll, "BEGIN UnrollLoops"); 1846 } 1847 #endif 1848 1849 // Make up a vector of initial candidates, which are innermost loops only, 1850 // and not too big. 1851 1852 InitialCandidateVector initialCandidates; 1853 if (!FindInitialCandidates(graph, initialCandidates)) { 1854 return false; 1855 } 1856 1857 #ifdef JS_JITSPEW 1858 if (JitSpewEnabled(JitSpew_Unroll)) { 1859 for (const InitialCandidate& cand : initialCandidates) { 1860 MOZ_ASSERT(cand.valid); 1861 JitSpew(JitSpew_Unroll, " initial cand: score=%-5.1f blocks=%u-%u", 1862 cand.score, cand.header->id(), 1863 cand.header->id() + cand.numBlocks - 1); 1864 } 1865 } 1866 #endif 1867 1868 // Perform a more expensive analysis of the candidates, leading to a further 1869 // reduction in the set. For those remaining, an UnrollState is created and 1870 // initialized. Also a decision is made whether to unroll only, peel only, 1871 // or both. 1872 1873 mozilla::Vector<UnrollState, 32, SystemAllocPolicy> unrollStates; 1874 1875 for (const InitialCandidate& cand : initialCandidates) { 1876 if (cand.score > InitialCandidateThreshold) { 1877 // Give up. We just sorted them by `score`, so the rest will be even 1878 // more undesirable for unrolling. 1879 break; 1880 } 1881 1882 // Park the loop's blocks in a vector, so we can hand it off to 1883 // AnalyzeLoops for examination. This is the place where the original 1884 // block sequence, as seen by the entire rest of the machinery, is fixed. 1885 BlockVector originalBlocks; 1886 1887 const MBasicBlock* backedge = cand.header->backedge(); 1888 1889 for (auto blockIter(graph.rpoBegin(cand.header)); /**/; ++blockIter) { 1890 MOZ_ASSERT(blockIter != graph.rpoEnd()); 1891 MBasicBlock* block = *blockIter; 1892 if (!originalBlocks.append(block)) { 1893 return false; 1894 } 1895 if (block == backedge) { 1896 break; 1897 } 1898 } 1899 1900 // Analyze the candidate in detail. If unrolling and/or peeling is 1901 // approved, this also collects up for us, the loop's target block set and 1902 // exiting value set. 1903 BlockSet exitTargetBlocks; 1904 ValueSet exitingValues; 1905 1906 AnalysisResult res = 1907 AnalyzeLoop(originalBlocks, &exitTargetBlocks, &exitingValues); 1908 1909 #ifdef JS_JITSPEW 1910 if (JitSpewEnabled(JitSpew_Unroll)) { 1911 MOZ_ASSERT(cand.valid); 1912 JitSpew(JitSpew_Unroll, " %13s: score=%-5.1f blocks=%u-%u", 1913 Name_of_AnalysisResult(res), cand.score, cand.header->id(), 1914 cand.header->id() + cand.numBlocks - 1); 1915 } 1916 #endif 1917 1918 switch (res) { 1919 case AnalysisResult::OOM: 1920 return false; 1921 case AnalysisResult::Peel: 1922 case AnalysisResult::Unroll: 1923 case AnalysisResult::PeelAndUnroll: 1924 break; 1925 case AnalysisResult::BadInvariants: 1926 MOZ_CRASH("js::jit::UnrollLoops: unexpected incoming MIR"); 1927 case AnalysisResult::TooComplex: 1928 case AnalysisResult::Uncloneable: 1929 case AnalysisResult::TooLarge: 1930 case AnalysisResult::Unsuitable: 1931 // Ignore this loop 1932 continue; 1933 default: 1934 MOZ_CRASH(); 1935 } 1936 1937 // The candidate has been accepted for unrolling and/or peeling. Make an 1938 // initial UnrollState for it and add it to our collection thereof. 1939 1940 UnrollMode mode; 1941 if (res == AnalysisResult::Peel) { 1942 mode = UnrollMode::Peel; 1943 } else if (res == AnalysisResult::Unroll) { 1944 mode = UnrollMode::Unroll; 1945 } else if (res == AnalysisResult::PeelAndUnroll) { 1946 mode = UnrollMode::PeelAndUnroll; 1947 } else { 1948 MOZ_CRASH(); 1949 } 1950 1951 if (!unrollStates.emplaceBack(mode)) { 1952 return false; 1953 } 1954 UnrollState* state = &unrollStates.back(); 1955 if (!state->exitTargetBlocks.copy(exitTargetBlocks) || 1956 !state->exitingValues.copy(exitingValues)) { 1957 return false; 1958 } 1959 1960 // Ignoring peeling, how many times should we unroll a loop? 1961 // For example, if the original loop was `loop { B; }` then with 1962 // `basicUnrollingFactor = 3`, the unrolled loop will be `loop { B; B; B; 1963 // }`. If peeling is also requested then we will unroll one more time than 1964 // this, giving overall result `B; loop { B; B; B; }`. 1965 uint32_t basicUnrollingFactor = JS::Prefs::wasm_unroll_factor(); 1966 if (basicUnrollingFactor < 2) { 1967 // It needs to be at least 2, else we're not unrolling at all. 1968 basicUnrollingFactor = 2; 1969 } else if (basicUnrollingFactor > 8) { 1970 // Unrolling gives rapidly diminishing marginal gains. Even beyond 4 1971 // there's little benefit to be had. 1972 basicUnrollingFactor = 8; 1973 } 1974 1975 // Decide on the total number of duplicates of the loop body we'll need 1976 // (including the original). 1977 uint32_t unrollingFactor; 1978 if (res == AnalysisResult::Peel) { 1979 // 1 for the peeled iteration, 1980 // 1 for the loop (since we are not unrolling it) 1981 unrollingFactor = 1 + 1; 1982 } else if (res == AnalysisResult::Unroll) { 1983 // No peeled iteration, we unroll only 1984 unrollingFactor = 0 + basicUnrollingFactor; 1985 } else if (res == AnalysisResult::PeelAndUnroll) { 1986 // 1 peeled iteration, and we unroll the loop 1987 unrollingFactor = 1 + basicUnrollingFactor; 1988 } else { 1989 MOZ_CRASH(); 1990 } 1991 1992 // Set up UnrollState::blockTable and copy in the initial loop blocks. 1993 if (!state->blockTable.init(unrollingFactor, originalBlocks.length())) { 1994 return false; 1995 } 1996 for (uint32_t bix = 0; bix < originalBlocks.length(); bix++) { 1997 state->blockTable.set(0, bix, originalBlocks[bix]); 1998 } 1999 } 2000 2001 // Now, finally, we have an initialized vector of UnrollStates ready to go. 2002 // After this point, `originalBlocks` is no longer used. Also, because of 2003 // the checking done in AnalyzeLoop, we know the transformations can't fail 2004 // (apart from OOMing). 2005 2006 // Add "loop-closing phis" for values defined in the loop and used 2007 // afterwards. See comments on AddClosingPhisForLoop. 2008 for (const UnrollState& state : unrollStates) { 2009 if (!AddClosingPhisForLoop(graph.alloc(), state)) { 2010 return false; 2011 } 2012 } 2013 2014 // Actually do the unrolling and/or peeling. 2015 uint32_t numLoopsPeeled = 0; 2016 uint32_t numLoopsUnrolled = 0; 2017 uint32_t numLoopsPeeledAndUnrolled = 0; 2018 for (UnrollState& state : unrollStates) { 2019 if (!UnrollAndOrPeelLoop(graph, state)) { 2020 return false; 2021 } 2022 // Update stats. 2023 switch (state.mode) { 2024 case UnrollMode::Peel: 2025 numLoopsPeeled++; 2026 break; 2027 case UnrollMode::Unroll: 2028 numLoopsUnrolled++; 2029 break; 2030 case UnrollMode::PeelAndUnroll: 2031 numLoopsPeeledAndUnrolled++; 2032 break; 2033 default: 2034 MOZ_CRASH(); 2035 } 2036 } 2037 2038 // Do function-level fixups, if anything changed. 2039 if (!unrollStates.empty()) { 2040 RenumberBlocks(graph); 2041 ClearDominatorTree(graph); 2042 if (!BuildDominatorTree(mir, graph)) { 2043 return false; 2044 } 2045 } 2046 2047 uint32_t numLoopsChanged = 2048 numLoopsPeeled + numLoopsUnrolled + numLoopsPeeledAndUnrolled; 2049 2050 #ifdef JS_JITSPEW 2051 if (JitSpewEnabled(JitSpew_Unroll)) { 2052 if (numLoopsChanged == 0) { 2053 JitSpew(JitSpew_Unroll, "END UnrollLoops"); 2054 } else { 2055 JitSpew(JitSpew_Unroll, 2056 "END UnrollLoops, %u processed (P=%u, U=%u, P&U=%u)", 2057 numLoopsChanged, numLoopsPeeled, numLoopsUnrolled, 2058 numLoopsPeeledAndUnrolled); 2059 } 2060 } 2061 #endif 2062 2063 *changed = numLoopsChanged > 0; 2064 return true; 2065 } 2066 2067 } // namespace jit 2068 } // namespace js