[ tor-browser ].git.dasho

commit 86cef46aa081c087e3129bb5e012311b9494500d
parent 7b29457f213ccab25b5e2ea05bae3210936f0c57
Author: André Bargull <andre.bargull@gmail.com>
Date:   Mon, 27 Oct 2025 09:00:00 +0000

Bug 1996345 - Part 1: Use three operand imul instruction when possible. r=spidermonkey-reviewers,iain

Improves codegen for:
```js
function f(x) {
  return (Math.imul(x, 10) + x)|0;
}
```

From:
```asm
movl       %eax, %ecx
imull      $10, %eax, %eax
addl       %ecx, %eax
```

To:
```asm
imull      $10, %eax, %ecx
addl       %eax, %ecx
```

For small constants use `lea` to match GCC/Clang codegen.

Differential Revision: https://phabricator.services.mozilla.com/D270021

Diffstat:
M js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js  | 14 +++++++-------
M js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js  | 42 ++++++++++++++----------------------------
M js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M js/src/jit/x86-shared/Lowering-x86-shared.cpp  | 13 +++++++++++--

4 files changed, 94 insertions(+), 53 deletions(-)
diff --git a/js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js
@@ -37,7 +37,7 @@ let zero32 =
 codegenTestX64_adhoc(
     zero32,
     'f',
-    'xor %eax, %eax', {no_prefix:true});
+    'xor %eax, %eax');
 assertEq(wasmEvalText(zero32).exports.f(-37), 0)
 assertEq(wasmEvalText(zero32).exports.f(42), 0)
 
@@ -74,7 +74,7 @@ codegenTestX64_adhoc(
 assertEq(wasmEvalText(one64).exports.f(-37000000000n), -37000000000n)
 assertEq(wasmEvalText(one64).exports.f(42000000000n), 42000000000n)
 
-// Test that multiplication by two yields an add
+// Test that multiplication by two yields lea/add
 
 let double32 =
     `(module
@@ -83,7 +83,7 @@ let double32 =
 codegenTestX64_adhoc(
     double32,
     'f',
-    'add %eax, %eax', {no_prefix:true});
+    'lea \\(%rdi,%rdi,1\\), %eax');
 assertEq(wasmEvalText(double32).exports.f(-37), -74)
 assertEq(wasmEvalText(double32).exports.f(42), 84)
 
@@ -97,7 +97,7 @@ codegenTestX64_adhoc(
 assertEq(wasmEvalText(double64).exports.f(-37000000000n), -74000000000n)
 assertEq(wasmEvalText(double64).exports.f(42000000000n), 84000000000n)
 
-// Test that multiplication by four yields a shift
+// Test that multiplication by four yields lea/shift
 
 let quad32 =
     `(module
@@ -106,7 +106,7 @@ let quad32 =
 codegenTestX64_adhoc(
     quad32,
     'f',
-    'shl \\$0x02, %eax', {no_prefix:true});
+    'lea \\(,%rdi,4\\), %eax');
 assertEq(wasmEvalText(quad32).exports.f(-37), -148)
 assertEq(wasmEvalText(quad32).exports.f(42), 168)
 
@@ -120,7 +120,7 @@ codegenTestX64_adhoc(
 assertEq(wasmEvalText(quad64).exports.f(-37000000000n), -148000000000n)
 assertEq(wasmEvalText(quad64).exports.f(42000000000n), 168000000000n)
 
-// Test that multiplication by five yields a multiply
+// Test that multiplication by five yields lea/imul
 
 let quint32 =
     `(module
@@ -129,7 +129,7 @@ let quint32 =
 codegenTestX64_adhoc(
     quint32,
     'f',
-    'imul \\$0x05, %eax, %eax', {no_prefix:true});
+    'lea \\(%rdi,%rdi,4\\), %eax');
 assertEq(wasmEvalText(quint32).exports.f(-37), -37*5)
 assertEq(wasmEvalText(quint32).exports.f(42), 42*5)
 
diff --git a/js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js b/js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js
@@ -30,18 +30,7 @@ codegenTestMultiplatform_adhoc(
     `(module (func (export "mul32_zeroL") (param $p1 i32) (result i32)
        (i32.mul (i32.const 0) (local.get $p1))))`,
     "mul32_zeroL",
-    {x64:   // FIXME move folding to MIR level
-            // First we move edi to eax unnecessarily via ecx (bug 1752520),
-            // then we overwrite eax.  Presumably because the folding
-            // 0 * x => 0 is done at the LIR level, not the MIR level, hence
-            // the now-pointless WasmParameter node is not DCE'd away, since
-            // DCE only happens at the MIR level.  In fact all targets suffer
-            // from the latter problem, but on x86 no_prefix_x86:true
-            // hides it, and on arm32/64 the pointless move is correctly
-            // transformed by RA into a no-op.
-            `mov %edi, %ecx
-             mov %ecx, %eax
-             xor %eax, %eax`,
+    {x64:   `xor %eax, %eax`,
      x86:   `xor %eax, %eax`,
      arm64: `mov w0, wzr`,
      arm:   `mov r0, #0`},
@@ -52,7 +41,14 @@ codegenTestMultiplatform_adhoc(
        (i64.mul (i64.const 0) (local.get $p1))))`,
     "mul64_zeroL",
     // FIXME folding happened, zero-creation insns could be improved
-    {x64:   // Same shenanigans as above.  Also, on xor, REX.W is redundant.
+    {x64:   // First we move rdi to rax unnecessarily via rcx (bug 1752520),
+            // then we overwrite rax.  Presumably because the folding
+            // 0 * x => 0 is done at the LIR level, not the MIR level, hence
+            // the now-pointless WasmParameter node is not DCE'd away, since
+            // DCE only happens at the MIR level.  In fact all targets suffer
+            // from the latter problem, but on x86 no_prefix_x86:true
+            // hides it, and on arm32/64 the pointless move is correctly
+            // transformed by RA into a no-op.  Also, on xor, REX.W is redundant.
             `mov %rdi, %rcx
              mov %rcx, %rax
              xor %rax, %rax`,
@@ -118,9 +114,7 @@ codegenTestMultiplatform_adhoc(
     `(module (func (export "mul32_twoL") (param $p1 i32) (result i32)
        (i32.mul (i32.const 2) (local.get $p1))))`,
     "mul32_twoL",
-    {x64:   `mov %edi, %ecx
-             mov %ecx, %eax
-             add %eax, %eax`,
+    {x64:   `lea \\(%rdi,%rdi,1\\), %eax`,
      x86:   `movl 0x10\\(%rbp\\), %eax
              add %eax, %eax`,
      arm64: `add w0, w0, w0`,
@@ -148,9 +142,7 @@ codegenTestMultiplatform_adhoc(
     `(module (func (export "mul32_fourL") (param $p1 i32) (result i32)
        (i32.mul (i32.const 4) (local.get $p1))))`,
     "mul32_fourL",
-    {x64:   `mov %edi, %ecx
-             mov %ecx, %eax
-             shl \\$0x02, %eax`,
+    {x64:   `lea \\(,%rdi,4\\), %eax`,
      x86:   `movl 0x10\\(%rbp\\), %eax
              shl \\$0x02, %eax`,
      arm64: `lsl w0, w0, #2`,
@@ -187,9 +179,7 @@ codegenTestMultiplatform_adhoc(
     `(module (func (export "mul32_zeroR") (param $p1 i32) (result i32)
        (i32.mul (local.get $p1) (i32.const 0))))`,
     "mul32_zeroR",
-    {x64:   `mov %edi, %ecx
-             mov %ecx, %eax
-             xor %eax, %eax`,
+    {x64:   `xor %eax, %eax`,
      x86:   `xor %eax, %eax`,
      arm64: `mov w0, wzr`,
      arm:   `mov r0, #0`},
@@ -264,9 +254,7 @@ codegenTestMultiplatform_adhoc(
     `(module (func (export "mul32_twoR") (param $p1 i32) (result i32)
        (i32.mul (local.get $p1) (i32.const 2))))`,
     "mul32_twoR",
-    {x64:   `mov %edi, %ecx
-             mov %ecx, %eax
-             add %eax, %eax`,
+    {x64:   `lea \\(%rdi,%rdi,1\\), %eax`,
      x86:   `movl 0x10\\(%rbp\\), %eax
              add %eax, %eax`,
      arm64: `add w0, w0, w0`,
@@ -294,9 +282,7 @@ codegenTestMultiplatform_adhoc(
     `(module (func (export "mul32_fourR") (param $p1 i32) (result i32)
        (i32.mul (local.get $p1) (i32.const 4))))`,
     "mul32_fourR",
-    {x64:   `mov %edi, %ecx
-             mov %ecx, %eax
-             shl \\$0x02, %eax`,
+    {x64:   `lea \\(,%rdi,4\\), %eax`,
      x86:   `movl 0x10\\(%rbp\\), %eax
              shl \\$0x02, %eax`,
      arm64: `lsl w0, w0, #2`,
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -746,8 +746,7 @@ void CodeGenerator::visitSubI64(LSubI64* lir) {
 void CodeGenerator::visitMulI(LMulI* ins) {
   Register lhs = ToRegister(ins->lhs());
   const LAllocation* rhs = ins->rhs();
-
-  MOZ_ASSERT(ToRegister(ins->output()) == lhs);
+  Register out = ToRegister(ins->output());
 
   MMul* mul = ins->mir();
   MOZ_ASSERT_IF(mul->mode() == MMul::Integer,
@@ -763,29 +762,74 @@ void CodeGenerator::visitMulI(LMulI* ins) {
       bailoutIf(bailoutCond, ins->snapshot());
     }
 
+    if (!mul->canOverflow()) {
+      switch (constant) {
+        case 2:
+          if (lhs == out) {
+            masm.addl(lhs, lhs);
+          } else {
+            masm.leal(Operand(lhs, lhs, TimesOne), out);
+          }
+          return;
+        case 3:
+          masm.leal(Operand(lhs, lhs, TimesTwo), out);
+          return;
+        case 4:
+          if (lhs == out) {
+            masm.shll(Imm32(2), lhs);
+          } else {
+            masm.leal(Operand(lhs, TimesFour, 0), out);
+          }
+          return;
+        case 5:
+          masm.leal(Operand(lhs, lhs, TimesFour), out);
+          return;
+        case 8:
+          if (lhs == out) {
+            masm.shll(Imm32(3), lhs);
+          } else {
+            masm.leal(Operand(lhs, TimesEight, 0), out);
+          }
+          return;
+        case 9:
+          masm.leal(Operand(lhs, lhs, TimesEight), out);
+          return;
+        default:
+          // Use shift if cannot overflow and constant is power of 2
+          int32_t shift = FloorLog2(constant);
+          if (constant > 0 && (1 << shift) == constant) {
+            if (lhs != out) {
+              masm.movl(lhs, out);
+            }
+            masm.shll(Imm32(shift), out);
+            return;
+          }
+      }
+    }
+
     switch (constant) {
       case -1:
-        masm.negl(lhs);
+        if (lhs != out) {
+          masm.movl(lhs, out);
+        }
+        masm.negl(out);
         break;
       case 0:
-        masm.xorl(lhs, lhs);
+        masm.xorl(out, out);
         return;  // escape overflow check;
       case 1:
-        // nop
+        if (lhs != out) {
+          masm.movl(lhs, out);
+        }
         return;  // escape overflow check;
       case 2:
-        masm.addl(lhs, lhs);
-        break;
-      default:
-        if (!mul->canOverflow() && constant > 0) {
-          // Use shift if cannot overflow and constant is power of 2
-          int32_t shift = FloorLog2(constant);
-          if ((1 << shift) == constant) {
-            masm.shll(Imm32(shift), lhs);
-            return;
-          }
+        if (lhs == out) {
+          masm.addl(lhs, lhs);
+          break;
         }
-        masm.imull(Imm32(constant), lhs);
+        [[fallthrough]];
+      default:
+        masm.imull(Imm32(constant), lhs, out);
     }
 
     // Bailout on overflow
@@ -793,6 +837,8 @@ void CodeGenerator::visitMulI(LMulI* ins) {
       bailoutIf(Assembler::Overflow, ins->snapshot());
     }
   } else {
+    MOZ_ASSERT(out == lhs);
+
     masm.imull(ToOperand(rhs), lhs);
 
     // Bailout on overflow
diff --git a/js/src/jit/x86-shared/Lowering-x86-shared.cpp b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -185,12 +185,21 @@ void LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, 0>* ins,
 
 void LIRGeneratorX86Shared::lowerMulI(MMul* mul, MDefinition* lhs,
                                       MDefinition* rhs) {
+  if (rhs->isConstant()) {
+    auto* lir = new (alloc()) LMulI(useRegisterAtStart(lhs),
+                                    useOrConstantAtStart(rhs), LAllocation());
+    if (mul->fallible()) {
+      assignSnapshot(lir, mul->bailoutKind());
+    }
+    define(lir, mul);
+    return;
+  }
+
   // Note: If we need a negative zero check, lhs is used twice.
   LAllocation lhsCopy = mul->canBeNegativeZero() ? use(lhs) : LAllocation();
   LMulI* lir = new (alloc())
       LMulI(useRegisterAtStart(lhs),
-            willHaveDifferentLIRNodes(lhs, rhs) ? useOrConstant(rhs)
-                                                : useOrConstantAtStart(rhs),
+            willHaveDifferentLIRNodes(lhs, rhs) ? use(rhs) : useAtStart(rhs),
             lhsCopy);
   if (mul->fallible()) {
     assignSnapshot(lir, mul->bailoutKind());

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js	\|	14	+++++++-------
M	js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js	\|	42	++++++++++++++----------------------------
M	js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp	\|	78	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M	js/src/jit/x86-shared/Lowering-x86-shared.cpp	\|	13	+++++++++++--