commit 5f94800ce6841cba208fda2cefa3b35603147591
parent fc1621fc5ee0c4a2fe6de385297771edd8a77ee4
Author: André Bargull <andre.bargull@gmail.com>
Date: Mon, 27 Oct 2025 15:22:13 +0000
Bug 1996345 - Part 1: Use three operand imul instruction when possible. r=spidermonkey-reviewers,iain
Improves codegen for:
```js
function f(x) {
return (Math.imul(x, 10) + x)|0;
}
```
From:
```asm
movl %eax, %ecx
imull $10, %eax, %eax
addl %ecx, %eax
```
To:
```asm
imull $10, %eax, %ecx
addl %eax, %ecx
```
For small constants use `lea` to match GCC/Clang codegen.
Differential Revision: https://phabricator.services.mozilla.com/D270021
Diffstat:
4 files changed, 118 insertions(+), 56 deletions(-)
diff --git a/js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js
@@ -4,6 +4,27 @@
// bug 1701164) so we avoid those with no_prefix/no_suffix flags when the issue
// comes up.
+// Handle calling convention differences.
+function codegenTestX64_adhoc_call(module_text, export_name, expected, options = {}) {
+ // Microsoft x64 calling convention passes the first four arguments in
+ // registers rcx, rdx, r8, r9 (in that order).
+ if (getBuildConfiguration("windows")) {
+ // Arguments passed on the stack not yet supported.
+ assertEq(
+ expected.includes("%r8") || expected.includes("%r9"),
+ false,
+ "too many arguments"
+ );
+ expected = expected.replaceAll("%rcx", "%r9")
+ .replaceAll("%rdx", "%r8")
+ .replaceAll("%rsi", "%rdx")
+ .replaceAll("%rdi", "%rcx")
+ .replaceAll("%esi", "%edx")
+ .replaceAll("%edi", "%ecx");
+ }
+ codegenTestX64_adhoc(module_text, export_name, expected, options);
+}
+
// Test that multiplication by -1 yields negation.
let neg32 =
@@ -37,7 +58,7 @@ let zero32 =
codegenTestX64_adhoc(
zero32,
'f',
- 'xor %eax, %eax', {no_prefix:true});
+ 'xor %eax, %eax');
assertEq(wasmEvalText(zero32).exports.f(-37), 0)
assertEq(wasmEvalText(zero32).exports.f(42), 0)
@@ -74,16 +95,16 @@ codegenTestX64_adhoc(
assertEq(wasmEvalText(one64).exports.f(-37000000000n), -37000000000n)
assertEq(wasmEvalText(one64).exports.f(42000000000n), 42000000000n)
-// Test that multiplication by two yields an add
+// Test that multiplication by two yields lea/add
let double32 =
`(module
(func (export "f") (param i32) (result i32)
(i32.mul (local.get 0) (i32.const 2))))`;
-codegenTestX64_adhoc(
+codegenTestX64_adhoc_call(
double32,
'f',
- 'add %eax, %eax', {no_prefix:true});
+ 'lea \\(%rdi,%rdi,1\\), %eax');
assertEq(wasmEvalText(double32).exports.f(-37), -74)
assertEq(wasmEvalText(double32).exports.f(42), 84)
@@ -97,16 +118,16 @@ codegenTestX64_adhoc(
assertEq(wasmEvalText(double64).exports.f(-37000000000n), -74000000000n)
assertEq(wasmEvalText(double64).exports.f(42000000000n), 84000000000n)
-// Test that multiplication by four yields a shift
+// Test that multiplication by four yields lea/shift
let quad32 =
`(module
(func (export "f") (param i32) (result i32)
(i32.mul (local.get 0) (i32.const 4))))`;
-codegenTestX64_adhoc(
+codegenTestX64_adhoc_call(
quad32,
'f',
- 'shl \\$0x02, %eax', {no_prefix:true});
+ 'lea \\(,%rdi,4\\), %eax');
assertEq(wasmEvalText(quad32).exports.f(-37), -148)
assertEq(wasmEvalText(quad32).exports.f(42), 168)
@@ -120,16 +141,16 @@ codegenTestX64_adhoc(
assertEq(wasmEvalText(quad64).exports.f(-37000000000n), -148000000000n)
assertEq(wasmEvalText(quad64).exports.f(42000000000n), 168000000000n)
-// Test that multiplication by five yields a multiply
+// Test that multiplication by five yields lea/imul
let quint32 =
`(module
(func (export "f") (param i32) (result i32)
(i32.mul (local.get 0) (i32.const 5))))`;
-codegenTestX64_adhoc(
+codegenTestX64_adhoc_call(
quint32,
'f',
- 'imul \\$0x05, %eax, %eax', {no_prefix:true});
+ 'lea \\(%rdi,%rdi,4\\), %eax');
assertEq(wasmEvalText(quint32).exports.f(-37), -37*5)
assertEq(wasmEvalText(quint32).exports.f(42), 42*5)
diff --git a/js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js b/js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js
@@ -30,18 +30,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul32_zeroL") (param $p1 i32) (result i32)
(i32.mul (i32.const 0) (local.get $p1))))`,
"mul32_zeroL",
- {x64: // FIXME move folding to MIR level
- // First we move edi to eax unnecessarily via ecx (bug 1752520),
- // then we overwrite eax. Presumably because the folding
- // 0 * x => 0 is done at the LIR level, not the MIR level, hence
- // the now-pointless WasmParameter node is not DCE'd away, since
- // DCE only happens at the MIR level. In fact all targets suffer
- // from the latter problem, but on x86 no_prefix_x86:true
- // hides it, and on arm32/64 the pointless move is correctly
- // transformed by RA into a no-op.
- `mov %edi, %ecx
- mov %ecx, %eax
- xor %eax, %eax`,
+ {x64: `xor %eax, %eax`,
x86: `xor %eax, %eax`,
arm64: `mov w0, wzr`,
arm: `mov r0, #0`},
@@ -52,7 +41,14 @@ codegenTestMultiplatform_adhoc(
(i64.mul (i64.const 0) (local.get $p1))))`,
"mul64_zeroL",
// FIXME folding happened, zero-creation insns could be improved
- {x64: // Same shenanigans as above. Also, on xor, REX.W is redundant.
+ {x64: // First we move rdi to rax unnecessarily via rcx (bug 1752520),
+ // then we overwrite rax. Presumably because the folding
+ // 0 * x => 0 is done at the LIR level, not the MIR level, hence
+ // the now-pointless WasmParameter node is not DCE'd away, since
+ // DCE only happens at the MIR level. In fact all targets suffer
+ // from the latter problem, but on x86 no_prefix_x86:true
+ // hides it, and on arm32/64 the pointless move is correctly
+ // transformed by RA into a no-op. Also, on xor, REX.W is redundant.
`mov %rdi, %rcx
mov %rcx, %rax
xor %rax, %rax`,
@@ -118,9 +114,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul32_twoL") (param $p1 i32) (result i32)
(i32.mul (i32.const 2) (local.get $p1))))`,
"mul32_twoL",
- {x64: `mov %edi, %ecx
- mov %ecx, %eax
- add %eax, %eax`,
+ {x64: `lea \\(%rdi,%rdi,1\\), %eax`,
x86: `movl 0x10\\(%rbp\\), %eax
add %eax, %eax`,
arm64: `add w0, w0, w0`,
@@ -148,9 +142,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul32_fourL") (param $p1 i32) (result i32)
(i32.mul (i32.const 4) (local.get $p1))))`,
"mul32_fourL",
- {x64: `mov %edi, %ecx
- mov %ecx, %eax
- shl \\$0x02, %eax`,
+ {x64: `lea \\(,%rdi,4\\), %eax`,
x86: `movl 0x10\\(%rbp\\), %eax
shl \\$0x02, %eax`,
arm64: `lsl w0, w0, #2`,
@@ -187,9 +179,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul32_zeroR") (param $p1 i32) (result i32)
(i32.mul (local.get $p1) (i32.const 0))))`,
"mul32_zeroR",
- {x64: `mov %edi, %ecx
- mov %ecx, %eax
- xor %eax, %eax`,
+ {x64: `xor %eax, %eax`,
x86: `xor %eax, %eax`,
arm64: `mov w0, wzr`,
arm: `mov r0, #0`},
@@ -264,9 +254,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul32_twoR") (param $p1 i32) (result i32)
(i32.mul (local.get $p1) (i32.const 2))))`,
"mul32_twoR",
- {x64: `mov %edi, %ecx
- mov %ecx, %eax
- add %eax, %eax`,
+ {x64: `lea \\(%rdi,%rdi,1\\), %eax`,
x86: `movl 0x10\\(%rbp\\), %eax
add %eax, %eax`,
arm64: `add w0, w0, w0`,
@@ -294,9 +282,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul32_fourR") (param $p1 i32) (result i32)
(i32.mul (local.get $p1) (i32.const 4))))`,
"mul32_fourR",
- {x64: `mov %edi, %ecx
- mov %ecx, %eax
- shl \\$0x02, %eax`,
+ {x64: `lea \\(,%rdi,4\\), %eax`,
x86: `movl 0x10\\(%rbp\\), %eax
shl \\$0x02, %eax`,
arm64: `lsl w0, w0, #2`,
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -746,8 +746,7 @@ void CodeGenerator::visitSubI64(LSubI64* lir) {
void CodeGenerator::visitMulI(LMulI* ins) {
Register lhs = ToRegister(ins->lhs());
const LAllocation* rhs = ins->rhs();
-
- MOZ_ASSERT(ToRegister(ins->output()) == lhs);
+ Register out = ToRegister(ins->output());
MMul* mul = ins->mir();
MOZ_ASSERT_IF(mul->mode() == MMul::Integer,
@@ -763,29 +762,74 @@ void CodeGenerator::visitMulI(LMulI* ins) {
bailoutIf(bailoutCond, ins->snapshot());
}
+ if (!mul->canOverflow()) {
+ switch (constant) {
+ case 2:
+ if (lhs == out) {
+ masm.addl(lhs, lhs);
+ } else {
+ masm.leal(Operand(lhs, lhs, TimesOne), out);
+ }
+ return;
+ case 3:
+ masm.leal(Operand(lhs, lhs, TimesTwo), out);
+ return;
+ case 4:
+ if (lhs == out) {
+ masm.shll(Imm32(2), lhs);
+ } else {
+ masm.leal(Operand(lhs, TimesFour, 0), out);
+ }
+ return;
+ case 5:
+ masm.leal(Operand(lhs, lhs, TimesFour), out);
+ return;
+ case 8:
+ if (lhs == out) {
+ masm.shll(Imm32(3), lhs);
+ } else {
+ masm.leal(Operand(lhs, TimesEight, 0), out);
+ }
+ return;
+ case 9:
+ masm.leal(Operand(lhs, lhs, TimesEight), out);
+ return;
+ default:
+ // Use shift if cannot overflow and constant is power of 2
+ int32_t shift = FloorLog2(constant);
+ if (constant > 0 && (1 << shift) == constant) {
+ if (lhs != out) {
+ masm.movl(lhs, out);
+ }
+ masm.shll(Imm32(shift), out);
+ return;
+ }
+ }
+ }
+
switch (constant) {
case -1:
- masm.negl(lhs);
+ if (lhs != out) {
+ masm.movl(lhs, out);
+ }
+ masm.negl(out);
break;
case 0:
- masm.xorl(lhs, lhs);
+ masm.xorl(out, out);
return; // escape overflow check;
case 1:
- // nop
+ if (lhs != out) {
+ masm.movl(lhs, out);
+ }
return; // escape overflow check;
case 2:
- masm.addl(lhs, lhs);
- break;
- default:
- if (!mul->canOverflow() && constant > 0) {
- // Use shift if cannot overflow and constant is power of 2
- int32_t shift = FloorLog2(constant);
- if ((1 << shift) == constant) {
- masm.shll(Imm32(shift), lhs);
- return;
- }
+ if (lhs == out) {
+ masm.addl(lhs, lhs);
+ break;
}
- masm.imull(Imm32(constant), lhs);
+ [[fallthrough]];
+ default:
+ masm.imull(Imm32(constant), lhs, out);
}
// Bailout on overflow
@@ -793,6 +837,8 @@ void CodeGenerator::visitMulI(LMulI* ins) {
bailoutIf(Assembler::Overflow, ins->snapshot());
}
} else {
+ MOZ_ASSERT(out == lhs);
+
masm.imull(ToOperand(rhs), lhs);
// Bailout on overflow
diff --git a/js/src/jit/x86-shared/Lowering-x86-shared.cpp b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -185,12 +185,21 @@ void LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, 0>* ins,
void LIRGeneratorX86Shared::lowerMulI(MMul* mul, MDefinition* lhs,
MDefinition* rhs) {
+ if (rhs->isConstant()) {
+ auto* lir = new (alloc()) LMulI(useRegisterAtStart(lhs),
+ useOrConstantAtStart(rhs), LAllocation());
+ if (mul->fallible()) {
+ assignSnapshot(lir, mul->bailoutKind());
+ }
+ define(lir, mul);
+ return;
+ }
+
// Note: If we need a negative zero check, lhs is used twice.
LAllocation lhsCopy = mul->canBeNegativeZero() ? use(lhs) : LAllocation();
LMulI* lir = new (alloc())
LMulI(useRegisterAtStart(lhs),
- willHaveDifferentLIRNodes(lhs, rhs) ? useOrConstant(rhs)
- : useOrConstantAtStart(rhs),
+ willHaveDifferentLIRNodes(lhs, rhs) ? use(rhs) : useAtStart(rhs),
lhsCopy);
if (mul->fallible()) {
assignSnapshot(lir, mul->bailoutKind());