commit 130494460295b72c62f2d7c6763cb00c89189a85
parent 979b5c3d12aa2503aabdff77ec4c8f71ed3b9d37
Author: André Bargull <andre.bargull@gmail.com>
Date: Mon, 27 Oct 2025 09:00:01 +0000
Bug 1996345 - Part 4: Use three operand imul instruction for Int64 when possible. r=spidermonkey-reviewers,iain
Applies the changes from part 1 for Int64 multiplication.
Differential Revision: https://phabricator.services.mozilla.com/D270024
Diffstat:
5 files changed, 129 insertions(+), 60 deletions(-)
diff --git a/js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js
@@ -47,7 +47,7 @@ let zero64 = `(module
codegenTestX64_adhoc(
zero64,
'f',
- 'xor %rax, %rax', {no_prefix:true});
+ 'xor %rax, %rax');
assertEq(wasmEvalText(zero64).exports.f(-37000000000n), 0n)
assertEq(wasmEvalText(zero64).exports.f(42000000000n), 0n)
@@ -74,7 +74,7 @@ codegenTestX64_adhoc(
assertEq(wasmEvalText(one64).exports.f(-37000000000n), -37000000000n)
assertEq(wasmEvalText(one64).exports.f(42000000000n), 42000000000n)
-// Test that multiplication by two yields lea/add
+// Test that multiplication by two yields lea
let double32 =
`(module
@@ -93,11 +93,11 @@ let double64 = `(module
codegenTestX64_adhoc(
double64,
'f',
- 'add %rax, %rax', {no_prefix:true});
+ 'lea \\(%rdi,%rdi,1\\), %rax');
assertEq(wasmEvalText(double64).exports.f(-37000000000n), -74000000000n)
assertEq(wasmEvalText(double64).exports.f(42000000000n), 84000000000n)
-// Test that multiplication by four yields lea/shift
+// Test that multiplication by four yields lea
let quad32 =
`(module
@@ -116,11 +116,11 @@ let quad64 = `(module
codegenTestX64_adhoc(
quad64,
'f',
- 'shl \\$0x02, %rax', {no_prefix:true});
+ 'lea \\(,%rdi,4\\), %rax');
assertEq(wasmEvalText(quad64).exports.f(-37000000000n), -148000000000n)
assertEq(wasmEvalText(quad64).exports.f(42000000000n), 168000000000n)
-// Test that multiplication by five yields lea/imul
+// Test that multiplication by five yields lea
let quint32 =
`(module
@@ -139,10 +139,46 @@ let quint64 = `(module
codegenTestX64_adhoc(
quint64,
'f',
- `imul \\$0x05, %rax, %rax`, {no_prefix:true})
+ `lea \\(%rdi,%rdi,4\\), %rax`)
assertEq(wasmEvalText(quint64).exports.f(-37000000000n), -37000000000n*5n)
assertEq(wasmEvalText(quint64).exports.f(42000000000n), 42000000000n*5n)
+// Test that multiplication by six yields imul
+
+let sext32 =
+ `(module
+ (func (export "f") (param i32) (result i32)
+ (i32.mul (local.get 0) (i32.const 6))))`;
+codegenTestX64_adhoc(
+ sext32,
+ 'f',
+ 'imul \\$0x06, %edi, %eax');
+assertEq(wasmEvalText(sext32).exports.f(-37), -37*6)
+assertEq(wasmEvalText(sext32).exports.f(42), 42*6)
+
+let sext64 = `(module
+ (func (export "f") (param i64) (result i64)
+ (i64.mul (local.get 0) (i64.const 6))))`
+codegenTestX64_adhoc(
+ sext64,
+ 'f',
+ `imul \\$0x06, %rdi, %rax`)
+assertEq(wasmEvalText(sext64).exports.f(-37000000000n), -37000000000n*6n)
+assertEq(wasmEvalText(sext64).exports.f(42000000000n), 42000000000n*6n)
+
+// Test that multiplication by UINT32_MAX yields imul
+
+let uint32max64 = `(module
+ (func (export "f") (param i64) (result i64)
+ (i64.mul (local.get 0) (i64.const 0xffffffff))))`
+codegenTestX64_adhoc(
+ uint32max64,
+ 'f',
+ `mov \\$-0x01, %r11d
+ imul %r11, %rax`, {no_prefix:true})
+assertEq(wasmEvalText(uint32max64).exports.f(-37000000000n), BigInt.asIntN(64, -37000000000n*0xffffffffn))
+assertEq(wasmEvalText(uint32max64).exports.f(42000000000n), BigInt.asIntN(64, 42000000000n*0xffffffffn))
+
// Test that 0-n yields negation.
let subneg32 =
diff --git a/js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js b/js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js
@@ -40,18 +40,8 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul64_zeroL") (param $p1 i64) (result i64)
(i64.mul (i64.const 0) (local.get $p1))))`,
"mul64_zeroL",
- // FIXME folding happened, zero-creation insns could be improved
- {x64: // First we move rdi to rax unnecessarily via rcx (bug 1752520),
- // then we overwrite rax. Presumably because the folding
- // 0 * x => 0 is done at the LIR level, not the MIR level, hence
- // the now-pointless WasmParameter node is not DCE'd away, since
- // DCE only happens at the MIR level. In fact all targets suffer
- // from the latter problem, but on x86 no_prefix_x86:true
- // hides it, and on arm32/64 the pointless move is correctly
- // transformed by RA into a no-op. Also, on xor, REX.W is redundant.
- `mov %rdi, %rcx
- mov %rcx, %rax
- xor %rax, %rax`,
+ // FIXME zero-creation insns could be improved
+ {x64: `xor %rax, %rax`, // REX.W is redundant
x86: `xor %eax, %eax
xor %edx, %edx`,
arm64: `mov x0, xzr`,
@@ -64,7 +54,14 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul32_oneL") (param $p1 i32) (result i32)
(i32.mul (i32.const 1) (local.get $p1))))`,
"mul32_oneL",
- {x64: `mov %edi, %ecx
+ {x64: // We move edi to eax unnecessarily via ecx (bug 1752520).
+ // Presumably because the folding 1 * x => x is done at the LIR
+ // level, not the MIR level, hence the now-pointless WasmParameter
+ // node is not DCE'd away, since DCE only happens at the MIR level.
+ // In fact all targets suffer from the latter problem, but on x86
+ // no_prefix_x86:true hides it, and on arm32/64 the pointless move
+ // is correctly transformed by RA into a no-op.
+ `mov %edi, %ecx
mov %ecx, %eax`,
x86: `movl 0x10\\(%rbp\\), %eax`,
arm64: ``,
@@ -125,9 +122,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul64_twoL") (param $p1 i64) (result i64)
(i64.mul (i64.const 2) (local.get $p1))))`,
"mul64_twoL",
- {x64: `mov %rdi, %rcx
- mov %rcx, %rax
- add %rax, %rax`,
+ {x64: `lea \\(%rdi,%rdi,1\\), %rax`,
x86: `movl 0x14\\(%rbp\\), %edx
movl 0x10\\(%rbp\\), %eax
add %eax, %eax
@@ -153,9 +148,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul64_fourL") (param $p1 i64) (result i64)
(i64.mul (i64.const 4) (local.get $p1))))`,
"mul64_fourL",
- {x64: `mov %rdi, %rcx
- mov %rcx, %rax
- shl \\$0x02, %rax`,
+ {x64: `lea \\(,%rdi,4\\), %rax`,
x86: `movl 0x14\\(%rbp\\), %edx
movl 0x10\\(%rbp\\), %eax
shld \\$0x02, %eax, %edx
@@ -189,9 +182,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul64_zeroR") (param $p1 i64) (result i64)
(i64.mul (local.get $p1) (i64.const 0))))`,
"mul64_zeroR",
- {x64: `mov %rdi, %rcx
- mov %rcx, %rax
- xor %rax, %rax`, // REX.W is redundant
+ {x64: `xor %rax, %rax`, // REX.W is redundant
x86: `xor %eax, %eax
xor %edx, %edx`,
arm64: `mov x0, xzr`,
@@ -265,9 +256,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul64_twoR") (param $p1 i64) (result i64)
(i64.mul (local.get $p1) (i64.const 2))))`,
"mul64_twoR",
- {x64: `mov %rdi, %rcx
- mov %rcx, %rax
- add %rax, %rax`,
+ {x64: `lea \\(%rdi,%rdi,1\\), %rax`,
x86: `movl 0x14\\(%rbp\\), %edx
movl 0x10\\(%rbp\\), %eax
add %eax, %eax
@@ -293,9 +282,7 @@ codegenTestMultiplatform_adhoc(
`(module (func (export "mul64_fourR") (param $p1 i64) (result i64)
(i64.mul (local.get $p1) (i64.const 4))))`,
"mul64_fourR",
- {x64: `mov %rdi, %rcx
- mov %rcx, %rax
- shl \\$0x02, %rax`,
+ {x64: `lea \\(,%rdi,4\\), %rax`,
x86: `movl 0x14\\(%rbp\\), %edx
movl 0x10\\(%rbp\\), %eax
shld \\$0x02, %eax, %edx
diff --git a/js/src/jit/LIROps.yaml b/js/src/jit/LIROps.yaml
@@ -1351,7 +1351,7 @@
operands:
lhs: Int64
rhs: Int64
-#if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_ARM)
+#if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_ARM)
num_temps: 1
#endif
defer_init: true
diff --git a/js/src/jit/x64/CodeGenerator-x64.cpp b/js/src/jit/x64/CodeGenerator-x64.cpp
@@ -123,41 +123,77 @@ void CodeGenerator::visitUnbox(LUnbox* unbox) {
}
void CodeGenerator::visitMulI64(LMulI64* lir) {
- Register64 lhs = ToRegister64(lir->lhs());
+ Register lhs = ToRegister64(lir->lhs()).reg;
LInt64Allocation rhs = lir->rhs();
-
- MOZ_ASSERT(ToOutRegister64(lir) == lhs);
+ Register out = ToOutRegister64(lir).reg;
if (IsConstant(rhs)) {
int64_t constant = ToInt64(rhs);
switch (constant) {
case -1:
- masm.neg64(lhs);
- return;
+ if (lhs != out) {
+ masm.movq(lhs, out);
+ }
+ masm.negq(out);
+ break;
case 0:
- masm.xor64(lhs, lhs);
- return;
+ masm.xorq(out, out);
+ break;
case 1:
- // nop
- return;
+ if (lhs != out) {
+ masm.movq(lhs, out);
+ }
+ break;
case 2:
- masm.add64(lhs, lhs);
- return;
- default:
- if (constant > 0) {
- // Use shift if constant is power of 2.
- int32_t shift = mozilla::FloorLog2(constant);
- if (int64_t(1) << shift == constant) {
- masm.lshift64(Imm32(shift), lhs);
- return;
+ if (lhs == out) {
+ masm.addq(lhs, lhs);
+ } else {
+ masm.lea(Operand(lhs, lhs, TimesOne), out);
+ }
+ break;
+ case 3:
+ masm.lea(Operand(lhs, lhs, TimesTwo), out);
+ break;
+ case 4:
+ if (lhs == out) {
+ masm.shlq(Imm32(2), lhs);
+ } else {
+ masm.lea(Operand(lhs, TimesFour, 0), out);
+ }
+ break;
+ case 5:
+ masm.lea(Operand(lhs, lhs, TimesFour), out);
+ break;
+ case 8:
+ if (lhs == out) {
+ masm.shlq(Imm32(3), lhs);
+ } else {
+ masm.lea(Operand(lhs, TimesEight, 0), out);
+ }
+ break;
+ case 9:
+ masm.lea(Operand(lhs, lhs, TimesEight), out);
+ break;
+ default: {
+ // Use shift if constant is power of 2.
+ int32_t shift = mozilla::FloorLog2(constant);
+ if (constant > 0 && (1 << shift) == constant) {
+ if (lhs != out) {
+ masm.movq(lhs, out);
}
+ masm.shlq(Imm32(shift), out);
+ } else if (int32_t(constant) == constant) {
+ masm.imulq(Imm32(constant), lhs, out);
+ } else {
+ MOZ_ASSERT(out == lhs);
+ masm.mul64(Imm64(constant), Register64(lhs));
}
- Register temp = ToTempRegisterOrInvalid(lir->temp0());
- masm.mul64(Imm64(constant), lhs, temp);
+ break;
+ }
}
} else {
- Register temp = ToTempRegisterOrInvalid(lir->temp0());
- masm.mul64(ToOperandOrRegister64(rhs), lhs, temp);
+ MOZ_ASSERT(out == lhs);
+ masm.imulq(ToOperandOrRegister64(rhs), lhs);
}
}
diff --git a/js/src/jit/x64/Lowering-x64.cpp b/js/src/jit/x64/Lowering-x64.cpp
@@ -62,12 +62,22 @@ void LIRGeneratorX64::lowerForALUInt64(
void LIRGeneratorX64::lowerForMulInt64(LMulI64* ins, MMul* mir,
MDefinition* lhs, MDefinition* rhs) {
- // X64 doesn't need a temp for 64bit multiplication.
+ // No input reuse needed when we can use imulq with an int32 immediate.
+ bool reuseInput = true;
+ if (rhs->isConstant()) {
+ int64_t constant = rhs->toConstant()->toInt64();
+ reuseInput = int32_t(constant) != constant;
+ }
+
ins->setLhs(useInt64RegisterAtStart(lhs));
ins->setRhs(willHaveDifferentLIRNodes(lhs, rhs)
? useInt64OrConstant(rhs)
: useInt64OrConstantAtStart(rhs));
- defineInt64ReuseInput(ins, mir, 0);
+ if (reuseInput) {
+ defineInt64ReuseInput(ins, mir, 0);
+ } else {
+ defineInt64(ins, mir);
+ }
}
void LIRGenerator::visitBox(MBox* box) {