tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 86cef46aa081c087e3129bb5e012311b9494500d
parent 7b29457f213ccab25b5e2ea05bae3210936f0c57
Author: André Bargull <andre.bargull@gmail.com>
Date:   Mon, 27 Oct 2025 09:00:00 +0000

Bug 1996345 - Part 1: Use three operand imul instruction when possible. r=spidermonkey-reviewers,iain

Improves codegen for:
```js
function f(x) {
  return (Math.imul(x, 10) + x)|0;
}
```

From:
```asm
movl       %eax, %ecx
imull      $10, %eax, %eax
addl       %ecx, %eax
```

To:
```asm
imull      $10, %eax, %ecx
addl       %eax, %ecx
```

For small constants use `lea` to match GCC/Clang codegen.

Differential Revision: https://phabricator.services.mozilla.com/D270021

Diffstat:
Mjs/src/jit-test/tests/wasm/binop-x64-ion-codegen.js | 14+++++++-------
Mjs/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js | 42++++++++++++++----------------------------
Mjs/src/jit/x86-shared/CodeGenerator-x86-shared.cpp | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mjs/src/jit/x86-shared/Lowering-x86-shared.cpp | 13+++++++++++--
4 files changed, 94 insertions(+), 53 deletions(-)

diff --git a/js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/binop-x64-ion-codegen.js @@ -37,7 +37,7 @@ let zero32 = codegenTestX64_adhoc( zero32, 'f', - 'xor %eax, %eax', {no_prefix:true}); + 'xor %eax, %eax'); assertEq(wasmEvalText(zero32).exports.f(-37), 0) assertEq(wasmEvalText(zero32).exports.f(42), 0) @@ -74,7 +74,7 @@ codegenTestX64_adhoc( assertEq(wasmEvalText(one64).exports.f(-37000000000n), -37000000000n) assertEq(wasmEvalText(one64).exports.f(42000000000n), 42000000000n) -// Test that multiplication by two yields an add +// Test that multiplication by two yields lea/add let double32 = `(module @@ -83,7 +83,7 @@ let double32 = codegenTestX64_adhoc( double32, 'f', - 'add %eax, %eax', {no_prefix:true}); + 'lea \\(%rdi,%rdi,1\\), %eax'); assertEq(wasmEvalText(double32).exports.f(-37), -74) assertEq(wasmEvalText(double32).exports.f(42), 84) @@ -97,7 +97,7 @@ codegenTestX64_adhoc( assertEq(wasmEvalText(double64).exports.f(-37000000000n), -74000000000n) assertEq(wasmEvalText(double64).exports.f(42000000000n), 84000000000n) -// Test that multiplication by four yields a shift +// Test that multiplication by four yields lea/shift let quad32 = `(module @@ -106,7 +106,7 @@ let quad32 = codegenTestX64_adhoc( quad32, 'f', - 'shl \\$0x02, %eax', {no_prefix:true}); + 'lea \\(,%rdi,4\\), %eax'); assertEq(wasmEvalText(quad32).exports.f(-37), -148) assertEq(wasmEvalText(quad32).exports.f(42), 168) @@ -120,7 +120,7 @@ codegenTestX64_adhoc( assertEq(wasmEvalText(quad64).exports.f(-37000000000n), -148000000000n) assertEq(wasmEvalText(quad64).exports.f(42000000000n), 168000000000n) -// Test that multiplication by five yields a multiply +// Test that multiplication by five yields lea/imul let quint32 = `(module @@ -129,7 +129,7 @@ let quint32 = codegenTestX64_adhoc( quint32, 'f', - 'imul \\$0x05, %eax, %eax', {no_prefix:true}); + 'lea \\(%rdi,%rdi,4\\), %eax'); assertEq(wasmEvalText(quint32).exports.f(-37), -37*5) assertEq(wasmEvalText(quint32).exports.f(42), 42*5) diff --git a/js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js b/js/src/jit-test/tests/wasm/ion-adhoc-multiplatform.js @@ -30,18 +30,7 @@ codegenTestMultiplatform_adhoc( `(module (func (export "mul32_zeroL") (param $p1 i32) (result i32) (i32.mul (i32.const 0) (local.get $p1))))`, "mul32_zeroL", - {x64: // FIXME move folding to MIR level - // First we move edi to eax unnecessarily via ecx (bug 1752520), - // then we overwrite eax. Presumably because the folding - // 0 * x => 0 is done at the LIR level, not the MIR level, hence - // the now-pointless WasmParameter node is not DCE'd away, since - // DCE only happens at the MIR level. In fact all targets suffer - // from the latter problem, but on x86 no_prefix_x86:true - // hides it, and on arm32/64 the pointless move is correctly - // transformed by RA into a no-op. - `mov %edi, %ecx - mov %ecx, %eax - xor %eax, %eax`, + {x64: `xor %eax, %eax`, x86: `xor %eax, %eax`, arm64: `mov w0, wzr`, arm: `mov r0, #0`}, @@ -52,7 +41,14 @@ codegenTestMultiplatform_adhoc( (i64.mul (i64.const 0) (local.get $p1))))`, "mul64_zeroL", // FIXME folding happened, zero-creation insns could be improved - {x64: // Same shenanigans as above. Also, on xor, REX.W is redundant. + {x64: // First we move rdi to rax unnecessarily via rcx (bug 1752520), + // then we overwrite rax. Presumably because the folding + // 0 * x => 0 is done at the LIR level, not the MIR level, hence + // the now-pointless WasmParameter node is not DCE'd away, since + // DCE only happens at the MIR level. In fact all targets suffer + // from the latter problem, but on x86 no_prefix_x86:true + // hides it, and on arm32/64 the pointless move is correctly + // transformed by RA into a no-op. Also, on xor, REX.W is redundant. `mov %rdi, %rcx mov %rcx, %rax xor %rax, %rax`, @@ -118,9 +114,7 @@ codegenTestMultiplatform_adhoc( `(module (func (export "mul32_twoL") (param $p1 i32) (result i32) (i32.mul (i32.const 2) (local.get $p1))))`, "mul32_twoL", - {x64: `mov %edi, %ecx - mov %ecx, %eax - add %eax, %eax`, + {x64: `lea \\(%rdi,%rdi,1\\), %eax`, x86: `movl 0x10\\(%rbp\\), %eax add %eax, %eax`, arm64: `add w0, w0, w0`, @@ -148,9 +142,7 @@ codegenTestMultiplatform_adhoc( `(module (func (export "mul32_fourL") (param $p1 i32) (result i32) (i32.mul (i32.const 4) (local.get $p1))))`, "mul32_fourL", - {x64: `mov %edi, %ecx - mov %ecx, %eax - shl \\$0x02, %eax`, + {x64: `lea \\(,%rdi,4\\), %eax`, x86: `movl 0x10\\(%rbp\\), %eax shl \\$0x02, %eax`, arm64: `lsl w0, w0, #2`, @@ -187,9 +179,7 @@ codegenTestMultiplatform_adhoc( `(module (func (export "mul32_zeroR") (param $p1 i32) (result i32) (i32.mul (local.get $p1) (i32.const 0))))`, "mul32_zeroR", - {x64: `mov %edi, %ecx - mov %ecx, %eax - xor %eax, %eax`, + {x64: `xor %eax, %eax`, x86: `xor %eax, %eax`, arm64: `mov w0, wzr`, arm: `mov r0, #0`}, @@ -264,9 +254,7 @@ codegenTestMultiplatform_adhoc( `(module (func (export "mul32_twoR") (param $p1 i32) (result i32) (i32.mul (local.get $p1) (i32.const 2))))`, "mul32_twoR", - {x64: `mov %edi, %ecx - mov %ecx, %eax - add %eax, %eax`, + {x64: `lea \\(%rdi,%rdi,1\\), %eax`, x86: `movl 0x10\\(%rbp\\), %eax add %eax, %eax`, arm64: `add w0, w0, w0`, @@ -294,9 +282,7 @@ codegenTestMultiplatform_adhoc( `(module (func (export "mul32_fourR") (param $p1 i32) (result i32) (i32.mul (local.get $p1) (i32.const 4))))`, "mul32_fourR", - {x64: `mov %edi, %ecx - mov %ecx, %eax - shl \\$0x02, %eax`, + {x64: `lea \\(,%rdi,4\\), %eax`, x86: `movl 0x10\\(%rbp\\), %eax shl \\$0x02, %eax`, arm64: `lsl w0, w0, #2`, diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp @@ -746,8 +746,7 @@ void CodeGenerator::visitSubI64(LSubI64* lir) { void CodeGenerator::visitMulI(LMulI* ins) { Register lhs = ToRegister(ins->lhs()); const LAllocation* rhs = ins->rhs(); - - MOZ_ASSERT(ToRegister(ins->output()) == lhs); + Register out = ToRegister(ins->output()); MMul* mul = ins->mir(); MOZ_ASSERT_IF(mul->mode() == MMul::Integer, @@ -763,29 +762,74 @@ void CodeGenerator::visitMulI(LMulI* ins) { bailoutIf(bailoutCond, ins->snapshot()); } + if (!mul->canOverflow()) { + switch (constant) { + case 2: + if (lhs == out) { + masm.addl(lhs, lhs); + } else { + masm.leal(Operand(lhs, lhs, TimesOne), out); + } + return; + case 3: + masm.leal(Operand(lhs, lhs, TimesTwo), out); + return; + case 4: + if (lhs == out) { + masm.shll(Imm32(2), lhs); + } else { + masm.leal(Operand(lhs, TimesFour, 0), out); + } + return; + case 5: + masm.leal(Operand(lhs, lhs, TimesFour), out); + return; + case 8: + if (lhs == out) { + masm.shll(Imm32(3), lhs); + } else { + masm.leal(Operand(lhs, TimesEight, 0), out); + } + return; + case 9: + masm.leal(Operand(lhs, lhs, TimesEight), out); + return; + default: + // Use shift if cannot overflow and constant is power of 2 + int32_t shift = FloorLog2(constant); + if (constant > 0 && (1 << shift) == constant) { + if (lhs != out) { + masm.movl(lhs, out); + } + masm.shll(Imm32(shift), out); + return; + } + } + } + switch (constant) { case -1: - masm.negl(lhs); + if (lhs != out) { + masm.movl(lhs, out); + } + masm.negl(out); break; case 0: - masm.xorl(lhs, lhs); + masm.xorl(out, out); return; // escape overflow check; case 1: - // nop + if (lhs != out) { + masm.movl(lhs, out); + } return; // escape overflow check; case 2: - masm.addl(lhs, lhs); - break; - default: - if (!mul->canOverflow() && constant > 0) { - // Use shift if cannot overflow and constant is power of 2 - int32_t shift = FloorLog2(constant); - if ((1 << shift) == constant) { - masm.shll(Imm32(shift), lhs); - return; - } + if (lhs == out) { + masm.addl(lhs, lhs); + break; } - masm.imull(Imm32(constant), lhs); + [[fallthrough]]; + default: + masm.imull(Imm32(constant), lhs, out); } // Bailout on overflow @@ -793,6 +837,8 @@ void CodeGenerator::visitMulI(LMulI* ins) { bailoutIf(Assembler::Overflow, ins->snapshot()); } } else { + MOZ_ASSERT(out == lhs); + masm.imull(ToOperand(rhs), lhs); // Bailout on overflow diff --git a/js/src/jit/x86-shared/Lowering-x86-shared.cpp b/js/src/jit/x86-shared/Lowering-x86-shared.cpp @@ -185,12 +185,21 @@ void LIRGeneratorX86Shared::lowerForFPU(LInstructionHelper<1, 2, 0>* ins, void LIRGeneratorX86Shared::lowerMulI(MMul* mul, MDefinition* lhs, MDefinition* rhs) { + if (rhs->isConstant()) { + auto* lir = new (alloc()) LMulI(useRegisterAtStart(lhs), + useOrConstantAtStart(rhs), LAllocation()); + if (mul->fallible()) { + assignSnapshot(lir, mul->bailoutKind()); + } + define(lir, mul); + return; + } + // Note: If we need a negative zero check, lhs is used twice. LAllocation lhsCopy = mul->canBeNegativeZero() ? use(lhs) : LAllocation(); LMulI* lir = new (alloc()) LMulI(useRegisterAtStart(lhs), - willHaveDifferentLIRNodes(lhs, rhs) ? useOrConstant(rhs) - : useOrConstantAtStart(rhs), + willHaveDifferentLIRNodes(lhs, rhs) ? use(rhs) : useAtStart(rhs), lhsCopy); if (mul->fallible()) { assignSnapshot(lir, mul->bailoutKind());