From 9565f16d84a8a739f732dafc3b2fc039ea0c0f3d Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 10:33:11 -0400 Subject: [PATCH 01/17] RegisterAllocationPass: simplify Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index 0707abf03b..2a3b25848c 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -639,7 +639,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { // Remap sources last, since AssignReg can shuffle. for (auto s = 0; s < IR::GetRAArgs(IROp->Op); ++s) { - Ref Remapped = SSAToNewSSA[IR->GetID(IR->GetNode(IROp->Args[s])).Value]; + Ref Remapped = SSAToNewSSA[IROp->Args[s].ID().Value]; if (Remapped != nullptr) { IREmit->ReplaceNodeArgument(CodeNode, s, Remapped); From aa548bd19c237e0cb609314f24e141ed0c9bb158 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 10:35:06 -0400 Subject: [PATCH 02/17] RegisterAllocationPass: track if we need a remap should be faster on average. Signed-off-by: Alyssa Rosenzweig --- .../IR/Passes/RegisterAllocationPass.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index 2a3b25848c..b54d548578 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -96,6 +96,9 @@ class ConstrainedRAPass final : public RegisterAllocationPass { // Inverse of SSAToNewSSA. Since it's indexed by new nodes, it grows. fextl::vector NewSSAToSSA; + // Whether we have a non-identity mapping. + bool AnyRemapped; + // Map of assigned registers. Grows. fextl::vector SSAToReg; @@ -127,6 +130,7 @@ class ConstrainedRAPass final : public RegisterAllocationPass { SSAToNewSSA[OldID] = New; NewSSAToSSA[NewID] = Old; + AnyRemapped = true; LOGMAN_THROW_A_FMT(Map(Old) == New && Unmap(New) == Old, "Post-condition"); LOGMAN_THROW_A_FMT(Unmap(Old) == Old, "Invariant1"); @@ -468,6 +472,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { NextUses.resize(IR->GetSSACount(), 0); SpillSlotCount = 0; AnySpilled = false; + AnyRemapped = false; // Next-use distance relative to the block end of each source, last first. fextl::vector SourcesNextUses; @@ -638,11 +643,13 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { } // Remap sources last, since AssignReg can shuffle. - for (auto s = 0; s < IR::GetRAArgs(IROp->Op); ++s) { - Ref Remapped = SSAToNewSSA[IROp->Args[s].ID().Value]; + if (AnyRemapped) { + for (auto s = 0; s < IR::GetRAArgs(IROp->Op); ++s) { + Ref Remapped = SSAToNewSSA[IROp->Args[s].ID().Value]; - if (Remapped != nullptr) { - IREmit->ReplaceNodeArgument(CodeNode, s, Remapped); + if (Remapped != nullptr) { + IREmit->ReplaceNodeArgument(CodeNode, s, Remapped); + } } } From 8a551b9e642a55ecc87231db2d46353de743ad8c Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 10:39:18 -0400 Subject: [PATCH 03/17] RegisterAllocationPass: rm leftover comment Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index b54d548578..6321a3ce6a 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -426,8 +426,7 @@ class ConstrainedRAPass final : public RegisterAllocationPass { RegisterClassType ClassType = GetRegClassFromNode(IR, IROp); RegisterClass* Class = &Classes[ClassType]; - // Spill to make room in the register file. Free registers need not be - // contiguous, we'll shuffle later. + // Spill to make room in the register file. if (!Class->Available) { IREmit->SetWriteCursorBefore(CodeNode); SpillReg(Class, Pivot); From 5aaa18e7a2ba43dc162378362cdbdee088c6ae3e Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 10:43:15 -0400 Subject: [PATCH 04/17] RegisterAllocationPass: remove Class->Allocated not needed anymore. Signed-off-by: Alyssa Rosenzweig --- .../IR/Passes/RegisterAllocationPass.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index 6321a3ce6a..e430df17fa 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -28,13 +28,10 @@ namespace { uint32_t Available; uint32_t Count; - // If bit R of Allocated is 1, then RegToSSA[R] is the Old node + // If bit R of Available is 0, then RegToSSA[R] is the Old node // currently allocated to R. Else, RegToSSA[R] is UNDEFINED, no need to // clear this when freeing registers. Ref RegToSSA[32]; - - // Allocated base registers. Similar to ~Available except for pairs. - uint32_t Allocated; }; IR::RegisterClassType GetRegClassFromNode(IR::IRListView* IR, IR::IROp_Header* IROp) { @@ -202,7 +199,7 @@ class ConstrainedRAPass final : public RegisterAllocationPass { PhysicalRegister Reg = SSAToReg[IR->GetID(Map(Old)).Value]; RegisterClass* Class = GetClass(Reg); - return (Class->Allocated & GetRegBits(Reg)) && Class->RegToSSA[Reg.Reg] == Old; + return (Class->Available & GetRegBits(Reg)) == 0 && Class->RegToSSA[Reg.Reg] == Old; }; void FreeReg(PhysicalRegister Reg) { @@ -210,10 +207,8 @@ class ConstrainedRAPass final : public RegisterAllocationPass { uint32_t RegBits = GetRegBits(Reg); LOGMAN_THROW_AA_FMT(!(Class->Available & RegBits), "Register double-free"); - LOGMAN_THROW_AA_FMT((Class->Allocated & RegBits), "Register double-free"); Class->Available |= RegBits; - Class->Allocated &= ~RegBits; }; bool HasSource(IROp_Header* I, Ref Old) { @@ -287,8 +282,9 @@ class ConstrainedRAPass final : public RegisterAllocationPass { Ref Candidate = nullptr; uint32_t BestDistance = UINT32_MAX; uint8_t BestReg = ~0; + uint32_t Allocated = ((1u << Class->Count) - 1) & ~Class->Available; - foreach_bit(i, Class->Allocated) { + foreach_bit(i, Allocated) { Ref Old = Class->RegToSSA[i]; LOGMAN_THROW_AA_FMT(Old != nullptr, "Invariant3"); @@ -354,10 +350,8 @@ class ConstrainedRAPass final : public RegisterAllocationPass { uint32_t RegBits = GetRegBits(Reg); LOGMAN_THROW_AA_FMT((Class->Available & RegBits) == RegBits, "Precondition"); - LOGMAN_THROW_AA_FMT(!(Class->Allocated & RegBits), "Precondition"); Class->Available &= ~RegBits; - Class->Allocated |= (1u << Reg.Reg); Class->RegToSSA[Reg.Reg] = Unmap(Node); if (Index >= SSAToReg.size()) { @@ -480,7 +474,6 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { // At the start of each block, all registers are available. for (auto& Class : Classes) { Class.Available = (1u << Class.Count) - 1; - Class.Allocated = 0; } SourcesNextUses.clear(); @@ -518,7 +511,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { } // Record preferred registers for SRA. We also record the Node accessing - // each register, used below. Since we initialized Class->Allocated = 0, + // each register, used below. Since we initialized Class->Available, // RegToSSA is otherwise undefined so we can stash our temps there. if (auto Node = DecodeSRANode(IROp, CodeNode); Node != nullptr) { auto Reg = DecodeSRAReg(IROp, Node); @@ -571,7 +564,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { auto Reg = DecodeSRAReg(IROp, Node); RegisterClass* Class = &Classes[Reg.Class]; - if (Class->Allocated & (1u << Reg.Reg)) { + if (!(Class->Available & (1u << Reg.Reg))) { Ref Old = Class->RegToSSA[Reg.Reg]; LOGMAN_THROW_A_FMT(IsOld(Old), "RegToSSA invariant"); From 8819fa88d5c0042a95f3ad277f7383407726619a Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 10:49:31 -0400 Subject: [PATCH 05/17] RegisterAllocationPass: avoid allocating remaps shaves 6ms off. Signed-off-by: Alyssa Rosenzweig --- .../IR/Passes/RegisterAllocationPass.cpp | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index e430df17fa..f6cfbcd2b8 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -87,7 +87,7 @@ class ConstrainedRAPass final : public RegisterAllocationPass { // // SSAToNewSSA tracks the current remapping. nullptr indicates no remapping. // - // Since its indexed by Old nodes, SSAToNewSSA does not grow. + // Since its indexed by Old nodes, SSAToNewSSA does not grow after allocation. fextl::vector SSAToNewSSA; // Inverse of SSAToNewSSA. Since it's indexed by new nodes, it grows. @@ -100,19 +100,27 @@ class ConstrainedRAPass final : public RegisterAllocationPass { fextl::vector SSAToReg; bool IsOld(Ref Node) { - return IR->GetID(Node).Value < SSAToNewSSA.size(); + return IR->GetID(Node).Value < PreferredReg.size(); }; // Return the New node (if it exists) for an Old node, else the Old node. Ref Map(Ref Old) { LOGMAN_THROW_A_FMT(IsOld(Old), "Pre-condition"); - return SSAToNewSSA[IR->GetID(Old).Value] ?: Old; + if (SSAToNewSSA.empty()) { + return Old; + } else { + return SSAToNewSSA[IR->GetID(Old).Value] ?: Old; + } }; // Return the Old node for a possibly-remapped node. Ref Unmap(Ref Node) { - return NewSSAToSSA[IR->GetID(Node).Value] ?: Node; + if (NewSSAToSSA.empty()) { + return Node; + } else { + return NewSSAToSSA[IR->GetID(Node).Value] ?: Node; + } }; // Record a remapping of Old to New. @@ -125,6 +133,10 @@ class ConstrainedRAPass final : public RegisterAllocationPass { LOGMAN_THROW_A_FMT(NewID >= NewSSAToSSA.size(), "Brand new SSA def"); NewSSAToSSA.resize(NewID + 1, 0); + if (SSAToNewSSA.empty()) { + SSAToNewSSA.resize(PreferredReg.size(), nullptr); + } + SSAToNewSSA[OldID] = New; NewSSAToSSA[NewID] = Old; AnyRemapped = true; @@ -458,9 +470,8 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { auto IR_ = IREmit->ViewIR(); IR = &IR_; + // SSAToNewSSA, NewSSAToSSA allocated on first-use PreferredReg.resize(IR->GetSSACount(), PhysicalRegister::Invalid()); - SSAToNewSSA.resize(IR->GetSSACount(), nullptr); - NewSSAToSSA.resize(IR->GetSSACount(), nullptr); SSAToReg.resize(IR->GetSSACount(), PhysicalRegister::Invalid()); NextUses.resize(IR->GetSSACount(), 0); SpillSlotCount = 0; From ec24fc3d5d84e51c9d7f9f9857277800e45ed00e Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 10:52:28 -0400 Subject: [PATCH 06/17] RegisterAllocationPass: infer AnyRemapped Signed-off-by: Alyssa Rosenzweig --- .../Source/Interface/IR/Passes/RegisterAllocationPass.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index f6cfbcd2b8..3a425f19c9 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -93,9 +93,6 @@ class ConstrainedRAPass final : public RegisterAllocationPass { // Inverse of SSAToNewSSA. Since it's indexed by new nodes, it grows. fextl::vector NewSSAToSSA; - // Whether we have a non-identity mapping. - bool AnyRemapped; - // Map of assigned registers. Grows. fextl::vector SSAToReg; @@ -139,7 +136,6 @@ class ConstrainedRAPass final : public RegisterAllocationPass { SSAToNewSSA[OldID] = New; NewSSAToSSA[NewID] = Old; - AnyRemapped = true; LOGMAN_THROW_A_FMT(Map(Old) == New && Unmap(New) == Old, "Post-condition"); LOGMAN_THROW_A_FMT(Unmap(Old) == Old, "Invariant1"); @@ -476,7 +472,6 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { NextUses.resize(IR->GetSSACount(), 0); SpillSlotCount = 0; AnySpilled = false; - AnyRemapped = false; // Next-use distance relative to the block end of each source, last first. fextl::vector SourcesNextUses; @@ -646,7 +641,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { } // Remap sources last, since AssignReg can shuffle. - if (AnyRemapped) { + if (!SSAToNewSSA.empty()) { for (auto s = 0; s < IR::GetRAArgs(IROp->Op); ++s) { Ref Remapped = SSAToNewSSA[IROp->Args[s].ID().Value]; From c17858bfebf8ac8a6685a66cd38a8a7e78007c91 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 10:26:02 -0400 Subject: [PATCH 07/17] RegisterAllocationPass: don't lookup sources lot of silly chasing. not needed. Signed-off-by: Alyssa Rosenzweig --- .../Interface/IR/Passes/RegisterAllocationPass.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index 3a425f19c9..594b2cd8a7 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -508,7 +508,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { const uint8_t NumArgs = IR::GetRAArgs(IROp->Op); for (int8_t i = NumArgs - 1; i >= 0; --i) { const auto& Arg = IROp->Args[i]; - if (IsValidArg(Arg)) { + if (!Arg.IsInvalid()) { const uint32_t Index = Arg.ID().Value; SourcesNextUses.push_back(NextUses[Index]); @@ -618,7 +618,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { } for (auto s = 0; s < IR::GetRAArgs(IROp->Op); ++s) { - if (!IsValidArg(IROp->Args[s])) { + if (IROp->Args[s].IsInvalid()) { continue; } @@ -626,10 +626,13 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { LOGMAN_THROW_AA_FMT(SourceIndex >= 0, "Consistent source count"); Ref Old = IR->GetNode(IROp->Args[s]); - LOGMAN_THROW_A_FMT(IsInRegisterFile(Old), "sources in file"); if (!SourcesNextUses[SourceIndex]) { - FreeReg(SSAToReg[IR->GetID(Map(Old)).Value]); + auto Reg = SSAToReg[IR->GetID(Map(Old)).Value]; + if (!Reg.IsInvalid()) { + LOGMAN_THROW_A_FMT(IsInRegisterFile(Old), "sources in file"); + FreeReg(Reg); + } } NextUses[IR->GetID(Old).Value] = SourcesNextUses[SourceIndex]; From d16969bdce8d84b454a2091adf0f5948e43843aa Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 10:57:35 -0400 Subject: [PATCH 08/17] RegisterAllocationPass: simplify Signed-off-by: Alyssa Rosenzweig --- .../Source/Interface/IR/Passes/RegisterAllocationPass.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index 594b2cd8a7..ffd72c1d46 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -625,17 +625,17 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { SourceIndex--; LOGMAN_THROW_AA_FMT(SourceIndex >= 0, "Consistent source count"); - Ref Old = IR->GetNode(IROp->Args[s]); - if (!SourcesNextUses[SourceIndex]) { + Ref Old = IR->GetNode(IROp->Args[s]); auto Reg = SSAToReg[IR->GetID(Map(Old)).Value]; + if (!Reg.IsInvalid()) { LOGMAN_THROW_A_FMT(IsInRegisterFile(Old), "sources in file"); FreeReg(Reg); } } - NextUses[IR->GetID(Old).Value] = SourcesNextUses[SourceIndex]; + NextUses[IROp->Args[s].ID().Value] = SourcesNextUses[SourceIndex]; } // Assign destinations. From 319f1e66cbab275fc2f2d62c7b14cb4e91281583 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 11:04:56 -0400 Subject: [PATCH 09/17] RegisterAllocationPass: eliminate a silly sxtb Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp index ffd72c1d46..b93fcde297 100644 --- a/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RegisterAllocationPass.cpp @@ -506,7 +506,7 @@ void ConstrainedRAPass::Run(IREmitter* IREmit_) { // of SourcesNextUses is consistent. The forward pass can then iterate // forwards and just flip the order. const uint8_t NumArgs = IR::GetRAArgs(IROp->Op); - for (int8_t i = NumArgs - 1; i >= 0; --i) { + for (int i = NumArgs - 1; i >= 0; --i) { const auto& Arg = IROp->Args[i]; if (!Arg.IsInvalid()) { const uint32_t Index = Arg.ID().Value; From f175b525f45aa6142afaf0950350f72055309b61 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 11:11:02 -0400 Subject: [PATCH 10/17] OpcodeDispatcher/Flags: inline constants Signed-off-by: Alyssa Rosenzweig --- .../Interface/Core/OpcodeDispatcher/Flags.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp index 4ae055ee44..3237f057b6 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp @@ -116,7 +116,7 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) { // instead. if (FlagsMask & (1 << FEXCore::X86State::RFLAG_PF_RAW_LOC)) { // Set every bit except the bottommost. - auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(false, false), _Constant(~1ull)); + auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(false, false), _InlineConstant(~1ull)); // Rotate the bottom bit to the appropriate location for PF, so we get // something like 111P1111. Then invert that to get 000p0000. Then OR that @@ -129,13 +129,13 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) { if (GetNZ) { static_assert(FEXCore::X86State::RFLAG_SF_RAW_LOC == (FEXCore::X86State::RFLAG_ZF_RAW_LOC + 1)); auto NZCV = GetNZCV(); - auto NZ = _And(OpSize::i64Bit, NZCV, _Constant(0b11u << 30)); + auto NZ = _And(OpSize::i64Bit, NZCV, _InlineConstant(0b11u << 30)); Original = _Orlshr(OpSize::i64Bit, Original, NZ, 31 - FEXCore::X86State::RFLAG_SF_RAW_LOC); } // The constant is OR'ed in at the end, to avoid a pointless or xzr, #2. if ((1U << X86State::RFLAG_RESERVED_LOC) & FlagsMask) { - Original = _Or(OpSize::i64Bit, Original, _Constant(2)); + Original = _Or(OpSize::i64Bit, Original, _InlineConstant(2)); } return Original; @@ -266,8 +266,8 @@ Ref OpDispatchBuilder::IncrementByCarry(OpSize OpSize, Ref Src) { } Ref OpDispatchBuilder::CalculateFlags_ADC(uint8_t SrcSize, Ref Src1, Ref Src2) { - auto Zero = _Constant(0); - auto One = _Constant(1); + auto Zero = _InlineConstant(0); + auto One = _InlineConstant(1); auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; Ref Res; @@ -303,8 +303,8 @@ Ref OpDispatchBuilder::CalculateFlags_ADC(uint8_t SrcSize, Ref Src1, Ref Src2) { } Ref OpDispatchBuilder::CalculateFlags_SBB(uint8_t SrcSize, Ref Src1, Ref Src2) { - auto Zero = _Constant(0); - auto One = _Constant(1); + auto Zero = _InlineConstant(0); + auto One = _InlineConstant(1); auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit; CalculateAF(Src1, Src2); @@ -408,7 +408,7 @@ void OpDispatchBuilder::CalculateFlags_MUL(uint8_t SrcSize, Ref Res, Ref High) { // If High = SignBit, then sets to nZCv. Else sets to nzcV. Since SF/ZF // undefined, this does what we need after inverting carry. - auto Zero = _Constant(0); + auto Zero = _InlineConstant(0); _CondSubNZCV(OpSize::i64Bit, Zero, Zero, CondClassType {COND_EQ}, 0x1 /* nzcV */); CFInverted = true; } @@ -417,7 +417,7 @@ void OpDispatchBuilder::CalculateFlags_UMUL(Ref High) { HandleNZCVWrite(); InvalidatePF_AF(); - auto Zero = _Constant(0); + auto Zero = _InlineConstant(0); OpSize Size = IR::SizeToOpSize(GetOpSize(High)); // CF and OF are set if the result of the operation can't be fit in to the destination register From 6cc6181261f2d737322f5861d84a7dfb30d81c00 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 11:12:33 -0400 Subject: [PATCH 11/17] OpcodeDispatcher/Flags: dont emit zero Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp index 3237f057b6..1d8db1cae9 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp @@ -76,16 +76,13 @@ Ref OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) { // Calculate flags early. CalculateDeferredFlags(); - Ref Original = _Constant(0); - // SF/ZF and N/Z are together on both arm64 and x86_64, so we special case that. bool GetNZ = (FlagsMask & (1 << FEXCore::X86State::RFLAG_SF_RAW_LOC)) && (FlagsMask & (1 << FEXCore::X86State::RFLAG_ZF_RAW_LOC)); // Handle CF first, since it's at bit 0 and hence doesn't need shift or OR. - if (FlagsMask & (1 << FEXCore::X86State::RFLAG_CF_RAW_LOC)) { - static_assert(FEXCore::X86State::RFLAG_CF_RAW_LOC == 0); - Original = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); - } + LOGMAN_THROW_A_FMT(FlagsMask & (1 << FEXCore::X86State::RFLAG_CF_RAW_LOC), "CF always handled"); + static_assert(FEXCore::X86State::RFLAG_CF_RAW_LOC == 0); + Ref Original = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC); for (size_t i = 0; i < FlagOffsets.size(); ++i) { const auto FlagOffset = FlagOffsets[i]; From 967a74cda9280e187783f6f90a2c0a1d7c0f000b Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 11:25:01 -0400 Subject: [PATCH 12/17] OpcodeDispatcher: manually inline constants less work for constprop. Signed-off-by: Alyssa Rosenzweig --- .../Interface/Core/OpcodeDispatcher.cpp | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 681d4072e0..a77b5b3c0b 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -348,7 +348,7 @@ void OpDispatchBuilder::SBBOp(OpcodeArgs, uint32_t SrcIndex) { void OpDispatchBuilder::SALCOp(OpcodeArgs) { CalculateDeferredFlags(); - auto Result = NZCVSelect(OpSize::i32Bit, {COND_UGE} /* CF = 1 */, _Constant(0xffffffff), _Constant(0)); + auto Result = NZCVSelect(OpSize::i32Bit, {COND_UGE} /* CF = 1 */, _InlineConstant(0xffffffff), _InlineConstant(0)); StoreResult(GPRClass, Op, Result, -1); } @@ -493,7 +493,7 @@ void OpDispatchBuilder::POPAOp(OpcodeArgs) { StoreGPRRegister(X86State::REG_RBP, Pop(Size, SP), Size); // Skip loading RSP because it'll be correct at the end - SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _Constant(Size))); + SP = _RMWHandle(_Add(OpSize::i64Bit, SP, _InlineConstant(Size))); StoreGPRRegister(X86State::REG_RBX, Pop(Size, SP), Size); StoreGPRRegister(X86State::REG_RDX, Pop(Size, SP), Size); @@ -611,7 +611,7 @@ Ref OpDispatchBuilder::SelectPF(bool Invert, IR::OpSize ResultSize, Ref TrueValu // Because we're only clobbering NZCV internally, we ignore all carry flag // shenanigans and just use the raw test and raw select. - _TestNZ(OpSize::i32Bit, Cmp, _Constant(1)); + _TestNZ(OpSize::i32Bit, Cmp, _InlineConstant(1)); return _NZCVSelect(ResultSize, {COND_NEQ}, TrueValue, FalseValue); } @@ -876,7 +876,7 @@ void OpDispatchBuilder::LoopOp(OpcodeArgs) { uint64_t Target = Op->PC + Op->InstSize + Op->Src[1].Literal(); Ref CondReg = LoadSource_WithOpSize(GPRClass, Op, Op->Src[0], SrcSize, Op->Flags); - CondReg = _Sub(OpSize, CondReg, _Constant(SrcSize * 8, 1)); + CondReg = _Sub(OpSize, CondReg, _InlineConstant(1)); StoreResult(GPRClass, Op, Op->Src[0], CondReg, -1); // If LOOPE then jumps to target if RCX != 0 && ZF == 1 @@ -884,7 +884,7 @@ void OpDispatchBuilder::LoopOp(OpcodeArgs) { // // To handle efficiently, smash RCX to zero if ZF is wrong (1 csel). if (CheckZF) { - CondReg = NZCVSelect(OpSize, {ZFTrue ? COND_EQ : COND_NEQ}, CondReg, _Constant(0)); + CondReg = NZCVSelect(OpSize, {ZFTrue ? COND_EQ : COND_NEQ}, CondReg, _InlineConstant(0)); } CalculateDeferredFlags(); @@ -1154,7 +1154,7 @@ void OpDispatchBuilder::SAHFOp(OpcodeArgs) { Src = _Andn(OpSize::i64Bit, Src, _Constant(0b101000)); // Set the bit that is always set here - Src = _Or(OpSize::i64Bit, Src, _Constant(0b10)); + Src = _Or(OpSize::i64Bit, Src, _InlineConstant(0b10)); // Store the lower 8 bits in to RFLAGS SetPackedRFLAG(true, Src); @@ -1437,9 +1437,9 @@ void OpDispatchBuilder::SHLDOp(OpcodeArgs) { // x86 masks the shift by 0x3F or 0x1F depending on size of op. if (Size == 64) { - Shift = _And(OpSize::i64Bit, Shift, _Constant(0x3F)); + Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x3F)); } else { - Shift = _And(OpSize::i64Bit, Shift, _Constant(0x1F)); + Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x1F)); } // a64 masks the bottom bits, so if we're using a native 32/64-bit shift, we @@ -1510,9 +1510,9 @@ void OpDispatchBuilder::SHRDOp(OpcodeArgs) { // x86 masks the shift by 0x3F or 0x1F depending on size of op if (Size == 64) { - Shift = _And(OpSize::i64Bit, Shift, _Constant(0x3F)); + Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x3F)); } else { - Shift = _And(OpSize::i64Bit, Shift, _Constant(0x1F)); + Shift = _And(OpSize::i64Bit, Shift, _InlineConstant(0x1F)); } auto ShiftLeft = _Sub(OpSize::i64Bit, _Constant(Size), Shift); @@ -1608,7 +1608,7 @@ void OpDispatchBuilder::RotateOp(OpcodeArgs, bool Left, bool IsImmediate, bool I Src = _Constant(UnmaskedConst & Mask); } else { UnmaskedSrc = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true}); - Src = _And(OpSize::i64Bit, UnmaskedSrc, _Constant(Mask)); + Src = _And(OpSize::i64Bit, UnmaskedSrc, _InlineConstant(Mask)); } // We fill the upper bits so we allow garbage on load. @@ -1643,7 +1643,7 @@ void OpDispatchBuilder::RotateOp(OpcodeArgs, bool Left, bool IsImmediate, bool I // We deferred the masking for 8-bit to the flag section, do it here. if (Size == 8) { - Src = _And(OpSize::i64Bit, UnmaskedSrc, _Constant(0x1F)); + Src = _And(OpSize::i64Bit, UnmaskedSrc, _InlineConstant(0x1F)); } _RotateFlags(OpSizeFromSrc(Op), Res, Src, Left); @@ -1730,7 +1730,7 @@ void OpDispatchBuilder::BLSMSKBMIOp(OpcodeArgs) { auto Size = OpSizeFromSrc(Op); auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); - auto Result = _Xor(Size, _Sub(Size, Src, _Constant(1)), Src); + auto Result = _Xor(Size, _Sub(Size, Src, _InlineConstant(1)), Src); StoreResult(GPRClass, Op, Result, -1); InvalidatePF_AF(); @@ -1752,7 +1752,7 @@ void OpDispatchBuilder::BLSRBMIOp(OpcodeArgs) { auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto Size = OpSizeFromSrc(Op); - auto Result = _And(Size, _Sub(Size, Src, _Constant(1)), Src); + auto Result = _And(Size, _Sub(Size, Src, _InlineConstant(1)), Src); StoreResult(GPRClass, Op, Result, -1); auto Zero = _Constant(0); @@ -1817,8 +1817,8 @@ void OpDispatchBuilder::BZHI(OpcodeArgs) { auto Result = _NZCVSelect(IR::SizeToOpSize(Size), {COND_NEQ}, Src, MaskResult); StoreResult(GPRClass, Op, Result, -1); - auto Zero = _Constant(0); - auto One = _Constant(1); + auto Zero = _InlineConstant(0); + auto One = _InlineConstant(1); auto CFInv = _NZCVSelect(OpSize::i32Bit, {COND_EQ}, One, Zero); InvalidatePF_AF(); @@ -1849,7 +1849,7 @@ void OpDispatchBuilder::RORX(OpcodeArgs) { auto* Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); auto* Result = Src; if (DoRotation) [[likely]] { - Result = _Ror(OpSizeFromSrc(Op), Src, _Constant(Amount)); + Result = _Ror(OpSizeFromSrc(Op), Src, _InlineConstant(Amount)); } StoreResult(GPRClass, Op, Result, -1); @@ -2057,7 +2057,7 @@ void OpDispatchBuilder::RCROp(OpcodeArgs) { return; } - Ref SrcMasked = _And(OpSize, Src, _Constant(Size, Mask)); + Ref SrcMasked = _And(OpSize, Src, _InlineConstant(Mask)); Calculate_ShiftVariable( Op, SrcMasked, [this, Op, Size, OpSize]() { @@ -2282,7 +2282,7 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) { return; } - Ref SrcMasked = _And(OpSize, Src, _Constant(Size, Mask)); + Ref SrcMasked = _And(OpSize, Src, _InlineConstant(Mask)); Calculate_ShiftVariable( Op, SrcMasked, [this, Op, Size, OpSize]() { @@ -2305,7 +2305,7 @@ void OpDispatchBuilder::RCLOp(OpcodeArgs) { SetCFDirect(NewCF, 0, true); // Since Shift != 0 we can inject the CF. Shift absorbs the masking. - Ref CFShl = _Sub(OpSize, Src, _Constant(Size, 1)); + Ref CFShl = _Sub(OpSize, Src, _InlineConstant(1)); auto TmpCF = _Lshl(OpSize, CF, CFShl); Res = _Or(OpSize, Res, TmpCF); From e2d58809edf39db2c64f8e4d4d84615961a6c634 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 11:34:10 -0400 Subject: [PATCH 13/17] OpcodeDispatcher: rm dead Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index a77b5b3c0b..076231bf00 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -803,11 +803,6 @@ void OpDispatchBuilder::CondJUMPRCXOp(OpcodeArgs) { uint8_t JcxGPRSize = CTX->GetGPRSize(); JcxGPRSize = (Op->Flags & X86Tables::DecodeFlags::FLAG_ADDRESS_SIZE) ? (JcxGPRSize >> 1) : JcxGPRSize; - IRPair TakeBranch; - IRPair DoNotTakeBranch; - TakeBranch = _Constant(1); - DoNotTakeBranch = _Constant(0); - uint64_t Target = Op->PC + Op->InstSize + Op->Src[0].Literal(); Ref CondReg = LoadGPRRegister(X86State::REG_RCX, JcxGPRSize); From 0650dd1992f9f0f916a4a84219d11167f9670b65 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 11:39:55 -0400 Subject: [PATCH 14/17] ConstProp: defer initialization for blocks that don't need pooling. hopefully that's most of them. Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/IR/Passes/ConstProp.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp index df97cda3e6..8cfe155709 100644 --- a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp +++ b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp @@ -139,7 +139,10 @@ class ConstProp final : public FEXCore::IR::Pass { // Constants are pooled per block. void ConstProp::HandleConstantPools(IREmitter* IREmit, const IRListView& CurrentIR) { const uint32_t SSACount = CurrentIR.GetSSACount(); - fextl::vector Remap(SSACount, NULL); + + // Allocation/initialization deferred until first use, since many multiblocks + // don't have constants leftover after all inlining. + fextl::vector Remap {}; for (auto [BlockNode, BlockIROp] : CurrentIR.GetBlocks()) { for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { @@ -151,11 +154,15 @@ void ConstProp::HandleConstantPools(IREmitter* IREmit, const IRListView& Current uint32_t Value = CurrentIR.GetID(CodeNode).Value; LOGMAN_THROW_A_FMT(Value < SSACount, "def not yet remapped"); + if (Remap.empty()) { + Remap.resize(SSACount, NULL); + } + Remap[Value] = it->second; } else { ConstPool[Op->Constant] = CodeNode; } - } else { + } else if (!Remap.empty()) { const uint8_t NumArgs = IR::GetArgs(IROp->Op); for (uint8_t i = 0; i < NumArgs; ++i) { if (IROp->Args[i].IsInvalid()) { From 36d0a6707097bf3cb0671e85b35469461b3c1f4f Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 11:46:56 -0400 Subject: [PATCH 15/17] ConstProp: don't pool with a hash table hash tables have high overhead! but individual blocks are small, so the quadratic search is _much_ faster in practice. knocks 8% off node. Signed-off-by: Alyssa Rosenzweig --- .../Source/Interface/IR/Passes/ConstProp.cpp | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp index 8cfe155709..e31427dd7b 100644 --- a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp +++ b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp @@ -75,8 +75,6 @@ class ConstProp final : public FEXCore::IR::Pass { void HandleConstantPools(IREmitter* IREmit, const IRListView& CurrentIR); void ConstantPropagation(IREmitter* IREmit, const IRListView& CurrentIR, Ref CodeNode, IROp_Header* IROp); - fextl::unordered_map ConstPool; - bool SupportsTSOImm9 {}; const FEXCore::CPUIDEmu* CPUID; @@ -144,23 +142,42 @@ void ConstProp::HandleConstantPools(IREmitter* IREmit, const IRListView& Current // don't have constants leftover after all inlining. fextl::vector Remap {}; + struct Entry { + int64_t Value; + Ref R; + }; + + + fextl::vector Pool {}; + for (auto [BlockNode, BlockIROp] : CurrentIR.GetBlocks()) { + Pool.clear(); + for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { if (IROp->Op == OP_CONSTANT) { auto Op = IROp->C(); - auto it = ConstPool.find(Op->Constant); - - if (it != ConstPool.end()) { - uint32_t Value = CurrentIR.GetID(CodeNode).Value; - LOGMAN_THROW_A_FMT(Value < SSACount, "def not yet remapped"); - - if (Remap.empty()) { - Remap.resize(SSACount, NULL); + bool Found = false; + + // Search for the constant. This is O(n^2) but n is small since it's + // local and most constants are inlined. In practice, it ends up much + // faster than a hash table. + for (auto K : Pool) { + if (K.Value == Op->Constant) { + uint32_t Value = CurrentIR.GetID(CodeNode).Value; + LOGMAN_THROW_A_FMT(Value < SSACount, "def not yet remapped"); + + if (Remap.empty()) { + Remap.resize(SSACount, nullptr); + } + + Remap[Value] = K.R; + Found = true; + break; } + } - Remap[Value] = it->second; - } else { - ConstPool[Op->Constant] = CodeNode; + if (!Found) { + Pool.push_back({.Value = Op->Constant, .R = CodeNode}); } } else if (!Remap.empty()) { const uint8_t NumArgs = IR::GetArgs(IROp->Op); @@ -179,7 +196,6 @@ void ConstProp::HandleConstantPools(IREmitter* IREmit, const IRListView& Current } } } - ConstPool.clear(); } } From 6ac7ba388f384c853c2477fd5bdae905775ad5c3 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 12:01:24 -0400 Subject: [PATCH 16/17] ConstProp: rm leftover Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/IR/Passes/ConstProp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp index e31427dd7b..b0816cb3ca 100644 --- a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp +++ b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp @@ -189,7 +189,7 @@ void ConstProp::HandleConstantPools(IREmitter* IREmit, const IRListView& Current uint32_t Value = IROp->Args[i].ID().Value; LOGMAN_THROW_A_FMT(Value < SSACount, "src not yet remapped"); - Ref New = Value < SSACount ? Remap[Value] : NULL; + Ref New = Remap[Value]; if (New) { IREmit->ReplaceNodeArgument(CodeNode, i, New); } From 44484a7b05cfd5ec3cd9db4dbb6b22e115b7f301 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 2 Oct 2024 13:59:03 -0400 Subject: [PATCH 17/17] ConstProp: drop non-loadbearing opts every pattern costs us JIT time, but not every pattern is doing anything. especially with the flag rework of 2023-2024, a lot of patterns just don't make sense anymore. constant folding in particular isn't too useful now. only instcountci change is AAD which i'm contractually forbidden from caring about. Signed-off-by: Alyssa Rosenzweig --- .../Source/Interface/IR/Passes/ConstProp.cpp | 114 +----------------- .../FlagM/Primary_32Bit.json | 5 +- .../InstructionCountCI/Primary_32Bit.json | 5 +- 3 files changed, 10 insertions(+), 114 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp index b0816cb3ca..5ead4ffe79 100644 --- a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp +++ b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp @@ -52,17 +52,6 @@ static bool IsImmLogical(uint64_t imm, unsigned width) { return ARMEmitter::Emitter::IsImmLogical(imm, width); } -static bool IsBfeAlreadyDone(IREmitter* IREmit, OrderedNodeWrapper src, uint64_t Width) { - auto IROp = IREmit->GetOpHeader(src); - if (IROp->Op == OP_BFE) { - auto Op = IROp->C(); - if (Width >= Op->Width) { - return true; - } - } - return false; -} - class ConstProp final : public FEXCore::IR::Pass { public: explicit ConstProp(bool SupportsTSOImm9, const FEXCore::CPUIDEmu* CPUID) @@ -294,16 +283,6 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current uint64_t NewConstant = (Constant1 & Constant2) & getMask(IROp); IREmit->ReplaceWithConstant(CodeNode, NewConstant); Replaced = true; - } else if (Constant2 == 1) { - // happens from flag calcs - auto val = IREmit->GetOpHeader(IROp->Args[0]); - - uint64_t Constant3; - if (val->Op == OP_SELECT && IREmit->IsValueConstant(val->Args[2], &Constant2) && IREmit->IsValueConstant(val->Args[3], &Constant3) && - Constant2 == 1 && Constant3 == 0) { - IREmit->ReplaceAllUsesWith(CodeNode, CurrentIR.GetNode(IROp->Args[0])); - Replaced = true; - } } else if (IROp->Args[0].ID() == IROp->Args[1].ID() || (Constant2 & getMask(IROp)) == getMask(IROp)) { // AND with same value results in original value IREmit->ReplaceAllUsesWith(CodeNode, CurrentIR.GetNode(IROp->Args[0])); @@ -316,28 +295,13 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current break; } case OP_OR: { - uint64_t Constant1 {}; - uint64_t Constant2 {}; - - if (IREmit->IsValueConstant(IROp->Args[0], &Constant1) && IREmit->IsValueConstant(IROp->Args[1], &Constant2)) { - uint64_t NewConstant = Constant1 | Constant2; - IREmit->ReplaceWithConstant(CodeNode, NewConstant); - } else if (IROp->Args[0].ID() == IROp->Args[1].ID()) { - // OR with same value results in original value - IREmit->ReplaceAllUsesWith(CodeNode, CurrentIR.GetNode(IROp->Args[0])); - } else { - InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); }); - } + InlineIf(IREmit, CurrentIR, CodeNode, IROp, 1, [&IROp](uint64_t X) { return IsImmLogical(X, IROp->Size * 8); }); break; } case OP_XOR: { uint64_t Constant1 {}; - uint64_t Constant2 {}; - if (IREmit->IsValueConstant(IROp->Args[0], &Constant1) && IREmit->IsValueConstant(IROp->Args[1], &Constant2)) { - uint64_t NewConstant = Constant1 ^ Constant2; - IREmit->ReplaceWithConstant(CodeNode, NewConstant); - } else if (IROp->Args[0].ID() == IROp->Args[1].ID()) { + if (IROp->Args[0].ID() == IROp->Args[1].ID()) { // XOR with same value results to zero IREmit->SetWriteCursor(CodeNode); IREmit->ReplaceAllUsesWith(CodeNode, IREmit->_Constant(0)); @@ -405,19 +369,9 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current break; } case OP_LSHR: { - uint64_t Constant1 {}; uint64_t Constant2 {}; - if (IREmit->IsValueConstant(IROp->Args[0], &Constant1) && IREmit->IsValueConstant(IROp->Args[1], &Constant2)) { - // Shifts mask the shift amount by 63 or 31 depending on operating size; - // The source is masked, which will produce a correctly masked - // destination. Masking the destination without the source instead will - // right-shift garbage into the upper bits instead of zeroes. - Constant1 &= getMask(IROp); - Constant2 &= (IROp->Size == 8 ? 63 : 31); - uint64_t NewConstant = (Constant1 >> Constant2); - IREmit->ReplaceWithConstant(CodeNode, NewConstant); - } else if (IREmit->IsValueConstant(IROp->Args[1], &Constant2) && Constant2 == 0) { + if (IREmit->IsValueConstant(IROp->Args[1], &Constant2) && Constant2 == 0) { IREmit->SetWriteCursor(CodeNode); Ref Arg = CurrentIR.GetNode(IROp->Args[0]); IREmit->ReplaceAllUsesWith(CodeNode, Arg); @@ -430,46 +384,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current auto Op = IROp->C(); uint64_t Constant; - // Is this value already BFE'd? - if (IsBfeAlreadyDone(IREmit, Op->Src, Op->Width)) { - IREmit->ReplaceAllUsesWith(CodeNode, CurrentIR.GetNode(Op->Src)); - break; - } - - // Is this value already ZEXT'd? - if (Op->lsb == 0) { - // LoadMem, LoadMemTSO & LoadContext ZExt - auto source = Op->Src; - auto sourceHeader = IREmit->GetOpHeader(source); - - if (Op->Width >= (sourceHeader->Size * 8) && - (sourceHeader->Op == OP_LOADMEM || sourceHeader->Op == OP_LOADMEMTSO || sourceHeader->Op == OP_LOADCONTEXT)) { - // Load mem / load ctx zexts, no need to vmem - IREmit->ReplaceAllUsesWith(CodeNode, CurrentIR.GetNode(source)); - break; - } - } - if (IROp->Size <= 8 && IREmit->IsValueConstant(Op->Src, &Constant)) { uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1); SourceMask <<= Op->lsb; uint64_t NewConstant = (Constant & SourceMask) >> Op->lsb; IREmit->ReplaceWithConstant(CodeNode, NewConstant); - } else if (IROp->Size == CurrentIR.GetOp(IROp->Args[0])->Size && Op->Width == (IROp->Size * 8) && Op->lsb == 0) { - // A BFE that extracts all bits results in original value - // XXX - This is broken for now - see https://github.com/FEX-Emu/FEX/issues/351 - // IREmit->ReplaceAllUsesWith(CodeNode, CurrentIR.GetNode(IROp->Args[0])); - } else if (Op->Width == 1 && Op->lsb == 0) { - // common from flag codegen - auto val = IREmit->GetOpHeader(IROp->Args[0]); - - uint64_t Constant2 {}; - uint64_t Constant3 {}; - if (val->Op == OP_SELECT && IREmit->IsValueConstant(val->Args[2], &Constant2) && IREmit->IsValueConstant(val->Args[3], &Constant3) && - Constant2 == 1 && Constant3 == 0) { - IREmit->ReplaceAllUsesWith(CodeNode, CurrentIR.GetNode(IROp->Args[0])); - } } break; @@ -494,18 +414,10 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current } case OP_BFI: { auto Op = IROp->C(); - uint64_t ConstantDest {}; uint64_t ConstantSrc {}; - bool DestIsConstant = IREmit->IsValueConstant(IROp->Args[0], &ConstantDest); bool SrcIsConstant = IREmit->IsValueConstant(IROp->Args[1], &ConstantSrc); - if (DestIsConstant && SrcIsConstant) { - uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1); - uint64_t NewConstant = ConstantDest & ~(SourceMask << Op->lsb); - NewConstant |= (ConstantSrc & SourceMask) << Op->lsb; - - IREmit->ReplaceWithConstant(CodeNode, NewConstant); - } else if (SrcIsConstant && HasConsecutiveBits(ConstantSrc, Op->Width)) { + if (SrcIsConstant && HasConsecutiveBits(ConstantSrc, Op->Width)) { // We are trying to insert constant, if it is a bitfield of only set bits then we can orr or and it. IREmit->SetWriteCursor(CodeNode); uint64_t SourceMask = Op->Width == 64 ? ~0ULL : ((1ULL << Op->Width) - 1); @@ -522,24 +434,6 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current } break; } - case OP_MUL: { - uint64_t Constant1 {}; - uint64_t Constant2 {}; - - if (IREmit->IsValueConstant(IROp->Args[0], &Constant1) && IREmit->IsValueConstant(IROp->Args[1], &Constant2)) { - uint64_t NewConstant = (Constant1 * Constant2) & getMask(IROp); - IREmit->ReplaceWithConstant(CodeNode, NewConstant); - } else if (IREmit->IsValueConstant(IROp->Args[1], &Constant2) && std::popcount(Constant2) == 1) { - if (IROp->Size == 4 || IROp->Size == 8) { - uint64_t amt = std::countr_zero(Constant2); - IREmit->SetWriteCursor(CodeNode); - auto shift = IREmit->_Lshl(IR::SizeToOpSize(IROp->Size), CurrentIR.GetNode(IROp->Args[0]), IREmit->_Constant(amt)); - IREmit->ReplaceAllUsesWith(CodeNode, shift); - } - } - break; - } - case OP_VMOV: { // elim from load mem auto source = IROp->Args[0]; diff --git a/unittests/InstructionCountCI/FlagM/Primary_32Bit.json b/unittests/InstructionCountCI/FlagM/Primary_32Bit.json index 73698a9d22..50eccd1ed0 100644 --- a/unittests/InstructionCountCI/FlagM/Primary_32Bit.json +++ b/unittests/InstructionCountCI/FlagM/Primary_32Bit.json @@ -352,14 +352,15 @@ ] }, "db 0xd5, 0x40": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 8, "Comment": [ "aad with a different immediate byte base", "0xd5" ], "ExpectedArm64ASM": [ "lsr w20, w4, #8", - "lsl x20, x20, #6", + "mov w21, #0x40", + "mul x20, x20, x21", "add x20, x4, x20", "and x26, x20, #0xff", "bfxil w4, w26, #0, #16", diff --git a/unittests/InstructionCountCI/Primary_32Bit.json b/unittests/InstructionCountCI/Primary_32Bit.json index b0945d1a90..bae7f66116 100644 --- a/unittests/InstructionCountCI/Primary_32Bit.json +++ b/unittests/InstructionCountCI/Primary_32Bit.json @@ -373,14 +373,15 @@ ] }, "db 0xd5, 0x40": { - "ExpectedInstructionCount": 9, + "ExpectedInstructionCount": 10, "Comment": [ "aad with a different immediate byte base", "0xd5" ], "ExpectedArm64ASM": [ "lsr w20, w4, #8", - "lsl x20, x20, #6", + "mov w21, #0x40", + "mul x20, x20, x21", "add x20, x4, x20", "and x26, x20, #0xff", "bfxil w4, w26, #0, #16",