Skip to content

Commit

Permalink
[MachineSink] Extend loop sinking capability (llvm#117247)
Browse files Browse the repository at this point in the history
The current MIR cycle sinking capabilities are rather limited. It only
support sinking copies into a single successor block while obeying
limits.

This opt-in feature adds a more aggressive option, that is not limited
to the above concerns. The feature will try to "sink" by duplicating any
top-level preheader instruction (that we are sure is safe to sink) into
any user block, then does some dead code cleanup. In particular, this is
useful for high RP situations when loop bodies have control flow.
  • Loading branch information
jrbyrnes authored Jan 24, 2025
1 parent 24f177d commit acb7859
Show file tree
Hide file tree
Showing 5 changed files with 1,530 additions and 168 deletions.
267 changes: 176 additions & 91 deletions llvm/lib/CodeGen/MachineSink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DebugInfoMetadata.h"
Expand Down Expand Up @@ -113,6 +114,8 @@ STATISTIC(NumSplit, "Number of critical edges split");
STATISTIC(NumCoalesces, "Number of copies coalesced");
STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");

using RegSubRegPair = TargetInstrInfo::RegSubRegPair;

namespace {

class MachineSinking : public MachineFunctionPass {
Expand All @@ -128,6 +131,7 @@ class MachineSinking : public MachineFunctionPass {
const MachineBranchProbabilityInfo *MBPI = nullptr;
AliasAnalysis *AA = nullptr;
RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;

// Remember which edges have been considered for breaking.
SmallSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>, 8>
Expand Down Expand Up @@ -161,6 +165,8 @@ class MachineSinking : public MachineFunctionPass {
/// would re-order assignments.
using SeenDbgUser = PointerIntPair<MachineInstr *, 1>;

using SinkItem = std::pair<MachineInstr *, MachineBasicBlock *>;

/// Record of DBG_VALUE uses of vregs in a block, so that we can identify
/// debug instructions to sink.
SmallDenseMap<unsigned, TinyPtrVector<SeenDbgUser>> SeenDbgUsers;
Expand Down Expand Up @@ -255,7 +261,10 @@ class MachineSinking : public MachineFunctionPass {

void FindCycleSinkCandidates(MachineCycle *Cycle, MachineBasicBlock *BB,
SmallVectorImpl<MachineInstr *> &Candidates);
bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);

bool
aggressivelySinkIntoCycle(MachineCycle *Cycle, MachineInstr &I,
DenseMap<SinkItem, MachineInstr *> &SunkInstrs);

bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
Expand All @@ -271,11 +280,14 @@ class MachineSinking : public MachineFunctionPass {
GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
AllSuccsCache &AllSuccessors) const;

std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB);
std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB,
bool UseCache = true);

bool registerPressureSetExceedsLimit(unsigned NRegs,
const TargetRegisterClass *RC,
const MachineBasicBlock &MBB);

bool registerPressureExceedsLimit(const MachineBasicBlock &MBB);
};

} // end anonymous namespace
Expand Down Expand Up @@ -680,6 +692,10 @@ void MachineSinking::FindCycleSinkCandidates(
SmallVectorImpl<MachineInstr *> &Candidates) {
for (auto &MI : *BB) {
LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
if (MI.isMetaInstruction()) {
LLVM_DEBUG(dbgs() << "CycleSink: not sinking meta instruction\n");
continue;
}
if (!TII->shouldSink(MI)) {
LLVM_DEBUG(dbgs() << "CycleSink: Instruction not a candidate for this "
"target\n");
Expand Down Expand Up @@ -775,31 +791,62 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {

if (SinkInstsIntoCycle) {
SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
for (auto *Cycle : Cycles) {
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
if (!Preheader) {
LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
continue;
}
SmallVector<MachineInstr *, 8> Candidates;
FindCycleSinkCandidates(Cycle, Preheader, Candidates);

// Walk the candidates in reverse order so that we start with the use
// of a def-use chain, if there is any.
// TODO: Sort the candidates using a cost-model.
unsigned i = 0;
for (MachineInstr *I : llvm::reverse(Candidates)) {
if (i++ == SinkIntoCycleLimit) {
LLVM_DEBUG(dbgs() << "CycleSink: Limit reached of instructions to "
"be analysed.");
break;
SchedModel.init(STI);
bool HasHighPressure;

DenseMap<SinkItem, MachineInstr *> SunkInstrs;

enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
for (unsigned Stage = CycleSinkStage::COPY; Stage != CycleSinkStage::END;
++Stage, SunkInstrs.clear()) {
HasHighPressure = false;

for (auto *Cycle : Cycles) {
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
if (!Preheader) {
LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
continue;
}
SmallVector<MachineInstr *, 8> Candidates;
FindCycleSinkCandidates(Cycle, Preheader, Candidates);

unsigned i = 0;

// Walk the candidates in reverse order so that we start with the use
// of a def-use chain, if there is any.
// TODO: Sort the candidates using a cost-model.
for (MachineInstr *I : llvm::reverse(Candidates)) {
// CycleSinkStage::COPY: Sink a limited number of copies
if (Stage == CycleSinkStage::COPY) {
if (i++ == SinkIntoCycleLimit) {
LLVM_DEBUG(dbgs()
<< "CycleSink: Limit reached of instructions to "
"be analyzed.");
break;
}

if (!I->isCopy())
continue;
}

if (!SinkIntoCycle(Cycle, *I))
break;
EverMadeChange = true;
++NumCycleSunk;
// CycleSinkStage::LOW_LATENCY: sink unlimited number of instructions
// which the target specifies as low-latency
if (Stage == CycleSinkStage::LOW_LATENCY &&
!TII->hasLowDefLatency(SchedModel, *I, 0))
continue;

if (!aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs))
break;
EverMadeChange = true;
++NumCycleSunk;
}

// Recalculate the pressure after sinking
if (!HasHighPressure)
HasHighPressure = registerPressureExceedsLimit(*Preheader);
}
if (!HasHighPressure)
break;
}
}

Expand Down Expand Up @@ -1055,13 +1102,15 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
}

std::vector<unsigned> &
MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB,
bool UseCache) {
// Currently to save compiling time, MBB's register pressure will not change
// in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's
// register pressure is changed after sinking any instructions into it.
// FIXME: need a accurate and cheap register pressure estiminate model here.

auto RP = CachedRegisterPressure.find(&MBB);
if (RP != CachedRegisterPressure.end())
if (UseCache && RP != CachedRegisterPressure.end())
return RP->second;

RegionPressure Pressure;
Expand All @@ -1085,6 +1134,12 @@ MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
}

RPTracker.closeRegion();

if (RP != CachedRegisterPressure.end()) {
CachedRegisterPressure[&MBB] = RPTracker.getPressure().MaxSetPressure;
return CachedRegisterPressure[&MBB];
}

auto It = CachedRegisterPressure.insert(
std::make_pair(&MBB, RPTracker.getPressure().MaxSetPressure));
return It.first->second;
Expand All @@ -1103,6 +1158,21 @@ bool MachineSinking::registerPressureSetExceedsLimit(
return false;
}

// Recalculate RP and check if any pressure set exceeds the set limit.
bool MachineSinking::registerPressureExceedsLimit(
const MachineBasicBlock &MBB) {
std::vector<unsigned> BBRegisterPressure = getBBRegisterPressure(MBB, false);

for (unsigned PS = 0; PS < BBRegisterPressure.size(); ++PS) {
if (BBRegisterPressure[PS] >=
TRI->getRegPressureSetLimit(*MBB.getParent(), PS)) {
return true;
}
}

return false;
}

/// isProfitableToSinkTo - Return true if it is profitable to sink MI.
bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
Expand Down Expand Up @@ -1581,83 +1651,98 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
return HasAliasedStore;
}

/// Sink instructions into cycles if profitable. This especially tries to
/// prevent register spills caused by register pressure if there is little to no
/// overhead moving instructions into cycles.
bool MachineSinking::SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I) {
LLVM_DEBUG(dbgs() << "CycleSink: Finding sink block for: " << I);
/// Aggressively sink instructions into cycles. This will aggressively try to
/// sink all instructions in the top-most preheaders in an attempt to reduce RP.
/// In particular, it will sink into multiple successor blocks without limits
/// based on the amount of sinking, or the type of ops being sunk (so long as
/// they are safe to sink).
bool MachineSinking::aggressivelySinkIntoCycle(
MachineCycle *Cycle, MachineInstr &I,
DenseMap<SinkItem, MachineInstr *> &SunkInstrs) {
// TODO: support instructions with multiple defs
if (I.getNumDefs() > 1)
return false;

LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
assert(Preheader && "Cycle sink needs a preheader block");
MachineBasicBlock *SinkBlock = nullptr;
bool CanSink = true;
const MachineOperand &MO = I.getOperand(0);

for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
LLVM_DEBUG(dbgs() << "CycleSink: Analysing use: " << MI);
if (!Cycle->contains(MI.getParent())) {
LLVM_DEBUG(dbgs() << "CycleSink: Use not in cycle, can't sink.\n");
CanSink = false;
break;
}
SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;

// FIXME: Come up with a proper cost model that estimates whether sinking
// the instruction (and thus possibly executing it on every cycle
// iteration) is more expensive than a register.
// For now assumes that copies are cheap and thus almost always worth it.
if (!MI.isCopy()) {
LLVM_DEBUG(dbgs() << "CycleSink: Use is not a copy\n");
CanSink = false;
break;
MachineOperand &DefMO = I.getOperand(0);
for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
Uses.push_back({{DefMO.getReg(), DefMO.getSubReg()}, &MI});
}

for (std::pair<RegSubRegPair, MachineInstr *> Entry : Uses) {
MachineInstr *MI = Entry.second;
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Analysing use: " << MI);
if (MI->isPHI()) {
LLVM_DEBUG(
dbgs() << "AggressiveCycleSink: Not attempting to sink for PHI.\n");
continue;
}
if (!SinkBlock) {
SinkBlock = MI.getParent();
LLVM_DEBUG(dbgs() << "CycleSink: Setting sink block to: "
<< printMBBReference(*SinkBlock) << "\n");
// We cannot sink before the prologue
if (MI->isPosition() || TII->isBasicBlockPrologue(*MI)) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Use is BasicBlock prologue, "
"can't sink.\n");
continue;
}
SinkBlock = DT->findNearestCommonDominator(SinkBlock, MI.getParent());
if (!SinkBlock) {
LLVM_DEBUG(dbgs() << "CycleSink: Can't find nearest dominator\n");
CanSink = false;
break;
if (!Cycle->contains(MI->getParent())) {
LLVM_DEBUG(
dbgs() << "AggressiveCycleSink: Use not in cycle, can't sink.\n");
continue;
}
LLVM_DEBUG(dbgs() << "CycleSink: Setting nearest common dom block: "
<< printMBBReference(*SinkBlock) << "\n");
}

if (!CanSink) {
LLVM_DEBUG(dbgs() << "CycleSink: Can't sink instruction.\n");
return false;
}
if (!SinkBlock) {
LLVM_DEBUG(dbgs() << "CycleSink: Not sinking, can't find sink block.\n");
return false;
}
if (SinkBlock == Preheader) {
LLVM_DEBUG(
dbgs() << "CycleSink: Not sinking, sink block is the preheader\n");
return false;
}
if (SinkBlock->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold)) {
LLVM_DEBUG(
dbgs() << "CycleSink: Not Sinking, block too large to analyse.\n");
return false;
}
MachineBasicBlock *SinkBlock = MI->getParent();
MachineInstr *NewMI = nullptr;
SinkItem MapEntry(&I, SinkBlock);

auto SI = SunkInstrs.find(MapEntry);

// Check for the case in which we have already sunk a copy of this
// instruction into the user block.
if (SI != SunkInstrs.end()) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
<< printMBBReference(*SinkBlock) << "\n");
NewMI = SI->second;
}

LLVM_DEBUG(dbgs() << "CycleSink: Sinking instruction!\n");
SinkBlock->splice(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()), Preheader,
I);
// Create a copy of the instruction in the use block.
if (!NewMI) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Sinking instruction to block: "
<< printMBBReference(*SinkBlock) << "\n");

NewMI = I.getMF()->CloneMachineInstr(&I);
if (DefMO.getReg().isVirtual()) {
const TargetRegisterClass *TRC = MRI->getRegClass(DefMO.getReg());
Register DestReg = MRI->createVirtualRegister(TRC);
NewMI->substituteRegister(DefMO.getReg(), DestReg, DefMO.getSubReg(),
*TRI);
}
SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
NewMI);
SunkInstrs.insert({MapEntry, NewMI});
}

// Conservatively clear any kill flags on uses of sunk instruction
for (MachineOperand &MO : I.operands()) {
if (MO.isReg() && MO.readsReg())
// Conservatively clear any kill flags on uses of sunk instruction
for (MachineOperand &MO : NewMI->all_uses()) {
assert(MO.isReg() && MO.isUse());
RegsToClearKillFlags.insert(MO.getReg());
}
}

// The instruction is moved from its basic block, so do not retain the
// debug information.
assert(!I.isDebugInstr() && "Should not sink debug inst");
I.setDebugLoc(DebugLoc());
// The instruction is moved from its basic block, so do not retain the
// debug information.
assert(!NewMI->isDebugInstr() && "Should not sink debug inst");
NewMI->setDebugLoc(DebugLoc());

// Replace the use with the newly created virtual register.
RegSubRegPair &UseReg = Entry.first;
MI->substituteRegister(UseReg.Reg, NewMI->getOperand(0).getReg(),
UseReg.SubReg, *TRI);
}
// If we have replaced all uses, then delete the dead instruction
if (I.isDead(*MRI))
I.eraseFromParent();
return true;
}

Expand Down
Loading

0 comments on commit acb7859

Please sign in to comment.