Skip to content

[AMDGPU] NFC: Add BBLiveOutMap & LiveOut Cache #93089

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 50 additions & 8 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ static cl::opt<bool>
"Wave Limited (amdgpu-limit-wave-threshold)."),
cl::init(false));

static cl::opt<bool> GCNTrackers(
"amdgpu-use-amdgpu-trackers", cl::Hidden,
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));

const unsigned ScheduleMetrics::ScaleFactor = 100;

GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
Expand Down Expand Up @@ -484,7 +489,8 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
RegionLiveOuts(this, /*IsLiveOut=*/true) {

LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
if (RelaxedOcc) {
Expand Down Expand Up @@ -526,6 +532,14 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
return RPTracker.moveMaxPressure();
}

static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
MachineBasicBlock::iterator RegionEnd) {
auto REnd = RegionEnd == RegionBegin->getParent()->end()
? std::prev(RegionEnd)
: RegionEnd;
return &*skipDebugInstructionsBackward(REnd, RegionBegin);
}

void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
const MachineBasicBlock *MBB) {
GCNDownwardRPTracker RPTracker(*LIS);
Expand Down Expand Up @@ -600,20 +614,45 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
}

DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
GCNScheduleDAGMILive::getBBLiveInMap() const {
GCNScheduleDAGMILive::getRegionLiveInMap() const {
assert(!Regions.empty());
std::vector<MachineInstr *> BBStarters;
BBStarters.reserve(Regions.size());
std::vector<MachineInstr *> RegionFirstMIs;
RegionFirstMIs.reserve(Regions.size());
auto I = Regions.rbegin(), E = Regions.rend();
auto *BB = I->first->getParent();
do {
auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
BBStarters.push_back(MI);
RegionFirstMIs.push_back(MI);
do {
++I;
} while (I != E && I->first->getParent() == BB);
} while (I != E);
return getLiveRegMap(BBStarters, false /*After*/, *LIS);
return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
}

DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
GCNScheduleDAGMILive::getRegionLiveOutMap() const {
assert(!Regions.empty());
std::vector<MachineInstr *> RegionLastMIs;
RegionLastMIs.reserve(Regions.size());
for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));

return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
}

void RegionPressureMap::buildLiveRegMap() {
IdxToInstruction.clear();

BBLiveRegMap =
IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
for (unsigned I = 0; I < DAG->Regions.size(); I++) {
MachineInstr *RegionKey =
IsLiveOut
? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
: &*DAG->Regions[I].first;
IdxToInstruction[I] = RegionKey;
}
}

void GCNScheduleDAGMILive::finalizeSchedule() {
Expand All @@ -639,8 +678,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");

if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();
if (!Regions.empty()) {
BBLiveInMap = getRegionLiveInMap();
if (GCNTrackers)
RegionLiveOuts.buildLiveRegMap();
}

GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
Expand Down
42 changes: 41 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,13 +163,40 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
return OS;
}

class GCNScheduleDAGMILive;
class RegionPressureMap {
GCNScheduleDAGMILive *DAG;
// The live in/out pressure as indexed by the first or last MI in the region
// before scheduling.
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveRegMap;
Comment on lines +169 to +171
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm worried this is making the pressure tracking even more complicated and expensive. LiveRegSet already felt wrong, and this adds another level on top of it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is adding an instance of DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> . However, this PR is not introducing this new level above GCNRPTracker::LiveRegSet, as it already exists in our code

DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
, so I'm not sure that that particular class is unduly complicating our tracking. Also, this cache is used to avoid recalculation of the liveouts across stages. Obviously, there are some costs to creating the map, but I assume it is making things less expensive in general.

Perhaps more importantly, I don't think there is an alternative to this besides deleting the cache. Implementing a LIS->getLiveRegsAt(SlotIndex) with cached results would probably be counterproductive due to the maintenance required for each call to SlotIndex insertMachineInstrInMaps during scheduling. We could introduce and cache GCNPressureDiffs (instead of the new trackers) which would use our more accurate RP costs to encode the PDiff. But this is ultimately just a different way to build the cache, and I'm not sure this is worth a complete redesign (especially since there may be issues with inconsistent state between tracker and pdiffs).

// The mapping of RegionIDx to key instruction
DenseMap<unsigned, MachineInstr *> IdxToInstruction;
// Whether we are calculating LiveOuts or LiveIns
bool IsLiveOut;

public:
RegionPressureMap() {}
RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
: DAG(GCNDAG), IsLiveOut(LiveOut) {}
// Build the Instr->LiveReg and RegionIdx->Instr maps
void buildLiveRegMap();

// Retrieve the LiveReg for a given RegionIdx
GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) {
assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end());
MachineInstr *Key = IdxToInstruction[RegionIdx];
return BBLiveRegMap[Key];
}
};

class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
friend class OccInitialScheduleStage;
friend class UnclusteredHighRPStage;
friend class ClusteredLowOccStage;
friend class PreRARematStage;
friend class ILPInitialScheduleStage;
friend class RegionPressureMap;

const GCNSubtarget &ST;

Expand Down Expand Up @@ -211,9 +238,22 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Temporary basic block live-in cache.
DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;

// The map of the initial first region instruction to region live in registers
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;

DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
// Calculate the map of the initial first region instruction to region live in
// registers
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getRegionLiveInMap() const;

// Calculate the map of the initial last region instruction to region live out
// registers
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
getRegionLiveOutMap() const;

// The live out registers per region. These are internally stored as a map of
// the initial last region instruction to region live out registers, but can
// be retreived with the regionIdx by calls to getLiveRegsForRegionIdx.
RegionPressureMap RegionLiveOuts;

// Return current region pressure.
GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;
Expand Down
Loading