1//===-- SchedClassResolution.cpp --------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "SchedClassResolution.h"
10#include "BenchmarkResult.h"
11#include "llvm/ADT/STLExtras.h"
12#include "llvm/MC/MCAsmInfo.h"
13#include "llvm/MCA/Support.h"
14#include "llvm/Support/FormatVariadic.h"
15#include <vector>
16
17namespace llvm {
18namespace exegesis {
19
20// Return the non-redundant list of WriteProcRes used by the given sched class.
21// The scheduling model for LLVM is such that each instruction has a certain
22// number of uops which consume resources which are described by WriteProcRes
23// entries. Each entry describe how many cycles are spent on a specific ProcRes
24// kind.
25// For example, an instruction might have 3 uOps, one dispatching on P0
26// (ProcResIdx=1) and two on P06 (ProcResIdx = 7).
27// Note that LLVM additionally denormalizes resource consumption to include
28// usage of super resources by subresources. So in practice if there exists a
29// P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by
30// P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed
31// by P06 are also consumed by P016. In the figure below, parenthesized cycles
32// denote implied usage of superresources by subresources:
33// P0 P06 P016
34// uOp1 1 (1) (1)
35// uOp2 1 (1)
36// uOp3 1 (1)
37// =============================
38// 1 3 3
39// Eventually we end up with three entries for the WriteProcRes of the
40// instruction:
41// {ProcResIdx=1, Cycles=1} // P0
42// {ProcResIdx=7, Cycles=3} // P06
43// {ProcResIdx=10, Cycles=3} // P016
44//
45// Note that in this case, P016 does not contribute any cycles, so it would
46// be removed by this function.
47// FIXME: Merge this with the equivalent in llvm-mca.
48static SmallVector<MCWriteProcResEntry, 8>
49getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc,
50 const MCSubtargetInfo &STI) {
51 SmallVector<MCWriteProcResEntry, 8> Result;
52 const auto &SM = STI.getSchedModel();
53 const unsigned NumProcRes = SM.getNumProcResourceKinds();
54
55 // Collect resource masks.
56 SmallVector<uint64_t> ProcResourceMasks(NumProcRes);
57 mca::computeProcResourceMasks(SM, Masks: ProcResourceMasks);
58
59 // Sort entries by smaller resources for (basic) topological ordering.
60 using ResourceMaskAndEntry = std::pair<uint64_t, const MCWriteProcResEntry *>;
61 SmallVector<ResourceMaskAndEntry, 8> ResourceMaskAndEntries;
62 for (const auto *WPR = STI.getWriteProcResBegin(SC: &SCDesc),
63 *const WPREnd = STI.getWriteProcResEnd(SC: &SCDesc);
64 WPR != WPREnd; ++WPR) {
65 uint64_t Mask = ProcResourceMasks[WPR->ProcResourceIdx];
66 ResourceMaskAndEntries.push_back(Elt: {Mask, WPR});
67 }
68 sort(C&: ResourceMaskAndEntries,
69 Comp: [](const ResourceMaskAndEntry &A, const ResourceMaskAndEntry &B) {
70 unsigned popcntA = popcount(Value: A.first);
71 unsigned popcntB = popcount(Value: B.first);
72 return std::tie(args&: popcntA, args: A.first) < std::tie(args&: popcntB, args: B.first);
73 });
74
75 SmallVector<float, 32> ProcResUnitUsage(NumProcRes);
76 for (const ResourceMaskAndEntry &Entry : ResourceMaskAndEntries) {
77 const MCWriteProcResEntry *WPR = Entry.second;
78 const MCProcResourceDesc *const ProcResDesc =
79 SM.getProcResource(ProcResourceIdx: WPR->ProcResourceIdx);
80 // TODO: Handle AcquireAtAtCycle in llvm-exegesis and llvm-mca. See
81 // https://github.com/llvm/llvm-project/issues/62680 and
82 // https://github.com/llvm/llvm-project/issues/62681
83 assert(WPR->AcquireAtCycle == 0 &&
84 "`llvm-exegesis` does not handle AcquireAtCycle > 0");
85 if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
86 // This is a ProcResUnit.
87 Result.push_back(
88 Elt: {.ProcResourceIdx: WPR->ProcResourceIdx, .ReleaseAtCycle: WPR->ReleaseAtCycle, .AcquireAtCycle: WPR->AcquireAtCycle});
89 ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->ReleaseAtCycle;
90 } else {
91 // This is a ProcResGroup. First see if it contributes any cycles or if
92 // it has cycles just from subunits.
93 float RemainingCycles = WPR->ReleaseAtCycle;
94 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
95 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
96 ++SubResIdx) {
97 RemainingCycles -= ProcResUnitUsage[*SubResIdx];
98 }
99 if (RemainingCycles < 0.01f) {
100 // The ProcResGroup contributes no cycles of its own.
101 continue;
102 }
103 // The ProcResGroup contributes `RemainingCycles` cycles of its own.
104 Result.push_back(Elt: {.ProcResourceIdx: WPR->ProcResourceIdx,
105 .ReleaseAtCycle: static_cast<uint16_t>(std::round(x: RemainingCycles)),
106 .AcquireAtCycle: WPR->AcquireAtCycle});
107 // Spread the remaining cycles over all subunits.
108 for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
109 SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
110 ++SubResIdx) {
111 ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits;
112 }
113 }
114 }
115 return Result;
116}
117
118// Distributes a pressure budget as evenly as possible on the provided subunits
119// given the already existing port pressure distribution.
120//
121// The algorithm is as follows: while there is remaining pressure to
122// distribute, find the subunits with minimal pressure, and distribute
123// remaining pressure equally up to the pressure of the unit with
124// second-to-minimal pressure.
125// For example, let's assume we want to distribute 2*P1256
126// (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is:
127// DensePressure = P0 P1 P2 P3 P4 P5 P6 P7
128// 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5
129// RemainingPressure = 2.0
130// We sort the subunits by pressure:
131// Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)]
132// We'll first start by the subunits with minimal pressure, which are at
133// the beginning of the sorted array. In this example there is one (P2).
134// The subunit with second-to-minimal pressure is the next one in the
135// array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles
136// from the budget.
137// Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)]
138// RemainingPressure = 1.9
139// We repeat this process: distribute 0.2 pressure on each of the minimal
140// P2 and P1, decrease budget by 2*0.2:
141// Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)]
142// RemainingPressure = 1.5
143// There are no second-to-minimal subunits so we just share the remaining
144// budget (1.5 cycles) equally:
145// Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)]
146// RemainingPressure = 0.0
147// We stop as there is no remaining budget to distribute.
148static void distributePressure(float RemainingPressure,
149 SmallVector<uint16_t, 32> Subunits,
150 SmallVector<float, 32> &DensePressure) {
151 // Find the number of subunits with minimal pressure (they are at the
152 // front).
153 sort(C&: Subunits, Comp: [&DensePressure](const uint16_t A, const uint16_t B) {
154 return DensePressure[A] < DensePressure[B];
155 });
156 const auto getPressureForSubunit = [&DensePressure,
157 &Subunits](size_t I) -> float & {
158 return DensePressure[Subunits[I]];
159 };
160 size_t NumMinimalSU = 1;
161 while (NumMinimalSU < Subunits.size() &&
162 getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) {
163 ++NumMinimalSU;
164 }
165 while (RemainingPressure > 0.0f) {
166 if (NumMinimalSU == Subunits.size()) {
167 // All units are minimal, just distribute evenly and be done.
168 for (size_t I = 0; I < NumMinimalSU; ++I) {
169 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
170 }
171 return;
172 }
173 // Distribute the remaining pressure equally.
174 const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1);
175 const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU);
176 assert(MinimalPressure < SecondToMinimalPressure);
177 const float Increment = SecondToMinimalPressure - MinimalPressure;
178 if (RemainingPressure <= NumMinimalSU * Increment) {
179 // There is not enough remaining pressure.
180 for (size_t I = 0; I < NumMinimalSU; ++I) {
181 getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
182 }
183 return;
184 }
185 // Bump all minimal pressure subunits to `SecondToMinimalPressure`.
186 for (size_t I = 0; I < NumMinimalSU; ++I) {
187 getPressureForSubunit(I) = SecondToMinimalPressure;
188 RemainingPressure -= SecondToMinimalPressure;
189 }
190 while (NumMinimalSU < Subunits.size() &&
191 getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) {
192 ++NumMinimalSU;
193 }
194 }
195}
196
197std::vector<std::pair<uint16_t, float>>
198computeIdealizedProcResPressure(const MCSchedModel &SM,
199 SmallVector<MCWriteProcResEntry, 8> WPRS) {
200 // DensePressure[I] is the port pressure for Proc Resource I.
201 SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds());
202 sort(C&: WPRS, Comp: [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) {
203 return A.ProcResourceIdx < B.ProcResourceIdx;
204 });
205 for (const MCWriteProcResEntry &WPR : WPRS) {
206 // Get units for the entry.
207 const MCProcResourceDesc *const ProcResDesc =
208 SM.getProcResource(ProcResourceIdx: WPR.ProcResourceIdx);
209 if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
210 // This is a ProcResUnit.
211 DensePressure[WPR.ProcResourceIdx] += WPR.ReleaseAtCycle;
212 } else {
213 // This is a ProcResGroup.
214 SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin,
215 ProcResDesc->SubUnitsIdxBegin +
216 ProcResDesc->NumUnits);
217 distributePressure(RemainingPressure: WPR.ReleaseAtCycle, Subunits, DensePressure);
218 }
219 }
220 // Turn dense pressure into sparse pressure by removing zero entries.
221 std::vector<std::pair<uint16_t, float>> Pressure;
222 for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
223 if (DensePressure[I] > 0.0f)
224 Pressure.emplace_back(args&: I, args&: DensePressure[I]);
225 }
226 return Pressure;
227}
228
229ResolvedSchedClass::ResolvedSchedClass(const MCSubtargetInfo &STI,
230 unsigned ResolvedSchedClassId,
231 bool WasVariant)
232 : SchedClassId(ResolvedSchedClassId),
233 SCDesc(STI.getSchedModel().getSchedClassDesc(SchedClassIdx: ResolvedSchedClassId)),
234 WasVariant(WasVariant),
235 NonRedundantWriteProcRes(getNonRedundantWriteProcRes(SCDesc: *SCDesc, STI)),
236 IdealizedProcResPressure(computeIdealizedProcResPressure(
237 SM: STI.getSchedModel(), WPRS: NonRedundantWriteProcRes)) {
238 assert((SCDesc == nullptr || !SCDesc->isVariant()) &&
239 "ResolvedSchedClass should never be variant");
240}
241
242static unsigned ResolveVariantSchedClassId(const MCSubtargetInfo &STI,
243 const MCInstrInfo &InstrInfo,
244 unsigned SchedClassId,
245 const MCInst &MCI) {
246 const auto &SM = STI.getSchedModel();
247 while (SchedClassId && SM.getSchedClassDesc(SchedClassIdx: SchedClassId)->isVariant()) {
248 SchedClassId = STI.resolveVariantSchedClass(SchedClass: SchedClassId, MI: &MCI, MCII: &InstrInfo,
249 CPUID: SM.getProcessorID());
250 }
251 return SchedClassId;
252}
253
254std::pair<unsigned /*SchedClassId*/, bool /*WasVariant*/>
255ResolvedSchedClass::resolveSchedClassId(const MCSubtargetInfo &SubtargetInfo,
256 const MCInstrInfo &InstrInfo,
257 const MCInst &MCI) {
258 unsigned SchedClassId = InstrInfo.get(Opcode: MCI.getOpcode()).getSchedClass();
259 const bool WasVariant = SchedClassId && SubtargetInfo.getSchedModel()
260 .getSchedClassDesc(SchedClassIdx: SchedClassId)
261 ->isVariant();
262 SchedClassId =
263 ResolveVariantSchedClassId(STI: SubtargetInfo, InstrInfo, SchedClassId, MCI);
264 return std::make_pair(x&: SchedClassId, y: WasVariant);
265}
266
267// Returns a ProxResIdx by id or name.
268static unsigned findProcResIdx(const MCSubtargetInfo &STI,
269 const StringRef NameOrId) {
270 // Interpret the key as an ProcResIdx.
271 unsigned ProcResIdx = 0;
272 if (to_integer(S: NameOrId, Num&: ProcResIdx, Base: 10))
273 return ProcResIdx;
274 // Interpret the key as a ProcRes name.
275 const auto &SchedModel = STI.getSchedModel();
276 for (int I = 0, E = SchedModel.getNumProcResourceKinds(); I < E; ++I) {
277 if (NameOrId == SchedModel.getProcResource(ProcResourceIdx: I)->Name)
278 return I;
279 }
280 return 0;
281}
282
283std::vector<BenchmarkMeasure> ResolvedSchedClass::getAsPoint(
284 Benchmark::ModeE Mode, const MCSubtargetInfo &STI,
285 ArrayRef<PerInstructionStats> Representative) const {
286 const size_t NumMeasurements = Representative.size();
287
288 std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements);
289
290 if (Mode == Benchmark::Latency) {
291 assert(NumMeasurements == 1 && "Latency is a single measure.");
292 BenchmarkMeasure &LatencyMeasure = SchedClassPoint[0];
293
294 // Find the latency.
295 LatencyMeasure.PerInstructionValue = 0.0;
296
297 for (unsigned I = 0; I < SCDesc->NumWriteLatencyEntries; ++I) {
298 const MCWriteLatencyEntry *const WLE =
299 STI.getWriteLatencyEntry(SC: SCDesc, DefIdx: I);
300 LatencyMeasure.PerInstructionValue =
301 std::max<double>(a: LatencyMeasure.PerInstructionValue, b: WLE->Cycles);
302 }
303 } else if (Mode == Benchmark::Uops) {
304 for (auto I : zip(t&: SchedClassPoint, u&: Representative)) {
305 BenchmarkMeasure &Measure = std::get<0>(t&: I);
306 const PerInstructionStats &Stats = std::get<1>(t&: I);
307
308 StringRef Key = Stats.key();
309 uint16_t ProcResIdx = findProcResIdx(STI, NameOrId: Key);
310 if (ProcResIdx > 0) {
311 // Find the pressure on ProcResIdx `Key`.
312 const auto ProcResPressureIt =
313 find_if(Range: IdealizedProcResPressure,
314 P: [ProcResIdx](const std::pair<uint16_t, float> &WPR) {
315 return WPR.first == ProcResIdx;
316 });
317 Measure.PerInstructionValue =
318 ProcResPressureIt == IdealizedProcResPressure.end()
319 ? 0.0
320 : ProcResPressureIt->second;
321 } else if (Key == "NumMicroOps") {
322 Measure.PerInstructionValue = SCDesc->NumMicroOps;
323 } else {
324 errs() << "expected `key` to be either a ProcResIdx or a ProcRes "
325 "name, got "
326 << Key << "\n";
327 return {};
328 }
329 }
330 } else if (Mode == Benchmark::InverseThroughput) {
331 assert(NumMeasurements == 1 && "Inverse Throughput is a single measure.");
332 BenchmarkMeasure &RThroughputMeasure = SchedClassPoint[0];
333
334 RThroughputMeasure.PerInstructionValue =
335 MCSchedModel::getReciprocalThroughput(STI, SCDesc: *SCDesc);
336 } else {
337 llvm_unreachable("unimplemented measurement matching mode");
338 }
339
340 return SchedClassPoint;
341}
342
343} // namespace exegesis
344} // namespace llvm
345