1//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ParallelSnippetGenerator.h"
10
11#include "BenchmarkRunner.h"
12#include "MCInstrDescView.h"
13#include "Target.h"
14
15// FIXME: Load constants into registers (e.g. with fld1) to not break
16// instructions like x87.
17
18// Ideally we would like the only limitation on executing instructions to be the
19// availability of the CPU resources (e.g. execution ports) needed to execute
20// them, instead of the availability of their data dependencies.
21
22// To achieve that, one approach is to generate instructions that do not have
23// data dependencies between them.
24//
25// For some instructions, this is trivial:
26// mov rax, qword ptr [rsi]
27// mov rax, qword ptr [rsi]
28// mov rax, qword ptr [rsi]
29// mov rax, qword ptr [rsi]
30// For the above snippet, haswell just renames rax four times and executes the
31// four instructions two at a time on P23 and P0126.
32//
33// For some instructions, we just need to make sure that the source is
34// different from the destination. For example, IDIV8r reads from GPR and
35// writes to AX. We just need to ensure that the Var is assigned a
36// register which is different from AX:
37// idiv bx
38// idiv bx
39// idiv bx
40// idiv bx
41// The above snippet will be able to fully saturate the ports, while the same
42// with ax would issue one uop every `latency(IDIV8r)` cycles.
43//
44// Some instructions make this harder because they both read and write from
45// the same register:
46// inc rax
47// inc rax
48// inc rax
49// inc rax
50// This has a data dependency from each instruction to the next, limit the
51// number of instructions that can be issued in parallel.
52// It turns out that this is not a big issue on recent Intel CPUs because they
53// have heuristics to balance port pressure. In the snippet above, subsequent
54// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
55// might end up executing them all on P0 (just because they can), or try
56// avoiding P5 because it's usually under high pressure from vector
57// instructions.
58// This issue is even more important for high-latency instructions because
59// they increase the idle time of the CPU, e.g. :
60// imul rax, rbx
61// imul rax, rbx
62// imul rax, rbx
63// imul rax, rbx
64//
65// To avoid that, we do the renaming statically by generating as many
66// independent exclusive assignments as possible (until all possible registers
67// are exhausted) e.g.:
68// imul rax, rbx
69// imul rcx, rbx
70// imul rdx, rbx
71// imul r8, rbx
72//
73// Some instruction even make the above static renaming impossible because
74// they implicitly read and write from the same operand, e.g. ADC16rr reads
75// and writes from EFLAGS.
76// In that case we just use a greedy register assignment and hope for the
77// best.
78
79namespace llvm {
80namespace exegesis {
81
82static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
83 for (const auto &Var : Instr.Variables)
84 if (Var.hasTiedOperands())
85 return true;
86 return false;
87}
88
89ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
90
91void ParallelSnippetGenerator::instantiateMemoryOperands(
92 const MCRegister ScratchSpacePointerInReg,
93 std::vector<InstructionTemplate> &Instructions) const {
94 if (!ScratchSpacePointerInReg)
95 return; // no memory operands.
96 const auto &ET = State.getExegesisTarget();
97 const unsigned MemStep = ET.getMaxMemoryAccessSize();
98 const size_t OriginalInstructionsSize = Instructions.size();
99 size_t I = 0;
100 for (InstructionTemplate &IT : Instructions) {
101 ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep);
102 ++I;
103 }
104
105 while (Instructions.size() < kMinNumDifferentAddresses) {
106 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
107 ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep);
108 ++I;
109 Instructions.push_back(x: std::move(IT));
110 }
111 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
112 "not enough scratch space");
113}
114
115enum class RegRandomizationStrategy : uint8_t {
116 PickRandomRegs,
117 SingleStaticRegPerOperand,
118 SingleStaticReg,
119
120 FIRST = PickRandomRegs,
121 LAST = SingleStaticReg,
122};
123
124} // namespace exegesis
125
126template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
127 static constexpr bool is_iterable = true;
128};
129
130namespace exegesis {
131
132const char *getDescription(RegRandomizationStrategy S) {
133 switch (S) {
134 case RegRandomizationStrategy::PickRandomRegs:
135 return "randomizing registers";
136 case RegRandomizationStrategy::SingleStaticRegPerOperand:
137 return "one unique register for each position";
138 case RegRandomizationStrategy::SingleStaticReg:
139 return "reusing the same register for all positions";
140 }
141 llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
142}
143
144static std::variant<std::nullopt_t, MCOperand, Register>
145generateSingleRegisterForInstrAvoidingDefUseOverlap(
146 const LLVMState &State, const BitVector &ForbiddenRegisters,
147 const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
148 const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
149 const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
150 RegRandomizationStrategy S) {
151 const Instruction &Instr = IT.getInstr();
152 assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
153 !IT.getValueFor(Op).isValid());
154 assert((!Op.isUse() || !Op.isTied()) &&
155 "Not expecting to see a tied use reg");
156
157 if (Op.isUse()) {
158 switch (S) {
159 case RegRandomizationStrategy::PickRandomRegs:
160 break;
161 case RegRandomizationStrategy::SingleStaticReg:
162 case RegRandomizationStrategy::SingleStaticRegPerOperand: {
163 if (!Instructions.empty())
164 return Instructions.front().getValueFor(Op);
165 if (S != RegRandomizationStrategy::SingleStaticReg)
166 break;
167 BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
168 const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
169 if (std::optional<int> CommonBit =
170 getFirstCommonBit(A: PossibleRegisters, B: UseAliases))
171 return *CommonBit;
172 break;
173 }
174 }
175 }
176
177 BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
178 remove(A&: PossibleRegisters, B: ForbiddenRegisters);
179
180 if (Op.isDef()) {
181 remove(A&: PossibleRegisters, B: ImplicitUseAliases);
182 const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
183 remove(A&: PossibleRegisters, B: UseAliases);
184 }
185
186 if (Op.isUse()) {
187 remove(A&: PossibleRegisters, B: ImplicitDefAliases);
188 // NOTE: in general, using same reg for multiple Use's is fine.
189 if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
190 const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
191 remove(A&: PossibleRegisters, B: UseAliases);
192 }
193 }
194
195 bool IsDefWithTiedUse =
196 Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
197 if (Op.isUse() || IsDefWithTiedUse) {
198 // Now, important bit: if we have used some register for def,
199 // then we can not use that same register for *any* use,
200 // be it either an untied use, or an use tied to a def.
201 // But def-ing same regs is fine, as long as there are no uses!
202 const BitVector DefsAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Defs);
203 remove(A&: PossibleRegisters, B: DefsAliases);
204 }
205
206 if (!PossibleRegisters.any())
207 return std::nullopt;
208
209 return randomBit(Vector: PossibleRegisters);
210}
211
212static std::optional<InstructionTemplate>
213generateSingleSnippetForInstrAvoidingDefUseOverlap(
214 const LLVMState &State, const BitVector &ForbiddenRegisters,
215 const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
216 BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
217 const ArrayRef<InstructionTemplate> Instructions,
218 RegRandomizationStrategy S) {
219 const Instruction &Instr = IT.getInstr();
220 for (const Operand &Op : Instr.Operands) {
221 if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
222 IT.getValueFor(Op).isValid())
223 continue;
224 assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
225
226 std::variant<std::nullopt_t, MCOperand, Register> R =
227 generateSingleRegisterForInstrAvoidingDefUseOverlap(
228 State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
229 Uses, Defs, IT, Op, Instructions, S);
230
231 if (std::holds_alternative<std::nullopt_t>(v: R))
232 return {};
233
234 MCOperand MCOp;
235 if (std::holds_alternative<MCOperand>(v: R))
236 MCOp = std::get<MCOperand>(v&: R);
237 else {
238 Register RandomReg = std::get<Register>(v&: R);
239 if (Op.isDef())
240 Defs.set(RandomReg);
241 if (Op.isUse())
242 Uses.set(RandomReg);
243 MCOp = MCOperand::createReg(Reg: RandomReg);
244 }
245 IT.getValueFor(Op) = MCOp;
246 }
247 return IT;
248}
249
250static std::vector<InstructionTemplate>
251generateSnippetForInstrAvoidingDefUseOverlap(
252 const LLVMState &State, const InstructionTemplate &IT,
253 RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
254 // We don't want to accidentally serialize the instruction,
255 // so we must be sure that we don't pick a def that is an implicit use,
256 // or a use that is an implicit def, so record implicit regs now.
257 BitVector ImplicitUses(State.getRegInfo().getNumRegs());
258 BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
259 for (const auto &Op : IT.getInstr().Operands) {
260 if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
261 assert(Op.isImplicitReg() && "Not an implicit register operand?");
262 if (Op.isUse())
263 ImplicitUses.set(Op.getImplicitReg().id());
264 else {
265 assert(Op.isDef() && "Not a use and not a def?");
266 ImplicitDefs.set(Op.getImplicitReg().id());
267 }
268 }
269 }
270 const BitVector ImplicitUseAliases =
271 getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitUses);
272 const BitVector ImplicitDefAliases =
273 getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitDefs);
274
275 BitVector Defs(State.getRegInfo().getNumRegs());
276 BitVector Uses(State.getRegInfo().getNumRegs());
277 std::vector<InstructionTemplate> Instructions;
278
279 while (true) {
280 std::optional<InstructionTemplate> TmpIT =
281 generateSingleSnippetForInstrAvoidingDefUseOverlap(
282 State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
283 Uses, Defs, IT, Instructions, S);
284 if (!TmpIT)
285 return Instructions;
286 Instructions.push_back(x: std::move(*TmpIT));
287 if (!hasVariablesWithTiedOperands(Instr: IT.getInstr()))
288 return Instructions;
289 assert(Instructions.size() <= 128 && "Stuck in endless loop?");
290 }
291}
292
293Expected<std::vector<CodeTemplate>>
294ParallelSnippetGenerator::generateCodeTemplates(
295 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
296 const Instruction &Instr = Variant.getInstr();
297 CodeTemplate CT;
298 CT.ScratchSpacePointerInReg =
299 Instr.hasMemoryOperands()
300 ? State.getExegesisTarget().getScratchMemoryRegister(
301 State.getTargetMachine().getTargetTriple())
302 : MCRegister();
303 const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
304 if (SelfAliasing.empty()) {
305 CT.Info = "instruction is parallel, repeating a random one.";
306 CT.Instructions.push_back(x: std::move(Variant));
307 instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions);
308 return getSingleton(CT: std::move(CT));
309 }
310 if (SelfAliasing.hasImplicitAliasing()) {
311 CT.Info = "instruction is serial, repeating a random one.";
312 CT.Instructions.push_back(x: std::move(Variant));
313 instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions);
314 return getSingleton(CT: std::move(CT));
315 }
316 std::vector<CodeTemplate> Result;
317 bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
318 // If there are no tied operands, then we don't want to "saturate backedge",
319 // and the template we will produce will have only a single instruction.
320 unsigned NumUntiedUseRegs = count_if(Range: Instr.Operands, P: [](const Operand &Op) {
321 return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
322 !Op.isTied();
323 });
324 SmallVector<RegRandomizationStrategy, 3> Strategies;
325 if (HasTiedOperands || NumUntiedUseRegs >= 3)
326 Strategies.push_back(Elt: RegRandomizationStrategy::PickRandomRegs);
327 if (NumUntiedUseRegs >= 2)
328 Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticRegPerOperand);
329 Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticReg);
330 for (RegRandomizationStrategy S : Strategies) {
331 CodeTemplate CurrCT = CT.clone();
332 CurrCT.Info =
333 Twine("instruction has ")
334 .concat(Suffix: HasTiedOperands ? "" : "no ")
335 .concat(Suffix: "tied variables, avoiding "
336 "Read-After-Write issue, picking random def and use "
337 "registers not aliasing each other, for uses, ")
338 .concat(Suffix: getDescription(S))
339 .str();
340 CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
341 State, IT: Variant, S, ForbiddenRegisters);
342 if (CurrCT.Instructions.empty())
343 return make_error<StringError>(
344 Args: Twine("Failed to produce any snippet via: ").concat(Suffix: CurrCT.Info),
345 Args: inconvertibleErrorCode());
346 instantiateMemoryOperands(ScratchSpacePointerInReg: CurrCT.ScratchSpacePointerInReg,
347 Instructions&: CurrCT.Instructions);
348 Result.push_back(x: std::move(CurrCT));
349 }
350 return Result;
351}
352
353constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
354
355} // namespace exegesis
356} // namespace llvm
357