1//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ParallelSnippetGenerator.h"
10
11#include "BenchmarkRunner.h"
12#include "MCInstrDescView.h"
13#include "Target.h"
14
15// FIXME: Load constants into registers (e.g. with fld1) to not break
16// instructions like x87.
17
18// Ideally we would like the only limitation on executing instructions to be the
19// availability of the CPU resources (e.g. execution ports) needed to execute
20// them, instead of the availability of their data dependencies.
21
22// To achieve that, one approach is to generate instructions that do not have
23// data dependencies between them.
24//
25// For some instructions, this is trivial:
26// mov rax, qword ptr [rsi]
27// mov rax, qword ptr [rsi]
28// mov rax, qword ptr [rsi]
29// mov rax, qword ptr [rsi]
30// For the above snippet, haswell just renames rax four times and executes the
31// four instructions two at a time on P23 and P0126.
32//
33// For some instructions, we just need to make sure that the source is
34// different from the destination. For example, IDIV8r reads from GPR and
35// writes to AX. We just need to ensure that the Var is assigned a
36// register which is different from AX:
37// idiv bx
38// idiv bx
39// idiv bx
40// idiv bx
41// The above snippet will be able to fully saturate the ports, while the same
42// with ax would issue one uop every `latency(IDIV8r)` cycles.
43//
44// Some instructions make this harder because they both read and write from
45// the same register:
46// inc rax
47// inc rax
48// inc rax
49// inc rax
50// This has a data dependency from each instruction to the next, limit the
51// number of instructions that can be issued in parallel.
52// It turns out that this is not a big issue on recent Intel CPUs because they
53// have heuristics to balance port pressure. In the snippet above, subsequent
54// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
55// might end up executing them all on P0 (just because they can), or try
56// avoiding P5 because it's usually under high pressure from vector
57// instructions.
58// This issue is even more important for high-latency instructions because
59// they increase the idle time of the CPU, e.g. :
60// imul rax, rbx
61// imul rax, rbx
62// imul rax, rbx
63// imul rax, rbx
64//
65// To avoid that, we do the renaming statically by generating as many
66// independent exclusive assignments as possible (until all possible registers
67// are exhausted) e.g.:
68// imul rax, rbx
69// imul rcx, rbx
70// imul rdx, rbx
71// imul r8, rbx
72//
73// Some instruction even make the above static renaming impossible because
74// they implicitly read and write from the same operand, e.g. ADC16rr reads
75// and writes from EFLAGS.
76// In that case we just use a greedy register assignment and hope for the
77// best.
78
79namespace llvm {
80namespace exegesis {
81
82static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
83 SmallVector<const Variable *, 8> Result;
84 for (const auto &Var : Instr.Variables)
85 if (Var.hasTiedOperands())
86 return true;
87 return false;
88}
89
90ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
91
92void ParallelSnippetGenerator::instantiateMemoryOperands(
93 const unsigned ScratchSpacePointerInReg,
94 std::vector<InstructionTemplate> &Instructions) const {
95 if (ScratchSpacePointerInReg == 0)
96 return; // no memory operands.
97 const auto &ET = State.getExegesisTarget();
98 const unsigned MemStep = ET.getMaxMemoryAccessSize();
99 const size_t OriginalInstructionsSize = Instructions.size();
100 size_t I = 0;
101 for (InstructionTemplate &IT : Instructions) {
102 ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep);
103 ++I;
104 }
105
106 while (Instructions.size() < kMinNumDifferentAddresses) {
107 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
108 ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep);
109 ++I;
110 Instructions.push_back(x: std::move(IT));
111 }
112 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
113 "not enough scratch space");
114}
115
116enum class RegRandomizationStrategy : uint8_t {
117 PickRandomRegs,
118 SingleStaticRegPerOperand,
119 SingleStaticReg,
120
121 FIRST = PickRandomRegs,
122 LAST = SingleStaticReg,
123};
124
125} // namespace exegesis
126
127template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
128 static constexpr bool is_iterable = true;
129};
130
131namespace exegesis {
132
133const char *getDescription(RegRandomizationStrategy S) {
134 switch (S) {
135 case RegRandomizationStrategy::PickRandomRegs:
136 return "randomizing registers";
137 case RegRandomizationStrategy::SingleStaticRegPerOperand:
138 return "one unique register for each position";
139 case RegRandomizationStrategy::SingleStaticReg:
140 return "reusing the same register for all positions";
141 }
142 llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
143}
144
145static std::variant<std::nullopt_t, MCOperand, Register>
146generateSingleRegisterForInstrAvoidingDefUseOverlap(
147 const LLVMState &State, const BitVector &ForbiddenRegisters,
148 const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
149 const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
150 const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
151 RegRandomizationStrategy S) {
152 const Instruction &Instr = IT.getInstr();
153 assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
154 !IT.getValueFor(Op).isValid());
155 assert((!Op.isUse() || !Op.isTied()) &&
156 "Not expecting to see a tied use reg");
157
158 if (Op.isUse()) {
159 switch (S) {
160 case RegRandomizationStrategy::PickRandomRegs:
161 break;
162 case RegRandomizationStrategy::SingleStaticReg:
163 case RegRandomizationStrategy::SingleStaticRegPerOperand: {
164 if (!Instructions.empty())
165 return Instructions.front().getValueFor(Op);
166 if (S != RegRandomizationStrategy::SingleStaticReg)
167 break;
168 BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
169 const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
170 if (std::optional<int> CommonBit =
171 getFirstCommonBit(A: PossibleRegisters, B: UseAliases))
172 return *CommonBit;
173 break;
174 }
175 }
176 }
177
178 BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
179 remove(A&: PossibleRegisters, B: ForbiddenRegisters);
180
181 if (Op.isDef()) {
182 remove(A&: PossibleRegisters, B: ImplicitUseAliases);
183 const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
184 remove(A&: PossibleRegisters, B: UseAliases);
185 }
186
187 if (Op.isUse()) {
188 remove(A&: PossibleRegisters, B: ImplicitDefAliases);
189 // NOTE: in general, using same reg for multiple Use's is fine.
190 if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
191 const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
192 remove(A&: PossibleRegisters, B: UseAliases);
193 }
194 }
195
196 bool IsDefWithTiedUse =
197 Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
198 if (Op.isUse() || IsDefWithTiedUse) {
199 // Now, important bit: if we have used some register for def,
200 // then we can not use that same register for *any* use,
201 // be it either an untied use, or an use tied to a def.
202 // But def-ing same regs is fine, as long as there are no uses!
203 const BitVector DefsAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Defs);
204 remove(A&: PossibleRegisters, B: DefsAliases);
205 }
206
207 if (!PossibleRegisters.any())
208 return std::nullopt;
209
210 return randomBit(Vector: PossibleRegisters);
211}
212
213static std::optional<InstructionTemplate>
214generateSingleSnippetForInstrAvoidingDefUseOverlap(
215 const LLVMState &State, const BitVector &ForbiddenRegisters,
216 const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
217 BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
218 const ArrayRef<InstructionTemplate> Instructions,
219 RegRandomizationStrategy S) {
220 const Instruction &Instr = IT.getInstr();
221 for (const Operand &Op : Instr.Operands) {
222 if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
223 IT.getValueFor(Op).isValid())
224 continue;
225 assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
226
227 std::variant<std::nullopt_t, MCOperand, Register> R =
228 generateSingleRegisterForInstrAvoidingDefUseOverlap(
229 State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
230 Uses, Defs, IT, Op, Instructions, S);
231
232 if (std::holds_alternative<std::nullopt_t>(v: R))
233 return {};
234
235 MCOperand MCOp;
236 if (std::holds_alternative<MCOperand>(v: R))
237 MCOp = std::get<MCOperand>(v&: R);
238 else {
239 Register RandomReg = std::get<Register>(v&: R);
240 if (Op.isDef())
241 Defs.set(RandomReg);
242 if (Op.isUse())
243 Uses.set(RandomReg);
244 MCOp = MCOperand::createReg(Reg: RandomReg);
245 }
246 IT.getValueFor(Op) = MCOp;
247 }
248 return IT;
249}
250
251static std::vector<InstructionTemplate>
252generateSnippetForInstrAvoidingDefUseOverlap(
253 const LLVMState &State, const InstructionTemplate &IT,
254 RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
255 // We don't want to accidentally serialize the instruction,
256 // so we must be sure that we don't pick a def that is an implicit use,
257 // or a use that is an implicit def, so record implicit regs now.
258 BitVector ImplicitUses(State.getRegInfo().getNumRegs());
259 BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
260 for (const auto &Op : IT.getInstr().Operands) {
261 if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
262 assert(Op.isImplicitReg() && "Not an implicit register operand?");
263 if (Op.isUse())
264 ImplicitUses.set(Op.getImplicitReg());
265 else {
266 assert(Op.isDef() && "Not a use and not a def?");
267 ImplicitDefs.set(Op.getImplicitReg());
268 }
269 }
270 }
271 const BitVector ImplicitUseAliases =
272 getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitUses);
273 const BitVector ImplicitDefAliases =
274 getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitDefs);
275
276 BitVector Defs(State.getRegInfo().getNumRegs());
277 BitVector Uses(State.getRegInfo().getNumRegs());
278 std::vector<InstructionTemplate> Instructions;
279
280 while (true) {
281 std::optional<InstructionTemplate> TmpIT =
282 generateSingleSnippetForInstrAvoidingDefUseOverlap(
283 State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
284 Uses, Defs, IT, Instructions, S);
285 if (!TmpIT)
286 return Instructions;
287 Instructions.push_back(x: std::move(*TmpIT));
288 if (!hasVariablesWithTiedOperands(Instr: IT.getInstr()))
289 return Instructions;
290 assert(Instructions.size() <= 128 && "Stuck in endless loop?");
291 }
292}
293
294Expected<std::vector<CodeTemplate>>
295ParallelSnippetGenerator::generateCodeTemplates(
296 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
297 const Instruction &Instr = Variant.getInstr();
298 CodeTemplate CT;
299 CT.ScratchSpacePointerInReg =
300 Instr.hasMemoryOperands()
301 ? State.getExegesisTarget().getScratchMemoryRegister(
302 State.getTargetMachine().getTargetTriple())
303 : 0;
304 const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
305 if (SelfAliasing.empty()) {
306 CT.Info = "instruction is parallel, repeating a random one.";
307 CT.Instructions.push_back(x: std::move(Variant));
308 instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions);
309 return getSingleton(CT: std::move(CT));
310 }
311 if (SelfAliasing.hasImplicitAliasing()) {
312 CT.Info = "instruction is serial, repeating a random one.";
313 CT.Instructions.push_back(x: std::move(Variant));
314 instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions);
315 return getSingleton(CT: std::move(CT));
316 }
317 std::vector<CodeTemplate> Result;
318 bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
319 // If there are no tied operands, then we don't want to "saturate backedge",
320 // and the template we will produce will have only a single instruction.
321 unsigned NumUntiedUseRegs = count_if(Range: Instr.Operands, P: [](const Operand &Op) {
322 return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
323 !Op.isTied();
324 });
325 SmallVector<RegRandomizationStrategy, 3> Strategies;
326 if (HasTiedOperands || NumUntiedUseRegs >= 3)
327 Strategies.push_back(Elt: RegRandomizationStrategy::PickRandomRegs);
328 if (NumUntiedUseRegs >= 2)
329 Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticRegPerOperand);
330 Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticReg);
331 for (RegRandomizationStrategy S : Strategies) {
332 CodeTemplate CurrCT = CT.clone();
333 CurrCT.Info =
334 Twine("instruction has ")
335 .concat(Suffix: HasTiedOperands ? "" : "no ")
336 .concat(Suffix: "tied variables, avoiding "
337 "Read-After-Write issue, picking random def and use "
338 "registers not aliasing each other, for uses, ")
339 .concat(Suffix: getDescription(S))
340 .str();
341 CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
342 State, IT: Variant, S, ForbiddenRegisters);
343 if (CurrCT.Instructions.empty())
344 return make_error<StringError>(
345 Args: Twine("Failed to produce any snippet via: ").concat(Suffix: CurrCT.Info),
346 Args: inconvertibleErrorCode());
347 instantiateMemoryOperands(ScratchSpacePointerInReg: CurrCT.ScratchSpacePointerInReg,
348 Instructions&: CurrCT.Instructions);
349 Result.push_back(x: std::move(CurrCT));
350 }
351 return Result;
352}
353
354constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
355
356} // namespace exegesis
357} // namespace llvm
358