ParallelSnippetGenerator.cpp source code [llvm_projects/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp]

1	//===-- ParallelSnippetGenerator.cpp ----------------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "ParallelSnippetGenerator.h"
10
11	#include "BenchmarkRunner.h"
12	#include "MCInstrDescView.h"
13	#include "Target.h"
14
15	// FIXME: Load constants into registers (e.g. with fld1) to not break
16	// instructions like x87.
17
18	// Ideally we would like the only limitation on executing instructions to be the
19	// availability of the CPU resources (e.g. execution ports) needed to execute
20	// them, instead of the availability of their data dependencies.
21
22	// To achieve that, one approach is to generate instructions that do not have
23	// data dependencies between them.
24	//
25	// For some instructions, this is trivial:
26	// mov rax, qword ptr [rsi]
27	// mov rax, qword ptr [rsi]
28	// mov rax, qword ptr [rsi]
29	// mov rax, qword ptr [rsi]
30	// For the above snippet, haswell just renames rax four times and executes the
31	// four instructions two at a time on P23 and P0126.
32	//
33	// For some instructions, we just need to make sure that the source is
34	// different from the destination. For example, IDIV8r reads from GPR and
35	// writes to AX. We just need to ensure that the Var is assigned a
36	// register which is different from AX:
37	// idiv bx
38	// idiv bx
39	// idiv bx
40	// idiv bx
41	// The above snippet will be able to fully saturate the ports, while the same
42	// with ax would issue one uop every `latency(IDIV8r)` cycles.
43	//
44	// Some instructions make this harder because they both read and write from
45	// the same register:
46	// inc rax
47	// inc rax
48	// inc rax
49	// inc rax
50	// This has a data dependency from each instruction to the next, limit the
51	// number of instructions that can be issued in parallel.
52	// It turns out that this is not a big issue on recent Intel CPUs because they
53	// have heuristics to balance port pressure. In the snippet above, subsequent
54	// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
55	// might end up executing them all on P0 (just because they can), or try
56	// avoiding P5 because it's usually under high pressure from vector
57	// instructions.
58	// This issue is even more important for high-latency instructions because
59	// they increase the idle time of the CPU, e.g. :
60	// imul rax, rbx
61	// imul rax, rbx
62	// imul rax, rbx
63	// imul rax, rbx
64	//
65	// To avoid that, we do the renaming statically by generating as many
66	// independent exclusive assignments as possible (until all possible registers
67	// are exhausted) e.g.:
68	// imul rax, rbx
69	// imul rcx, rbx
70	// imul rdx, rbx
71	// imul r8, rbx
72	//
73	// Some instruction even make the above static renaming impossible because
74	// they implicitly read and write from the same operand, e.g. ADC16rr reads
75	// and writes from EFLAGS.
76	// In that case we just use a greedy register assignment and hope for the
77	// best.
78
79	namespace llvm {
80	namespace exegesis {
81
82	static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
83	SmallVector<const Variable *, `8`> Result;
84	for (const auto &Var : Instr.Variables)
85	if (Var.hasTiedOperands())
86	return true;
87	return false;
88	}
89
90	ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
91
92	void ParallelSnippetGenerator::instantiateMemoryOperands(
93	const unsigned ScratchSpacePointerInReg,
94	std::vector<InstructionTemplate> &Instructions) const {
95	if (ScratchSpacePointerInReg == `0`)
96	return; // no memory operands.
97	const auto &ET = State.getExegesisTarget();
98	const unsigned MemStep = ET.getMaxMemoryAccessSize();
99	const size_t OriginalInstructionsSize = Instructions.size();
100	size_t I = `0`;
101	for (InstructionTemplate &IT : Instructions) {
102	ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep);
103	++I;
104	}
105
106	while (Instructions.size() < kMinNumDifferentAddresses) {
107	InstructionTemplate IT = Instructions [I % OriginalInstructionsSize];
108	ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep);
109	++I;
110	Instructions.push_back(x: std::move(IT));
111	}
112	assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
113	"not enough scratch space");
114	}
115
116	enum class RegRandomizationStrategy : uint8_t {
117	PickRandomRegs,
118	SingleStaticRegPerOperand,
119	SingleStaticReg,
120
121	FIRST = PickRandomRegs,
122	LAST = SingleStaticReg,
123	};
124
125	} // namespace exegesis
126
127	template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
128	static constexpr bool is_iterable = true;
129	};
130
131	namespace exegesis {
132
133	const char *getDescription(RegRandomizationStrategy S) {
134	switch (S) {
135	case RegRandomizationStrategy::PickRandomRegs:
136	return "randomizing registers";
137	case RegRandomizationStrategy::SingleStaticRegPerOperand:
138	return "one unique register for each position";
139	case RegRandomizationStrategy::SingleStaticReg:
140	return "reusing the same register for all positions";
141	}
142	llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
143	}
144
145	static std::variant<std::nullopt_t, MCOperand, Register>
146	generateSingleRegisterForInstrAvoidingDefUseOverlap(
147	const LLVMState &State, const BitVector &ForbiddenRegisters,
148	const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
149	const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
150	const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
151	RegRandomizationStrategy S) {
152	const Instruction &Instr = IT.getInstr();
153	assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
154	!IT.getValueFor(Op).isValid());
155	assert((!Op.isUse() \|\| !Op.isTied()) &&
156	"Not expecting to see a tied use reg");
157
158	if (Op.isUse()) {
159	switch (S) {
160	case RegRandomizationStrategy::PickRandomRegs:
161	break;
162	case RegRandomizationStrategy::SingleStaticReg:
163	case RegRandomizationStrategy::SingleStaticRegPerOperand: {
164	if (!Instructions.empty())
165	return Instructions.front().getValueFor(Op);
166	if (S != RegRandomizationStrategy::SingleStaticReg)
167	break;
168	BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
169	const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
170	if (std::optional<int> CommonBit =
171	getFirstCommonBit(A: PossibleRegisters, B: UseAliases))
172	return *CommonBit;
173	break;
174	}
175	}
176	}
177
178	BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
179	remove(A&: PossibleRegisters, B: ForbiddenRegisters);
180
181	if (Op.isDef()) {
182	remove(A&: PossibleRegisters, B: ImplicitUseAliases);
183	const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
184	remove(A&: PossibleRegisters, B: UseAliases);
185	}
186
187	if (Op.isUse()) {
188	remove(A&: PossibleRegisters, B: ImplicitDefAliases);
189	// NOTE: in general, using same reg for multiple Use's is fine.
190	if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
191	const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses);
192	remove(A&: PossibleRegisters, B: UseAliases);
193	}
194	}
195
196	bool IsDefWithTiedUse =
197	Instr.Variables [Op.getVariableIndex()].hasTiedOperands();
198	if (Op.isUse() \|\| IsDefWithTiedUse) {
199	// Now, important bit: if we have used some register for def,
200	// then we can not use that same register for any* use,*
201	// be it either an untied use, or an use tied to a def.
202	// But def-ing same regs is fine, as long as there are no uses!
203	const BitVector DefsAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Defs);
204	remove(A&: PossibleRegisters, B: DefsAliases);
205	}
206
207	if (!PossibleRegisters.any())
208	return std::nullopt;
209
210	return randomBit(Vector: PossibleRegisters);
211	}
212
213	static std::optional<InstructionTemplate>
214	generateSingleSnippetForInstrAvoidingDefUseOverlap(
215	const LLVMState &State, const BitVector &ForbiddenRegisters,
216	const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
217	BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
218	const ArrayRef<InstructionTemplate> Instructions,
219	RegRandomizationStrategy S) {
220	const Instruction &Instr = IT.getInstr();
221	for (const Operand &Op : Instr.Operands) {
222	if (!Op.isReg() \|\| !Op.isExplicit() \|\| Op.isMemory() \|\|
223	IT.getValueFor(Op).isValid())
224	continue;
225	assert((!Op.isUse() \|\| !Op.isTied()) && "Will not get tied uses.");
226
227	std::variant<std::nullopt_t, MCOperand, Register> R =
228	generateSingleRegisterForInstrAvoidingDefUseOverlap(
229	State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
230	Uses, Defs, IT, Op, Instructions, S);
231
232	if (std::holds_alternative<std::nullopt_t>(v: R))
233	return {};
234
235	MCOperand MCOp;
236	if (std::holds_alternative<MCOperand>(v: R))
237	MCOp = std::get<MCOperand>(v&: R);
238	else {
239	Register RandomReg = std::get<Register>(v&: R);
240	if (Op.isDef())
241	Defs.set(RandomReg);
242	if (Op.isUse())
243	Uses.set(RandomReg);
244	MCOp = MCOperand::createReg(Reg: RandomReg);
245	}
246	IT.getValueFor(Op) = MCOp;
247	}
248	return IT;
249	}
250
251	static std::vector<InstructionTemplate>
252	generateSnippetForInstrAvoidingDefUseOverlap(
253	const LLVMState &State, const InstructionTemplate &IT,
254	RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
255	// We don't want to accidentally serialize the instruction,
256	// so we must be sure that we don't pick a def that is an implicit use,
257	// or a use that is an implicit def, so record implicit regs now.
258	BitVector ImplicitUses(State.getRegInfo().getNumRegs());
259	BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
260	for (const auto &Op : IT.getInstr().Operands) {
261	if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
262	assert(Op.isImplicitReg() && "Not an implicit register operand?");
263	if (Op.isUse())
264	ImplicitUses.set(Op.getImplicitReg());
265	else {
266	assert(Op.isDef() && "Not a use and not a def?");
267	ImplicitDefs.set(Op.getImplicitReg());
268	}
269	}
270	}
271	const BitVector ImplicitUseAliases =
272	getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitUses);
273	const BitVector ImplicitDefAliases =
274	getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitDefs);
275
276	BitVector Defs(State.getRegInfo().getNumRegs());
277	BitVector Uses(State.getRegInfo().getNumRegs());
278	std::vector<InstructionTemplate> Instructions;
279
280	while (true) {
281	std::optional<InstructionTemplate> TmpIT =
282	generateSingleSnippetForInstrAvoidingDefUseOverlap(
283	State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
284	Uses, Defs, IT, Instructions, S);
285	if (!TmpIT)
286	return Instructions;
287	Instructions.push_back(x: std::move(*TmpIT));
288	if (!hasVariablesWithTiedOperands(Instr: IT.getInstr()))
289	return Instructions;
290	assert(Instructions.size() <= `128` && "Stuck in endless loop?");
291	}
292	}
293
294	Expected<std::vector<CodeTemplate>>
295	ParallelSnippetGenerator::generateCodeTemplates(
296	InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
297	const Instruction &Instr = Variant.getInstr();
298	CodeTemplate CT;
299	CT.ScratchSpacePointerInReg =
300	Instr.hasMemoryOperands()
301	? State.getExegesisTarget().getScratchMemoryRegister(
302	State.getTargetMachine().getTargetTriple())
303	: `0`;
304	const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
305	if (SelfAliasing.empty()) {
306	CT.Info = "instruction is parallel, repeating a random one.";
307	CT.Instructions.push_back(x: std::move(Variant));
308	instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions);
309	return getSingleton(CT: std::move(CT));
310	}
311	if (SelfAliasing.hasImplicitAliasing()) {
312	CT.Info = "instruction is serial, repeating a random one.";
313	CT.Instructions.push_back(x: std::move(Variant));
314	instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions);
315	return getSingleton(CT: std::move(CT));
316	}
317	std::vector<CodeTemplate> Result;
318	bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
319	// If there are no tied operands, then we don't want to "saturate backedge",
320	// and the template we will produce will have only a single instruction.
321	unsigned NumUntiedUseRegs = count_if(Range: Instr.Operands, P: [](const Operand &Op) {
322	return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
323	!Op.isTied();
324	});
325	SmallVector<RegRandomizationStrategy, `3`> Strategies;
326	if (HasTiedOperands \|\| NumUntiedUseRegs >= `3`)
327	Strategies.push_back(Elt: RegRandomizationStrategy::PickRandomRegs);
328	if (NumUntiedUseRegs >= `2`)
329	Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticRegPerOperand);
330	Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticReg);
331	for (RegRandomizationStrategy S : Strategies) {
332	CodeTemplate CurrCT = CT.clone();
333	CurrCT.Info =
334	Twine ("instruction has ")
335	.concat(Suffix: HasTiedOperands ? "" : "no ")
336	.concat(Suffix: "tied variables, avoiding "
337	"Read-After-Write issue, picking random def and use "
338	"registers not aliasing each other, for uses, ")
339	.concat(Suffix: getDescription(S))
340	.str();
341	CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
342	State, IT: Variant, S, ForbiddenRegisters);
343	if (CurrCT.Instructions.empty())
344	return make_error<StringError>(
345	Args: Twine ("Failed to produce any snippet via: ").concat(Suffix: CurrCT.Info),
346	Args: inconvertibleErrorCode());
347	instantiateMemoryOperands(ScratchSpacePointerInReg: CurrCT.ScratchSpacePointerInReg,
348	Instructions&: CurrCT.Instructions);
349	Result.push_back(x: std::move(CurrCT));
350	}
351	return Result;
352	}
353
354	constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
355
356	} // namespace exegesis
357	} // namespace llvm
358

Browse the source code of llvm_projects/llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp