1 | //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "ParallelSnippetGenerator.h" |
10 | |
11 | #include "BenchmarkRunner.h" |
12 | #include "MCInstrDescView.h" |
13 | #include "Target.h" |
14 | |
15 | // FIXME: Load constants into registers (e.g. with fld1) to not break |
16 | // instructions like x87. |
17 | |
18 | // Ideally we would like the only limitation on executing instructions to be the |
19 | // availability of the CPU resources (e.g. execution ports) needed to execute |
20 | // them, instead of the availability of their data dependencies. |
21 | |
22 | // To achieve that, one approach is to generate instructions that do not have |
23 | // data dependencies between them. |
24 | // |
25 | // For some instructions, this is trivial: |
26 | // mov rax, qword ptr [rsi] |
27 | // mov rax, qword ptr [rsi] |
28 | // mov rax, qword ptr [rsi] |
29 | // mov rax, qword ptr [rsi] |
30 | // For the above snippet, haswell just renames rax four times and executes the |
31 | // four instructions two at a time on P23 and P0126. |
32 | // |
33 | // For some instructions, we just need to make sure that the source is |
34 | // different from the destination. For example, IDIV8r reads from GPR and |
35 | // writes to AX. We just need to ensure that the Var is assigned a |
36 | // register which is different from AX: |
37 | // idiv bx |
38 | // idiv bx |
39 | // idiv bx |
40 | // idiv bx |
41 | // The above snippet will be able to fully saturate the ports, while the same |
42 | // with ax would issue one uop every `latency(IDIV8r)` cycles. |
43 | // |
44 | // Some instructions make this harder because they both read and write from |
45 | // the same register: |
46 | // inc rax |
47 | // inc rax |
48 | // inc rax |
49 | // inc rax |
50 | // This has a data dependency from each instruction to the next, limit the |
51 | // number of instructions that can be issued in parallel. |
52 | // It turns out that this is not a big issue on recent Intel CPUs because they |
53 | // have heuristics to balance port pressure. In the snippet above, subsequent |
54 | // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs |
55 | // might end up executing them all on P0 (just because they can), or try |
56 | // avoiding P5 because it's usually under high pressure from vector |
57 | // instructions. |
58 | // This issue is even more important for high-latency instructions because |
59 | // they increase the idle time of the CPU, e.g. : |
60 | // imul rax, rbx |
61 | // imul rax, rbx |
62 | // imul rax, rbx |
63 | // imul rax, rbx |
64 | // |
65 | // To avoid that, we do the renaming statically by generating as many |
66 | // independent exclusive assignments as possible (until all possible registers |
67 | // are exhausted) e.g.: |
68 | // imul rax, rbx |
69 | // imul rcx, rbx |
70 | // imul rdx, rbx |
71 | // imul r8, rbx |
72 | // |
73 | // Some instruction even make the above static renaming impossible because |
74 | // they implicitly read and write from the same operand, e.g. ADC16rr reads |
75 | // and writes from EFLAGS. |
76 | // In that case we just use a greedy register assignment and hope for the |
77 | // best. |
78 | |
79 | namespace llvm { |
80 | namespace exegesis { |
81 | |
82 | static bool hasVariablesWithTiedOperands(const Instruction &Instr) { |
83 | SmallVector<const Variable *, 8> Result; |
84 | for (const auto &Var : Instr.Variables) |
85 | if (Var.hasTiedOperands()) |
86 | return true; |
87 | return false; |
88 | } |
89 | |
90 | ParallelSnippetGenerator::~ParallelSnippetGenerator() = default; |
91 | |
92 | void ParallelSnippetGenerator::instantiateMemoryOperands( |
93 | const unsigned ScratchSpacePointerInReg, |
94 | std::vector<InstructionTemplate> &Instructions) const { |
95 | if (ScratchSpacePointerInReg == 0) |
96 | return; // no memory operands. |
97 | const auto &ET = State.getExegesisTarget(); |
98 | const unsigned MemStep = ET.getMaxMemoryAccessSize(); |
99 | const size_t OriginalInstructionsSize = Instructions.size(); |
100 | size_t I = 0; |
101 | for (InstructionTemplate &IT : Instructions) { |
102 | ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep); |
103 | ++I; |
104 | } |
105 | |
106 | while (Instructions.size() < kMinNumDifferentAddresses) { |
107 | InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; |
108 | ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep); |
109 | ++I; |
110 | Instructions.push_back(x: std::move(IT)); |
111 | } |
112 | assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && |
113 | "not enough scratch space" ); |
114 | } |
115 | |
116 | enum class RegRandomizationStrategy : uint8_t { |
117 | PickRandomRegs, |
118 | SingleStaticRegPerOperand, |
119 | SingleStaticReg, |
120 | |
121 | FIRST = PickRandomRegs, |
122 | LAST = SingleStaticReg, |
123 | }; |
124 | |
125 | } // namespace exegesis |
126 | |
127 | template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> { |
128 | static constexpr bool is_iterable = true; |
129 | }; |
130 | |
131 | namespace exegesis { |
132 | |
133 | const char *getDescription(RegRandomizationStrategy S) { |
134 | switch (S) { |
135 | case RegRandomizationStrategy::PickRandomRegs: |
136 | return "randomizing registers" ; |
137 | case RegRandomizationStrategy::SingleStaticRegPerOperand: |
138 | return "one unique register for each position" ; |
139 | case RegRandomizationStrategy::SingleStaticReg: |
140 | return "reusing the same register for all positions" ; |
141 | } |
142 | llvm_unreachable("Unknown UseRegRandomizationStrategy enum" ); |
143 | } |
144 | |
145 | static std::variant<std::nullopt_t, MCOperand, Register> |
146 | generateSingleRegisterForInstrAvoidingDefUseOverlap( |
147 | const LLVMState &State, const BitVector &ForbiddenRegisters, |
148 | const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, |
149 | const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT, |
150 | const Operand &Op, const ArrayRef<InstructionTemplate> Instructions, |
151 | RegRandomizationStrategy S) { |
152 | const Instruction &Instr = IT.getInstr(); |
153 | assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() && |
154 | !IT.getValueFor(Op).isValid()); |
155 | assert((!Op.isUse() || !Op.isTied()) && |
156 | "Not expecting to see a tied use reg" ); |
157 | |
158 | if (Op.isUse()) { |
159 | switch (S) { |
160 | case RegRandomizationStrategy::PickRandomRegs: |
161 | break; |
162 | case RegRandomizationStrategy::SingleStaticReg: |
163 | case RegRandomizationStrategy::SingleStaticRegPerOperand: { |
164 | if (!Instructions.empty()) |
165 | return Instructions.front().getValueFor(Op); |
166 | if (S != RegRandomizationStrategy::SingleStaticReg) |
167 | break; |
168 | BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); |
169 | const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses); |
170 | if (std::optional<int> CommonBit = |
171 | getFirstCommonBit(A: PossibleRegisters, B: UseAliases)) |
172 | return *CommonBit; |
173 | break; |
174 | } |
175 | } |
176 | } |
177 | |
178 | BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); |
179 | remove(A&: PossibleRegisters, B: ForbiddenRegisters); |
180 | |
181 | if (Op.isDef()) { |
182 | remove(A&: PossibleRegisters, B: ImplicitUseAliases); |
183 | const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses); |
184 | remove(A&: PossibleRegisters, B: UseAliases); |
185 | } |
186 | |
187 | if (Op.isUse()) { |
188 | remove(A&: PossibleRegisters, B: ImplicitDefAliases); |
189 | // NOTE: in general, using same reg for multiple Use's is fine. |
190 | if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) { |
191 | const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses); |
192 | remove(A&: PossibleRegisters, B: UseAliases); |
193 | } |
194 | } |
195 | |
196 | bool IsDefWithTiedUse = |
197 | Instr.Variables[Op.getVariableIndex()].hasTiedOperands(); |
198 | if (Op.isUse() || IsDefWithTiedUse) { |
199 | // Now, important bit: if we have used some register for def, |
200 | // then we can not use that same register for *any* use, |
201 | // be it either an untied use, or an use tied to a def. |
202 | // But def-ing same regs is fine, as long as there are no uses! |
203 | const BitVector DefsAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Defs); |
204 | remove(A&: PossibleRegisters, B: DefsAliases); |
205 | } |
206 | |
207 | if (!PossibleRegisters.any()) |
208 | return std::nullopt; |
209 | |
210 | return randomBit(Vector: PossibleRegisters); |
211 | } |
212 | |
213 | static std::optional<InstructionTemplate> |
214 | generateSingleSnippetForInstrAvoidingDefUseOverlap( |
215 | const LLVMState &State, const BitVector &ForbiddenRegisters, |
216 | const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, |
217 | BitVector &Uses, BitVector &Defs, InstructionTemplate IT, |
218 | const ArrayRef<InstructionTemplate> Instructions, |
219 | RegRandomizationStrategy S) { |
220 | const Instruction &Instr = IT.getInstr(); |
221 | for (const Operand &Op : Instr.Operands) { |
222 | if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() || |
223 | IT.getValueFor(Op).isValid()) |
224 | continue; |
225 | assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses." ); |
226 | |
227 | std::variant<std::nullopt_t, MCOperand, Register> R = |
228 | generateSingleRegisterForInstrAvoidingDefUseOverlap( |
229 | State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, |
230 | Uses, Defs, IT, Op, Instructions, S); |
231 | |
232 | if (std::holds_alternative<std::nullopt_t>(v: R)) |
233 | return {}; |
234 | |
235 | MCOperand MCOp; |
236 | if (std::holds_alternative<MCOperand>(v: R)) |
237 | MCOp = std::get<MCOperand>(v&: R); |
238 | else { |
239 | Register RandomReg = std::get<Register>(v&: R); |
240 | if (Op.isDef()) |
241 | Defs.set(RandomReg); |
242 | if (Op.isUse()) |
243 | Uses.set(RandomReg); |
244 | MCOp = MCOperand::createReg(Reg: RandomReg); |
245 | } |
246 | IT.getValueFor(Op) = MCOp; |
247 | } |
248 | return IT; |
249 | } |
250 | |
251 | static std::vector<InstructionTemplate> |
252 | generateSnippetForInstrAvoidingDefUseOverlap( |
253 | const LLVMState &State, const InstructionTemplate &IT, |
254 | RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) { |
255 | // We don't want to accidentally serialize the instruction, |
256 | // so we must be sure that we don't pick a def that is an implicit use, |
257 | // or a use that is an implicit def, so record implicit regs now. |
258 | BitVector ImplicitUses(State.getRegInfo().getNumRegs()); |
259 | BitVector ImplicitDefs(State.getRegInfo().getNumRegs()); |
260 | for (const auto &Op : IT.getInstr().Operands) { |
261 | if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) { |
262 | assert(Op.isImplicitReg() && "Not an implicit register operand?" ); |
263 | if (Op.isUse()) |
264 | ImplicitUses.set(Op.getImplicitReg()); |
265 | else { |
266 | assert(Op.isDef() && "Not a use and not a def?" ); |
267 | ImplicitDefs.set(Op.getImplicitReg()); |
268 | } |
269 | } |
270 | } |
271 | const BitVector ImplicitUseAliases = |
272 | getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitUses); |
273 | const BitVector ImplicitDefAliases = |
274 | getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitDefs); |
275 | |
276 | BitVector Defs(State.getRegInfo().getNumRegs()); |
277 | BitVector Uses(State.getRegInfo().getNumRegs()); |
278 | std::vector<InstructionTemplate> Instructions; |
279 | |
280 | while (true) { |
281 | std::optional<InstructionTemplate> TmpIT = |
282 | generateSingleSnippetForInstrAvoidingDefUseOverlap( |
283 | State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, |
284 | Uses, Defs, IT, Instructions, S); |
285 | if (!TmpIT) |
286 | return Instructions; |
287 | Instructions.push_back(x: std::move(*TmpIT)); |
288 | if (!hasVariablesWithTiedOperands(Instr: IT.getInstr())) |
289 | return Instructions; |
290 | assert(Instructions.size() <= 128 && "Stuck in endless loop?" ); |
291 | } |
292 | } |
293 | |
294 | Expected<std::vector<CodeTemplate>> |
295 | ParallelSnippetGenerator::generateCodeTemplates( |
296 | InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { |
297 | const Instruction &Instr = Variant.getInstr(); |
298 | CodeTemplate CT; |
299 | CT.ScratchSpacePointerInReg = |
300 | Instr.hasMemoryOperands() |
301 | ? State.getExegesisTarget().getScratchMemoryRegister( |
302 | State.getTargetMachine().getTargetTriple()) |
303 | : 0; |
304 | const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters); |
305 | if (SelfAliasing.empty()) { |
306 | CT.Info = "instruction is parallel, repeating a random one." ; |
307 | CT.Instructions.push_back(x: std::move(Variant)); |
308 | instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions); |
309 | return getSingleton(CT: std::move(CT)); |
310 | } |
311 | if (SelfAliasing.hasImplicitAliasing()) { |
312 | CT.Info = "instruction is serial, repeating a random one." ; |
313 | CT.Instructions.push_back(x: std::move(Variant)); |
314 | instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions); |
315 | return getSingleton(CT: std::move(CT)); |
316 | } |
317 | std::vector<CodeTemplate> Result; |
318 | bool HasTiedOperands = hasVariablesWithTiedOperands(Instr); |
319 | // If there are no tied operands, then we don't want to "saturate backedge", |
320 | // and the template we will produce will have only a single instruction. |
321 | unsigned NumUntiedUseRegs = count_if(Range: Instr.Operands, P: [](const Operand &Op) { |
322 | return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() && |
323 | !Op.isTied(); |
324 | }); |
325 | SmallVector<RegRandomizationStrategy, 3> Strategies; |
326 | if (HasTiedOperands || NumUntiedUseRegs >= 3) |
327 | Strategies.push_back(Elt: RegRandomizationStrategy::PickRandomRegs); |
328 | if (NumUntiedUseRegs >= 2) |
329 | Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticRegPerOperand); |
330 | Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticReg); |
331 | for (RegRandomizationStrategy S : Strategies) { |
332 | CodeTemplate CurrCT = CT.clone(); |
333 | CurrCT.Info = |
334 | Twine("instruction has " ) |
335 | .concat(Suffix: HasTiedOperands ? "" : "no " ) |
336 | .concat(Suffix: "tied variables, avoiding " |
337 | "Read-After-Write issue, picking random def and use " |
338 | "registers not aliasing each other, for uses, " ) |
339 | .concat(Suffix: getDescription(S)) |
340 | .str(); |
341 | CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap( |
342 | State, IT: Variant, S, ForbiddenRegisters); |
343 | if (CurrCT.Instructions.empty()) |
344 | return make_error<StringError>( |
345 | Args: Twine("Failed to produce any snippet via: " ).concat(Suffix: CurrCT.Info), |
346 | Args: inconvertibleErrorCode()); |
347 | instantiateMemoryOperands(ScratchSpacePointerInReg: CurrCT.ScratchSpacePointerInReg, |
348 | Instructions&: CurrCT.Instructions); |
349 | Result.push_back(x: std::move(CurrCT)); |
350 | } |
351 | return Result; |
352 | } |
353 | |
354 | constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses; |
355 | |
356 | } // namespace exegesis |
357 | } // namespace llvm |
358 | |