| 1 | //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "ParallelSnippetGenerator.h" |
| 10 | |
| 11 | #include "BenchmarkRunner.h" |
| 12 | #include "MCInstrDescView.h" |
| 13 | #include "Target.h" |
| 14 | |
| 15 | // FIXME: Load constants into registers (e.g. with fld1) to not break |
| 16 | // instructions like x87. |
| 17 | |
| 18 | // Ideally we would like the only limitation on executing instructions to be the |
| 19 | // availability of the CPU resources (e.g. execution ports) needed to execute |
| 20 | // them, instead of the availability of their data dependencies. |
| 21 | |
| 22 | // To achieve that, one approach is to generate instructions that do not have |
| 23 | // data dependencies between them. |
| 24 | // |
| 25 | // For some instructions, this is trivial: |
| 26 | // mov rax, qword ptr [rsi] |
| 27 | // mov rax, qword ptr [rsi] |
| 28 | // mov rax, qword ptr [rsi] |
| 29 | // mov rax, qword ptr [rsi] |
| 30 | // For the above snippet, haswell just renames rax four times and executes the |
| 31 | // four instructions two at a time on P23 and P0126. |
| 32 | // |
| 33 | // For some instructions, we just need to make sure that the source is |
| 34 | // different from the destination. For example, IDIV8r reads from GPR and |
| 35 | // writes to AX. We just need to ensure that the Var is assigned a |
| 36 | // register which is different from AX: |
| 37 | // idiv bx |
| 38 | // idiv bx |
| 39 | // idiv bx |
| 40 | // idiv bx |
| 41 | // The above snippet will be able to fully saturate the ports, while the same |
| 42 | // with ax would issue one uop every `latency(IDIV8r)` cycles. |
| 43 | // |
| 44 | // Some instructions make this harder because they both read and write from |
| 45 | // the same register: |
| 46 | // inc rax |
| 47 | // inc rax |
| 48 | // inc rax |
| 49 | // inc rax |
| 50 | // This has a data dependency from each instruction to the next, limit the |
| 51 | // number of instructions that can be issued in parallel. |
| 52 | // It turns out that this is not a big issue on recent Intel CPUs because they |
| 53 | // have heuristics to balance port pressure. In the snippet above, subsequent |
| 54 | // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs |
| 55 | // might end up executing them all on P0 (just because they can), or try |
| 56 | // avoiding P5 because it's usually under high pressure from vector |
| 57 | // instructions. |
| 58 | // This issue is even more important for high-latency instructions because |
| 59 | // they increase the idle time of the CPU, e.g. : |
| 60 | // imul rax, rbx |
| 61 | // imul rax, rbx |
| 62 | // imul rax, rbx |
| 63 | // imul rax, rbx |
| 64 | // |
| 65 | // To avoid that, we do the renaming statically by generating as many |
| 66 | // independent exclusive assignments as possible (until all possible registers |
| 67 | // are exhausted) e.g.: |
| 68 | // imul rax, rbx |
| 69 | // imul rcx, rbx |
| 70 | // imul rdx, rbx |
| 71 | // imul r8, rbx |
| 72 | // |
| 73 | // Some instruction even make the above static renaming impossible because |
| 74 | // they implicitly read and write from the same operand, e.g. ADC16rr reads |
| 75 | // and writes from EFLAGS. |
| 76 | // In that case we just use a greedy register assignment and hope for the |
| 77 | // best. |
| 78 | |
| 79 | namespace llvm { |
| 80 | namespace exegesis { |
| 81 | |
| 82 | static bool hasVariablesWithTiedOperands(const Instruction &Instr) { |
| 83 | for (const auto &Var : Instr.Variables) |
| 84 | if (Var.hasTiedOperands()) |
| 85 | return true; |
| 86 | return false; |
| 87 | } |
| 88 | |
| 89 | ParallelSnippetGenerator::~ParallelSnippetGenerator() = default; |
| 90 | |
| 91 | void ParallelSnippetGenerator::instantiateMemoryOperands( |
| 92 | const MCRegister ScratchSpacePointerInReg, |
| 93 | std::vector<InstructionTemplate> &Instructions) const { |
| 94 | if (!ScratchSpacePointerInReg) |
| 95 | return; // no memory operands. |
| 96 | const auto &ET = State.getExegesisTarget(); |
| 97 | const unsigned MemStep = ET.getMaxMemoryAccessSize(); |
| 98 | const size_t OriginalInstructionsSize = Instructions.size(); |
| 99 | size_t I = 0; |
| 100 | for (InstructionTemplate &IT : Instructions) { |
| 101 | ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep); |
| 102 | ++I; |
| 103 | } |
| 104 | |
| 105 | while (Instructions.size() < kMinNumDifferentAddresses) { |
| 106 | InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; |
| 107 | ET.fillMemoryOperands(IT, Reg: ScratchSpacePointerInReg, Offset: I * MemStep); |
| 108 | ++I; |
| 109 | Instructions.push_back(x: std::move(IT)); |
| 110 | } |
| 111 | assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && |
| 112 | "not enough scratch space" ); |
| 113 | } |
| 114 | |
| 115 | enum class RegRandomizationStrategy : uint8_t { |
| 116 | PickRandomRegs, |
| 117 | SingleStaticRegPerOperand, |
| 118 | SingleStaticReg, |
| 119 | |
| 120 | FIRST = PickRandomRegs, |
| 121 | LAST = SingleStaticReg, |
| 122 | }; |
| 123 | |
| 124 | } // namespace exegesis |
| 125 | |
| 126 | template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> { |
| 127 | static constexpr bool is_iterable = true; |
| 128 | }; |
| 129 | |
| 130 | namespace exegesis { |
| 131 | |
| 132 | const char *getDescription(RegRandomizationStrategy S) { |
| 133 | switch (S) { |
| 134 | case RegRandomizationStrategy::PickRandomRegs: |
| 135 | return "randomizing registers" ; |
| 136 | case RegRandomizationStrategy::SingleStaticRegPerOperand: |
| 137 | return "one unique register for each position" ; |
| 138 | case RegRandomizationStrategy::SingleStaticReg: |
| 139 | return "reusing the same register for all positions" ; |
| 140 | } |
| 141 | llvm_unreachable("Unknown UseRegRandomizationStrategy enum" ); |
| 142 | } |
| 143 | |
| 144 | static std::variant<std::nullopt_t, MCOperand, Register> |
| 145 | generateSingleRegisterForInstrAvoidingDefUseOverlap( |
| 146 | const LLVMState &State, const BitVector &ForbiddenRegisters, |
| 147 | const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, |
| 148 | const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT, |
| 149 | const Operand &Op, const ArrayRef<InstructionTemplate> Instructions, |
| 150 | RegRandomizationStrategy S) { |
| 151 | const Instruction &Instr = IT.getInstr(); |
| 152 | assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() && |
| 153 | !IT.getValueFor(Op).isValid()); |
| 154 | assert((!Op.isUse() || !Op.isTied()) && |
| 155 | "Not expecting to see a tied use reg" ); |
| 156 | |
| 157 | if (Op.isUse()) { |
| 158 | switch (S) { |
| 159 | case RegRandomizationStrategy::PickRandomRegs: |
| 160 | break; |
| 161 | case RegRandomizationStrategy::SingleStaticReg: |
| 162 | case RegRandomizationStrategy::SingleStaticRegPerOperand: { |
| 163 | if (!Instructions.empty()) |
| 164 | return Instructions.front().getValueFor(Op); |
| 165 | if (S != RegRandomizationStrategy::SingleStaticReg) |
| 166 | break; |
| 167 | BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); |
| 168 | const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses); |
| 169 | if (std::optional<int> CommonBit = |
| 170 | getFirstCommonBit(A: PossibleRegisters, B: UseAliases)) |
| 171 | return *CommonBit; |
| 172 | break; |
| 173 | } |
| 174 | } |
| 175 | } |
| 176 | |
| 177 | BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); |
| 178 | remove(A&: PossibleRegisters, B: ForbiddenRegisters); |
| 179 | |
| 180 | if (Op.isDef()) { |
| 181 | remove(A&: PossibleRegisters, B: ImplicitUseAliases); |
| 182 | const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses); |
| 183 | remove(A&: PossibleRegisters, B: UseAliases); |
| 184 | } |
| 185 | |
| 186 | if (Op.isUse()) { |
| 187 | remove(A&: PossibleRegisters, B: ImplicitDefAliases); |
| 188 | // NOTE: in general, using same reg for multiple Use's is fine. |
| 189 | if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) { |
| 190 | const BitVector UseAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Uses); |
| 191 | remove(A&: PossibleRegisters, B: UseAliases); |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | bool IsDefWithTiedUse = |
| 196 | Instr.Variables[Op.getVariableIndex()].hasTiedOperands(); |
| 197 | if (Op.isUse() || IsDefWithTiedUse) { |
| 198 | // Now, important bit: if we have used some register for def, |
| 199 | // then we can not use that same register for *any* use, |
| 200 | // be it either an untied use, or an use tied to a def. |
| 201 | // But def-ing same regs is fine, as long as there are no uses! |
| 202 | const BitVector DefsAliases = getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: Defs); |
| 203 | remove(A&: PossibleRegisters, B: DefsAliases); |
| 204 | } |
| 205 | |
| 206 | if (!PossibleRegisters.any()) |
| 207 | return std::nullopt; |
| 208 | |
| 209 | return randomBit(Vector: PossibleRegisters); |
| 210 | } |
| 211 | |
| 212 | static std::optional<InstructionTemplate> |
| 213 | generateSingleSnippetForInstrAvoidingDefUseOverlap( |
| 214 | const LLVMState &State, const BitVector &ForbiddenRegisters, |
| 215 | const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, |
| 216 | BitVector &Uses, BitVector &Defs, InstructionTemplate IT, |
| 217 | const ArrayRef<InstructionTemplate> Instructions, |
| 218 | RegRandomizationStrategy S) { |
| 219 | const Instruction &Instr = IT.getInstr(); |
| 220 | for (const Operand &Op : Instr.Operands) { |
| 221 | if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() || |
| 222 | IT.getValueFor(Op).isValid()) |
| 223 | continue; |
| 224 | assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses." ); |
| 225 | |
| 226 | std::variant<std::nullopt_t, MCOperand, Register> R = |
| 227 | generateSingleRegisterForInstrAvoidingDefUseOverlap( |
| 228 | State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, |
| 229 | Uses, Defs, IT, Op, Instructions, S); |
| 230 | |
| 231 | if (std::holds_alternative<std::nullopt_t>(v: R)) |
| 232 | return {}; |
| 233 | |
| 234 | MCOperand MCOp; |
| 235 | if (std::holds_alternative<MCOperand>(v: R)) |
| 236 | MCOp = std::get<MCOperand>(v&: R); |
| 237 | else { |
| 238 | Register RandomReg = std::get<Register>(v&: R); |
| 239 | if (Op.isDef()) |
| 240 | Defs.set(RandomReg); |
| 241 | if (Op.isUse()) |
| 242 | Uses.set(RandomReg); |
| 243 | MCOp = MCOperand::createReg(Reg: RandomReg); |
| 244 | } |
| 245 | IT.getValueFor(Op) = MCOp; |
| 246 | } |
| 247 | return IT; |
| 248 | } |
| 249 | |
| 250 | static std::vector<InstructionTemplate> |
| 251 | generateSnippetForInstrAvoidingDefUseOverlap( |
| 252 | const LLVMState &State, const InstructionTemplate &IT, |
| 253 | RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) { |
| 254 | // We don't want to accidentally serialize the instruction, |
| 255 | // so we must be sure that we don't pick a def that is an implicit use, |
| 256 | // or a use that is an implicit def, so record implicit regs now. |
| 257 | BitVector ImplicitUses(State.getRegInfo().getNumRegs()); |
| 258 | BitVector ImplicitDefs(State.getRegInfo().getNumRegs()); |
| 259 | for (const auto &Op : IT.getInstr().Operands) { |
| 260 | if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) { |
| 261 | assert(Op.isImplicitReg() && "Not an implicit register operand?" ); |
| 262 | if (Op.isUse()) |
| 263 | ImplicitUses.set(Op.getImplicitReg().id()); |
| 264 | else { |
| 265 | assert(Op.isDef() && "Not a use and not a def?" ); |
| 266 | ImplicitDefs.set(Op.getImplicitReg().id()); |
| 267 | } |
| 268 | } |
| 269 | } |
| 270 | const BitVector ImplicitUseAliases = |
| 271 | getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitUses); |
| 272 | const BitVector ImplicitDefAliases = |
| 273 | getAliasedBits(RegInfo: State.getRegInfo(), SourceBits: ImplicitDefs); |
| 274 | |
| 275 | BitVector Defs(State.getRegInfo().getNumRegs()); |
| 276 | BitVector Uses(State.getRegInfo().getNumRegs()); |
| 277 | std::vector<InstructionTemplate> Instructions; |
| 278 | |
| 279 | while (true) { |
| 280 | std::optional<InstructionTemplate> TmpIT = |
| 281 | generateSingleSnippetForInstrAvoidingDefUseOverlap( |
| 282 | State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, |
| 283 | Uses, Defs, IT, Instructions, S); |
| 284 | if (!TmpIT) |
| 285 | return Instructions; |
| 286 | Instructions.push_back(x: std::move(*TmpIT)); |
| 287 | if (!hasVariablesWithTiedOperands(Instr: IT.getInstr())) |
| 288 | return Instructions; |
| 289 | assert(Instructions.size() <= 128 && "Stuck in endless loop?" ); |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | Expected<std::vector<CodeTemplate>> |
| 294 | ParallelSnippetGenerator::generateCodeTemplates( |
| 295 | InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { |
| 296 | const Instruction &Instr = Variant.getInstr(); |
| 297 | CodeTemplate CT; |
| 298 | CT.ScratchSpacePointerInReg = |
| 299 | Instr.hasMemoryOperands() |
| 300 | ? State.getExegesisTarget().getScratchMemoryRegister( |
| 301 | State.getTargetMachine().getTargetTriple()) |
| 302 | : MCRegister(); |
| 303 | const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters); |
| 304 | if (SelfAliasing.empty()) { |
| 305 | CT.Info = "instruction is parallel, repeating a random one." ; |
| 306 | CT.Instructions.push_back(x: std::move(Variant)); |
| 307 | instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions); |
| 308 | return getSingleton(CT: std::move(CT)); |
| 309 | } |
| 310 | if (SelfAliasing.hasImplicitAliasing()) { |
| 311 | CT.Info = "instruction is serial, repeating a random one." ; |
| 312 | CT.Instructions.push_back(x: std::move(Variant)); |
| 313 | instantiateMemoryOperands(ScratchSpacePointerInReg: CT.ScratchSpacePointerInReg, Instructions&: CT.Instructions); |
| 314 | return getSingleton(CT: std::move(CT)); |
| 315 | } |
| 316 | std::vector<CodeTemplate> Result; |
| 317 | bool HasTiedOperands = hasVariablesWithTiedOperands(Instr); |
| 318 | // If there are no tied operands, then we don't want to "saturate backedge", |
| 319 | // and the template we will produce will have only a single instruction. |
| 320 | unsigned NumUntiedUseRegs = count_if(Range: Instr.Operands, P: [](const Operand &Op) { |
| 321 | return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() && |
| 322 | !Op.isTied(); |
| 323 | }); |
| 324 | SmallVector<RegRandomizationStrategy, 3> Strategies; |
| 325 | if (HasTiedOperands || NumUntiedUseRegs >= 3) |
| 326 | Strategies.push_back(Elt: RegRandomizationStrategy::PickRandomRegs); |
| 327 | if (NumUntiedUseRegs >= 2) |
| 328 | Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticRegPerOperand); |
| 329 | Strategies.push_back(Elt: RegRandomizationStrategy::SingleStaticReg); |
| 330 | for (RegRandomizationStrategy S : Strategies) { |
| 331 | CodeTemplate CurrCT = CT.clone(); |
| 332 | CurrCT.Info = |
| 333 | Twine("instruction has " ) |
| 334 | .concat(Suffix: HasTiedOperands ? "" : "no " ) |
| 335 | .concat(Suffix: "tied variables, avoiding " |
| 336 | "Read-After-Write issue, picking random def and use " |
| 337 | "registers not aliasing each other, for uses, " ) |
| 338 | .concat(Suffix: getDescription(S)) |
| 339 | .str(); |
| 340 | CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap( |
| 341 | State, IT: Variant, S, ForbiddenRegisters); |
| 342 | if (CurrCT.Instructions.empty()) |
| 343 | return make_error<StringError>( |
| 344 | Args: Twine("Failed to produce any snippet via: " ).concat(Suffix: CurrCT.Info), |
| 345 | Args: inconvertibleErrorCode()); |
| 346 | instantiateMemoryOperands(ScratchSpacePointerInReg: CurrCT.ScratchSpacePointerInReg, |
| 347 | Instructions&: CurrCT.Instructions); |
| 348 | Result.push_back(x: std::move(CurrCT)); |
| 349 | } |
| 350 | return Result; |
| 351 | } |
| 352 | |
| 353 | constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses; |
| 354 | |
| 355 | } // namespace exegesis |
| 356 | } // namespace llvm |
| 357 | |