1 | //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// |
11 | /// The AMDGPUAsmPrinter is used to print both assembly string and also binary |
12 | /// code. When passed an MCAsmStreamer it prints assembly and when passed |
13 | /// an MCObjectStreamer it outputs binary code. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | // |
17 | |
18 | #include "AMDGPUAsmPrinter.h" |
19 | #include "AMDGPU.h" |
20 | #include "AMDGPUHSAMetadataStreamer.h" |
21 | #include "AMDGPUResourceUsageAnalysis.h" |
22 | #include "GCNSubtarget.h" |
23 | #include "MCTargetDesc/AMDGPUInstPrinter.h" |
24 | #include "MCTargetDesc/AMDGPUMCExpr.h" |
25 | #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h" |
26 | #include "MCTargetDesc/AMDGPUTargetStreamer.h" |
27 | #include "R600AsmPrinter.h" |
28 | #include "SIMachineFunctionInfo.h" |
29 | #include "TargetInfo/AMDGPUTargetInfo.h" |
30 | #include "Utils/AMDGPUBaseInfo.h" |
31 | #include "Utils/AMDKernelCodeTUtils.h" |
32 | #include "Utils/SIDefinesUtils.h" |
33 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
34 | #include "llvm/BinaryFormat/ELF.h" |
35 | #include "llvm/CodeGen/MachineFrameInfo.h" |
36 | #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" |
37 | #include "llvm/IR/DiagnosticInfo.h" |
38 | #include "llvm/MC/MCAssembler.h" |
39 | #include "llvm/MC/MCContext.h" |
40 | #include "llvm/MC/MCSectionELF.h" |
41 | #include "llvm/MC/MCStreamer.h" |
42 | #include "llvm/MC/TargetRegistry.h" |
43 | #include "llvm/Support/AMDHSAKernelDescriptor.h" |
44 | #include "llvm/Target/TargetLoweringObjectFile.h" |
45 | #include "llvm/Target/TargetMachine.h" |
46 | #include "llvm/TargetParser/TargetParser.h" |
47 | |
48 | using namespace llvm; |
49 | using namespace llvm::AMDGPU; |
50 | |
51 | // This should get the default rounding mode from the kernel. We just set the |
52 | // default here, but this could change if the OpenCL rounding mode pragmas are |
53 | // used. |
54 | // |
55 | // The denormal mode here should match what is reported by the OpenCL runtime |
56 | // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but |
57 | // can also be override to flush with the -cl-denorms-are-zero compiler flag. |
58 | // |
59 | // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double |
60 | // precision, and leaves single precision to flush all and does not report |
61 | // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports |
62 | // CL_FP_DENORM for both. |
63 | // |
64 | // FIXME: It seems some instructions do not support single precision denormals |
65 | // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, |
66 | // and sin_f32, cos_f32 on most parts). |
67 | |
68 | // We want to use these instructions, and using fp32 denormals also causes |
69 | // instructions to run at the double precision rate for the device so it's |
70 | // probably best to just report no single precision denormals. |
71 | static uint32_t getFPMode(SIModeRegisterDefaults Mode) { |
72 | return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | |
73 | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | |
74 | FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | |
75 | FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); |
76 | } |
77 | |
78 | static AsmPrinter * |
79 | createAMDGPUAsmPrinterPass(TargetMachine &tm, |
80 | std::unique_ptr<MCStreamer> &&Streamer) { |
81 | return new AMDGPUAsmPrinter(tm, std::move(Streamer)); |
82 | } |
83 | |
84 | extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() { |
85 | TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(), |
86 | Fn: llvm::createR600AsmPrinterPass); |
87 | TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(), |
88 | Fn: createAMDGPUAsmPrinterPass); |
89 | } |
90 | |
91 | AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, |
92 | std::unique_ptr<MCStreamer> Streamer) |
93 | : AsmPrinter(TM, std::move(Streamer)) { |
94 | assert(OutStreamer && "AsmPrinter constructed without streamer" ); |
95 | } |
96 | |
97 | StringRef AMDGPUAsmPrinter::getPassName() const { |
98 | return "AMDGPU Assembly Printer" ; |
99 | } |
100 | |
101 | const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { |
102 | return TM.getMCSubtargetInfo(); |
103 | } |
104 | |
105 | AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { |
106 | if (!OutStreamer) |
107 | return nullptr; |
108 | return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); |
109 | } |
110 | |
111 | void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { |
112 | IsTargetStreamerInitialized = false; |
113 | } |
114 | |
115 | void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { |
116 | IsTargetStreamerInitialized = true; |
117 | |
118 | // TODO: Which one is called first, emitStartOfAsmFile or |
119 | // emitFunctionBodyStart? |
120 | if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) |
121 | initializeTargetID(M); |
122 | |
123 | if (TM.getTargetTriple().getOS() != Triple::AMDHSA && |
124 | TM.getTargetTriple().getOS() != Triple::AMDPAL) |
125 | return; |
126 | |
127 | getTargetStreamer()->EmitDirectiveAMDGCNTarget(); |
128 | |
129 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
130 | getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion( |
131 | COV: CodeObjectVersion); |
132 | HSAMetadataStream->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID()); |
133 | } |
134 | |
135 | if (TM.getTargetTriple().getOS() == Triple::AMDPAL) |
136 | getTargetStreamer()->getPALMetadata()->readFromIR(M); |
137 | } |
138 | |
139 | void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { |
140 | // Init target streamer if it has not yet happened |
141 | if (!IsTargetStreamerInitialized) |
142 | initTargetStreamer(M); |
143 | |
144 | if (TM.getTargetTriple().getOS() != Triple::AMDHSA) |
145 | getTargetStreamer()->EmitISAVersion(); |
146 | |
147 | // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). |
148 | // Emit HSA Metadata (NT_AMD_HSA_METADATA). |
149 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
150 | HSAMetadataStream->end(); |
151 | bool Success = HSAMetadataStream->emitTo(TargetStreamer&: *getTargetStreamer()); |
152 | (void)Success; |
153 | assert(Success && "Malformed HSA Metadata" ); |
154 | } |
155 | } |
156 | |
157 | void AMDGPUAsmPrinter::emitFunctionBodyStart() { |
158 | const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
159 | const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); |
160 | const Function &F = MF->getFunction(); |
161 | |
162 | // TODO: We're checking this late, would be nice to check it earlier. |
163 | if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) { |
164 | report_fatal_error( |
165 | reason: STM.getCPU() + " is only available on code object version 6 or better" , |
166 | /*gen_crash_diag*/ false); |
167 | } |
168 | |
169 | // TODO: Which one is called first, emitStartOfAsmFile or |
170 | // emitFunctionBodyStart? |
171 | if (!getTargetStreamer()->getTargetID()) |
172 | initializeTargetID(M: *F.getParent()); |
173 | |
174 | const auto &FunctionTargetID = STM.getTargetID(); |
175 | // Make sure function's xnack settings are compatible with module's |
176 | // xnack settings. |
177 | if (FunctionTargetID.isXnackSupported() && |
178 | FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any && |
179 | FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) { |
180 | OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine(MF->getName()) + |
181 | "' function does not match module xnack setting" ); |
182 | return; |
183 | } |
184 | // Make sure function's sramecc settings are compatible with module's |
185 | // sramecc settings. |
186 | if (FunctionTargetID.isSramEccSupported() && |
187 | FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any && |
188 | FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) { |
189 | OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine(MF->getName()) + |
190 | "' function does not match module sramecc setting" ); |
191 | return; |
192 | } |
193 | |
194 | if (!MFI.isEntryFunction()) |
195 | return; |
196 | |
197 | if (STM.isMesaKernel(F) && |
198 | (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
199 | F.getCallingConv() == CallingConv::SPIR_KERNEL)) { |
200 | AMDGPUMCKernelCodeT KernelCode; |
201 | getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF); |
202 | KernelCode.validate(STI: &STM, Ctx&: MF->getContext()); |
203 | getTargetStreamer()->EmitAMDKernelCodeT(Header&: KernelCode); |
204 | } |
205 | |
206 | if (STM.isAmdHsaOS()) |
207 | HSAMetadataStream->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo); |
208 | |
209 | if (MFI.getNumKernargPreloadedSGPRs() > 0) { |
210 | assert(AMDGPU::hasKernargPreload(STM)); |
211 | getTargetStreamer()->EmitKernargPreloadHeader(STI: *getGlobalSTI(), |
212 | TrapEnabled: STM.isAmdHsaOS()); |
213 | } |
214 | } |
215 | |
216 | void AMDGPUAsmPrinter::emitFunctionBodyEnd() { |
217 | const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
218 | if (!MFI.isEntryFunction()) |
219 | return; |
220 | |
221 | if (TM.getTargetTriple().getOS() != Triple::AMDHSA) |
222 | return; |
223 | |
224 | auto &Streamer = getTargetStreamer()->getStreamer(); |
225 | auto &Context = Streamer.getContext(); |
226 | auto &ObjectFileInfo = *Context.getObjectFileInfo(); |
227 | auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); |
228 | |
229 | Streamer.pushSection(); |
230 | Streamer.switchSection(Section: &ReadOnlySection); |
231 | |
232 | // CP microcode requires the kernel descriptor to be allocated on 64 byte |
233 | // alignment. |
234 | Streamer.emitValueToAlignment(Alignment: Align(64), Value: 0, ValueSize: 1, MaxBytesToEmit: 0); |
235 | ReadOnlySection.ensureMinAlignment(MinAlignment: Align(64)); |
236 | |
237 | const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); |
238 | |
239 | SmallString<128> KernelName; |
240 | getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction()); |
241 | getTargetStreamer()->EmitAmdhsaKernelDescriptor( |
242 | STI: STM, KernelName, KernelDescriptor: getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo), |
243 | NextVGPR: CurrentProgramInfo.NumVGPRsForWavesPerEU, |
244 | NextSGPR: MCBinaryExpr::createSub( |
245 | LHS: CurrentProgramInfo.NumSGPRsForWavesPerEU, |
246 | RHS: AMDGPUMCExpr::createExtraSGPRs( |
247 | VCCUsed: CurrentProgramInfo.VCCUsed, FlatScrUsed: CurrentProgramInfo.FlatUsed, |
248 | XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx&: Context), |
249 | Ctx&: Context), |
250 | ReserveVCC: CurrentProgramInfo.VCCUsed, ReserveFlatScr: CurrentProgramInfo.FlatUsed); |
251 | |
252 | Streamer.popSection(); |
253 | } |
254 | |
255 | void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { |
256 | Register RegNo = MI->getOperand(i: 0).getReg(); |
257 | |
258 | SmallString<128> Str; |
259 | raw_svector_ostream OS(Str); |
260 | OS << "implicit-def: " |
261 | << printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo()); |
262 | |
263 | if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL) |
264 | OS << " : SGPR spill to VGPR lane" ; |
265 | |
266 | OutStreamer->AddComment(T: OS.str()); |
267 | OutStreamer->addBlankLine(); |
268 | } |
269 | |
270 | void AMDGPUAsmPrinter::emitFunctionEntryLabel() { |
271 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
272 | AsmPrinter::emitFunctionEntryLabel(); |
273 | return; |
274 | } |
275 | |
276 | const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
277 | const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); |
278 | if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(F: MF->getFunction())) { |
279 | SmallString<128> SymbolName; |
280 | getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()), |
281 | getTargetStreamer()->EmitAMDGPUSymbolType( |
282 | SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL); |
283 | } |
284 | if (DumpCodeInstEmitter) { |
285 | // Disassemble function name label to text. |
286 | DisasmLines.push_back(x: MF->getName().str() + ":" ); |
287 | DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size()); |
288 | HexLines.emplace_back(args: "" ); |
289 | } |
290 | |
291 | AsmPrinter::emitFunctionEntryLabel(); |
292 | } |
293 | |
294 | void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { |
295 | if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) { |
296 | // Write a line for the basic block label if it is not only fallthrough. |
297 | DisasmLines.push_back( |
298 | x: (Twine("BB" ) + Twine(getFunctionNumber()) |
299 | + "_" + Twine(MBB.getNumber()) + ":" ).str()); |
300 | DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size()); |
301 | HexLines.emplace_back(args: "" ); |
302 | } |
303 | AsmPrinter::emitBasicBlockStart(MBB); |
304 | } |
305 | |
306 | void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { |
307 | if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
308 | if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) { |
309 | OutContext.reportError(L: {}, |
310 | Msg: Twine(GV->getName()) + |
311 | ": unsupported initializer for address space" ); |
312 | return; |
313 | } |
314 | |
315 | // LDS variables aren't emitted in HSA or PAL yet. |
316 | const Triple::OSType OS = TM.getTargetTriple().getOS(); |
317 | if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) |
318 | return; |
319 | |
320 | MCSymbol *GVSym = getSymbol(GV); |
321 | |
322 | GVSym->redefineIfPossible(); |
323 | if (GVSym->isDefined() || GVSym->isVariable()) |
324 | report_fatal_error(reason: "symbol '" + Twine(GVSym->getName()) + |
325 | "' is already defined" ); |
326 | |
327 | const DataLayout &DL = GV->getDataLayout(); |
328 | uint64_t Size = DL.getTypeAllocSize(Ty: GV->getValueType()); |
329 | Align Alignment = GV->getAlign().value_or(u: Align(4)); |
330 | |
331 | emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration()); |
332 | emitLinkage(GV, GVSym); |
333 | auto TS = getTargetStreamer(); |
334 | TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment); |
335 | return; |
336 | } |
337 | |
338 | AsmPrinter::emitGlobalVariable(GV); |
339 | } |
340 | |
341 | bool AMDGPUAsmPrinter::doInitialization(Module &M) { |
342 | CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M); |
343 | |
344 | if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { |
345 | switch (CodeObjectVersion) { |
346 | case AMDGPU::AMDHSA_COV4: |
347 | HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>(); |
348 | break; |
349 | case AMDGPU::AMDHSA_COV5: |
350 | HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>(); |
351 | break; |
352 | case AMDGPU::AMDHSA_COV6: |
353 | HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>(); |
354 | break; |
355 | default: |
356 | report_fatal_error(reason: "Unexpected code object version" ); |
357 | } |
358 | } |
359 | return AsmPrinter::doInitialization(M); |
360 | } |
361 | |
362 | bool AMDGPUAsmPrinter::doFinalization(Module &M) { |
363 | // Pad with s_code_end to help tools and guard against instruction prefetch |
364 | // causing stale data in caches. Arguably this should be done by the linker, |
365 | // which is why this isn't done for Mesa. |
366 | const MCSubtargetInfo &STI = *getGlobalSTI(); |
367 | if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && |
368 | (STI.getTargetTriple().getOS() == Triple::AMDHSA || |
369 | STI.getTargetTriple().getOS() == Triple::AMDPAL)) { |
370 | OutStreamer->switchSection(Section: getObjFileLowering().getTextSection()); |
371 | getTargetStreamer()->EmitCodeEnd(STI); |
372 | } |
373 | |
374 | return AsmPrinter::doFinalization(M); |
375 | } |
376 | |
377 | // Print comments that apply to both callable functions and entry points. |
378 | void AMDGPUAsmPrinter::( |
379 | uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR, |
380 | uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, |
381 | const AMDGPUMachineFunction *MFI) { |
382 | OutStreamer->emitRawComment(T: " codeLenInByte = " + Twine(CodeSize), TabPrefix: false); |
383 | OutStreamer->emitRawComment(T: " NumSgprs: " + Twine(NumSGPR), TabPrefix: false); |
384 | OutStreamer->emitRawComment(T: " NumVgprs: " + Twine(NumVGPR), TabPrefix: false); |
385 | if (NumAGPR) { |
386 | OutStreamer->emitRawComment(T: " NumAgprs: " + Twine(*NumAGPR), TabPrefix: false); |
387 | OutStreamer->emitRawComment(T: " TotalNumVgprs: " + Twine(TotalNumVGPR), |
388 | TabPrefix: false); |
389 | } |
390 | OutStreamer->emitRawComment(T: " ScratchSize: " + Twine(ScratchSize), TabPrefix: false); |
391 | OutStreamer->emitRawComment(T: " MemoryBound: " + Twine(MFI->isMemoryBound()), |
392 | TabPrefix: false); |
393 | } |
394 | |
395 | SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { |
396 | SmallString<128> Str; |
397 | raw_svector_ostream OSS(Str); |
398 | int64_t IVal; |
399 | if (Value->evaluateAsAbsolute(Res&: IVal)) { |
400 | OSS << static_cast<uint64_t>(IVal); |
401 | } else { |
402 | Value->print(OS&: OSS, MAI); |
403 | } |
404 | return Str; |
405 | } |
406 | |
407 | void AMDGPUAsmPrinter::( |
408 | const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, |
409 | const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize, |
410 | const AMDGPUMachineFunction *MFI) { |
411 | OutStreamer->emitRawComment(T: " codeLenInByte = " + Twine(CodeSize), TabPrefix: false); |
412 | OutStreamer->emitRawComment(T: " NumSgprs: " + getMCExprStr(Value: NumSGPR), TabPrefix: false); |
413 | OutStreamer->emitRawComment(T: " NumVgprs: " + getMCExprStr(Value: NumVGPR), TabPrefix: false); |
414 | if (NumAGPR && TotalNumVGPR) { |
415 | OutStreamer->emitRawComment(T: " NumAgprs: " + getMCExprStr(Value: NumAGPR), TabPrefix: false); |
416 | OutStreamer->emitRawComment(T: " TotalNumVgprs: " + getMCExprStr(Value: TotalNumVGPR), |
417 | TabPrefix: false); |
418 | } |
419 | OutStreamer->emitRawComment(T: " ScratchSize: " + getMCExprStr(Value: ScratchSize), |
420 | TabPrefix: false); |
421 | OutStreamer->emitRawComment(T: " MemoryBound: " + Twine(MFI->isMemoryBound()), |
422 | TabPrefix: false); |
423 | } |
424 | |
425 | const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( |
426 | const MachineFunction &MF) const { |
427 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); |
428 | MCContext &Ctx = MF.getContext(); |
429 | uint16_t KernelCodeProperties = 0; |
430 | const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); |
431 | |
432 | if (UserSGPRInfo.hasPrivateSegmentBuffer()) { |
433 | KernelCodeProperties |= |
434 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; |
435 | } |
436 | if (UserSGPRInfo.hasDispatchPtr()) { |
437 | KernelCodeProperties |= |
438 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
439 | } |
440 | if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { |
441 | KernelCodeProperties |= |
442 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; |
443 | } |
444 | if (UserSGPRInfo.hasKernargSegmentPtr()) { |
445 | KernelCodeProperties |= |
446 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; |
447 | } |
448 | if (UserSGPRInfo.hasDispatchID()) { |
449 | KernelCodeProperties |= |
450 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; |
451 | } |
452 | if (UserSGPRInfo.hasFlatScratchInit()) { |
453 | KernelCodeProperties |= |
454 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; |
455 | } |
456 | if (UserSGPRInfo.hasPrivateSegmentSize()) { |
457 | KernelCodeProperties |= |
458 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; |
459 | } |
460 | if (MF.getSubtarget<GCNSubtarget>().isWave32()) { |
461 | KernelCodeProperties |= |
462 | amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; |
463 | } |
464 | |
465 | // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be |
466 | // un-evaluatable at this point so it cannot be conditionally checked here. |
467 | // Instead, we'll directly shift the possibly unknown MCExpr into its place |
468 | // and bitwise-or it into KernelCodeProperties. |
469 | const MCExpr *KernelCodePropExpr = |
470 | MCConstantExpr::create(Value: KernelCodeProperties, Ctx); |
471 | const MCExpr *OrValue = MCConstantExpr::create( |
472 | Value: amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx); |
473 | OrValue = MCBinaryExpr::createShl(LHS: CurrentProgramInfo.DynamicCallStack, |
474 | RHS: OrValue, Ctx); |
475 | KernelCodePropExpr = MCBinaryExpr::createOr(LHS: KernelCodePropExpr, RHS: OrValue, Ctx); |
476 | |
477 | return KernelCodePropExpr; |
478 | } |
479 | |
480 | MCKernelDescriptor |
481 | AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, |
482 | const SIProgramInfo &PI) const { |
483 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
484 | const Function &F = MF.getFunction(); |
485 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
486 | MCContext &Ctx = MF.getContext(); |
487 | |
488 | MCKernelDescriptor KernelDescriptor; |
489 | |
490 | KernelDescriptor.group_segment_fixed_size = |
491 | MCConstantExpr::create(Value: PI.LDSSize, Ctx); |
492 | KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; |
493 | |
494 | Align MaxKernArgAlign; |
495 | KernelDescriptor.kernarg_size = MCConstantExpr::create( |
496 | Value: STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign), Ctx); |
497 | |
498 | KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(ST: STM, Ctx); |
499 | KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx); |
500 | KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); |
501 | |
502 | int64_t PGRM_Rsrc3 = 1; |
503 | bool EvaluatableRsrc3 = |
504 | CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(Res&: PGRM_Rsrc3); |
505 | (void)PGRM_Rsrc3; |
506 | (void)EvaluatableRsrc3; |
507 | assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 || |
508 | static_cast<uint64_t>(PGRM_Rsrc3) == 0); |
509 | KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; |
510 | |
511 | KernelDescriptor.kernarg_preload = MCConstantExpr::create( |
512 | Value: AMDGPU::hasKernargPreload(STI: STM) ? Info->getNumKernargPreloadedSGPRs() : 0, |
513 | Ctx); |
514 | |
515 | return KernelDescriptor; |
516 | } |
517 | |
518 | bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { |
519 | // Init target streamer lazily on the first function so that previous passes |
520 | // can set metadata. |
521 | if (!IsTargetStreamerInitialized) |
522 | initTargetStreamer(M&: *MF.getFunction().getParent()); |
523 | |
524 | ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); |
525 | CurrentProgramInfo.reset(MF); |
526 | |
527 | const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); |
528 | MCContext &Ctx = MF.getContext(); |
529 | |
530 | // The starting address of all shader programs must be 256 bytes aligned. |
531 | // Regular functions just need the basic required instruction alignment. |
532 | MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); |
533 | |
534 | SetupMachineFunction(MF); |
535 | |
536 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
537 | MCContext &Context = getObjFileLowering().getContext(); |
538 | // FIXME: This should be an explicit check for Mesa. |
539 | if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { |
540 | MCSectionELF *ConfigSection = |
541 | Context.getELFSection(Section: ".AMDGPU.config" , Type: ELF::SHT_PROGBITS, Flags: 0); |
542 | OutStreamer->switchSection(Section: ConfigSection); |
543 | } |
544 | |
545 | if (MFI->isModuleEntryFunction()) { |
546 | getSIProgramInfo(Out&: CurrentProgramInfo, MF); |
547 | } |
548 | |
549 | if (STM.isAmdPalOS()) { |
550 | if (MFI->isEntryFunction()) |
551 | EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo); |
552 | else if (MFI->isModuleEntryFunction()) |
553 | emitPALFunctionMetadata(MF); |
554 | } else if (!STM.isAmdHsaOS()) { |
555 | EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo); |
556 | } |
557 | |
558 | DumpCodeInstEmitter = nullptr; |
559 | if (STM.dumpCode()) { |
560 | // For -dumpcode, get the assembler out of the streamer. This only works |
561 | // with -filetype=obj. |
562 | MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); |
563 | if (Assembler) |
564 | DumpCodeInstEmitter = Assembler->getEmitterPtr(); |
565 | } |
566 | |
567 | DisasmLines.clear(); |
568 | HexLines.clear(); |
569 | DisasmLineMaxLen = 0; |
570 | |
571 | emitFunctionBody(); |
572 | |
573 | emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(), |
574 | hasMAIInsts: STM.hasMAIInsts()); |
575 | |
576 | if (isVerbose()) { |
577 | MCSectionELF * = |
578 | Context.getELFSection(Section: ".AMDGPU.csdata" , Type: ELF::SHT_PROGBITS, Flags: 0); |
579 | OutStreamer->switchSection(Section: CommentSection); |
580 | |
581 | if (!MFI->isEntryFunction()) { |
582 | OutStreamer->emitRawComment(T: " Function info:" , TabPrefix: false); |
583 | const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = |
584 | ResourceUsage->getResourceInfo(F: &MF.getFunction()); |
585 | emitCommonFunctionComments( |
586 | NumVGPR: Info.NumVGPR, |
587 | NumAGPR: STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(), |
588 | TotalNumVGPR: Info.getTotalNumVGPRs(ST: STM), |
589 | NumSGPR: Info.getTotalNumSGPRs(ST: MF.getSubtarget<GCNSubtarget>()), |
590 | ScratchSize: Info.PrivateSegmentSize, CodeSize: getFunctionCodeSize(MF), MFI); |
591 | return false; |
592 | } |
593 | |
594 | OutStreamer->emitRawComment(T: " Kernel info:" , TabPrefix: false); |
595 | emitCommonFunctionComments( |
596 | NumVGPR: CurrentProgramInfo.NumArchVGPR, |
597 | NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr, |
598 | TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR, |
599 | ScratchSize: CurrentProgramInfo.ScratchSize, CodeSize: getFunctionCodeSize(MF), MFI); |
600 | |
601 | OutStreamer->emitRawComment( |
602 | T: " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), TabPrefix: false); |
603 | OutStreamer->emitRawComment( |
604 | T: " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), TabPrefix: false); |
605 | OutStreamer->emitRawComment( |
606 | T: " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + |
607 | " bytes/workgroup (compile time only)" , TabPrefix: false); |
608 | |
609 | OutStreamer->emitRawComment( |
610 | T: " SGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.SGPRBlocks), TabPrefix: false); |
611 | |
612 | OutStreamer->emitRawComment( |
613 | T: " VGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.VGPRBlocks), TabPrefix: false); |
614 | |
615 | OutStreamer->emitRawComment( |
616 | T: " NumSGPRsForWavesPerEU: " + |
617 | getMCExprStr(Value: CurrentProgramInfo.NumSGPRsForWavesPerEU), |
618 | TabPrefix: false); |
619 | OutStreamer->emitRawComment( |
620 | T: " NumVGPRsForWavesPerEU: " + |
621 | getMCExprStr(Value: CurrentProgramInfo.NumVGPRsForWavesPerEU), |
622 | TabPrefix: false); |
623 | |
624 | if (STM.hasGFX90AInsts()) { |
625 | const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd( |
626 | LHS: CurrentProgramInfo.AccumOffset, RHS: MCConstantExpr::create(Value: 1, Ctx), Ctx); |
627 | AdjustedAccum = MCBinaryExpr::createMul( |
628 | LHS: AdjustedAccum, RHS: MCConstantExpr::create(Value: 4, Ctx), Ctx); |
629 | OutStreamer->emitRawComment( |
630 | T: " AccumOffset: " + getMCExprStr(Value: AdjustedAccum), TabPrefix: false); |
631 | } |
632 | |
633 | OutStreamer->emitRawComment( |
634 | T: " Occupancy: " + getMCExprStr(Value: CurrentProgramInfo.Occupancy), TabPrefix: false); |
635 | |
636 | OutStreamer->emitRawComment( |
637 | T: " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), TabPrefix: false); |
638 | |
639 | OutStreamer->emitRawComment( |
640 | T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + |
641 | getMCExprStr(Value: CurrentProgramInfo.ScratchEnable), |
642 | TabPrefix: false); |
643 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " + |
644 | Twine(CurrentProgramInfo.UserSGPR), |
645 | TabPrefix: false); |
646 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + |
647 | Twine(CurrentProgramInfo.TrapHandlerEnable), |
648 | TabPrefix: false); |
649 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " + |
650 | Twine(CurrentProgramInfo.TGIdXEnable), |
651 | TabPrefix: false); |
652 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + |
653 | Twine(CurrentProgramInfo.TGIdYEnable), |
654 | TabPrefix: false); |
655 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + |
656 | Twine(CurrentProgramInfo.TGIdZEnable), |
657 | TabPrefix: false); |
658 | OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + |
659 | Twine(CurrentProgramInfo.TIdIGCompCount), |
660 | TabPrefix: false); |
661 | |
662 | [[maybe_unused]] int64_t PGMRSrc3; |
663 | assert(STM.hasGFX90AInsts() || |
664 | (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute( |
665 | PGMRSrc3) && |
666 | static_cast<uint64_t>(PGMRSrc3) == 0)); |
667 | if (STM.hasGFX90AInsts()) { |
668 | OutStreamer->emitRawComment( |
669 | T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + |
670 | getMCExprStr(Value: MCKernelDescriptor::bits_get( |
671 | Src: CurrentProgramInfo.ComputePGMRSrc3GFX90A, |
672 | Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, |
673 | Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)), |
674 | TabPrefix: false); |
675 | OutStreamer->emitRawComment( |
676 | T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + |
677 | getMCExprStr(Value: MCKernelDescriptor::bits_get( |
678 | Src: CurrentProgramInfo.ComputePGMRSrc3GFX90A, |
679 | Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, |
680 | Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)), |
681 | TabPrefix: false); |
682 | } |
683 | } |
684 | |
685 | if (DumpCodeInstEmitter) { |
686 | |
687 | OutStreamer->switchSection( |
688 | Section: Context.getELFSection(Section: ".AMDGPU.disasm" , Type: ELF::SHT_PROGBITS, Flags: 0)); |
689 | |
690 | for (size_t i = 0; i < DisasmLines.size(); ++i) { |
691 | std::string = "\n" ; |
692 | if (!HexLines[i].empty()) { |
693 | Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); |
694 | Comment += " ; " + HexLines[i] + "\n" ; |
695 | } |
696 | |
697 | OutStreamer->emitBytes(Data: StringRef(DisasmLines[i])); |
698 | OutStreamer->emitBytes(Data: StringRef(Comment)); |
699 | } |
700 | } |
701 | |
702 | return false; |
703 | } |
704 | |
705 | // TODO: Fold this into emitFunctionBodyStart. |
706 | void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { |
707 | // In the beginning all features are either 'Any' or 'NotSupported', |
708 | // depending on global target features. This will cover empty modules. |
709 | getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(), |
710 | FeatureString: getGlobalSTI()->getFeatureString()); |
711 | |
712 | // If module is empty, we are done. |
713 | if (M.empty()) |
714 | return; |
715 | |
716 | // If module is not empty, need to find first 'Off' or 'On' feature |
717 | // setting per feature from functions in module. |
718 | for (auto &F : M) { |
719 | auto &TSTargetID = getTargetStreamer()->getTargetID(); |
720 | if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) && |
721 | (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff())) |
722 | break; |
723 | |
724 | const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); |
725 | const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID(); |
726 | if (TSTargetID->isXnackSupported()) |
727 | if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any) |
728 | TSTargetID->setXnackSetting(STMTargetID.getXnackSetting()); |
729 | if (TSTargetID->isSramEccSupported()) |
730 | if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any) |
731 | TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting()); |
732 | } |
733 | } |
734 | |
735 | uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { |
736 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
737 | const SIInstrInfo *TII = STM.getInstrInfo(); |
738 | |
739 | uint64_t CodeSize = 0; |
740 | |
741 | for (const MachineBasicBlock &MBB : MF) { |
742 | for (const MachineInstr &MI : MBB) { |
743 | // TODO: CodeSize should account for multiple functions. |
744 | |
745 | // TODO: Should we count size of debug info? |
746 | if (MI.isDebugInstr()) |
747 | continue; |
748 | |
749 | CodeSize += TII->getInstSizeInBytes(MI); |
750 | } |
751 | } |
752 | |
753 | return CodeSize; |
754 | } |
755 | |
756 | void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, |
757 | const MachineFunction &MF) { |
758 | const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = |
759 | ResourceUsage->getResourceInfo(F: &MF.getFunction()); |
760 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
761 | MCContext &Ctx = MF.getContext(); |
762 | |
763 | auto CreateExpr = [&Ctx](int64_t Value) { |
764 | return MCConstantExpr::create(Value, Ctx); |
765 | }; |
766 | |
767 | auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool { |
768 | int64_t Val; |
769 | if (Value->evaluateAsAbsolute(Res&: Val)) { |
770 | Res = Val; |
771 | return true; |
772 | } |
773 | return false; |
774 | }; |
775 | |
776 | ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR); |
777 | ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR); |
778 | ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(ST: STM)); |
779 | ProgInfo.AccumOffset = |
780 | CreateExpr(alignTo(Value: std::max(a: 1, b: Info.NumVGPR), Align: 4) / 4 - 1); |
781 | ProgInfo.TgSplit = STM.isTgSplitEnabled(); |
782 | ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR); |
783 | ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize); |
784 | ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC); |
785 | ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch); |
786 | ProgInfo.DynamicCallStack = |
787 | CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion); |
788 | |
789 | const uint64_t MaxScratchPerWorkitem = |
790 | STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); |
791 | uint64_t ScratchSize; |
792 | if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) && |
793 | ScratchSize > MaxScratchPerWorkitem) { |
794 | DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize, |
795 | MaxScratchPerWorkitem, DS_Error); |
796 | MF.getFunction().getContext().diagnose(DI: DiagStackSize); |
797 | } |
798 | |
799 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
800 | |
801 | // The calculations related to SGPR/VGPR blocks are |
802 | // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be |
803 | // unified. |
804 | const MCExpr * = AMDGPUMCExpr::createExtraSGPRs( |
805 | VCCUsed: ProgInfo.VCCUsed, FlatScrUsed: ProgInfo.FlatUsed, |
806 | XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx); |
807 | |
808 | // Check the addressable register limit before we add ExtraSGPRs. |
809 | if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && |
810 | !STM.hasSGPRInitBug()) { |
811 | unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); |
812 | uint64_t NumSgpr; |
813 | if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && |
814 | NumSgpr > MaxAddressableNumSGPRs) { |
815 | // This can happen due to a compiler bug or when using inline asm. |
816 | LLVMContext &Ctx = MF.getFunction().getContext(); |
817 | DiagnosticInfoResourceLimit Diag( |
818 | MF.getFunction(), "addressable scalar registers" , NumSgpr, |
819 | MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); |
820 | Ctx.diagnose(DI: Diag); |
821 | ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1); |
822 | } |
823 | } |
824 | |
825 | // Account for extra SGPRs and VGPRs reserved for debugger use. |
826 | ProgInfo.NumSGPR = MCBinaryExpr::createAdd(LHS: ProgInfo.NumSGPR, RHS: ExtraSGPRs, Ctx); |
827 | |
828 | const Function &F = MF.getFunction(); |
829 | |
830 | // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave |
831 | // dispatch registers are function args. |
832 | unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; |
833 | |
834 | if (isShader(CC: F.getCallingConv())) { |
835 | bool IsPixelShader = |
836 | F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); |
837 | |
838 | // Calculate the number of VGPR registers based on the SPI input registers |
839 | uint32_t InputEna = 0; |
840 | uint32_t InputAddr = 0; |
841 | unsigned LastEna = 0; |
842 | |
843 | if (IsPixelShader) { |
844 | // Note for IsPixelShader: |
845 | // By this stage, all enabled inputs are tagged in InputAddr as well. |
846 | // We will use InputAddr to determine whether the input counts against the |
847 | // vgpr total and only use the InputEnable to determine the last input |
848 | // that is relevant - if extra arguments are used, then we have to honour |
849 | // the InputAddr for any intermediate non-enabled inputs. |
850 | InputEna = MFI->getPSInputEnable(); |
851 | InputAddr = MFI->getPSInputAddr(); |
852 | |
853 | // We only need to consider input args up to the last used arg. |
854 | assert((InputEna || InputAddr) && |
855 | "PSInputAddr and PSInputEnable should " |
856 | "never both be 0 for AMDGPU_PS shaders" ); |
857 | // There are some rare circumstances where InputAddr is non-zero and |
858 | // InputEna can be set to 0. In this case we default to setting LastEna |
859 | // to 1. |
860 | LastEna = InputEna ? llvm::Log2_32(Value: InputEna) + 1 : 1; |
861 | } |
862 | |
863 | // FIXME: We should be using the number of registers determined during |
864 | // calling convention lowering to legalize the types. |
865 | const DataLayout &DL = F.getDataLayout(); |
866 | unsigned PSArgCount = 0; |
867 | unsigned IntermediateVGPR = 0; |
868 | for (auto &Arg : F.args()) { |
869 | unsigned NumRegs = (DL.getTypeSizeInBits(Ty: Arg.getType()) + 31) / 32; |
870 | if (Arg.hasAttribute(Kind: Attribute::InReg)) { |
871 | WaveDispatchNumSGPR += NumRegs; |
872 | } else { |
873 | // If this is a PS shader and we're processing the PS Input args (first |
874 | // 16 VGPR), use the InputEna and InputAddr bits to define how many |
875 | // VGPRs are actually used. |
876 | // Any extra VGPR arguments are handled as normal arguments (and |
877 | // contribute to the VGPR count whether they're used or not). |
878 | if (IsPixelShader && PSArgCount < 16) { |
879 | if ((1 << PSArgCount) & InputAddr) { |
880 | if (PSArgCount < LastEna) |
881 | WaveDispatchNumVGPR += NumRegs; |
882 | else |
883 | IntermediateVGPR += NumRegs; |
884 | } |
885 | PSArgCount++; |
886 | } else { |
887 | // If there are extra arguments we have to include the allocation for |
888 | // the non-used (but enabled with InputAddr) input arguments |
889 | if (IntermediateVGPR) { |
890 | WaveDispatchNumVGPR += IntermediateVGPR; |
891 | IntermediateVGPR = 0; |
892 | } |
893 | WaveDispatchNumVGPR += NumRegs; |
894 | } |
895 | } |
896 | } |
897 | ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( |
898 | Args: {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); |
899 | |
900 | ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( |
901 | Args: {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); |
902 | |
903 | ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( |
904 | NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx); |
905 | } |
906 | |
907 | // Adjust number of registers used to meet default/requested minimum/maximum |
908 | // number of waves per execution unit request. |
909 | unsigned MaxWaves = MFI->getMaxWavesPerEU(); |
910 | ProgInfo.NumSGPRsForWavesPerEU = |
911 | AMDGPUMCExpr::createMax(Args: {ProgInfo.NumSGPR, CreateExpr(1ul), |
912 | CreateExpr(STM.getMinNumSGPRs(WavesPerEU: MaxWaves))}, |
913 | Ctx); |
914 | ProgInfo.NumVGPRsForWavesPerEU = |
915 | AMDGPUMCExpr::createMax(Args: {ProgInfo.NumVGPR, CreateExpr(1ul), |
916 | CreateExpr(STM.getMinNumVGPRs(WavesPerEU: MaxWaves))}, |
917 | Ctx); |
918 | |
919 | if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || |
920 | STM.hasSGPRInitBug()) { |
921 | unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); |
922 | uint64_t NumSgpr; |
923 | if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && |
924 | NumSgpr > MaxAddressableNumSGPRs) { |
925 | // This can happen due to a compiler bug or when using inline asm to use |
926 | // the registers which are usually reserved for vcc etc. |
927 | LLVMContext &Ctx = MF.getFunction().getContext(); |
928 | DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers" , |
929 | NumSgpr, MaxAddressableNumSGPRs, |
930 | DS_Error, DK_ResourceLimit); |
931 | Ctx.diagnose(DI: Diag); |
932 | ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs); |
933 | ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs); |
934 | } |
935 | } |
936 | |
937 | if (STM.hasSGPRInitBug()) { |
938 | ProgInfo.NumSGPR = |
939 | CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); |
940 | ProgInfo.NumSGPRsForWavesPerEU = |
941 | CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); |
942 | } |
943 | |
944 | if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { |
945 | LLVMContext &Ctx = MF.getFunction().getContext(); |
946 | DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs" , |
947 | MFI->getNumUserSGPRs(), |
948 | STM.getMaxNumUserSGPRs(), DS_Error); |
949 | Ctx.diagnose(DI: Diag); |
950 | } |
951 | |
952 | if (MFI->getLDSSize() > |
953 | static_cast<unsigned>(STM.getAddressableLocalMemorySize())) { |
954 | LLVMContext &Ctx = MF.getFunction().getContext(); |
955 | DiagnosticInfoResourceLimit Diag( |
956 | MF.getFunction(), "local memory" , MFI->getLDSSize(), |
957 | STM.getAddressableLocalMemorySize(), DS_Error); |
958 | Ctx.diagnose(DI: Diag); |
959 | } |
960 | // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks: |
961 | // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1 |
962 | auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR, |
963 | unsigned Granule) { |
964 | const MCExpr *OneConst = CreateExpr(1ul); |
965 | const MCExpr *GranuleConst = CreateExpr(Granule); |
966 | const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax(Args: {NumGPR, OneConst}, Ctx); |
967 | const MCExpr *AlignToGPR = |
968 | AMDGPUMCExpr::createAlignTo(Value: MaxNumGPR, Align: GranuleConst, Ctx); |
969 | const MCExpr *DivGPR = |
970 | MCBinaryExpr::createDiv(LHS: AlignToGPR, RHS: GranuleConst, Ctx); |
971 | const MCExpr *SubGPR = MCBinaryExpr::createSub(LHS: DivGPR, RHS: OneConst, Ctx); |
972 | return SubGPR; |
973 | }; |
974 | |
975 | ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU, |
976 | IsaInfo::getSGPREncodingGranule(STI: &STM)); |
977 | ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU, |
978 | IsaInfo::getVGPREncodingGranule(STI: &STM)); |
979 | |
980 | const SIModeRegisterDefaults Mode = MFI->getMode(); |
981 | |
982 | // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode |
983 | // register. |
984 | ProgInfo.FloatMode = getFPMode(Mode); |
985 | |
986 | ProgInfo.IEEEMode = Mode.IEEE; |
987 | |
988 | // Make clamp modifier on NaN input returns 0. |
989 | ProgInfo.DX10Clamp = Mode.DX10Clamp; |
990 | |
991 | unsigned LDSAlignShift; |
992 | if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { |
993 | // LDS is allocated in 64 dword blocks. |
994 | LDSAlignShift = 8; |
995 | } else { |
996 | // LDS is allocated in 128 dword blocks. |
997 | LDSAlignShift = 9; |
998 | } |
999 | |
1000 | ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); |
1001 | ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); |
1002 | |
1003 | ProgInfo.LDSSize = MFI->getLDSSize(); |
1004 | ProgInfo.LDSBlocks = |
1005 | alignTo(Value: ProgInfo.LDSSize, Align: 1ULL << LDSAlignShift) >> LDSAlignShift; |
1006 | |
1007 | // The MCExpr equivalent of divideCeil. |
1008 | auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) { |
1009 | const MCExpr *Ceil = |
1010 | AMDGPUMCExpr::createAlignTo(Value: Numerator, Align: Denominator, Ctx); |
1011 | return MCBinaryExpr::createDiv(LHS: Ceil, RHS: Denominator, Ctx); |
1012 | }; |
1013 | |
1014 | // Scratch is allocated in 64-dword or 256-dword blocks. |
1015 | unsigned ScratchAlignShift = |
1016 | STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; |
1017 | // We need to program the hardware with the amount of scratch memory that |
1018 | // is used by the entire wave. ProgInfo.ScratchSize is the amount of |
1019 | // scratch memory used per thread. |
1020 | ProgInfo.ScratchBlocks = DivideCeil( |
1021 | MCBinaryExpr::createMul(LHS: ProgInfo.ScratchSize, |
1022 | RHS: CreateExpr(STM.getWavefrontSize()), Ctx), |
1023 | CreateExpr(1ULL << ScratchAlignShift)); |
1024 | |
1025 | if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= 10) { |
1026 | ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; |
1027 | ProgInfo.MemOrdered = 1; |
1028 | } |
1029 | |
1030 | // 0 = X, 1 = XY, 2 = XYZ |
1031 | unsigned TIDIGCompCnt = 0; |
1032 | if (MFI->hasWorkItemIDZ()) |
1033 | TIDIGCompCnt = 2; |
1034 | else if (MFI->hasWorkItemIDY()) |
1035 | TIDIGCompCnt = 1; |
1036 | |
1037 | // The private segment wave byte offset is the last of the system SGPRs. We |
1038 | // initially assumed it was allocated, and may have used it. It shouldn't harm |
1039 | // anything to disable it if we know the stack isn't used here. We may still |
1040 | // have emitted code reading it to initialize scratch, but if that's unused |
1041 | // reading garbage should be OK. |
1042 | ProgInfo.ScratchEnable = MCBinaryExpr::createLOr( |
1043 | LHS: MCBinaryExpr::createGT(LHS: ProgInfo.ScratchBlocks, |
1044 | RHS: MCConstantExpr::create(Value: 0, Ctx), Ctx), |
1045 | RHS: ProgInfo.DynamicCallStack, Ctx); |
1046 | |
1047 | ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); |
1048 | // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. |
1049 | ProgInfo.TrapHandlerEnable = |
1050 | STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); |
1051 | ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); |
1052 | ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); |
1053 | ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); |
1054 | ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo(); |
1055 | ProgInfo.TIdIGCompCount = TIDIGCompCnt; |
1056 | ProgInfo.EXCPEnMSB = 0; |
1057 | // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. |
1058 | ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; |
1059 | ProgInfo.EXCPEnable = 0; |
1060 | |
1061 | if (STM.hasGFX90AInsts()) { |
1062 | // return ((Dst & ~Mask) | (Value << Shift)) |
1063 | auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, |
1064 | uint32_t Shift) { |
1065 | auto Shft = MCConstantExpr::create(Value: Shift, Ctx); |
1066 | auto Msk = MCConstantExpr::create(Value: Mask, Ctx); |
1067 | Dst = MCBinaryExpr::createAnd(LHS: Dst, RHS: MCUnaryExpr::createNot(Expr: Msk, Ctx), Ctx); |
1068 | Dst = MCBinaryExpr::createOr( |
1069 | LHS: Dst, RHS: MCBinaryExpr::createShl(LHS: Value, RHS: Shft, Ctx), Ctx); |
1070 | return Dst; |
1071 | }; |
1072 | |
1073 | ProgInfo.ComputePGMRSrc3GFX90A = |
1074 | SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset, |
1075 | amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, |
1076 | amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT); |
1077 | ProgInfo.ComputePGMRSrc3GFX90A = |
1078 | SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit), |
1079 | amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, |
1080 | amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); |
1081 | } |
1082 | |
1083 | ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( |
1084 | InitOcc: STM.computeOccupancy(F, LDSSize: ProgInfo.LDSSize), NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU, |
1085 | NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); |
1086 | |
1087 | const auto [MinWEU, MaxWEU] = |
1088 | AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu" , Default: {0, 0}, OnlyFirstRequired: true); |
1089 | uint64_t Occupancy; |
1090 | if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) { |
1091 | DiagnosticInfoOptimizationFailure Diag( |
1092 | F, F.getSubprogram(), |
1093 | "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " |
1094 | "'" + |
1095 | F.getName() + "': desired occupancy was " + Twine(MinWEU) + |
1096 | ", final occupancy is " + Twine(Occupancy)); |
1097 | F.getContext().diagnose(DI: Diag); |
1098 | } |
1099 | } |
1100 | |
1101 | static unsigned getRsrcReg(CallingConv::ID CallConv) { |
1102 | switch (CallConv) { |
1103 | default: [[fallthrough]]; |
1104 | case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; |
1105 | case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; |
1106 | case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; |
1107 | case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; |
1108 | case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; |
1109 | case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; |
1110 | case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; |
1111 | } |
1112 | } |
1113 | |
1114 | void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, |
1115 | const SIProgramInfo &CurrentProgramInfo) { |
1116 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1117 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
1118 | unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv()); |
1119 | MCContext &Ctx = MF.getContext(); |
1120 | |
1121 | // (((Value) & Mask) << Shift) |
1122 | auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) { |
1123 | const MCExpr *msk = MCConstantExpr::create(Value: Mask, Ctx); |
1124 | const MCExpr *shft = MCConstantExpr::create(Value: Shift, Ctx); |
1125 | return MCBinaryExpr::createShl(LHS: MCBinaryExpr::createAnd(LHS: Value, RHS: msk, Ctx), |
1126 | RHS: shft, Ctx); |
1127 | }; |
1128 | |
1129 | auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) { |
1130 | int64_t Val; |
1131 | if (Value->evaluateAsAbsolute(Res&: Val)) |
1132 | OutStreamer->emitIntValue(Value: static_cast<uint64_t>(Val), Size); |
1133 | else |
1134 | OutStreamer->emitValue(Value, Size); |
1135 | }; |
1136 | |
1137 | if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) { |
1138 | OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); |
1139 | |
1140 | EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx), |
1141 | /*Size=*/4); |
1142 | |
1143 | OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); |
1144 | EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4); |
1145 | |
1146 | OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); |
1147 | |
1148 | // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the |
1149 | // appropriate generation. |
1150 | if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { |
1151 | EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, |
1152 | /*Mask=*/0x3FFFF, /*Shift=*/12), |
1153 | /*Size=*/4); |
1154 | } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { |
1155 | EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, |
1156 | /*Mask=*/0x7FFF, /*Shift=*/12), |
1157 | /*Size=*/4); |
1158 | } else { |
1159 | EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, |
1160 | /*Mask=*/0x1FFF, /*Shift=*/12), |
1161 | /*Size=*/4); |
1162 | } |
1163 | |
1164 | // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = |
1165 | // 0" comment but I don't see a corresponding field in the register spec. |
1166 | } else { |
1167 | OutStreamer->emitInt32(Value: RsrcReg); |
1168 | |
1169 | const MCExpr *GPRBlocks = MCBinaryExpr::createOr( |
1170 | LHS: SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0), |
1171 | RHS: SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6), |
1172 | Ctx&: MF.getContext()); |
1173 | EmitResolvedOrExpr(GPRBlocks, /*Size=*/4); |
1174 | OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); |
1175 | |
1176 | // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the |
1177 | // appropriate generation. |
1178 | if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { |
1179 | EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, |
1180 | /*Mask=*/0x3FFFF, /*Shift=*/12), |
1181 | /*Size=*/4); |
1182 | } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { |
1183 | EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, |
1184 | /*Mask=*/0x7FFF, /*Shift=*/12), |
1185 | /*Size=*/4); |
1186 | } else { |
1187 | EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, |
1188 | /*Mask=*/0x1FFF, /*Shift=*/12), |
1189 | /*Size=*/4); |
1190 | } |
1191 | } |
1192 | |
1193 | if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { |
1194 | OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); |
1195 | unsigned = STM.getGeneration() >= AMDGPUSubtarget::GFX11 |
1196 | ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2) |
1197 | : CurrentProgramInfo.LDSBlocks; |
1198 | OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); |
1199 | OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); |
1200 | OutStreamer->emitInt32(Value: MFI->getPSInputEnable()); |
1201 | OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); |
1202 | OutStreamer->emitInt32(Value: MFI->getPSInputAddr()); |
1203 | } |
1204 | |
1205 | OutStreamer->emitInt32(R_SPILLED_SGPRS); |
1206 | OutStreamer->emitInt32(Value: MFI->getNumSpilledSGPRs()); |
1207 | OutStreamer->emitInt32(R_SPILLED_VGPRS); |
1208 | OutStreamer->emitInt32(Value: MFI->getNumSpilledVGPRs()); |
1209 | } |
1210 | |
1211 | // Helper function to add common PAL Metadata 3.0+ |
1212 | static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, |
1213 | const SIProgramInfo &CurrentProgramInfo, |
1214 | CallingConv::ID CC, const GCNSubtarget &ST) { |
1215 | if (ST.hasIEEEMode()) |
1216 | MD->setHwStage(CC, field: ".ieee_mode" , Val: (bool)CurrentProgramInfo.IEEEMode); |
1217 | |
1218 | MD->setHwStage(CC, field: ".wgp_mode" , Val: (bool)CurrentProgramInfo.WgpMode); |
1219 | MD->setHwStage(CC, field: ".mem_ordered" , Val: (bool)CurrentProgramInfo.MemOrdered); |
1220 | |
1221 | if (AMDGPU::isCompute(CC)) { |
1222 | MD->setHwStage(CC, field: ".trap_present" , |
1223 | Val: (bool)CurrentProgramInfo.TrapHandlerEnable); |
1224 | MD->setHwStage(CC, field: ".excp_en" , Val: CurrentProgramInfo.EXCPEnable); |
1225 | } |
1226 | |
1227 | MD->setHwStage(CC, field: ".lds_size" , |
1228 | Val: (unsigned)(CurrentProgramInfo.LdsSize * |
1229 | getLdsDwGranularity(ST) * sizeof(uint32_t))); |
1230 | } |
1231 | |
1232 | // This is the equivalent of EmitProgramInfoSI above, but for when the OS type |
1233 | // is AMDPAL. It stores each compute/SPI register setting and other PAL |
1234 | // metadata items into the PALMD::Metadata, combining with any provided by the |
1235 | // frontend as LLVM metadata. Once all functions are written, the PAL metadata |
1236 | // is then written as a single block in the .note section. |
1237 | void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, |
1238 | const SIProgramInfo &CurrentProgramInfo) { |
1239 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1240 | auto CC = MF.getFunction().getCallingConv(); |
1241 | auto MD = getTargetStreamer()->getPALMetadata(); |
1242 | auto &Ctx = MF.getContext(); |
1243 | |
1244 | MD->setEntryPoint(CC, Name: MF.getFunction().getName()); |
1245 | MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx); |
1246 | |
1247 | // Only set AGPRs for supported devices |
1248 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
1249 | if (STM.hasMAIInsts()) { |
1250 | MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR); |
1251 | } |
1252 | |
1253 | MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx); |
1254 | if (MD->getPALMajorVersion() < 3) { |
1255 | MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM, Ctx), Ctx); |
1256 | if (AMDGPU::isCompute(CC)) { |
1257 | MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); |
1258 | } else { |
1259 | const MCExpr *HasScratchBlocks = |
1260 | MCBinaryExpr::createGT(LHS: CurrentProgramInfo.ScratchBlocks, |
1261 | RHS: MCConstantExpr::create(Value: 0, Ctx), Ctx); |
1262 | auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN); |
1263 | MD->setRsrc2(CC, Val: maskShiftSet(Val: HasScratchBlocks, Mask, Shift, Ctx), Ctx); |
1264 | } |
1265 | } else { |
1266 | MD->setHwStage(CC, field: ".debug_mode" , Val: (bool)CurrentProgramInfo.DebugMode); |
1267 | MD->setHwStage(CC, field: ".scratch_en" , Type: msgpack::Type::Boolean, |
1268 | Val: CurrentProgramInfo.ScratchEnable); |
1269 | EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM); |
1270 | } |
1271 | |
1272 | // ScratchSize is in bytes, 16 aligned. |
1273 | MD->setScratchSize( |
1274 | CC, |
1275 | Val: AMDGPUMCExpr::createAlignTo(Value: CurrentProgramInfo.ScratchSize, |
1276 | Align: MCConstantExpr::create(Value: 16, Ctx), Ctx), |
1277 | Ctx); |
1278 | |
1279 | if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { |
1280 | unsigned = STM.getGeneration() >= AMDGPUSubtarget::GFX11 |
1281 | ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2) |
1282 | : CurrentProgramInfo.LDSBlocks; |
1283 | if (MD->getPALMajorVersion() < 3) { |
1284 | MD->setRsrc2( |
1285 | CC, |
1286 | Val: MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx), |
1287 | Ctx); |
1288 | MD->setSpiPsInputEna(MFI->getPSInputEnable()); |
1289 | MD->setSpiPsInputAddr(MFI->getPSInputAddr()); |
1290 | } else { |
1291 | // Graphics registers |
1292 | const unsigned = |
1293 | STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128; |
1294 | MD->setGraphicsRegisters( |
1295 | field: ".ps_extra_lds_size" , |
1296 | Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t))); |
1297 | |
1298 | // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr |
1299 | static StringLiteral const PsInputFields[] = { |
1300 | ".persp_sample_ena" , ".persp_center_ena" , |
1301 | ".persp_centroid_ena" , ".persp_pull_model_ena" , |
1302 | ".linear_sample_ena" , ".linear_center_ena" , |
1303 | ".linear_centroid_ena" , ".line_stipple_tex_ena" , |
1304 | ".pos_x_float_ena" , ".pos_y_float_ena" , |
1305 | ".pos_z_float_ena" , ".pos_w_float_ena" , |
1306 | ".front_face_ena" , ".ancillary_ena" , |
1307 | ".sample_coverage_ena" , ".pos_fixed_pt_ena" }; |
1308 | unsigned PSInputEna = MFI->getPSInputEnable(); |
1309 | unsigned PSInputAddr = MFI->getPSInputAddr(); |
1310 | for (auto [Idx, Field] : enumerate(First: PsInputFields)) { |
1311 | MD->setGraphicsRegisters(field1: ".spi_ps_input_ena" , field2: Field, |
1312 | Val: (bool)((PSInputEna >> Idx) & 1)); |
1313 | MD->setGraphicsRegisters(field1: ".spi_ps_input_addr" , field2: Field, |
1314 | Val: (bool)((PSInputAddr >> Idx) & 1)); |
1315 | } |
1316 | } |
1317 | } |
1318 | |
1319 | // For version 3 and above the wave front size is already set in the metadata |
1320 | if (MD->getPALMajorVersion() < 3 && STM.isWave32()) |
1321 | MD->setWave32(MF.getFunction().getCallingConv()); |
1322 | } |
1323 | |
1324 | void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { |
1325 | auto *MD = getTargetStreamer()->getPALMetadata(); |
1326 | const MachineFrameInfo &MFI = MF.getFrameInfo(); |
1327 | StringRef FnName = MF.getFunction().getName(); |
1328 | MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize()); |
1329 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1330 | MCContext &Ctx = MF.getContext(); |
1331 | |
1332 | if (MD->getPALMajorVersion() < 3) { |
1333 | // Set compute registers |
1334 | MD->setRsrc1( |
1335 | CC: CallingConv::AMDGPU_CS, |
1336 | Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST, Ctx), Ctx); |
1337 | MD->setRsrc2(CC: CallingConv::AMDGPU_CS, |
1338 | Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); |
1339 | } else { |
1340 | EmitPALMetadataCommon(MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST); |
1341 | } |
1342 | |
1343 | // Set optional info |
1344 | MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize); |
1345 | MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU); |
1346 | MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU); |
1347 | } |
1348 | |
1349 | // This is supposed to be log2(Size) |
1350 | static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { |
1351 | switch (Size) { |
1352 | case 4: |
1353 | return AMD_ELEMENT_4_BYTES; |
1354 | case 8: |
1355 | return AMD_ELEMENT_8_BYTES; |
1356 | case 16: |
1357 | return AMD_ELEMENT_16_BYTES; |
1358 | default: |
1359 | llvm_unreachable("invalid private_element_size" ); |
1360 | } |
1361 | } |
1362 | |
1363 | void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out, |
1364 | const SIProgramInfo &CurrentProgramInfo, |
1365 | const MachineFunction &MF) const { |
1366 | const Function &F = MF.getFunction(); |
1367 | assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || |
1368 | F.getCallingConv() == CallingConv::SPIR_KERNEL); |
1369 | |
1370 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1371 | const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); |
1372 | MCContext &Ctx = MF.getContext(); |
1373 | |
1374 | Out.initDefault(STI: &STM, Ctx, /*InitMCExpr=*/false); |
1375 | |
1376 | Out.compute_pgm_resource1_registers = |
1377 | CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx); |
1378 | Out.compute_pgm_resource2_registers = |
1379 | CurrentProgramInfo.getComputePGMRSrc2(Ctx); |
1380 | Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; |
1381 | |
1382 | Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack; |
1383 | |
1384 | AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, |
1385 | getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); |
1386 | |
1387 | const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo(); |
1388 | if (UserSGPRInfo.hasPrivateSegmentBuffer()) { |
1389 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; |
1390 | } |
1391 | |
1392 | if (UserSGPRInfo.hasDispatchPtr()) |
1393 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
1394 | |
1395 | if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) |
1396 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; |
1397 | |
1398 | if (UserSGPRInfo.hasKernargSegmentPtr()) |
1399 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; |
1400 | |
1401 | if (UserSGPRInfo.hasDispatchID()) |
1402 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; |
1403 | |
1404 | if (UserSGPRInfo.hasFlatScratchInit()) |
1405 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; |
1406 | |
1407 | if (UserSGPRInfo.hasPrivateSegmentSize()) |
1408 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; |
1409 | |
1410 | if (UserSGPRInfo.hasDispatchPtr()) |
1411 | Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; |
1412 | |
1413 | if (STM.isXNACKEnabled()) |
1414 | Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; |
1415 | |
1416 | Align MaxKernArgAlign; |
1417 | Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign); |
1418 | Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; |
1419 | Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; |
1420 | Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; |
1421 | Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; |
1422 | |
1423 | // kernarg_segment_alignment is specified as log of the alignment. |
1424 | // The minimum alignment is 16. |
1425 | // FIXME: The metadata treats the minimum as 4? |
1426 | Out.kernarg_segment_alignment = Log2(A: std::max(a: Align(16), b: MaxKernArgAlign)); |
1427 | } |
1428 | |
1429 | bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, |
1430 | const char *, raw_ostream &O) { |
1431 | // First try the generic code, which knows about modifiers like 'c' and 'n'. |
1432 | if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O)) |
1433 | return false; |
1434 | |
1435 | if (ExtraCode && ExtraCode[0]) { |
1436 | if (ExtraCode[1] != 0) |
1437 | return true; // Unknown modifier. |
1438 | |
1439 | switch (ExtraCode[0]) { |
1440 | case 'r': |
1441 | break; |
1442 | default: |
1443 | return true; |
1444 | } |
1445 | } |
1446 | |
1447 | // TODO: Should be able to support other operand types like globals. |
1448 | const MachineOperand &MO = MI->getOperand(i: OpNo); |
1449 | if (MO.isReg()) { |
1450 | AMDGPUInstPrinter::printRegOperand(RegNo: MO.getReg(), O, |
1451 | MRI: *MF->getSubtarget().getRegisterInfo()); |
1452 | return false; |
1453 | } |
1454 | if (MO.isImm()) { |
1455 | int64_t Val = MO.getImm(); |
1456 | if (AMDGPU::isInlinableIntLiteral(Literal: Val)) { |
1457 | O << Val; |
1458 | } else if (isUInt<16>(x: Val)) { |
1459 | O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val)); |
1460 | } else if (isUInt<32>(x: Val)) { |
1461 | O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val)); |
1462 | } else { |
1463 | O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val)); |
1464 | } |
1465 | return false; |
1466 | } |
1467 | return true; |
1468 | } |
1469 | |
1470 | void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { |
1471 | AU.addRequired<AMDGPUResourceUsageAnalysis>(); |
1472 | AU.addPreserved<AMDGPUResourceUsageAnalysis>(); |
1473 | AsmPrinter::getAnalysisUsage(AU); |
1474 | } |
1475 | |
1476 | void AMDGPUAsmPrinter::( |
1477 | const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, |
1478 | bool isModuleEntryFunction, bool hasMAIInsts) { |
1479 | if (!ORE) |
1480 | return; |
1481 | |
1482 | const char *Name = "kernel-resource-usage" ; |
1483 | const char *Indent = " " ; |
1484 | |
1485 | // If the remark is not specifically enabled, do not output to yaml |
1486 | LLVMContext &Ctx = MF.getFunction().getContext(); |
1487 | if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name)) |
1488 | return; |
1489 | |
1490 | // Currently non-kernel functions have no resources to emit. |
1491 | if (!isEntryFunctionCC(CC: MF.getFunction().getCallingConv())) |
1492 | return; |
1493 | |
1494 | auto = [&](StringRef , |
1495 | StringRef , auto Argument) { |
1496 | // Add an indent for every line besides the line with the kernel name. This |
1497 | // makes it easier to tell which resource usage go with which kernel since |
1498 | // the kernel name will always be displayed first. |
1499 | std::string LabelStr = RemarkLabel.str() + ": " ; |
1500 | if (RemarkName != "FunctionName" ) |
1501 | LabelStr = Indent + LabelStr; |
1502 | |
1503 | ORE->emit([&]() { |
1504 | return MachineOptimizationRemarkAnalysis(Name, RemarkName, |
1505 | MF.getFunction().getSubprogram(), |
1506 | &MF.front()) |
1507 | << LabelStr << ore::NV(RemarkName, Argument); |
1508 | }); |
1509 | }; |
1510 | |
1511 | // FIXME: Formatting here is pretty nasty because clang does not accept |
1512 | // newlines from diagnostics. This forces us to emit multiple diagnostic |
1513 | // remarks to simulate newlines. If and when clang does accept newlines, this |
1514 | // formatting should be aggregated into one remark with newlines to avoid |
1515 | // printing multiple diagnostic location and diag opts. |
1516 | EmitResourceUsageRemark("FunctionName" , "Function Name" , |
1517 | MF.getFunction().getName()); |
1518 | EmitResourceUsageRemark("NumSGPR" , "SGPRs" , |
1519 | getMCExprStr(Value: CurrentProgramInfo.NumSGPR)); |
1520 | EmitResourceUsageRemark("NumVGPR" , "VGPRs" , |
1521 | getMCExprStr(Value: CurrentProgramInfo.NumArchVGPR)); |
1522 | if (hasMAIInsts) { |
1523 | EmitResourceUsageRemark("NumAGPR" , "AGPRs" , |
1524 | getMCExprStr(Value: CurrentProgramInfo.NumAccVGPR)); |
1525 | } |
1526 | EmitResourceUsageRemark("ScratchSize" , "ScratchSize [bytes/lane]" , |
1527 | getMCExprStr(Value: CurrentProgramInfo.ScratchSize)); |
1528 | int64_t DynStack; |
1529 | bool DynStackEvaluatable = |
1530 | CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(Res&: DynStack); |
1531 | StringRef DynamicStackStr = |
1532 | DynStackEvaluatable && DynStack ? "True" : "False" ; |
1533 | EmitResourceUsageRemark("DynamicStack" , "Dynamic Stack" , DynamicStackStr); |
1534 | EmitResourceUsageRemark("Occupancy" , "Occupancy [waves/SIMD]" , |
1535 | getMCExprStr(Value: CurrentProgramInfo.Occupancy)); |
1536 | EmitResourceUsageRemark("SGPRSpill" , "SGPRs Spill" , |
1537 | CurrentProgramInfo.SGPRSpill); |
1538 | EmitResourceUsageRemark("VGPRSpill" , "VGPRs Spill" , |
1539 | CurrentProgramInfo.VGPRSpill); |
1540 | if (isModuleEntryFunction) |
1541 | EmitResourceUsageRemark("BytesLDS" , "LDS Size [bytes/block]" , |
1542 | CurrentProgramInfo.LDSSize); |
1543 | } |
1544 | |