1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
20#include "AMDGPUHSAMetadataStreamer.h"
21#include "AMDGPUMCResourceInfo.h"
22#include "AMDGPUResourceUsageAnalysis.h"
23#include "GCNSubtarget.h"
24#include "MCTargetDesc/AMDGPUInstPrinter.h"
25#include "MCTargetDesc/AMDGPUMCExpr.h"
26#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
27#include "MCTargetDesc/AMDGPUTargetStreamer.h"
28#include "R600AsmPrinter.h"
29#include "SIMachineFunctionInfo.h"
30#include "TargetInfo/AMDGPUTargetInfo.h"
31#include "Utils/AMDGPUBaseInfo.h"
32#include "Utils/AMDKernelCodeTUtils.h"
33#include "Utils/SIDefinesUtils.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/BinaryFormat/ELF.h"
36#include "llvm/CodeGen/MachineFrameInfo.h"
37#include "llvm/CodeGen/MachineModuleInfo.h"
38#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
39#include "llvm/IR/DiagnosticInfo.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
42#include "llvm/MC/MCSectionELF.h"
43#include "llvm/MC/MCStreamer.h"
44#include "llvm/MC/MCValue.h"
45#include "llvm/MC/TargetRegistry.h"
46#include "llvm/Support/AMDHSAKernelDescriptor.h"
47#include "llvm/Support/Compiler.h"
48#include "llvm/Target/TargetLoweringObjectFile.h"
49#include "llvm/Target/TargetMachine.h"
50#include "llvm/TargetParser/TargetParser.h"
51
52using namespace llvm;
53using namespace llvm::AMDGPU;
54
55// This should get the default rounding mode from the kernel. We just set the
56// default here, but this could change if the OpenCL rounding mode pragmas are
57// used.
58//
59// The denormal mode here should match what is reported by the OpenCL runtime
60// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
61// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62//
63// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64// precision, and leaves single precision to flush all and does not report
65// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66// CL_FP_DENORM for both.
67//
68// FIXME: It seems some instructions do not support single precision denormals
69// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
70// and sin_f32, cos_f32 on most parts).
71
72// We want to use these instructions, and using fp32 denormals also causes
73// instructions to run at the double precision rate for the device so it's
74// probably best to just report no single precision denormals.
75static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
76 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
77 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
78 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
79 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
80}
81
82static AsmPrinter *
83createAMDGPUAsmPrinterPass(TargetMachine &tm,
84 std::unique_ptr<MCStreamer> &&Streamer) {
85 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
86}
87
88extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
89LLVMInitializeAMDGPUAsmPrinter() {
90 TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(),
91 Fn: llvm::createR600AsmPrinterPass);
92 TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(),
93 Fn: createAMDGPUAsmPrinterPass);
94}
95
96AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
97 std::unique_ptr<MCStreamer> Streamer)
98 : AsmPrinter(TM, std::move(Streamer)) {
99 assert(OutStreamer && "AsmPrinter constructed without streamer");
100}
101
102StringRef AMDGPUAsmPrinter::getPassName() const {
103 return "AMDGPU Assembly Printer";
104}
105
106const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
107 return TM.getMCSubtargetInfo();
108}
109
110AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
111 if (!OutStreamer)
112 return nullptr;
113 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
114}
115
116void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
117 IsTargetStreamerInitialized = false;
118}
119
120void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
121 IsTargetStreamerInitialized = true;
122
123 // TODO: Which one is called first, emitStartOfAsmFile or
124 // emitFunctionBodyStart?
125 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126 initializeTargetID(M);
127
128 if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
129 TM.getTargetTriple().getOS() != Triple::AMDPAL)
130 return;
131
132 getTargetStreamer()->EmitDirectiveAMDGCNTarget();
133
134 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
135 getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
136 COV: CodeObjectVersion);
137 HSAMetadataStream->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID());
138 }
139
140 if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
141 getTargetStreamer()->getPALMetadata()->readFromIR(M);
142}
143
144void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
145 // Init target streamer if it has not yet happened
146 if (!IsTargetStreamerInitialized)
147 initTargetStreamer(M);
148
149 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
150 getTargetStreamer()->EmitISAVersion();
151
152 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
154 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
155 HSAMetadataStream->end();
156 bool Success = HSAMetadataStream->emitTo(TargetStreamer&: *getTargetStreamer());
157 (void)Success;
158 assert(Success && "Malformed HSA Metadata");
159 }
160}
161
162void AMDGPUAsmPrinter::emitFunctionBodyStart() {
163 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
164 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165 const Function &F = MF->getFunction();
166
167 // TODO: We're checking this late, would be nice to check it earlier.
168 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
169 reportFatalUsageError(
170 reason: STM.getCPU() + " is only available on code object version 6 or better");
171 }
172
173 // TODO: Which one is called first, emitStartOfAsmFile or
174 // emitFunctionBodyStart?
175 if (!getTargetStreamer()->getTargetID())
176 initializeTargetID(M: *F.getParent());
177
178 const auto &FunctionTargetID = STM.getTargetID();
179 // Make sure function's xnack settings are compatible with module's
180 // xnack settings.
181 if (FunctionTargetID.isXnackSupported() &&
182 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184 OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine(MF->getName()) +
185 "' function does not match module xnack setting");
186 return;
187 }
188 // Make sure function's sramecc settings are compatible with module's
189 // sramecc settings.
190 if (FunctionTargetID.isSramEccSupported() &&
191 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193 OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine(MF->getName()) +
194 "' function does not match module sramecc setting");
195 return;
196 }
197
198 if (!MFI.isEntryFunction())
199 return;
200
201 if (STM.isMesaKernel(F) &&
202 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
203 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204 AMDGPUMCKernelCodeT KernelCode;
205 getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF);
206 KernelCode.validate(STI: &STM, Ctx&: MF->getContext());
207 getTargetStreamer()->EmitAMDKernelCodeT(Header&: KernelCode);
208 }
209
210 if (STM.isAmdHsaOS())
211 HSAMetadataStream->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo);
212}
213
214void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
215 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
216 if (!MFI.isEntryFunction())
217 return;
218
219 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
220 return;
221
222 auto &Streamer = getTargetStreamer()->getStreamer();
223 auto &Context = Streamer.getContext();
224 auto &ObjectFileInfo = *Context.getObjectFileInfo();
225 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227 Streamer.pushSection();
228 Streamer.switchSection(Section: &ReadOnlySection);
229
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
231 // alignment.
232 Streamer.emitValueToAlignment(Alignment: Align(64), Fill: 0, FillLen: 1, MaxBytesToEmit: 0);
233 ReadOnlySection.ensureMinAlignment(MinAlignment: Align(64));
234
235 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237 SmallString<128> KernelName;
238 getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction());
239 getTargetStreamer()->EmitAmdhsaKernelDescriptor(
240 STI: STM, KernelName, KernelDescriptor: getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo),
241 NextVGPR: CurrentProgramInfo.NumVGPRsForWavesPerEU,
242 NextSGPR: MCBinaryExpr::createSub(
243 LHS: CurrentProgramInfo.NumSGPRsForWavesPerEU,
244 RHS: AMDGPUMCExpr::createExtraSGPRs(
245 VCCUsed: CurrentProgramInfo.VCCUsed, FlatScrUsed: CurrentProgramInfo.FlatUsed,
246 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx&: Context),
247 Ctx&: Context),
248 ReserveVCC: CurrentProgramInfo.VCCUsed, ReserveFlatScr: CurrentProgramInfo.FlatUsed);
249
250 Streamer.popSection();
251}
252
253void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
254 Register RegNo = MI->getOperand(i: 0).getReg();
255
256 SmallString<128> Str;
257 raw_svector_ostream OS(Str);
258 OS << "implicit-def: "
259 << printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(T: OS.str());
265 OutStreamer->addBlankLine();
266}
267
268void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
269 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
270 AsmPrinter::emitFunctionEntryLabel();
271 return;
272 }
273
274 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(F: MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()),
279 getTargetStreamer()->EmitAMDGPUSymbolType(
280 SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(x: MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
286 HexLines.emplace_back(args: "");
287 }
288
289 AsmPrinter::emitFunctionEntryLabel();
290}
291
292void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 x: (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
299 HexLines.emplace_back(args: "");
300 }
301 AsmPrinter::emitBasicBlockStart(MBB);
302}
303
304void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
305 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
306 if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) {
307 OutContext.reportError(L: {},
308 Msg: Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
314 const Triple::OSType OS = TM.getTargetTriple().getOS();
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error(reason: "symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getDataLayout();
326 uint64_t Size = GV->getGlobalSize(DL);
327 Align Alignment = GV->getAlign().value_or(u: Align(4));
328
329 emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto *TS = getTargetStreamer();
332 TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment);
333 return;
334 }
335
336 AsmPrinter::emitGlobalVariable(GV);
337}
338
339bool AMDGPUAsmPrinter::doInitialization(Module &M) {
340 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
342 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
343 switch (CodeObjectVersion) {
344 case AMDGPU::AMDHSA_COV4:
345 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346 break;
347 case AMDGPU::AMDHSA_COV5:
348 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349 break;
350 case AMDGPU::AMDHSA_COV6:
351 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352 break;
353 default:
354 reportFatalUsageError(reason: "unsupported code object version");
355 }
356 }
357
358 return AsmPrinter::doInitialization(M);
359}
360
361/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
362///
363/// Remove dependency on GCNSubtarget and depend only only the necessary values
364/// for said occupancy computation. Should match computeOccupancy implementation
365/// without passing \p STM on.
366const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
367 const MCExpr *NumVGPRs,
368 unsigned DynamicVGPRBlockSize,
369 const GCNSubtarget &STM, MCContext &Ctx) {
370 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(STI: &STM);
371 unsigned Granule = IsaInfo::getVGPRAllocGranule(STI: &STM, DynamicVGPRBlockSize);
372 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(STI: &STM);
373 unsigned Generation = STM.getGeneration();
374
375 auto CreateExpr = [&Ctx](unsigned Value) {
376 return MCConstantExpr::create(Value, Ctx);
377 };
378
379 return AMDGPUMCExpr::create(Kind: AMDGPUMCExpr::AGVK_Occupancy,
380 Args: {CreateExpr(MaxWaves), CreateExpr(Granule),
381 CreateExpr(TargetTotalNumVGPRs),
382 CreateExpr(Generation), CreateExpr(InitOcc),
383 NumSGPRs, NumVGPRs},
384 Ctx);
385}
386
387void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
388 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(CC: F.getCallingConv()))
389 return;
390
391 using RIK = MCResourceInfo::ResourceInfoKind;
392 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
393 MCSymbol *FnSym = TM.getSymbol(GV: &F);
394 bool IsLocal = F.hasLocalLinkage();
395
396 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
397 int64_t Val;
398 if (Value->evaluateAsAbsolute(Res&: Val)) {
399 Res = Val;
400 return true;
401 }
402 return false;
403 };
404
405 const uint64_t MaxScratchPerWorkitem =
406 STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
407 MCSymbol *ScratchSizeSymbol = RI.getSymbol(
408 FuncName: FnSym->getName(), RIK: RIK::RIK_PrivateSegSize, OutContext, IsLocal);
409 uint64_t ScratchSize;
410 if (ScratchSizeSymbol->isVariable() &&
411 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
412 ScratchSize > MaxScratchPerWorkitem) {
413 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
414 DS_Error);
415 F.getContext().diagnose(DI: DiagStackSize);
416 }
417
418 // Validate addressable scalar registers (i.e., prior to added implicit
419 // SGPRs).
420 MCSymbol *NumSGPRSymbol =
421 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext, IsLocal);
422 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
423 !STM.hasSGPRInitBug()) {
424 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
425 uint64_t NumSgpr;
426 if (NumSGPRSymbol->isVariable() &&
427 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
428 NumSgpr > MaxAddressableNumSGPRs) {
429 F.getContext().diagnose(DI: DiagnosticInfoResourceLimit(
430 F, "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs,
431 DS_Error, DK_ResourceLimit));
432 return;
433 }
434 }
435
436 MCSymbol *VCCUsedSymbol =
437 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext, IsLocal);
438 MCSymbol *FlatUsedSymbol = RI.getSymbol(
439 FuncName: FnSym->getName(), RIK: RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
440 uint64_t VCCUsed, FlatUsed, NumSgpr;
441
442 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
443 FlatUsedSymbol->isVariable() &&
444 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
445 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
446 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
447
448 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
449 // resolvable.
450 NumSgpr += IsaInfo::getNumExtraSGPRs(
451 STI: &STM, VCCUsed, FlatScrUsed: FlatUsed,
452 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny());
453 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
454 STM.hasSGPRInitBug()) {
455 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
456 if (NumSgpr > MaxAddressableNumSGPRs) {
457 F.getContext().diagnose(DI: DiagnosticInfoResourceLimit(
458 F, "scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error,
459 DK_ResourceLimit));
460 return;
461 }
462 }
463
464 MCSymbol *NumVgprSymbol =
465 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext, IsLocal);
466 MCSymbol *NumAgprSymbol =
467 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext, IsLocal);
468 uint64_t NumVgpr, NumAgpr;
469
470 MachineModuleInfo &MMI =
471 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
472 MachineFunction *MF = MMI.getMachineFunction(F);
473 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
474 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
475 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
476 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
477 unsigned MaxWaves = MFI.getMaxWavesPerEU();
478 uint64_t TotalNumVgpr =
479 getTotalNumVGPRs(has90AInsts: STM.hasGFX90AInsts(), ArgNumAGPR: NumAgpr, ArgNumVGPR: NumVgpr);
480 uint64_t NumVGPRsForWavesPerEU =
481 std::max(l: {TotalNumVgpr, (uint64_t)1,
482 (uint64_t)STM.getMinNumVGPRs(
483 WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize())});
484 uint64_t NumSGPRsForWavesPerEU = std::max(
485 l: {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(WavesPerEU: MaxWaves)});
486 const MCExpr *OccupancyExpr = createOccupancy(
487 InitOcc: STM.getOccupancyWithWorkGroupSizes(MF: *MF).second,
488 NumSGPRs: MCConstantExpr::create(Value: NumSGPRsForWavesPerEU, Ctx&: OutContext),
489 NumVGPRs: MCConstantExpr::create(Value: NumVGPRsForWavesPerEU, Ctx&: OutContext),
490 DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize(), STM, Ctx&: OutContext);
491 uint64_t Occupancy;
492
493 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
494 F, Name: "amdgpu-waves-per-eu", Default: {0, 0}, OnlyFirstRequired: true);
495
496 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
497 DiagnosticInfoOptimizationFailure Diag(
498 F, F.getSubprogram(),
499 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
500 "'" +
501 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
502 ", final occupancy is " + Twine(Occupancy));
503 F.getContext().diagnose(DI: Diag);
504 return;
505 }
506 }
507 }
508}
509
510bool AMDGPUAsmPrinter::doFinalization(Module &M) {
511 // Pad with s_code_end to help tools and guard against instruction prefetch
512 // causing stale data in caches. Arguably this should be done by the linker,
513 // which is why this isn't done for Mesa.
514 // Don't do it if there is no code.
515 const MCSubtargetInfo &STI = *getGlobalSTI();
516 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
517 (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
518 STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
519 MCSection *TextSect = getObjFileLowering().getTextSection();
520 if (TextSect->hasInstructions()) {
521 OutStreamer->switchSection(Section: TextSect);
522 getTargetStreamer()->EmitCodeEnd(STI);
523 }
524 }
525
526 // Assign expressions which can only be resolved when all other functions are
527 // known.
528 RI.finalize(OutContext);
529
530 // Switch section and emit all GPR maximums within the processed module.
531 OutStreamer->pushSection();
532 MCSectionELF *MaxGPRSection =
533 OutContext.getELFSection(Section: ".AMDGPU.gpr_maximums", Type: ELF::SHT_PROGBITS, Flags: 0);
534 OutStreamer->switchSection(Section: MaxGPRSection);
535 getTargetStreamer()->EmitMCResourceMaximums(
536 MaxVGPR: RI.getMaxVGPRSymbol(OutContext), MaxAGPR: RI.getMaxAGPRSymbol(OutContext),
537 MaxSGPR: RI.getMaxSGPRSymbol(OutContext), MaxNamedBarrier: RI.getMaxNamedBarrierSymbol(OutContext));
538 OutStreamer->popSection();
539
540 for (Function &F : M.functions())
541 validateMCResourceInfo(F);
542
543 RI.reset();
544
545 return AsmPrinter::doFinalization(M);
546}
547
548SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
549 SmallString<128> Str;
550 raw_svector_ostream OSS(Str);
551 auto &Streamer = getTargetStreamer()->getStreamer();
552 auto &Context = Streamer.getContext();
553 const MCExpr *New = foldAMDGPUMCExpr(Expr: Value, Ctx&: Context);
554 printAMDGPUMCExpr(Expr: New, OS&: OSS, MAI);
555 return Str;
556}
557
558// Print comments that apply to both callable functions and entry points.
559void AMDGPUAsmPrinter::emitCommonFunctionComments(
560 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
561 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
562 const AMDGPUMachineFunctionInfo *MFI) {
563 OutStreamer->emitRawComment(T: " codeLenInByte = " + Twine(CodeSize), TabPrefix: false);
564 OutStreamer->emitRawComment(T: " TotalNumSgprs: " + getMCExprStr(Value: NumSGPR),
565 TabPrefix: false);
566 OutStreamer->emitRawComment(T: " NumVgprs: " + getMCExprStr(Value: NumVGPR), TabPrefix: false);
567 if (NumAGPR && TotalNumVGPR) {
568 OutStreamer->emitRawComment(T: " NumAgprs: " + getMCExprStr(Value: NumAGPR), TabPrefix: false);
569 OutStreamer->emitRawComment(T: " TotalNumVgprs: " + getMCExprStr(Value: TotalNumVGPR),
570 TabPrefix: false);
571 }
572 OutStreamer->emitRawComment(T: " ScratchSize: " + getMCExprStr(Value: ScratchSize),
573 TabPrefix: false);
574 OutStreamer->emitRawComment(T: " MemoryBound: " + Twine(MFI->isMemoryBound()),
575 TabPrefix: false);
576}
577
578const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
579 const MachineFunction &MF) const {
580 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
581 MCContext &Ctx = MF.getContext();
582 uint16_t KernelCodeProperties = 0;
583 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
584
585 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
586 KernelCodeProperties |=
587 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
588 }
589 if (UserSGPRInfo.hasDispatchPtr()) {
590 KernelCodeProperties |=
591 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
592 }
593 if (UserSGPRInfo.hasQueuePtr()) {
594 KernelCodeProperties |=
595 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
596 }
597 if (UserSGPRInfo.hasKernargSegmentPtr()) {
598 KernelCodeProperties |=
599 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
600 }
601 if (UserSGPRInfo.hasDispatchID()) {
602 KernelCodeProperties |=
603 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
604 }
605 if (UserSGPRInfo.hasFlatScratchInit()) {
606 KernelCodeProperties |=
607 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
608 }
609 if (UserSGPRInfo.hasPrivateSegmentSize()) {
610 KernelCodeProperties |=
611 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
612 }
613 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
614 KernelCodeProperties |=
615 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
616 }
617
618 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
619 // un-evaluatable at this point so it cannot be conditionally checked here.
620 // Instead, we'll directly shift the possibly unknown MCExpr into its place
621 // and bitwise-or it into KernelCodeProperties.
622 const MCExpr *KernelCodePropExpr =
623 MCConstantExpr::create(Value: KernelCodeProperties, Ctx);
624 const MCExpr *OrValue = MCConstantExpr::create(
625 Value: amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
626 OrValue = MCBinaryExpr::createShl(LHS: CurrentProgramInfo.DynamicCallStack,
627 RHS: OrValue, Ctx);
628 KernelCodePropExpr = MCBinaryExpr::createOr(LHS: KernelCodePropExpr, RHS: OrValue, Ctx);
629
630 return KernelCodePropExpr;
631}
632
633MCKernelDescriptor
634AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
635 const SIProgramInfo &PI) const {
636 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
637 const Function &F = MF.getFunction();
638 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
639 MCContext &Ctx = MF.getContext();
640
641 MCKernelDescriptor KernelDescriptor;
642
643 KernelDescriptor.group_segment_fixed_size =
644 MCConstantExpr::create(Value: PI.LDSSize, Ctx);
645 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
646
647 Align MaxKernArgAlign;
648 KernelDescriptor.kernarg_size = MCConstantExpr::create(
649 Value: STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign), Ctx);
650
651 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(ST: STM, Ctx);
652 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
653 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
654
655 int64_t PGM_Rsrc3 = 1;
656 bool EvaluatableRsrc3 =
657 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(Res&: PGM_Rsrc3);
658 (void)PGM_Rsrc3;
659 (void)EvaluatableRsrc3;
660 assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
661 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
662 static_cast<uint64_t>(PGM_Rsrc3) == 0);
663 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
664
665 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
666 Value: AMDGPU::hasKernargPreload(STI: STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
667 Ctx);
668
669 return KernelDescriptor;
670}
671
672bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
673 // Init target streamer lazily on the first function so that previous passes
674 // can set metadata.
675 if (!IsTargetStreamerInitialized)
676 initTargetStreamer(M&: *MF.getFunction().getParent());
677
678 ResourceUsage =
679 &getAnalysis<AMDGPUResourceUsageAnalysisWrapperPass>().getResourceInfo();
680 CurrentProgramInfo.reset(MF);
681
682 const AMDGPUMachineFunctionInfo *MFI =
683 MF.getInfo<AMDGPUMachineFunctionInfo>();
684 MCContext &Ctx = MF.getContext();
685
686 // The starting address of all shader programs must be 256 bytes aligned.
687 // Regular functions just need the basic required instruction alignment.
688 MF.ensureAlignment(A: MFI->isEntryFunction() ? Align(256) : Align(4));
689
690 SetupMachineFunction(MF);
691
692 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
693 MCContext &Context = getObjFileLowering().getContext();
694 bool IsLocal = MF.getFunction().hasLocalLinkage();
695 // FIXME: This should be an explicit check for Mesa.
696 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
697 MCSectionELF *ConfigSection =
698 Context.getELFSection(Section: ".AMDGPU.config", Type: ELF::SHT_PROGBITS, Flags: 0);
699 OutStreamer->switchSection(Section: ConfigSection);
700 }
701
702 RI.gatherResourceInfo(MF, FRI: *ResourceUsage, OutContext);
703
704 if (MFI->isModuleEntryFunction()) {
705 getSIProgramInfo(Out&: CurrentProgramInfo, MF);
706 }
707
708 if (STM.isAmdPalOS()) {
709 if (MFI->isEntryFunction())
710 EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo);
711 else if (MFI->isModuleEntryFunction())
712 emitPALFunctionMetadata(MF);
713 } else if (!STM.isAmdHsaOS()) {
714 EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo);
715 }
716
717 DumpCodeInstEmitter = nullptr;
718 if (STM.dumpCode()) {
719 // For -dumpcode, get the assembler out of the streamer. This only works
720 // with -filetype=obj.
721 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
722 if (Assembler)
723 DumpCodeInstEmitter = Assembler->getEmitterPtr();
724 }
725
726 DisasmLines.clear();
727 HexLines.clear();
728 DisasmLineMaxLen = 0;
729
730 emitFunctionBody();
731
732 emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(),
733 hasMAIInsts: STM.hasMAIInsts());
734
735 {
736 using RIK = MCResourceInfo::ResourceInfoKind;
737 getTargetStreamer()->EmitMCResourceInfo(
738 NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext,
739 IsLocal),
740 NumAGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext,
741 IsLocal),
742 NumExplicitSGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext,
743 IsLocal),
744 NumNamedBarrier: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumNamedBarrier,
745 OutContext, IsLocal),
746 PrivateSegmentSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
747 OutContext, IsLocal),
748 UsesVCC: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext,
749 IsLocal),
750 UsesFlatScratch: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesFlatScratch,
751 OutContext, IsLocal),
752 HasDynamicallySizedStack: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasDynSizedStack,
753 OutContext, IsLocal),
754 HasRecursion: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasRecursion, OutContext,
755 IsLocal),
756 HasIndirectCall: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasIndirectCall,
757 OutContext, IsLocal));
758 }
759
760 // Emit _dvgpr$ symbol when appropriate.
761 emitDVgprSymbol(MF);
762
763 if (isVerbose()) {
764 MCSectionELF *CommentSection =
765 Context.getELFSection(Section: ".AMDGPU.csdata", Type: ELF::SHT_PROGBITS, Flags: 0);
766 OutStreamer->switchSection(Section: CommentSection);
767
768 if (!MFI->isEntryFunction()) {
769 using RIK = MCResourceInfo::ResourceInfoKind;
770 OutStreamer->emitRawComment(T: " Function info:", TabPrefix: false);
771
772 emitCommonFunctionComments(
773 NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext,
774 IsLocal)
775 ->getVariableValue(),
776 NumAGPR: STM.hasMAIInsts()
777 ? RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR,
778 OutContext, IsLocal)
779 ->getVariableValue()
780 : nullptr,
781 TotalNumVGPR: RI.createTotalNumVGPRs(MF, Ctx),
782 NumSGPR: RI.createTotalNumSGPRs(
783 MF,
784 hasXnack: MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
785 Ctx),
786 ScratchSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
787 OutContext, IsLocal)
788 ->getVariableValue(),
789 CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
790 return false;
791 }
792
793 OutStreamer->emitRawComment(T: " Kernel info:", TabPrefix: false);
794 emitCommonFunctionComments(
795 NumVGPR: CurrentProgramInfo.NumArchVGPR,
796 NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
797 TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR,
798 ScratchSize: CurrentProgramInfo.ScratchSize,
799 CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
800
801 OutStreamer->emitRawComment(
802 T: " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), TabPrefix: false);
803 OutStreamer->emitRawComment(
804 T: " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), TabPrefix: false);
805 OutStreamer->emitRawComment(
806 T: " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
807 " bytes/workgroup (compile time only)", TabPrefix: false);
808
809 OutStreamer->emitRawComment(
810 T: " SGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.SGPRBlocks), TabPrefix: false);
811
812 OutStreamer->emitRawComment(
813 T: " VGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.VGPRBlocks), TabPrefix: false);
814
815 OutStreamer->emitRawComment(
816 T: " NumSGPRsForWavesPerEU: " +
817 getMCExprStr(Value: CurrentProgramInfo.NumSGPRsForWavesPerEU),
818 TabPrefix: false);
819 OutStreamer->emitRawComment(
820 T: " NumVGPRsForWavesPerEU: " +
821 getMCExprStr(Value: CurrentProgramInfo.NumVGPRsForWavesPerEU),
822 TabPrefix: false);
823
824 if (STM.hasGFX90AInsts()) {
825 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
826 LHS: CurrentProgramInfo.AccumOffset, RHS: MCConstantExpr::create(Value: 1, Ctx), Ctx);
827 AdjustedAccum = MCBinaryExpr::createMul(
828 LHS: AdjustedAccum, RHS: MCConstantExpr::create(Value: 4, Ctx), Ctx);
829 OutStreamer->emitRawComment(
830 T: " AccumOffset: " + getMCExprStr(Value: AdjustedAccum), TabPrefix: false);
831 }
832
833 if (STM.hasGFX1250Insts())
834 OutStreamer->emitRawComment(
835 T: " NamedBarCnt: " + getMCExprStr(Value: CurrentProgramInfo.NamedBarCnt),
836 TabPrefix: false);
837
838 OutStreamer->emitRawComment(
839 T: " Occupancy: " + getMCExprStr(Value: CurrentProgramInfo.Occupancy), TabPrefix: false);
840
841 OutStreamer->emitRawComment(
842 T: " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), TabPrefix: false);
843
844 OutStreamer->emitRawComment(
845 T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
846 getMCExprStr(Value: CurrentProgramInfo.ScratchEnable),
847 TabPrefix: false);
848 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " +
849 Twine(CurrentProgramInfo.UserSGPR),
850 TabPrefix: false);
851 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
852 Twine(CurrentProgramInfo.TrapHandlerEnable),
853 TabPrefix: false);
854 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
855 Twine(CurrentProgramInfo.TGIdXEnable),
856 TabPrefix: false);
857 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
858 Twine(CurrentProgramInfo.TGIdYEnable),
859 TabPrefix: false);
860 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
861 Twine(CurrentProgramInfo.TGIdZEnable),
862 TabPrefix: false);
863 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
864 Twine(CurrentProgramInfo.TIdIGCompCount),
865 TabPrefix: false);
866
867 [[maybe_unused]] int64_t PGMRSrc3;
868 assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
869 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
870 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
871 static_cast<uint64_t>(PGMRSrc3) == 0));
872 if (STM.hasGFX90AInsts()) {
873 OutStreamer->emitRawComment(
874 T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
875 getMCExprStr(Value: MCKernelDescriptor::bits_get(
876 Src: CurrentProgramInfo.ComputePGMRSrc3,
877 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
878 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
879 TabPrefix: false);
880 OutStreamer->emitRawComment(
881 T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
882 getMCExprStr(Value: MCKernelDescriptor::bits_get(
883 Src: CurrentProgramInfo.ComputePGMRSrc3,
884 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
885 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
886 TabPrefix: false);
887 }
888 }
889
890 if (DumpCodeInstEmitter) {
891
892 OutStreamer->switchSection(
893 Section: Context.getELFSection(Section: ".AMDGPU.disasm", Type: ELF::SHT_PROGBITS, Flags: 0));
894
895 for (size_t i = 0; i < DisasmLines.size(); ++i) {
896 std::string Comment = "\n";
897 if (!HexLines[i].empty()) {
898 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
899 Comment += " ; " + HexLines[i] + "\n";
900 }
901
902 OutStreamer->emitBytes(Data: StringRef(DisasmLines[i]));
903 OutStreamer->emitBytes(Data: StringRef(Comment));
904 }
905 }
906
907 return false;
908}
909
910// When appropriate, add a _dvgpr$ symbol, with the value of the function
911// symbol, plus an offset encoding one less than the number of VGPR blocks used
912// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
913// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
914// used by a front-end to have functions that are chained rather than called,
915// and a dispatcher that dynamically resizes the VGPR count before dispatching
916// to a function.
917void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
918 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
919 if (MFI.isDynamicVGPREnabled() &&
920 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
921 MCContext &Ctx = MF.getContext();
922 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
923 MCValue NumVGPRs;
924 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
925 Res&: NumVGPRs, Asm: nullptr) ||
926 !NumVGPRs.isAbsolute()) {
927 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
928 }
929 // Calculate number of VGPR blocks.
930 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
931 unsigned NumBlocks =
932 divideCeil(Numerator: std::max(a: unsigned(NumVGPRs.getConstant()), b: 1U), Denominator: BlockSize);
933
934 if (NumBlocks > 8) {
935 OutContext.reportError(L: {},
936 Msg: "too many DVGPR blocks for _dvgpr$ symbol for '" +
937 Twine(CurrentFnSym->getName()) + "'");
938 return;
939 }
940 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
941 // Add to function symbol to create _dvgpr$ symbol.
942 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
943 LHS: MCSymbolRefExpr::create(Symbol: CurrentFnSym, Ctx),
944 RHS: MCConstantExpr::create(Value: EncodedNumBlocks, Ctx), Ctx);
945 MCSymbol *DVgprFuncSym =
946 Ctx.getOrCreateSymbol(Name: Twine("_dvgpr$") + CurrentFnSym->getName());
947 OutStreamer->emitAssignment(Symbol: DVgprFuncSym, Value: DVgprFuncVal);
948 emitVisibility(Sym: DVgprFuncSym, Visibility: MF.getFunction().getVisibility());
949 emitLinkage(GV: &MF.getFunction(), GVSym: DVgprFuncSym);
950 }
951}
952
953// TODO: Fold this into emitFunctionBodyStart.
954void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
955 // In the beginning all features are either 'Any' or 'NotSupported',
956 // depending on global target features. This will cover empty modules.
957 getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(),
958 FeatureString: getGlobalSTI()->getFeatureString());
959
960 // If module is empty, we are done.
961 if (M.empty())
962 return;
963
964 // If module is not empty, need to find first 'Off' or 'On' feature
965 // setting per feature from functions in module.
966 for (auto &F : M) {
967 auto &TSTargetID = getTargetStreamer()->getTargetID();
968 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
969 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
970 break;
971
972 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
973 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
974 if (TSTargetID->isXnackSupported())
975 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
976 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
977 if (TSTargetID->isSramEccSupported())
978 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
979 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
980 }
981}
982
983// AccumOffset computed for the MCExpr equivalent of:
984// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
985static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
986 const MCExpr *ConstFour = MCConstantExpr::create(Value: 4, Ctx);
987 const MCExpr *ConstOne = MCConstantExpr::create(Value: 1, Ctx);
988
989 // Can't be lower than 1 for subsequent alignTo.
990 const MCExpr *MaximumTaken =
991 AMDGPUMCExpr::createMax(Args: {ConstOne, NumVGPR}, Ctx);
992
993 // Practically, it's computing divideCeil(MaximumTaken, 4).
994 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
995 LHS: AMDGPUMCExpr::createAlignTo(Value: MaximumTaken, Align: ConstFour, Ctx), RHS: ConstFour,
996 Ctx);
997
998 return MCBinaryExpr::createSub(LHS: DivCeil, RHS: ConstOne, Ctx);
999}
1000
1001void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1002 const MachineFunction &MF) {
1003 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1004 bool IsLocal = MF.getFunction().hasLocalLinkage();
1005 MCContext &Ctx = MF.getContext();
1006
1007 auto CreateExpr = [&Ctx](int64_t Value) {
1008 return MCConstantExpr::create(Value, Ctx);
1009 };
1010
1011 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1012 int64_t Val;
1013 if (Value->evaluateAsAbsolute(Res&: Val)) {
1014 Res = Val;
1015 return true;
1016 }
1017 return false;
1018 };
1019
1020 auto GetSymRefExpr =
1021 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1022 MCSymbol *Sym =
1023 RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK, OutContext, IsLocal);
1024 return MCSymbolRefExpr::create(Symbol: Sym, Ctx);
1025 };
1026
1027 using RIK = MCResourceInfo::ResourceInfoKind;
1028 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1029 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1030 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1031 NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1032
1033 ProgInfo.AccumOffset = computeAccumOffset(NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1034 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1035 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1036 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1037 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1038 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1039 ProgInfo.DynamicCallStack =
1040 MCBinaryExpr::createOr(LHS: GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1041 RHS: GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1042
1043 const MCExpr *BarBlkConst = MCConstantExpr::create(Value: 4, Ctx);
1044 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1045 Value: GetSymRefExpr(RIK::RIK_NumNamedBarrier), Align: BarBlkConst, Ctx);
1046 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(LHS: AlignToBlk, RHS: BarBlkConst, Ctx);
1047
1048 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1049
1050 // The calculations related to SGPR/VGPR blocks are
1051 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1052 // unified.
1053 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1054 VCCUsed: ProgInfo.VCCUsed, FlatScrUsed: ProgInfo.FlatUsed,
1055 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1056
1057 // Check the addressable register limit before we add ExtraSGPRs.
1058 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1059 !STM.hasSGPRInitBug()) {
1060 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1061 uint64_t NumSgpr;
1062 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1063 NumSgpr > MaxAddressableNumSGPRs) {
1064 // This can happen due to a compiler bug or when using inline asm.
1065 LLVMContext &Ctx = MF.getFunction().getContext();
1066 Ctx.diagnose(DI: DiagnosticInfoResourceLimit(
1067 MF.getFunction(), "addressable scalar registers", NumSgpr,
1068 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit));
1069 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1070 }
1071 }
1072
1073 // Account for extra SGPRs and VGPRs reserved for debugger use.
1074 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(LHS: ProgInfo.NumSGPR, RHS: ExtraSGPRs, Ctx);
1075
1076 const Function &F = MF.getFunction();
1077
1078 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1079 // dispatch registers as function args.
1080 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1081 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1082
1083 if (WaveDispatchNumSGPR) {
1084 ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
1085 Args: {ProgInfo.NumSGPR,
1086 MCBinaryExpr::createAdd(LHS: CreateExpr(WaveDispatchNumSGPR), RHS: ExtraSGPRs,
1087 Ctx)},
1088 Ctx);
1089 }
1090
1091 if (WaveDispatchNumVGPR) {
1092 ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
1093 Args: {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1094
1095 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1096 NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1097 }
1098
1099 // Adjust number of registers used to meet default/requested minimum/maximum
1100 // number of waves per execution unit request.
1101 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1102 ProgInfo.NumSGPRsForWavesPerEU =
1103 AMDGPUMCExpr::createMax(Args: {ProgInfo.NumSGPR, CreateExpr(1ul),
1104 CreateExpr(STM.getMinNumSGPRs(WavesPerEU: MaxWaves))},
1105 Ctx);
1106 ProgInfo.NumVGPRsForWavesPerEU =
1107 AMDGPUMCExpr::createMax(Args: {ProgInfo.NumVGPR, CreateExpr(1ul),
1108 CreateExpr(STM.getMinNumVGPRs(
1109 WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()))},
1110 Ctx);
1111
1112 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
1113 STM.hasSGPRInitBug()) {
1114 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1115 uint64_t NumSgpr;
1116 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1117 NumSgpr > MaxAddressableNumSGPRs) {
1118 // This can happen due to a compiler bug or when using inline asm to use
1119 // the registers which are usually reserved for vcc etc.
1120 LLVMContext &Ctx = MF.getFunction().getContext();
1121 Ctx.diagnose(DI: DiagnosticInfoResourceLimit(
1122 MF.getFunction(), "scalar registers", NumSgpr, MaxAddressableNumSGPRs,
1123 DS_Error, DK_ResourceLimit));
1124 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1125 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1126 }
1127 }
1128
1129 if (STM.hasSGPRInitBug()) {
1130 ProgInfo.NumSGPR =
1131 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1132 ProgInfo.NumSGPRsForWavesPerEU =
1133 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1134 }
1135
1136 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1137 LLVMContext &Ctx = MF.getFunction().getContext();
1138 Ctx.diagnose(DI: DiagnosticInfoResourceLimit(
1139 MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(),
1140 STM.getMaxNumUserSGPRs(), DS_Error));
1141 }
1142
1143 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1144 LLVMContext &Ctx = MF.getFunction().getContext();
1145 Ctx.diagnose(DI: DiagnosticInfoResourceLimit(
1146 MF.getFunction(), "local memory", MFI->getLDSSize(),
1147 STM.getAddressableLocalMemorySize(), DS_Error));
1148 }
1149 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1150 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1151 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1152 unsigned Granule) {
1153 const MCExpr *OneConst = CreateExpr(1ul);
1154 const MCExpr *GranuleConst = CreateExpr(Granule);
1155 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax(Args: {NumGPR, OneConst}, Ctx);
1156 const MCExpr *AlignToGPR =
1157 AMDGPUMCExpr::createAlignTo(Value: MaxNumGPR, Align: GranuleConst, Ctx);
1158 const MCExpr *DivGPR =
1159 MCBinaryExpr::createDiv(LHS: AlignToGPR, RHS: GranuleConst, Ctx);
1160 const MCExpr *SubGPR = MCBinaryExpr::createSub(LHS: DivGPR, RHS: OneConst, Ctx);
1161 return SubGPR;
1162 };
1163 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1164 if (STM.getGeneration() >= AMDGPUSubtarget::GFX10) {
1165 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1166 } else {
1167 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1168 ProgInfo.NumSGPRsForWavesPerEU, IsaInfo::getSGPREncodingGranule(STI: &STM));
1169 }
1170 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1171 IsaInfo::getVGPREncodingGranule(STI: &STM));
1172
1173 const SIModeRegisterDefaults Mode = MFI->getMode();
1174
1175 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1176 // register.
1177 ProgInfo.FloatMode = getFPMode(Mode);
1178
1179 ProgInfo.IEEEMode = Mode.IEEE;
1180
1181 // Make clamp modifier on NaN input returns 0.
1182 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1183
1184 unsigned LDSAlignShift = 8;
1185 switch (getLdsDwGranularity(ST: STM)) {
1186 case 512:
1187 case 320:
1188 LDSAlignShift = 11;
1189 break;
1190 case 128:
1191 LDSAlignShift = 9;
1192 break;
1193 case 64:
1194 LDSAlignShift = 8;
1195 break;
1196 default:
1197 llvm_unreachable("invald LDS block size");
1198 }
1199
1200 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1201 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1202
1203 ProgInfo.LDSSize = MFI->getLDSSize();
1204 ProgInfo.LDSBlocks =
1205 alignTo(Value: ProgInfo.LDSSize, Align: 1ULL << LDSAlignShift) >> LDSAlignShift;
1206
1207 // The MCExpr equivalent of divideCeil.
1208 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1209 const MCExpr *Ceil =
1210 AMDGPUMCExpr::createAlignTo(Value: Numerator, Align: Denominator, Ctx);
1211 return MCBinaryExpr::createDiv(LHS: Ceil, RHS: Denominator, Ctx);
1212 };
1213
1214 // Scratch is allocated in 64-dword or 256-dword blocks.
1215 unsigned ScratchAlignShift =
1216 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1217 // We need to program the hardware with the amount of scratch memory that
1218 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1219 // scratch memory used per thread.
1220 ProgInfo.ScratchBlocks = DivideCeil(
1221 MCBinaryExpr::createMul(LHS: ProgInfo.ScratchSize,
1222 RHS: CreateExpr(STM.getWavefrontSize()), Ctx),
1223 CreateExpr(1ULL << ScratchAlignShift));
1224
1225 if (STM.supportsWGP()) {
1226 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1227 }
1228
1229 if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= 10) {
1230 ProgInfo.MemOrdered = 1;
1231 ProgInfo.FwdProgress = !F.hasFnAttribute(Kind: "amdgpu-no-fwd-progress");
1232 }
1233
1234 // 0 = X, 1 = XY, 2 = XYZ
1235 unsigned TIDIGCompCnt = 0;
1236 if (MFI->hasWorkItemIDZ())
1237 TIDIGCompCnt = 2;
1238 else if (MFI->hasWorkItemIDY())
1239 TIDIGCompCnt = 1;
1240
1241 // The private segment wave byte offset is the last of the system SGPRs. We
1242 // initially assumed it was allocated, and may have used it. It shouldn't harm
1243 // anything to disable it if we know the stack isn't used here. We may still
1244 // have emitted code reading it to initialize scratch, but if that's unused
1245 // reading garbage should be OK.
1246 ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
1247 LHS: MCBinaryExpr::createGT(LHS: ProgInfo.ScratchBlocks,
1248 RHS: MCConstantExpr::create(Value: 0, Ctx), Ctx),
1249 RHS: ProgInfo.DynamicCallStack, Ctx);
1250
1251 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1252 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1253 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1254 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1255 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1256 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1257 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1258 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1259 ProgInfo.EXCPEnMSB = 0;
1260 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1261 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1262 ProgInfo.EXCPEnable = 0;
1263
1264 // return ((Dst & ~Mask) | (Value << Shift))
1265 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1266 uint32_t Shift) {
1267 const auto *Shft = MCConstantExpr::create(Value: Shift, Ctx);
1268 const auto *Msk = MCConstantExpr::create(Value: Mask, Ctx);
1269 Dst = MCBinaryExpr::createAnd(LHS: Dst, RHS: MCUnaryExpr::createNot(Expr: Msk, Ctx), Ctx);
1270 Dst = MCBinaryExpr::createOr(LHS: Dst, RHS: MCBinaryExpr::createShl(LHS: Value, RHS: Shft, Ctx),
1271 Ctx);
1272 return Dst;
1273 };
1274
1275 if (STM.hasGFX90AInsts()) {
1276 ProgInfo.ComputePGMRSrc3 =
1277 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1278 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1279 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1280 ProgInfo.ComputePGMRSrc3 =
1281 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1282 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1283 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1284 }
1285
1286 if (STM.hasGFX1250Insts())
1287 ProgInfo.ComputePGMRSrc3 =
1288 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1289 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1290 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1291
1292 ProgInfo.Occupancy = createOccupancy(
1293 InitOcc: STM.computeOccupancy(F, LDSSize: ProgInfo.LDSSize).second,
1294 NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU, NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU,
1295 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1296
1297 const auto [MinWEU, MaxWEU] =
1298 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default: {0, 0}, OnlyFirstRequired: true);
1299 uint64_t Occupancy;
1300 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1301 DiagnosticInfoOptimizationFailure Diag(
1302 F, F.getSubprogram(),
1303 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1304 "'" +
1305 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1306 ", final occupancy is " + Twine(Occupancy));
1307 F.getContext().diagnose(DI: Diag);
1308 }
1309
1310 if (isGFX11Plus(STI: STM)) {
1311 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1312 a: ProgInfo.getFunctionCodeSize(MF, IsLowerBound: true /* IsLowerBound */),
1313 b: (uint64_t)std::numeric_limits<uint32_t>::max());
1314 uint32_t CodeSizeInLines = divideCeil(Numerator: CodeSizeInBytes, Denominator: 128);
1315 uint32_t Field, Shift, Width;
1316 if (isGFX11(STI: STM)) {
1317 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1318 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1319 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1320 } else {
1321 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1322 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1323 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1324 }
1325 uint64_t InstPrefSize = std::min(a: CodeSizeInLines, b: (1u << Width) - 1);
1326 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1327 CreateExpr(InstPrefSize), Field, Shift);
1328 }
1329}
1330
1331static unsigned getRsrcReg(CallingConv::ID CallConv) {
1332 switch (CallConv) {
1333 default: [[fallthrough]];
1334 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1335 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1336 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1337 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1338 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1339 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1340 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1341 }
1342}
1343
1344void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1345 const SIProgramInfo &CurrentProgramInfo) {
1346 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1347 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1348 unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv());
1349 MCContext &Ctx = MF.getContext();
1350
1351 // (((Value) & Mask) << Shift)
1352 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1353 const MCExpr *msk = MCConstantExpr::create(Value: Mask, Ctx);
1354 const MCExpr *shft = MCConstantExpr::create(Value: Shift, Ctx);
1355 return MCBinaryExpr::createShl(LHS: MCBinaryExpr::createAnd(LHS: Value, RHS: msk, Ctx),
1356 RHS: shft, Ctx);
1357 };
1358
1359 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1360 int64_t Val;
1361 if (Value->evaluateAsAbsolute(Res&: Val))
1362 OutStreamer->emitIntValue(Value: static_cast<uint64_t>(Val), Size);
1363 else
1364 OutStreamer->emitValue(Value, Size);
1365 };
1366
1367 if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
1368 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1369
1370 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx),
1371 /*Size=*/4);
1372
1373 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1374 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1375
1376 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1377
1378 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1379 // appropriate generation.
1380 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1381 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1382 /*Mask=*/0x3FFFF, /*Shift=*/12),
1383 /*Size=*/4);
1384 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1385 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1386 /*Mask=*/0x7FFF, /*Shift=*/12),
1387 /*Size=*/4);
1388 } else {
1389 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1390 /*Mask=*/0x1FFF, /*Shift=*/12),
1391 /*Size=*/4);
1392 }
1393
1394 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1395 // 0" comment but I don't see a corresponding field in the register spec.
1396 } else {
1397 OutStreamer->emitInt32(Value: RsrcReg);
1398
1399 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1400 LHS: SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1401 RHS: SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1402 Ctx&: MF.getContext());
1403 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1404 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1405
1406 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1407 // appropriate generation.
1408 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1409 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1410 /*Mask=*/0x3FFFF, /*Shift=*/12),
1411 /*Size=*/4);
1412 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1413 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1414 /*Mask=*/0x7FFF, /*Shift=*/12),
1415 /*Size=*/4);
1416 } else {
1417 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1418 /*Mask=*/0x1FFF, /*Shift=*/12),
1419 /*Size=*/4);
1420 }
1421 }
1422
1423 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1424 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1425 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1426 ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2)
1427 : CurrentProgramInfo.LDSBlocks;
1428 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1429 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1430 OutStreamer->emitInt32(Value: MFI->getPSInputEnable());
1431 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1432 OutStreamer->emitInt32(Value: MFI->getPSInputAddr());
1433 }
1434
1435 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1436 OutStreamer->emitInt32(Value: MFI->getNumSpilledSGPRs());
1437 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1438 OutStreamer->emitInt32(Value: MFI->getNumSpilledVGPRs());
1439}
1440
1441// Helper function to add common PAL Metadata 3.0+
1442static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1443 const SIProgramInfo &CurrentProgramInfo,
1444 CallingConv::ID CC, const GCNSubtarget &ST,
1445 unsigned DynamicVGPRBlockSize) {
1446 if (ST.hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode))
1447 MD->setHwStage(CC, field: ".ieee_mode", Val: (bool)CurrentProgramInfo.IEEEMode);
1448
1449 MD->setHwStage(CC, field: ".wgp_mode", Val: (bool)CurrentProgramInfo.WgpMode);
1450 MD->setHwStage(CC, field: ".mem_ordered", Val: (bool)CurrentProgramInfo.MemOrdered);
1451 MD->setHwStage(CC, field: ".forward_progress", Val: (bool)CurrentProgramInfo.FwdProgress);
1452
1453 if (AMDGPU::isCompute(CC)) {
1454 MD->setHwStage(CC, field: ".trap_present",
1455 Val: (bool)CurrentProgramInfo.TrapHandlerEnable);
1456 MD->setHwStage(CC, field: ".excp_en", Val: CurrentProgramInfo.EXCPEnable);
1457
1458 if (DynamicVGPRBlockSize != 0)
1459 MD->setComputeRegisters(field: ".dynamic_vgpr_en", Val: true);
1460 }
1461
1462 MD->updateHwStageMaximum(
1463 CC, field: ".lds_size",
1464 Val: (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1465 sizeof(uint32_t)));
1466}
1467
1468// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1469// is AMDPAL. It stores each compute/SPI register setting and other PAL
1470// metadata items into the PALMD::Metadata, combining with any provided by the
1471// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1472// is then written as a single block in the .note section.
1473void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1474 const SIProgramInfo &CurrentProgramInfo) {
1475 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1476 auto CC = MF.getFunction().getCallingConv();
1477 auto *MD = getTargetStreamer()->getPALMetadata();
1478 auto &Ctx = MF.getContext();
1479
1480 MD->setEntryPoint(CC, Name: MF.getFunction().getName());
1481 MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1482
1483 // For targets that support dynamic VGPRs, set the number of saved dynamic
1484 // VGPRs (if any) in the PAL metadata.
1485 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1486 if (MFI->isDynamicVGPREnabled() &&
1487 MFI->getScratchReservedForDynamicVGPRs() > 0)
1488 MD->setHwStage(CC, field: ".dynamic_vgpr_saved_count",
1489 Val: MFI->getScratchReservedForDynamicVGPRs() / 4);
1490
1491 // Only set AGPRs for supported devices
1492 if (STM.hasMAIInsts()) {
1493 MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR);
1494 }
1495
1496 MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1497 if (MD->getPALMajorVersion() < 3) {
1498 MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM, Ctx), Ctx);
1499 if (AMDGPU::isCompute(CC)) {
1500 MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1501 } else {
1502 const MCExpr *HasScratchBlocks =
1503 MCBinaryExpr::createGT(LHS: CurrentProgramInfo.ScratchBlocks,
1504 RHS: MCConstantExpr::create(Value: 0, Ctx), Ctx);
1505 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1506 MD->setRsrc2(CC, Val: maskShiftSet(Val: HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1507 }
1508 } else {
1509 MD->setHwStage(CC, field: ".debug_mode", Val: (bool)CurrentProgramInfo.DebugMode);
1510 MD->setHwStage(CC, field: ".scratch_en", Type: msgpack::Type::Boolean,
1511 Val: CurrentProgramInfo.ScratchEnable);
1512 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM,
1513 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize());
1514 }
1515
1516 // ScratchSize is in bytes, 16 aligned.
1517 MD->setScratchSize(
1518 CC,
1519 Val: AMDGPUMCExpr::createAlignTo(Value: CurrentProgramInfo.ScratchSize,
1520 Align: MCConstantExpr::create(Value: 16, Ctx), Ctx),
1521 Ctx);
1522
1523 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1524 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1525 ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2)
1526 : CurrentProgramInfo.LDSBlocks;
1527 if (MD->getPALMajorVersion() < 3) {
1528 MD->setRsrc2(
1529 CC,
1530 Val: MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
1531 Ctx);
1532 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1533 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1534 } else {
1535 // Graphics registers
1536 const unsigned ExtraLdsDwGranularity =
1537 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1538 MD->setGraphicsRegisters(
1539 field: ".ps_extra_lds_size",
1540 Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1541
1542 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1543 static StringLiteral const PsInputFields[] = {
1544 ".persp_sample_ena", ".persp_center_ena",
1545 ".persp_centroid_ena", ".persp_pull_model_ena",
1546 ".linear_sample_ena", ".linear_center_ena",
1547 ".linear_centroid_ena", ".line_stipple_tex_ena",
1548 ".pos_x_float_ena", ".pos_y_float_ena",
1549 ".pos_z_float_ena", ".pos_w_float_ena",
1550 ".front_face_ena", ".ancillary_ena",
1551 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1552 unsigned PSInputEna = MFI->getPSInputEnable();
1553 unsigned PSInputAddr = MFI->getPSInputAddr();
1554 for (auto [Idx, Field] : enumerate(First: PsInputFields)) {
1555 MD->setGraphicsRegisters(field1: ".spi_ps_input_ena", field2: Field,
1556 Val: (bool)((PSInputEna >> Idx) & 1));
1557 MD->setGraphicsRegisters(field1: ".spi_ps_input_addr", field2: Field,
1558 Val: (bool)((PSInputAddr >> Idx) & 1));
1559 }
1560 }
1561 }
1562
1563 // For version 3 and above the wave front size is already set in the metadata
1564 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1565 MD->setWave32(MF.getFunction().getCallingConv());
1566}
1567
1568void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1569 auto *MD = getTargetStreamer()->getPALMetadata();
1570 const MachineFrameInfo &MFI = MF.getFrameInfo();
1571 StringRef FnName = MF.getFunction().getName();
1572 MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize());
1573 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1574 MCContext &Ctx = MF.getContext();
1575
1576 if (MD->getPALMajorVersion() < 3) {
1577 // Set compute registers
1578 MD->setRsrc1(
1579 CC: CallingConv::AMDGPU_CS,
1580 Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1581 MD->setRsrc2(CC: CallingConv::AMDGPU_CS,
1582 Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1583 } else {
1584 EmitPALMetadataCommon(
1585 MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST,
1586 DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1587 }
1588
1589 // Set optional info
1590 MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize);
1591 MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1592 MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1593}
1594
1595// This is supposed to be log2(Size)
1596static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1597 switch (Size) {
1598 case 4:
1599 return AMD_ELEMENT_4_BYTES;
1600 case 8:
1601 return AMD_ELEMENT_8_BYTES;
1602 case 16:
1603 return AMD_ELEMENT_16_BYTES;
1604 default:
1605 llvm_unreachable("invalid private_element_size");
1606 }
1607}
1608
1609void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1610 const SIProgramInfo &CurrentProgramInfo,
1611 const MachineFunction &MF) const {
1612 const Function &F = MF.getFunction();
1613 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1614 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1615
1616 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1617 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1618 MCContext &Ctx = MF.getContext();
1619
1620 Out.initDefault(STI: &STM, Ctx, /*InitMCExpr=*/false);
1621
1622 Out.compute_pgm_resource1_registers =
1623 CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx);
1624 Out.compute_pgm_resource2_registers =
1625 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1626 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1627
1628 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1629
1630 AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1631 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1632
1633 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1634 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1635 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1636 }
1637
1638 if (UserSGPRInfo.hasDispatchPtr())
1639 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1640
1641 if (UserSGPRInfo.hasQueuePtr())
1642 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1643
1644 if (UserSGPRInfo.hasKernargSegmentPtr())
1645 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1646
1647 if (UserSGPRInfo.hasDispatchID())
1648 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1649
1650 if (UserSGPRInfo.hasFlatScratchInit())
1651 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1652
1653 if (UserSGPRInfo.hasPrivateSegmentSize())
1654 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
1655
1656 if (STM.isXNACKEnabled())
1657 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1658
1659 Align MaxKernArgAlign;
1660 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign);
1661 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1662 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1663 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1664 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1665
1666 // kernarg_segment_alignment is specified as log of the alignment.
1667 // The minimum alignment is 16.
1668 // FIXME: The metadata treats the minimum as 4?
1669 Out.kernarg_segment_alignment = Log2(A: std::max(a: Align(16), b: MaxKernArgAlign));
1670}
1671
1672bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1673 const char *ExtraCode, raw_ostream &O) {
1674 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1675 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O))
1676 return false;
1677
1678 if (ExtraCode && ExtraCode[0]) {
1679 if (ExtraCode[1] != 0)
1680 return true; // Unknown modifier.
1681
1682 switch (ExtraCode[0]) {
1683 case 'r':
1684 break;
1685 default:
1686 return true;
1687 }
1688 }
1689
1690 // TODO: Should be able to support other operand types like globals.
1691 const MachineOperand &MO = MI->getOperand(i: OpNo);
1692 if (MO.isReg()) {
1693 AMDGPUInstPrinter::printRegOperand(Reg: MO.getReg(), O,
1694 MRI: *MF->getSubtarget().getRegisterInfo());
1695 return false;
1696 }
1697 if (MO.isImm()) {
1698 int64_t Val = MO.getImm();
1699 if (AMDGPU::isInlinableIntLiteral(Literal: Val)) {
1700 O << Val;
1701 } else if (isUInt<16>(x: Val)) {
1702 O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val));
1703 } else if (isUInt<32>(x: Val)) {
1704 O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val));
1705 } else {
1706 O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val));
1707 }
1708 return false;
1709 }
1710 return true;
1711}
1712
1713void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1714 AU.addRequired<AMDGPUResourceUsageAnalysisWrapperPass>();
1715 AU.addPreserved<AMDGPUResourceUsageAnalysisWrapperPass>();
1716 AU.addRequired<MachineModuleInfoWrapperPass>();
1717 AU.addPreserved<MachineModuleInfoWrapperPass>();
1718 AsmPrinter::getAnalysisUsage(AU);
1719}
1720
1721void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1722 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1723 bool isModuleEntryFunction, bool hasMAIInsts) {
1724 if (!ORE)
1725 return;
1726
1727 const char *Name = "kernel-resource-usage";
1728 const char *Indent = " ";
1729
1730 // If the remark is not specifically enabled, do not output to yaml
1731 LLVMContext &Ctx = MF.getFunction().getContext();
1732 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name))
1733 return;
1734
1735 // Currently non-kernel functions have no resources to emit.
1736 if (!isEntryFunctionCC(CC: MF.getFunction().getCallingConv()))
1737 return;
1738
1739 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1740 StringRef RemarkLabel, auto Argument) {
1741 // Add an indent for every line besides the line with the kernel name. This
1742 // makes it easier to tell which resource usage go with which kernel since
1743 // the kernel name will always be displayed first.
1744 std::string LabelStr = RemarkLabel.str() + ": ";
1745 if (RemarkName != "FunctionName")
1746 LabelStr = Indent + LabelStr;
1747
1748 ORE->emit([&]() {
1749 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1750 MF.getFunction().getSubprogram(),
1751 &MF.front())
1752 << LabelStr << ore::NV(RemarkName, Argument);
1753 });
1754 };
1755
1756 // FIXME: Formatting here is pretty nasty because clang does not accept
1757 // newlines from diagnostics. This forces us to emit multiple diagnostic
1758 // remarks to simulate newlines. If and when clang does accept newlines, this
1759 // formatting should be aggregated into one remark with newlines to avoid
1760 // printing multiple diagnostic location and diag opts.
1761 EmitResourceUsageRemark("FunctionName", "Function Name",
1762 MF.getFunction().getName());
1763 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1764 getMCExprStr(Value: CurrentProgramInfo.NumSGPR));
1765 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1766 getMCExprStr(Value: CurrentProgramInfo.NumArchVGPR));
1767 if (hasMAIInsts) {
1768 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1769 getMCExprStr(Value: CurrentProgramInfo.NumAccVGPR));
1770 }
1771 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1772 getMCExprStr(Value: CurrentProgramInfo.ScratchSize));
1773 int64_t DynStack;
1774 bool DynStackEvaluatable =
1775 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(Res&: DynStack);
1776 StringRef DynamicStackStr =
1777 DynStackEvaluatable && DynStack ? "True" : "False";
1778 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1779 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1780 getMCExprStr(Value: CurrentProgramInfo.Occupancy));
1781 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1782 CurrentProgramInfo.SGPRSpill);
1783 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1784 CurrentProgramInfo.VGPRSpill);
1785 if (isModuleEntryFunction)
1786 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1787 CurrentProgramInfo.LDSSize);
1788}
1789
1790char AMDGPUAsmPrinter::ID = 0;
1791
1792INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1793 "AMDGPU Assembly Printer", false, false)
1794