1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
20#include "AMDGPUHSAMetadataStreamer.h"
21#include "AMDGPUMCResourceInfo.h"
22#include "AMDGPUResourceUsageAnalysis.h"
23#include "GCNSubtarget.h"
24#include "MCTargetDesc/AMDGPUInstPrinter.h"
25#include "MCTargetDesc/AMDGPUMCExpr.h"
26#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
27#include "MCTargetDesc/AMDGPUTargetStreamer.h"
28#include "R600AsmPrinter.h"
29#include "SIMachineFunctionInfo.h"
30#include "TargetInfo/AMDGPUTargetInfo.h"
31#include "Utils/AMDGPUBaseInfo.h"
32#include "Utils/AMDKernelCodeTUtils.h"
33#include "Utils/SIDefinesUtils.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/BinaryFormat/ELF.h"
36#include "llvm/CodeGen/MachineFrameInfo.h"
37#include "llvm/CodeGen/MachineModuleInfo.h"
38#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
39#include "llvm/IR/DiagnosticInfo.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
42#include "llvm/MC/MCSectionELF.h"
43#include "llvm/MC/MCStreamer.h"
44#include "llvm/MC/MCValue.h"
45#include "llvm/MC/TargetRegistry.h"
46#include "llvm/Support/AMDHSAKernelDescriptor.h"
47#include "llvm/Support/Compiler.h"
48#include "llvm/Target/TargetLoweringObjectFile.h"
49#include "llvm/Target/TargetMachine.h"
50#include "llvm/TargetParser/TargetParser.h"
51
52using namespace llvm;
53using namespace llvm::AMDGPU;
54
55// This should get the default rounding mode from the kernel. We just set the
56// default here, but this could change if the OpenCL rounding mode pragmas are
57// used.
58//
59// The denormal mode here should match what is reported by the OpenCL runtime
60// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
61// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62//
63// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64// precision, and leaves single precision to flush all and does not report
65// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66// CL_FP_DENORM for both.
67//
68// FIXME: It seems some instructions do not support single precision denormals
69// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
70// and sin_f32, cos_f32 on most parts).
71
72// We want to use these instructions, and using fp32 denormals also causes
73// instructions to run at the double precision rate for the device so it's
74// probably best to just report no single precision denormals.
75static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
76 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
77 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
78 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
79 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
80}
81
82static AsmPrinter *
83createAMDGPUAsmPrinterPass(TargetMachine &tm,
84 std::unique_ptr<MCStreamer> &&Streamer) {
85 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
86}
87
88extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
89LLVMInitializeAMDGPUAsmPrinter() {
90 TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(),
91 Fn: llvm::createR600AsmPrinterPass);
92 TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(),
93 Fn: createAMDGPUAsmPrinterPass);
94}
95
96AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
97 std::unique_ptr<MCStreamer> Streamer)
98 : AsmPrinter(TM, std::move(Streamer)) {
99 assert(OutStreamer && "AsmPrinter constructed without streamer");
100}
101
102StringRef AMDGPUAsmPrinter::getPassName() const {
103 return "AMDGPU Assembly Printer";
104}
105
106const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
107 return TM.getMCSubtargetInfo();
108}
109
110AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
111 if (!OutStreamer)
112 return nullptr;
113 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
114}
115
116void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
117 IsTargetStreamerInitialized = false;
118}
119
120void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
121 IsTargetStreamerInitialized = true;
122
123 // TODO: Which one is called first, emitStartOfAsmFile or
124 // emitFunctionBodyStart?
125 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126 initializeTargetID(M);
127
128 if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
129 TM.getTargetTriple().getOS() != Triple::AMDPAL)
130 return;
131
132 getTargetStreamer()->EmitDirectiveAMDGCNTarget();
133
134 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
135 getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
136 COV: CodeObjectVersion);
137 HSAMetadataStream->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID());
138 }
139
140 if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
141 getTargetStreamer()->getPALMetadata()->readFromIR(M);
142}
143
144void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
145 // Init target streamer if it has not yet happened
146 if (!IsTargetStreamerInitialized)
147 initTargetStreamer(M);
148
149 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
150 getTargetStreamer()->EmitISAVersion();
151
152 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
154 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
155 HSAMetadataStream->end();
156 bool Success = HSAMetadataStream->emitTo(TargetStreamer&: *getTargetStreamer());
157 (void)Success;
158 assert(Success && "Malformed HSA Metadata");
159 }
160}
161
162void AMDGPUAsmPrinter::emitFunctionBodyStart() {
163 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
164 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165 const Function &F = MF->getFunction();
166
167 // TODO: We're checking this late, would be nice to check it earlier.
168 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
169 reportFatalUsageError(
170 reason: STM.getCPU() + " is only available on code object version 6 or better");
171 }
172
173 // TODO: Which one is called first, emitStartOfAsmFile or
174 // emitFunctionBodyStart?
175 if (!getTargetStreamer()->getTargetID())
176 initializeTargetID(M: *F.getParent());
177
178 const auto &FunctionTargetID = STM.getTargetID();
179 // Make sure function's xnack settings are compatible with module's
180 // xnack settings.
181 if (FunctionTargetID.isXnackSupported() &&
182 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184 OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine(MF->getName()) +
185 "' function does not match module xnack setting");
186 return;
187 }
188 // Make sure function's sramecc settings are compatible with module's
189 // sramecc settings.
190 if (FunctionTargetID.isSramEccSupported() &&
191 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193 OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine(MF->getName()) +
194 "' function does not match module sramecc setting");
195 return;
196 }
197
198 if (!MFI.isEntryFunction())
199 return;
200
201 if (STM.isMesaKernel(F) &&
202 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
203 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204 AMDGPUMCKernelCodeT KernelCode;
205 getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF);
206 KernelCode.validate(STI: &STM, Ctx&: MF->getContext());
207 getTargetStreamer()->EmitAMDKernelCodeT(Header&: KernelCode);
208 }
209
210 if (STM.isAmdHsaOS())
211 HSAMetadataStream->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo);
212}
213
214void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
215 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
216 if (!MFI.isEntryFunction())
217 return;
218
219 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
220 return;
221
222 auto &Streamer = getTargetStreamer()->getStreamer();
223 auto &Context = Streamer.getContext();
224 auto &ObjectFileInfo = *Context.getObjectFileInfo();
225 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227 Streamer.pushSection();
228 Streamer.switchSection(Section: &ReadOnlySection);
229
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
231 // alignment.
232 Streamer.emitValueToAlignment(Alignment: Align(64), Fill: 0, FillLen: 1, MaxBytesToEmit: 0);
233 ReadOnlySection.ensureMinAlignment(MinAlignment: Align(64));
234
235 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237 SmallString<128> KernelName;
238 getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction());
239 getTargetStreamer()->EmitAmdhsaKernelDescriptor(
240 STI: STM, KernelName, KernelDescriptor: getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo),
241 NextVGPR: CurrentProgramInfo.NumVGPRsForWavesPerEU,
242 NextSGPR: MCBinaryExpr::createSub(
243 LHS: CurrentProgramInfo.NumSGPRsForWavesPerEU,
244 RHS: AMDGPUMCExpr::createExtraSGPRs(
245 VCCUsed: CurrentProgramInfo.VCCUsed, FlatScrUsed: CurrentProgramInfo.FlatUsed,
246 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx&: Context),
247 Ctx&: Context),
248 ReserveVCC: CurrentProgramInfo.VCCUsed, ReserveFlatScr: CurrentProgramInfo.FlatUsed);
249
250 Streamer.popSection();
251}
252
253void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
254 Register RegNo = MI->getOperand(i: 0).getReg();
255
256 SmallString<128> Str;
257 raw_svector_ostream OS(Str);
258 OS << "implicit-def: "
259 << printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(T: OS.str());
265 OutStreamer->addBlankLine();
266}
267
268void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
269 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
270 AsmPrinter::emitFunctionEntryLabel();
271 return;
272 }
273
274 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(F: MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()),
279 getTargetStreamer()->EmitAMDGPUSymbolType(
280 SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(x: MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
286 HexLines.emplace_back(args: "");
287 }
288
289 AsmPrinter::emitFunctionEntryLabel();
290}
291
292void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 x: (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
299 HexLines.emplace_back(args: "");
300 }
301 AsmPrinter::emitBasicBlockStart(MBB);
302}
303
304void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
305 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
306 if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) {
307 OutContext.reportError(L: {},
308 Msg: Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
314 const Triple::OSType OS = TM.getTargetTriple().getOS();
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error(reason: "symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getDataLayout();
326 uint64_t Size = GV->getGlobalSize(DL);
327 Align Alignment = GV->getAlign().value_or(u: Align(4));
328
329 emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto *TS = getTargetStreamer();
332 TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment);
333 return;
334 }
335
336 AsmPrinter::emitGlobalVariable(GV);
337}
338
339bool AMDGPUAsmPrinter::doInitialization(Module &M) {
340 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
342 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
343 switch (CodeObjectVersion) {
344 case AMDGPU::AMDHSA_COV4:
345 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346 break;
347 case AMDGPU::AMDHSA_COV5:
348 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349 break;
350 case AMDGPU::AMDHSA_COV6:
351 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352 break;
353 default:
354 reportFatalUsageError(reason: "unsupported code object version");
355 }
356 }
357
358 return AsmPrinter::doInitialization(M);
359}
360
361/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
362///
363/// Remove dependency on GCNSubtarget and depend only only the necessary values
364/// for said occupancy computation. Should match computeOccupancy implementation
365/// without passing \p STM on.
366const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
367 const MCExpr *NumVGPRs,
368 unsigned DynamicVGPRBlockSize,
369 const GCNSubtarget &STM, MCContext &Ctx) {
370 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(STI: &STM);
371 unsigned Granule = IsaInfo::getVGPRAllocGranule(STI: &STM, DynamicVGPRBlockSize);
372 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(STI: &STM);
373 unsigned Generation = STM.getGeneration();
374
375 auto CreateExpr = [&Ctx](unsigned Value) {
376 return MCConstantExpr::create(Value, Ctx);
377 };
378
379 return AMDGPUMCExpr::create(Kind: AMDGPUMCExpr::AGVK_Occupancy,
380 Args: {CreateExpr(MaxWaves), CreateExpr(Granule),
381 CreateExpr(TargetTotalNumVGPRs),
382 CreateExpr(Generation), CreateExpr(InitOcc),
383 NumSGPRs, NumVGPRs},
384 Ctx);
385}
386
387void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
388 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(CC: F.getCallingConv()))
389 return;
390
391 using RIK = MCResourceInfo::ResourceInfoKind;
392 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
393 MCSymbol *FnSym = TM.getSymbol(GV: &F);
394 bool IsLocal = F.hasLocalLinkage();
395
396 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
397 int64_t Val;
398 if (Value->evaluateAsAbsolute(Res&: Val)) {
399 Res = Val;
400 return true;
401 }
402 return false;
403 };
404
405 const uint64_t MaxScratchPerWorkitem =
406 STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
407 MCSymbol *ScratchSizeSymbol = RI.getSymbol(
408 FuncName: FnSym->getName(), RIK: RIK::RIK_PrivateSegSize, OutContext, IsLocal);
409 uint64_t ScratchSize;
410 if (ScratchSizeSymbol->isVariable() &&
411 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
412 ScratchSize > MaxScratchPerWorkitem) {
413 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
414 DS_Error);
415 F.getContext().diagnose(DI: DiagStackSize);
416 }
417
418 // Validate addressable scalar registers (i.e., prior to added implicit
419 // SGPRs).
420 MCSymbol *NumSGPRSymbol =
421 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext, IsLocal);
422 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
423 !STM.hasSGPRInitBug()) {
424 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
425 uint64_t NumSgpr;
426 if (NumSGPRSymbol->isVariable() &&
427 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
428 NumSgpr > MaxAddressableNumSGPRs) {
429 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
430 NumSgpr, MaxAddressableNumSGPRs,
431 DS_Error, DK_ResourceLimit);
432 F.getContext().diagnose(DI: Diag);
433 return;
434 }
435 }
436
437 MCSymbol *VCCUsedSymbol =
438 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext, IsLocal);
439 MCSymbol *FlatUsedSymbol = RI.getSymbol(
440 FuncName: FnSym->getName(), RIK: RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
441 uint64_t VCCUsed, FlatUsed, NumSgpr;
442
443 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
444 FlatUsedSymbol->isVariable() &&
445 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
446 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
447 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
448
449 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
450 // resolvable.
451 NumSgpr += IsaInfo::getNumExtraSGPRs(
452 STI: &STM, VCCUsed, FlatScrUsed: FlatUsed,
453 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny());
454 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
455 STM.hasSGPRInitBug()) {
456 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
457 if (NumSgpr > MaxAddressableNumSGPRs) {
458 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
459 MaxAddressableNumSGPRs, DS_Error,
460 DK_ResourceLimit);
461 F.getContext().diagnose(DI: Diag);
462 return;
463 }
464 }
465
466 MCSymbol *NumVgprSymbol =
467 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext, IsLocal);
468 MCSymbol *NumAgprSymbol =
469 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext, IsLocal);
470 uint64_t NumVgpr, NumAgpr;
471
472 MachineModuleInfo &MMI =
473 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
474 MachineFunction *MF = MMI.getMachineFunction(F);
475 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
476 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
477 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
478 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
479 unsigned MaxWaves = MFI.getMaxWavesPerEU();
480 uint64_t TotalNumVgpr =
481 getTotalNumVGPRs(has90AInsts: STM.hasGFX90AInsts(), ArgNumAGPR: NumAgpr, ArgNumVGPR: NumVgpr);
482 uint64_t NumVGPRsForWavesPerEU =
483 std::max(l: {TotalNumVgpr, (uint64_t)1,
484 (uint64_t)STM.getMinNumVGPRs(
485 WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize())});
486 uint64_t NumSGPRsForWavesPerEU = std::max(
487 l: {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(WavesPerEU: MaxWaves)});
488 const MCExpr *OccupancyExpr = createOccupancy(
489 InitOcc: STM.getOccupancyWithWorkGroupSizes(MF: *MF).second,
490 NumSGPRs: MCConstantExpr::create(Value: NumSGPRsForWavesPerEU, Ctx&: OutContext),
491 NumVGPRs: MCConstantExpr::create(Value: NumVGPRsForWavesPerEU, Ctx&: OutContext),
492 DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize(), STM, Ctx&: OutContext);
493 uint64_t Occupancy;
494
495 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
496 F, Name: "amdgpu-waves-per-eu", Default: {0, 0}, OnlyFirstRequired: true);
497
498 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
499 DiagnosticInfoOptimizationFailure Diag(
500 F, F.getSubprogram(),
501 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
502 "'" +
503 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
504 ", final occupancy is " + Twine(Occupancy));
505 F.getContext().diagnose(DI: Diag);
506 return;
507 }
508 }
509 }
510}
511
512bool AMDGPUAsmPrinter::doFinalization(Module &M) {
513 // Pad with s_code_end to help tools and guard against instruction prefetch
514 // causing stale data in caches. Arguably this should be done by the linker,
515 // which is why this isn't done for Mesa.
516 // Don't do it if there is no code.
517 const MCSubtargetInfo &STI = *getGlobalSTI();
518 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
519 (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
520 STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
521 MCSection *TextSect = getObjFileLowering().getTextSection();
522 if (TextSect->hasInstructions()) {
523 OutStreamer->switchSection(Section: TextSect);
524 getTargetStreamer()->EmitCodeEnd(STI);
525 }
526 }
527
528 // Assign expressions which can only be resolved when all other functions are
529 // known.
530 RI.finalize(OutContext);
531
532 // Switch section and emit all GPR maximums within the processed module.
533 OutStreamer->pushSection();
534 MCSectionELF *MaxGPRSection =
535 OutContext.getELFSection(Section: ".AMDGPU.gpr_maximums", Type: ELF::SHT_PROGBITS, Flags: 0);
536 OutStreamer->switchSection(Section: MaxGPRSection);
537 getTargetStreamer()->EmitMCResourceMaximums(
538 MaxVGPR: RI.getMaxVGPRSymbol(OutContext), MaxAGPR: RI.getMaxAGPRSymbol(OutContext),
539 MaxSGPR: RI.getMaxSGPRSymbol(OutContext), MaxNamedBarrier: RI.getMaxNamedBarrierSymbol(OutContext));
540 OutStreamer->popSection();
541
542 for (Function &F : M.functions())
543 validateMCResourceInfo(F);
544
545 RI.reset();
546
547 return AsmPrinter::doFinalization(M);
548}
549
550SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
551 SmallString<128> Str;
552 raw_svector_ostream OSS(Str);
553 auto &Streamer = getTargetStreamer()->getStreamer();
554 auto &Context = Streamer.getContext();
555 const MCExpr *New = foldAMDGPUMCExpr(Expr: Value, Ctx&: Context);
556 printAMDGPUMCExpr(Expr: New, OS&: OSS, MAI);
557 return Str;
558}
559
560// Print comments that apply to both callable functions and entry points.
561void AMDGPUAsmPrinter::emitCommonFunctionComments(
562 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
563 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
564 const AMDGPUMachineFunction *MFI) {
565 OutStreamer->emitRawComment(T: " codeLenInByte = " + Twine(CodeSize), TabPrefix: false);
566 OutStreamer->emitRawComment(T: " TotalNumSgprs: " + getMCExprStr(Value: NumSGPR),
567 TabPrefix: false);
568 OutStreamer->emitRawComment(T: " NumVgprs: " + getMCExprStr(Value: NumVGPR), TabPrefix: false);
569 if (NumAGPR && TotalNumVGPR) {
570 OutStreamer->emitRawComment(T: " NumAgprs: " + getMCExprStr(Value: NumAGPR), TabPrefix: false);
571 OutStreamer->emitRawComment(T: " TotalNumVgprs: " + getMCExprStr(Value: TotalNumVGPR),
572 TabPrefix: false);
573 }
574 OutStreamer->emitRawComment(T: " ScratchSize: " + getMCExprStr(Value: ScratchSize),
575 TabPrefix: false);
576 OutStreamer->emitRawComment(T: " MemoryBound: " + Twine(MFI->isMemoryBound()),
577 TabPrefix: false);
578}
579
580const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
581 const MachineFunction &MF) const {
582 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
583 MCContext &Ctx = MF.getContext();
584 uint16_t KernelCodeProperties = 0;
585 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
586
587 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
588 KernelCodeProperties |=
589 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
590 }
591 if (UserSGPRInfo.hasDispatchPtr()) {
592 KernelCodeProperties |=
593 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
594 }
595 if (UserSGPRInfo.hasQueuePtr()) {
596 KernelCodeProperties |=
597 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
598 }
599 if (UserSGPRInfo.hasKernargSegmentPtr()) {
600 KernelCodeProperties |=
601 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
602 }
603 if (UserSGPRInfo.hasDispatchID()) {
604 KernelCodeProperties |=
605 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
606 }
607 if (UserSGPRInfo.hasFlatScratchInit()) {
608 KernelCodeProperties |=
609 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
610 }
611 if (UserSGPRInfo.hasPrivateSegmentSize()) {
612 KernelCodeProperties |=
613 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
614 }
615 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
616 KernelCodeProperties |=
617 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
618 }
619
620 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
621 // un-evaluatable at this point so it cannot be conditionally checked here.
622 // Instead, we'll directly shift the possibly unknown MCExpr into its place
623 // and bitwise-or it into KernelCodeProperties.
624 const MCExpr *KernelCodePropExpr =
625 MCConstantExpr::create(Value: KernelCodeProperties, Ctx);
626 const MCExpr *OrValue = MCConstantExpr::create(
627 Value: amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
628 OrValue = MCBinaryExpr::createShl(LHS: CurrentProgramInfo.DynamicCallStack,
629 RHS: OrValue, Ctx);
630 KernelCodePropExpr = MCBinaryExpr::createOr(LHS: KernelCodePropExpr, RHS: OrValue, Ctx);
631
632 return KernelCodePropExpr;
633}
634
635MCKernelDescriptor
636AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
637 const SIProgramInfo &PI) const {
638 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
639 const Function &F = MF.getFunction();
640 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
641 MCContext &Ctx = MF.getContext();
642
643 MCKernelDescriptor KernelDescriptor;
644
645 KernelDescriptor.group_segment_fixed_size =
646 MCConstantExpr::create(Value: PI.LDSSize, Ctx);
647 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
648
649 Align MaxKernArgAlign;
650 KernelDescriptor.kernarg_size = MCConstantExpr::create(
651 Value: STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign), Ctx);
652
653 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(ST: STM, Ctx);
654 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
655 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
656
657 int64_t PGRM_Rsrc3 = 1;
658 bool EvaluatableRsrc3 =
659 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(Res&: PGRM_Rsrc3);
660 (void)PGRM_Rsrc3;
661 (void)EvaluatableRsrc3;
662 assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
663 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
664 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
665 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
666
667 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
668 Value: AMDGPU::hasKernargPreload(STI: STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
669 Ctx);
670
671 return KernelDescriptor;
672}
673
674bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
675 // Init target streamer lazily on the first function so that previous passes
676 // can set metadata.
677 if (!IsTargetStreamerInitialized)
678 initTargetStreamer(M&: *MF.getFunction().getParent());
679
680 ResourceUsage =
681 &getAnalysis<AMDGPUResourceUsageAnalysisWrapperPass>().getResourceInfo();
682 CurrentProgramInfo.reset(MF);
683
684 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
685 MCContext &Ctx = MF.getContext();
686
687 // The starting address of all shader programs must be 256 bytes aligned.
688 // Regular functions just need the basic required instruction alignment.
689 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
690
691 SetupMachineFunction(MF);
692
693 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
694 MCContext &Context = getObjFileLowering().getContext();
695 bool IsLocal = MF.getFunction().hasLocalLinkage();
696 // FIXME: This should be an explicit check for Mesa.
697 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
698 MCSectionELF *ConfigSection =
699 Context.getELFSection(Section: ".AMDGPU.config", Type: ELF::SHT_PROGBITS, Flags: 0);
700 OutStreamer->switchSection(Section: ConfigSection);
701 }
702
703 RI.gatherResourceInfo(MF, FRI: *ResourceUsage, OutContext);
704
705 if (MFI->isModuleEntryFunction()) {
706 getSIProgramInfo(Out&: CurrentProgramInfo, MF);
707 }
708
709 if (STM.isAmdPalOS()) {
710 if (MFI->isEntryFunction())
711 EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo);
712 else if (MFI->isModuleEntryFunction())
713 emitPALFunctionMetadata(MF);
714 } else if (!STM.isAmdHsaOS()) {
715 EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo);
716 }
717
718 DumpCodeInstEmitter = nullptr;
719 if (STM.dumpCode()) {
720 // For -dumpcode, get the assembler out of the streamer. This only works
721 // with -filetype=obj.
722 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
723 if (Assembler)
724 DumpCodeInstEmitter = Assembler->getEmitterPtr();
725 }
726
727 DisasmLines.clear();
728 HexLines.clear();
729 DisasmLineMaxLen = 0;
730
731 emitFunctionBody();
732
733 emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(),
734 hasMAIInsts: STM.hasMAIInsts());
735
736 {
737 using RIK = MCResourceInfo::ResourceInfoKind;
738 getTargetStreamer()->EmitMCResourceInfo(
739 NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext,
740 IsLocal),
741 NumAGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext,
742 IsLocal),
743 NumExplicitSGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext,
744 IsLocal),
745 NumNamedBarrier: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumNamedBarrier,
746 OutContext, IsLocal),
747 PrivateSegmentSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
748 OutContext, IsLocal),
749 UsesVCC: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext,
750 IsLocal),
751 UsesFlatScratch: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesFlatScratch,
752 OutContext, IsLocal),
753 HasDynamicallySizedStack: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasDynSizedStack,
754 OutContext, IsLocal),
755 HasRecursion: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasRecursion, OutContext,
756 IsLocal),
757 HasIndirectCall: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasIndirectCall,
758 OutContext, IsLocal));
759 }
760
761 // Emit _dvgpr$ symbol when appropriate.
762 emitDVgprSymbol(MF);
763
764 if (isVerbose()) {
765 MCSectionELF *CommentSection =
766 Context.getELFSection(Section: ".AMDGPU.csdata", Type: ELF::SHT_PROGBITS, Flags: 0);
767 OutStreamer->switchSection(Section: CommentSection);
768
769 if (!MFI->isEntryFunction()) {
770 using RIK = MCResourceInfo::ResourceInfoKind;
771 OutStreamer->emitRawComment(T: " Function info:", TabPrefix: false);
772
773 emitCommonFunctionComments(
774 NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext,
775 IsLocal)
776 ->getVariableValue(),
777 NumAGPR: STM.hasMAIInsts()
778 ? RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR,
779 OutContext, IsLocal)
780 ->getVariableValue()
781 : nullptr,
782 TotalNumVGPR: RI.createTotalNumVGPRs(MF, Ctx),
783 NumSGPR: RI.createTotalNumSGPRs(
784 MF,
785 hasXnack: MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
786 Ctx),
787 ScratchSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
788 OutContext, IsLocal)
789 ->getVariableValue(),
790 CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
791 return false;
792 }
793
794 OutStreamer->emitRawComment(T: " Kernel info:", TabPrefix: false);
795 emitCommonFunctionComments(
796 NumVGPR: CurrentProgramInfo.NumArchVGPR,
797 NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
798 TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR,
799 ScratchSize: CurrentProgramInfo.ScratchSize,
800 CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
801
802 OutStreamer->emitRawComment(
803 T: " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), TabPrefix: false);
804 OutStreamer->emitRawComment(
805 T: " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), TabPrefix: false);
806 OutStreamer->emitRawComment(
807 T: " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
808 " bytes/workgroup (compile time only)", TabPrefix: false);
809
810 OutStreamer->emitRawComment(
811 T: " SGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.SGPRBlocks), TabPrefix: false);
812
813 OutStreamer->emitRawComment(
814 T: " VGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.VGPRBlocks), TabPrefix: false);
815
816 OutStreamer->emitRawComment(
817 T: " NumSGPRsForWavesPerEU: " +
818 getMCExprStr(Value: CurrentProgramInfo.NumSGPRsForWavesPerEU),
819 TabPrefix: false);
820 OutStreamer->emitRawComment(
821 T: " NumVGPRsForWavesPerEU: " +
822 getMCExprStr(Value: CurrentProgramInfo.NumVGPRsForWavesPerEU),
823 TabPrefix: false);
824
825 if (STM.hasGFX90AInsts()) {
826 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
827 LHS: CurrentProgramInfo.AccumOffset, RHS: MCConstantExpr::create(Value: 1, Ctx), Ctx);
828 AdjustedAccum = MCBinaryExpr::createMul(
829 LHS: AdjustedAccum, RHS: MCConstantExpr::create(Value: 4, Ctx), Ctx);
830 OutStreamer->emitRawComment(
831 T: " AccumOffset: " + getMCExprStr(Value: AdjustedAccum), TabPrefix: false);
832 }
833
834 if (STM.hasGFX1250Insts())
835 OutStreamer->emitRawComment(
836 T: " NamedBarCnt: " + getMCExprStr(Value: CurrentProgramInfo.NamedBarCnt),
837 TabPrefix: false);
838
839 OutStreamer->emitRawComment(
840 T: " Occupancy: " + getMCExprStr(Value: CurrentProgramInfo.Occupancy), TabPrefix: false);
841
842 OutStreamer->emitRawComment(
843 T: " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), TabPrefix: false);
844
845 OutStreamer->emitRawComment(
846 T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
847 getMCExprStr(Value: CurrentProgramInfo.ScratchEnable),
848 TabPrefix: false);
849 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " +
850 Twine(CurrentProgramInfo.UserSGPR),
851 TabPrefix: false);
852 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
853 Twine(CurrentProgramInfo.TrapHandlerEnable),
854 TabPrefix: false);
855 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
856 Twine(CurrentProgramInfo.TGIdXEnable),
857 TabPrefix: false);
858 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
859 Twine(CurrentProgramInfo.TGIdYEnable),
860 TabPrefix: false);
861 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
862 Twine(CurrentProgramInfo.TGIdZEnable),
863 TabPrefix: false);
864 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
865 Twine(CurrentProgramInfo.TIdIGCompCount),
866 TabPrefix: false);
867
868 [[maybe_unused]] int64_t PGMRSrc3;
869 assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
870 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
871 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
872 static_cast<uint64_t>(PGMRSrc3) == 0));
873 if (STM.hasGFX90AInsts()) {
874 OutStreamer->emitRawComment(
875 T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
876 getMCExprStr(Value: MCKernelDescriptor::bits_get(
877 Src: CurrentProgramInfo.ComputePGMRSrc3,
878 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
879 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
880 TabPrefix: false);
881 OutStreamer->emitRawComment(
882 T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
883 getMCExprStr(Value: MCKernelDescriptor::bits_get(
884 Src: CurrentProgramInfo.ComputePGMRSrc3,
885 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
886 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
887 TabPrefix: false);
888 }
889 }
890
891 if (DumpCodeInstEmitter) {
892
893 OutStreamer->switchSection(
894 Section: Context.getELFSection(Section: ".AMDGPU.disasm", Type: ELF::SHT_PROGBITS, Flags: 0));
895
896 for (size_t i = 0; i < DisasmLines.size(); ++i) {
897 std::string Comment = "\n";
898 if (!HexLines[i].empty()) {
899 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
900 Comment += " ; " + HexLines[i] + "\n";
901 }
902
903 OutStreamer->emitBytes(Data: StringRef(DisasmLines[i]));
904 OutStreamer->emitBytes(Data: StringRef(Comment));
905 }
906 }
907
908 return false;
909}
910
911// When appropriate, add a _dvgpr$ symbol, with the value of the function
912// symbol, plus an offset encoding one less than the number of VGPR blocks used
913// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
914// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
915// used by a front-end to have functions that are chained rather than called,
916// and a dispatcher that dynamically resizes the VGPR count before dispatching
917// to a function.
918void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
919 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
920 if (MFI.isDynamicVGPREnabled() &&
921 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
922 MCContext &Ctx = MF.getContext();
923 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
924 MCValue NumVGPRs;
925 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
926 Res&: NumVGPRs, Asm: nullptr) ||
927 !NumVGPRs.isAbsolute()) {
928 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
929 }
930 // Calculate number of VGPR blocks.
931 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
932 unsigned NumBlocks =
933 divideCeil(Numerator: std::max(a: unsigned(NumVGPRs.getConstant()), b: 1U), Denominator: BlockSize);
934
935 if (NumBlocks > 8) {
936 OutContext.reportError(L: {},
937 Msg: "too many DVGPR blocks for _dvgpr$ symbol for '" +
938 Twine(CurrentFnSym->getName()) + "'");
939 return;
940 }
941 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
942 // Add to function symbol to create _dvgpr$ symbol.
943 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
944 LHS: MCSymbolRefExpr::create(Symbol: CurrentFnSym, Ctx),
945 RHS: MCConstantExpr::create(Value: EncodedNumBlocks, Ctx), Ctx);
946 MCSymbol *DVgprFuncSym =
947 Ctx.getOrCreateSymbol(Name: Twine("_dvgpr$") + CurrentFnSym->getName());
948 OutStreamer->emitAssignment(Symbol: DVgprFuncSym, Value: DVgprFuncVal);
949 emitVisibility(Sym: DVgprFuncSym, Visibility: MF.getFunction().getVisibility());
950 emitLinkage(GV: &MF.getFunction(), GVSym: DVgprFuncSym);
951 }
952}
953
954// TODO: Fold this into emitFunctionBodyStart.
955void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
956 // In the beginning all features are either 'Any' or 'NotSupported',
957 // depending on global target features. This will cover empty modules.
958 getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(),
959 FeatureString: getGlobalSTI()->getFeatureString());
960
961 // If module is empty, we are done.
962 if (M.empty())
963 return;
964
965 // If module is not empty, need to find first 'Off' or 'On' feature
966 // setting per feature from functions in module.
967 for (auto &F : M) {
968 auto &TSTargetID = getTargetStreamer()->getTargetID();
969 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
970 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
971 break;
972
973 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
974 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
975 if (TSTargetID->isXnackSupported())
976 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
977 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
978 if (TSTargetID->isSramEccSupported())
979 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
980 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
981 }
982}
983
984// AccumOffset computed for the MCExpr equivalent of:
985// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
986static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
987 const MCExpr *ConstFour = MCConstantExpr::create(Value: 4, Ctx);
988 const MCExpr *ConstOne = MCConstantExpr::create(Value: 1, Ctx);
989
990 // Can't be lower than 1 for subsequent alignTo.
991 const MCExpr *MaximumTaken =
992 AMDGPUMCExpr::createMax(Args: {ConstOne, NumVGPR}, Ctx);
993
994 // Practically, it's computing divideCeil(MaximumTaken, 4).
995 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
996 LHS: AMDGPUMCExpr::createAlignTo(Value: MaximumTaken, Align: ConstFour, Ctx), RHS: ConstFour,
997 Ctx);
998
999 return MCBinaryExpr::createSub(LHS: DivCeil, RHS: ConstOne, Ctx);
1000}
1001
1002void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1003 const MachineFunction &MF) {
1004 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1005 bool IsLocal = MF.getFunction().hasLocalLinkage();
1006 MCContext &Ctx = MF.getContext();
1007
1008 auto CreateExpr = [&Ctx](int64_t Value) {
1009 return MCConstantExpr::create(Value, Ctx);
1010 };
1011
1012 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1013 int64_t Val;
1014 if (Value->evaluateAsAbsolute(Res&: Val)) {
1015 Res = Val;
1016 return true;
1017 }
1018 return false;
1019 };
1020
1021 auto GetSymRefExpr =
1022 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1023 MCSymbol *Sym =
1024 RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK, OutContext, IsLocal);
1025 return MCSymbolRefExpr::create(Symbol: Sym, Ctx);
1026 };
1027
1028 using RIK = MCResourceInfo::ResourceInfoKind;
1029 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1030 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1031 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1032 NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1033
1034 ProgInfo.AccumOffset = computeAccumOffset(NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1035 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1036 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1037 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1038 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1039 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1040 ProgInfo.DynamicCallStack =
1041 MCBinaryExpr::createOr(LHS: GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1042 RHS: GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1043
1044 const MCExpr *BarBlkConst = MCConstantExpr::create(Value: 4, Ctx);
1045 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1046 Value: GetSymRefExpr(RIK::RIK_NumNamedBarrier), Align: BarBlkConst, Ctx);
1047 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(LHS: AlignToBlk, RHS: BarBlkConst, Ctx);
1048
1049 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1050
1051 // The calculations related to SGPR/VGPR blocks are
1052 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1053 // unified.
1054 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1055 VCCUsed: ProgInfo.VCCUsed, FlatScrUsed: ProgInfo.FlatUsed,
1056 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1057
1058 // Check the addressable register limit before we add ExtraSGPRs.
1059 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1060 !STM.hasSGPRInitBug()) {
1061 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1062 uint64_t NumSgpr;
1063 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1064 NumSgpr > MaxAddressableNumSGPRs) {
1065 // This can happen due to a compiler bug or when using inline asm.
1066 LLVMContext &Ctx = MF.getFunction().getContext();
1067 DiagnosticInfoResourceLimit Diag(
1068 MF.getFunction(), "addressable scalar registers", NumSgpr,
1069 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
1070 Ctx.diagnose(DI: Diag);
1071 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1072 }
1073 }
1074
1075 // Account for extra SGPRs and VGPRs reserved for debugger use.
1076 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(LHS: ProgInfo.NumSGPR, RHS: ExtraSGPRs, Ctx);
1077
1078 const Function &F = MF.getFunction();
1079
1080 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1081 // dispatch registers as function args.
1082 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1083 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1084
1085 if (WaveDispatchNumSGPR) {
1086 ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
1087 Args: {ProgInfo.NumSGPR,
1088 MCBinaryExpr::createAdd(LHS: CreateExpr(WaveDispatchNumSGPR), RHS: ExtraSGPRs,
1089 Ctx)},
1090 Ctx);
1091 }
1092
1093 if (WaveDispatchNumVGPR) {
1094 ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
1095 Args: {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1096
1097 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1098 NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1099 }
1100
1101 // Adjust number of registers used to meet default/requested minimum/maximum
1102 // number of waves per execution unit request.
1103 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1104 ProgInfo.NumSGPRsForWavesPerEU =
1105 AMDGPUMCExpr::createMax(Args: {ProgInfo.NumSGPR, CreateExpr(1ul),
1106 CreateExpr(STM.getMinNumSGPRs(WavesPerEU: MaxWaves))},
1107 Ctx);
1108 ProgInfo.NumVGPRsForWavesPerEU =
1109 AMDGPUMCExpr::createMax(Args: {ProgInfo.NumVGPR, CreateExpr(1ul),
1110 CreateExpr(STM.getMinNumVGPRs(
1111 WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()))},
1112 Ctx);
1113
1114 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
1115 STM.hasSGPRInitBug()) {
1116 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1117 uint64_t NumSgpr;
1118 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1119 NumSgpr > MaxAddressableNumSGPRs) {
1120 // This can happen due to a compiler bug or when using inline asm to use
1121 // the registers which are usually reserved for vcc etc.
1122 LLVMContext &Ctx = MF.getFunction().getContext();
1123 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1124 NumSgpr, MaxAddressableNumSGPRs,
1125 DS_Error, DK_ResourceLimit);
1126 Ctx.diagnose(DI: Diag);
1127 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1128 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1129 }
1130 }
1131
1132 if (STM.hasSGPRInitBug()) {
1133 ProgInfo.NumSGPR =
1134 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1135 ProgInfo.NumSGPRsForWavesPerEU =
1136 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1137 }
1138
1139 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1140 LLVMContext &Ctx = MF.getFunction().getContext();
1141 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1142 MFI->getNumUserSGPRs(),
1143 STM.getMaxNumUserSGPRs(), DS_Error);
1144 Ctx.diagnose(DI: Diag);
1145 }
1146
1147 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1148 LLVMContext &Ctx = MF.getFunction().getContext();
1149 DiagnosticInfoResourceLimit Diag(
1150 MF.getFunction(), "local memory", MFI->getLDSSize(),
1151 STM.getAddressableLocalMemorySize(), DS_Error);
1152 Ctx.diagnose(DI: Diag);
1153 }
1154 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1155 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1156 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1157 unsigned Granule) {
1158 const MCExpr *OneConst = CreateExpr(1ul);
1159 const MCExpr *GranuleConst = CreateExpr(Granule);
1160 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax(Args: {NumGPR, OneConst}, Ctx);
1161 const MCExpr *AlignToGPR =
1162 AMDGPUMCExpr::createAlignTo(Value: MaxNumGPR, Align: GranuleConst, Ctx);
1163 const MCExpr *DivGPR =
1164 MCBinaryExpr::createDiv(LHS: AlignToGPR, RHS: GranuleConst, Ctx);
1165 const MCExpr *SubGPR = MCBinaryExpr::createSub(LHS: DivGPR, RHS: OneConst, Ctx);
1166 return SubGPR;
1167 };
1168 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1169 if (STM.getGeneration() >= AMDGPUSubtarget::GFX10) {
1170 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1171 } else {
1172 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1173 ProgInfo.NumSGPRsForWavesPerEU, IsaInfo::getSGPREncodingGranule(STI: &STM));
1174 }
1175 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1176 IsaInfo::getVGPREncodingGranule(STI: &STM));
1177
1178 const SIModeRegisterDefaults Mode = MFI->getMode();
1179
1180 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1181 // register.
1182 ProgInfo.FloatMode = getFPMode(Mode);
1183
1184 ProgInfo.IEEEMode = Mode.IEEE;
1185
1186 // Make clamp modifier on NaN input returns 0.
1187 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1188
1189 unsigned LDSAlignShift = 8;
1190 switch (getLdsDwGranularity(ST: STM)) {
1191 case 512:
1192 case 320:
1193 LDSAlignShift = 11;
1194 break;
1195 case 128:
1196 LDSAlignShift = 9;
1197 break;
1198 case 64:
1199 LDSAlignShift = 8;
1200 break;
1201 default:
1202 llvm_unreachable("invald LDS block size");
1203 }
1204
1205 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1206 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1207
1208 ProgInfo.LDSSize = MFI->getLDSSize();
1209 ProgInfo.LDSBlocks =
1210 alignTo(Value: ProgInfo.LDSSize, Align: 1ULL << LDSAlignShift) >> LDSAlignShift;
1211
1212 // The MCExpr equivalent of divideCeil.
1213 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1214 const MCExpr *Ceil =
1215 AMDGPUMCExpr::createAlignTo(Value: Numerator, Align: Denominator, Ctx);
1216 return MCBinaryExpr::createDiv(LHS: Ceil, RHS: Denominator, Ctx);
1217 };
1218
1219 // Scratch is allocated in 64-dword or 256-dword blocks.
1220 unsigned ScratchAlignShift =
1221 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1222 // We need to program the hardware with the amount of scratch memory that
1223 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1224 // scratch memory used per thread.
1225 ProgInfo.ScratchBlocks = DivideCeil(
1226 MCBinaryExpr::createMul(LHS: ProgInfo.ScratchSize,
1227 RHS: CreateExpr(STM.getWavefrontSize()), Ctx),
1228 CreateExpr(1ULL << ScratchAlignShift));
1229
1230 if (STM.supportsWGP()) {
1231 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1232 }
1233
1234 if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= 10) {
1235 ProgInfo.MemOrdered = 1;
1236 ProgInfo.FwdProgress = 1;
1237 }
1238
1239 // 0 = X, 1 = XY, 2 = XYZ
1240 unsigned TIDIGCompCnt = 0;
1241 if (MFI->hasWorkItemIDZ())
1242 TIDIGCompCnt = 2;
1243 else if (MFI->hasWorkItemIDY())
1244 TIDIGCompCnt = 1;
1245
1246 // The private segment wave byte offset is the last of the system SGPRs. We
1247 // initially assumed it was allocated, and may have used it. It shouldn't harm
1248 // anything to disable it if we know the stack isn't used here. We may still
1249 // have emitted code reading it to initialize scratch, but if that's unused
1250 // reading garbage should be OK.
1251 ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
1252 LHS: MCBinaryExpr::createGT(LHS: ProgInfo.ScratchBlocks,
1253 RHS: MCConstantExpr::create(Value: 0, Ctx), Ctx),
1254 RHS: ProgInfo.DynamicCallStack, Ctx);
1255
1256 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1257 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1258 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1259 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1260 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1261 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1262 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1263 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1264 ProgInfo.EXCPEnMSB = 0;
1265 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1266 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1267 ProgInfo.EXCPEnable = 0;
1268
1269 // return ((Dst & ~Mask) | (Value << Shift))
1270 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1271 uint32_t Shift) {
1272 const auto *Shft = MCConstantExpr::create(Value: Shift, Ctx);
1273 const auto *Msk = MCConstantExpr::create(Value: Mask, Ctx);
1274 Dst = MCBinaryExpr::createAnd(LHS: Dst, RHS: MCUnaryExpr::createNot(Expr: Msk, Ctx), Ctx);
1275 Dst = MCBinaryExpr::createOr(LHS: Dst, RHS: MCBinaryExpr::createShl(LHS: Value, RHS: Shft, Ctx),
1276 Ctx);
1277 return Dst;
1278 };
1279
1280 if (STM.hasGFX90AInsts()) {
1281 ProgInfo.ComputePGMRSrc3 =
1282 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1283 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1284 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1285 ProgInfo.ComputePGMRSrc3 =
1286 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1287 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1288 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1289 }
1290
1291 if (STM.hasGFX1250Insts())
1292 ProgInfo.ComputePGMRSrc3 =
1293 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1294 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1295 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1296
1297 ProgInfo.Occupancy = createOccupancy(
1298 InitOcc: STM.computeOccupancy(F, LDSSize: ProgInfo.LDSSize).second,
1299 NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU, NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU,
1300 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1301
1302 const auto [MinWEU, MaxWEU] =
1303 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default: {0, 0}, OnlyFirstRequired: true);
1304 uint64_t Occupancy;
1305 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1306 DiagnosticInfoOptimizationFailure Diag(
1307 F, F.getSubprogram(),
1308 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1309 "'" +
1310 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1311 ", final occupancy is " + Twine(Occupancy));
1312 F.getContext().diagnose(DI: Diag);
1313 }
1314
1315 if (isGFX11Plus(STI: STM)) {
1316 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1317 a: ProgInfo.getFunctionCodeSize(MF, IsLowerBound: true /* IsLowerBound */),
1318 b: (uint64_t)std::numeric_limits<uint32_t>::max());
1319 uint32_t CodeSizeInLines = divideCeil(Numerator: CodeSizeInBytes, Denominator: 128);
1320 uint32_t Field, Shift, Width;
1321 if (isGFX11(STI: STM)) {
1322 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1323 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1324 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1325 } else {
1326 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1327 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1328 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1329 }
1330 uint64_t InstPrefSize = std::min(a: CodeSizeInLines, b: (1u << Width) - 1);
1331 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1332 CreateExpr(InstPrefSize), Field, Shift);
1333 }
1334}
1335
1336static unsigned getRsrcReg(CallingConv::ID CallConv) {
1337 switch (CallConv) {
1338 default: [[fallthrough]];
1339 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1340 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1341 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1342 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1343 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1344 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1345 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1346 }
1347}
1348
1349void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1350 const SIProgramInfo &CurrentProgramInfo) {
1351 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1352 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1353 unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv());
1354 MCContext &Ctx = MF.getContext();
1355
1356 // (((Value) & Mask) << Shift)
1357 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1358 const MCExpr *msk = MCConstantExpr::create(Value: Mask, Ctx);
1359 const MCExpr *shft = MCConstantExpr::create(Value: Shift, Ctx);
1360 return MCBinaryExpr::createShl(LHS: MCBinaryExpr::createAnd(LHS: Value, RHS: msk, Ctx),
1361 RHS: shft, Ctx);
1362 };
1363
1364 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1365 int64_t Val;
1366 if (Value->evaluateAsAbsolute(Res&: Val))
1367 OutStreamer->emitIntValue(Value: static_cast<uint64_t>(Val), Size);
1368 else
1369 OutStreamer->emitValue(Value, Size);
1370 };
1371
1372 if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
1373 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1374
1375 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx),
1376 /*Size=*/4);
1377
1378 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1379 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1380
1381 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1382
1383 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1384 // appropriate generation.
1385 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1386 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1387 /*Mask=*/0x3FFFF, /*Shift=*/12),
1388 /*Size=*/4);
1389 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1390 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1391 /*Mask=*/0x7FFF, /*Shift=*/12),
1392 /*Size=*/4);
1393 } else {
1394 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1395 /*Mask=*/0x1FFF, /*Shift=*/12),
1396 /*Size=*/4);
1397 }
1398
1399 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1400 // 0" comment but I don't see a corresponding field in the register spec.
1401 } else {
1402 OutStreamer->emitInt32(Value: RsrcReg);
1403
1404 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1405 LHS: SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1406 RHS: SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1407 Ctx&: MF.getContext());
1408 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1409 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1410
1411 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1412 // appropriate generation.
1413 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1414 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1415 /*Mask=*/0x3FFFF, /*Shift=*/12),
1416 /*Size=*/4);
1417 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1418 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1419 /*Mask=*/0x7FFF, /*Shift=*/12),
1420 /*Size=*/4);
1421 } else {
1422 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1423 /*Mask=*/0x1FFF, /*Shift=*/12),
1424 /*Size=*/4);
1425 }
1426 }
1427
1428 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1429 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1430 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1431 ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2)
1432 : CurrentProgramInfo.LDSBlocks;
1433 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1434 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1435 OutStreamer->emitInt32(Value: MFI->getPSInputEnable());
1436 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1437 OutStreamer->emitInt32(Value: MFI->getPSInputAddr());
1438 }
1439
1440 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1441 OutStreamer->emitInt32(Value: MFI->getNumSpilledSGPRs());
1442 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1443 OutStreamer->emitInt32(Value: MFI->getNumSpilledVGPRs());
1444}
1445
1446// Helper function to add common PAL Metadata 3.0+
1447static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1448 const SIProgramInfo &CurrentProgramInfo,
1449 CallingConv::ID CC, const GCNSubtarget &ST,
1450 unsigned DynamicVGPRBlockSize) {
1451 if (ST.hasIEEEMode())
1452 MD->setHwStage(CC, field: ".ieee_mode", Val: (bool)CurrentProgramInfo.IEEEMode);
1453
1454 MD->setHwStage(CC, field: ".wgp_mode", Val: (bool)CurrentProgramInfo.WgpMode);
1455 MD->setHwStage(CC, field: ".mem_ordered", Val: (bool)CurrentProgramInfo.MemOrdered);
1456 MD->setHwStage(CC, field: ".forward_progress", Val: (bool)CurrentProgramInfo.FwdProgress);
1457
1458 if (AMDGPU::isCompute(CC)) {
1459 MD->setHwStage(CC, field: ".trap_present",
1460 Val: (bool)CurrentProgramInfo.TrapHandlerEnable);
1461 MD->setHwStage(CC, field: ".excp_en", Val: CurrentProgramInfo.EXCPEnable);
1462
1463 if (DynamicVGPRBlockSize != 0)
1464 MD->setComputeRegisters(field: ".dynamic_vgpr_en", Val: true);
1465 }
1466
1467 MD->updateHwStageMaximum(
1468 CC, field: ".lds_size",
1469 Val: (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1470 sizeof(uint32_t)));
1471}
1472
1473// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1474// is AMDPAL. It stores each compute/SPI register setting and other PAL
1475// metadata items into the PALMD::Metadata, combining with any provided by the
1476// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1477// is then written as a single block in the .note section.
1478void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1479 const SIProgramInfo &CurrentProgramInfo) {
1480 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1481 auto CC = MF.getFunction().getCallingConv();
1482 auto *MD = getTargetStreamer()->getPALMetadata();
1483 auto &Ctx = MF.getContext();
1484
1485 MD->setEntryPoint(CC, Name: MF.getFunction().getName());
1486 MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1487
1488 // For targets that support dynamic VGPRs, set the number of saved dynamic
1489 // VGPRs (if any) in the PAL metadata.
1490 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1491 if (MFI->isDynamicVGPREnabled() &&
1492 MFI->getScratchReservedForDynamicVGPRs() > 0)
1493 MD->setHwStage(CC, field: ".dynamic_vgpr_saved_count",
1494 Val: MFI->getScratchReservedForDynamicVGPRs() / 4);
1495
1496 // Only set AGPRs for supported devices
1497 if (STM.hasMAIInsts()) {
1498 MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR);
1499 }
1500
1501 MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1502 if (MD->getPALMajorVersion() < 3) {
1503 MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM, Ctx), Ctx);
1504 if (AMDGPU::isCompute(CC)) {
1505 MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1506 } else {
1507 const MCExpr *HasScratchBlocks =
1508 MCBinaryExpr::createGT(LHS: CurrentProgramInfo.ScratchBlocks,
1509 RHS: MCConstantExpr::create(Value: 0, Ctx), Ctx);
1510 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1511 MD->setRsrc2(CC, Val: maskShiftSet(Val: HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1512 }
1513 } else {
1514 MD->setHwStage(CC, field: ".debug_mode", Val: (bool)CurrentProgramInfo.DebugMode);
1515 MD->setHwStage(CC, field: ".scratch_en", Type: msgpack::Type::Boolean,
1516 Val: CurrentProgramInfo.ScratchEnable);
1517 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM,
1518 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize());
1519 }
1520
1521 // ScratchSize is in bytes, 16 aligned.
1522 MD->setScratchSize(
1523 CC,
1524 Val: AMDGPUMCExpr::createAlignTo(Value: CurrentProgramInfo.ScratchSize,
1525 Align: MCConstantExpr::create(Value: 16, Ctx), Ctx),
1526 Ctx);
1527
1528 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1529 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1530 ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2)
1531 : CurrentProgramInfo.LDSBlocks;
1532 if (MD->getPALMajorVersion() < 3) {
1533 MD->setRsrc2(
1534 CC,
1535 Val: MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
1536 Ctx);
1537 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1538 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1539 } else {
1540 // Graphics registers
1541 const unsigned ExtraLdsDwGranularity =
1542 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1543 MD->setGraphicsRegisters(
1544 field: ".ps_extra_lds_size",
1545 Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1546
1547 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1548 static StringLiteral const PsInputFields[] = {
1549 ".persp_sample_ena", ".persp_center_ena",
1550 ".persp_centroid_ena", ".persp_pull_model_ena",
1551 ".linear_sample_ena", ".linear_center_ena",
1552 ".linear_centroid_ena", ".line_stipple_tex_ena",
1553 ".pos_x_float_ena", ".pos_y_float_ena",
1554 ".pos_z_float_ena", ".pos_w_float_ena",
1555 ".front_face_ena", ".ancillary_ena",
1556 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1557 unsigned PSInputEna = MFI->getPSInputEnable();
1558 unsigned PSInputAddr = MFI->getPSInputAddr();
1559 for (auto [Idx, Field] : enumerate(First: PsInputFields)) {
1560 MD->setGraphicsRegisters(field1: ".spi_ps_input_ena", field2: Field,
1561 Val: (bool)((PSInputEna >> Idx) & 1));
1562 MD->setGraphicsRegisters(field1: ".spi_ps_input_addr", field2: Field,
1563 Val: (bool)((PSInputAddr >> Idx) & 1));
1564 }
1565 }
1566 }
1567
1568 // For version 3 and above the wave front size is already set in the metadata
1569 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1570 MD->setWave32(MF.getFunction().getCallingConv());
1571}
1572
1573void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1574 auto *MD = getTargetStreamer()->getPALMetadata();
1575 const MachineFrameInfo &MFI = MF.getFrameInfo();
1576 StringRef FnName = MF.getFunction().getName();
1577 MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize());
1578 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1579 MCContext &Ctx = MF.getContext();
1580
1581 if (MD->getPALMajorVersion() < 3) {
1582 // Set compute registers
1583 MD->setRsrc1(
1584 CC: CallingConv::AMDGPU_CS,
1585 Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1586 MD->setRsrc2(CC: CallingConv::AMDGPU_CS,
1587 Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1588 } else {
1589 EmitPALMetadataCommon(
1590 MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST,
1591 DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1592 }
1593
1594 // Set optional info
1595 MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize);
1596 MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1597 MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1598}
1599
1600// This is supposed to be log2(Size)
1601static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1602 switch (Size) {
1603 case 4:
1604 return AMD_ELEMENT_4_BYTES;
1605 case 8:
1606 return AMD_ELEMENT_8_BYTES;
1607 case 16:
1608 return AMD_ELEMENT_16_BYTES;
1609 default:
1610 llvm_unreachable("invalid private_element_size");
1611 }
1612}
1613
1614void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1615 const SIProgramInfo &CurrentProgramInfo,
1616 const MachineFunction &MF) const {
1617 const Function &F = MF.getFunction();
1618 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1619 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1620
1621 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1622 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1623 MCContext &Ctx = MF.getContext();
1624
1625 Out.initDefault(STI: &STM, Ctx, /*InitMCExpr=*/false);
1626
1627 Out.compute_pgm_resource1_registers =
1628 CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx);
1629 Out.compute_pgm_resource2_registers =
1630 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1631 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1632
1633 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1634
1635 AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1636 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1637
1638 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1639 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1640 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1641 }
1642
1643 if (UserSGPRInfo.hasDispatchPtr())
1644 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1645
1646 if (UserSGPRInfo.hasQueuePtr())
1647 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1648
1649 if (UserSGPRInfo.hasKernargSegmentPtr())
1650 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1651
1652 if (UserSGPRInfo.hasDispatchID())
1653 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1654
1655 if (UserSGPRInfo.hasFlatScratchInit())
1656 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1657
1658 if (UserSGPRInfo.hasPrivateSegmentSize())
1659 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
1660
1661 if (STM.isXNACKEnabled())
1662 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1663
1664 Align MaxKernArgAlign;
1665 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign);
1666 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1667 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1668 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1669 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1670
1671 // kernarg_segment_alignment is specified as log of the alignment.
1672 // The minimum alignment is 16.
1673 // FIXME: The metadata treats the minimum as 4?
1674 Out.kernarg_segment_alignment = Log2(A: std::max(a: Align(16), b: MaxKernArgAlign));
1675}
1676
1677bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1678 const char *ExtraCode, raw_ostream &O) {
1679 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1680 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O))
1681 return false;
1682
1683 if (ExtraCode && ExtraCode[0]) {
1684 if (ExtraCode[1] != 0)
1685 return true; // Unknown modifier.
1686
1687 switch (ExtraCode[0]) {
1688 case 'r':
1689 break;
1690 default:
1691 return true;
1692 }
1693 }
1694
1695 // TODO: Should be able to support other operand types like globals.
1696 const MachineOperand &MO = MI->getOperand(i: OpNo);
1697 if (MO.isReg()) {
1698 AMDGPUInstPrinter::printRegOperand(Reg: MO.getReg(), O,
1699 MRI: *MF->getSubtarget().getRegisterInfo());
1700 return false;
1701 }
1702 if (MO.isImm()) {
1703 int64_t Val = MO.getImm();
1704 if (AMDGPU::isInlinableIntLiteral(Literal: Val)) {
1705 O << Val;
1706 } else if (isUInt<16>(x: Val)) {
1707 O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val));
1708 } else if (isUInt<32>(x: Val)) {
1709 O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val));
1710 } else {
1711 O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val));
1712 }
1713 return false;
1714 }
1715 return true;
1716}
1717
1718void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1719 AU.addRequired<AMDGPUResourceUsageAnalysisWrapperPass>();
1720 AU.addPreserved<AMDGPUResourceUsageAnalysisWrapperPass>();
1721 AU.addRequired<MachineModuleInfoWrapperPass>();
1722 AU.addPreserved<MachineModuleInfoWrapperPass>();
1723 AsmPrinter::getAnalysisUsage(AU);
1724}
1725
1726void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1727 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1728 bool isModuleEntryFunction, bool hasMAIInsts) {
1729 if (!ORE)
1730 return;
1731
1732 const char *Name = "kernel-resource-usage";
1733 const char *Indent = " ";
1734
1735 // If the remark is not specifically enabled, do not output to yaml
1736 LLVMContext &Ctx = MF.getFunction().getContext();
1737 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name))
1738 return;
1739
1740 // Currently non-kernel functions have no resources to emit.
1741 if (!isEntryFunctionCC(CC: MF.getFunction().getCallingConv()))
1742 return;
1743
1744 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1745 StringRef RemarkLabel, auto Argument) {
1746 // Add an indent for every line besides the line with the kernel name. This
1747 // makes it easier to tell which resource usage go with which kernel since
1748 // the kernel name will always be displayed first.
1749 std::string LabelStr = RemarkLabel.str() + ": ";
1750 if (RemarkName != "FunctionName")
1751 LabelStr = Indent + LabelStr;
1752
1753 ORE->emit([&]() {
1754 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1755 MF.getFunction().getSubprogram(),
1756 &MF.front())
1757 << LabelStr << ore::NV(RemarkName, Argument);
1758 });
1759 };
1760
1761 // FIXME: Formatting here is pretty nasty because clang does not accept
1762 // newlines from diagnostics. This forces us to emit multiple diagnostic
1763 // remarks to simulate newlines. If and when clang does accept newlines, this
1764 // formatting should be aggregated into one remark with newlines to avoid
1765 // printing multiple diagnostic location and diag opts.
1766 EmitResourceUsageRemark("FunctionName", "Function Name",
1767 MF.getFunction().getName());
1768 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1769 getMCExprStr(Value: CurrentProgramInfo.NumSGPR));
1770 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1771 getMCExprStr(Value: CurrentProgramInfo.NumArchVGPR));
1772 if (hasMAIInsts) {
1773 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1774 getMCExprStr(Value: CurrentProgramInfo.NumAccVGPR));
1775 }
1776 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1777 getMCExprStr(Value: CurrentProgramInfo.ScratchSize));
1778 int64_t DynStack;
1779 bool DynStackEvaluatable =
1780 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(Res&: DynStack);
1781 StringRef DynamicStackStr =
1782 DynStackEvaluatable && DynStack ? "True" : "False";
1783 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1784 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1785 getMCExprStr(Value: CurrentProgramInfo.Occupancy));
1786 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1787 CurrentProgramInfo.SGPRSpill);
1788 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1789 CurrentProgramInfo.VGPRSpill);
1790 if (isModuleEntryFunction)
1791 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1792 CurrentProgramInfo.LDSSize);
1793}
1794
1795char AMDGPUAsmPrinter::ID = 0;
1796
1797INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1798 "AMDGPU Assembly Printer", false, false)
1799