1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
20#include "AMDGPUHSAMetadataStreamer.h"
21#include "AMDGPUMCResourceInfo.h"
22#include "AMDGPUResourceUsageAnalysis.h"
23#include "AMDGPUTargetMachine.h"
24#include "GCNSubtarget.h"
25#include "MCTargetDesc/AMDGPUInstPrinter.h"
26#include "MCTargetDesc/AMDGPUMCExpr.h"
27#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
28#include "MCTargetDesc/AMDGPUTargetStreamer.h"
29#include "R600AsmPrinter.h"
30#include "SIMachineFunctionInfo.h"
31#include "TargetInfo/AMDGPUTargetInfo.h"
32#include "Utils/AMDGPUBaseInfo.h"
33#include "Utils/AMDKernelCodeTUtils.h"
34#include "Utils/SIDefinesUtils.h"
35#include "llvm/ADT/StringSet.h"
36#include "llvm/Analysis/OptimizationRemarkEmitter.h"
37#include "llvm/BinaryFormat/ELF.h"
38#include "llvm/CodeGen/AsmPrinterHandler.h"
39#include "llvm/CodeGen/MachineFrameInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
42#include "llvm/IR/DiagnosticInfo.h"
43#include "llvm/MC/MCAssembler.h"
44#include "llvm/MC/MCContext.h"
45#include "llvm/MC/MCSectionELF.h"
46#include "llvm/MC/MCStreamer.h"
47#include "llvm/MC/MCValue.h"
48#include "llvm/MC/TargetRegistry.h"
49#include "llvm/Support/AMDHSAKernelDescriptor.h"
50#include "llvm/Support/Compiler.h"
51#include "llvm/Target/TargetLoweringObjectFile.h"
52#include "llvm/Target/TargetMachine.h"
53#include "llvm/TargetParser/AMDGPUTargetParser.h"
54
55using namespace llvm;
56using namespace llvm::AMDGPU;
57
58// This should get the default rounding mode from the kernel. We just set the
59// default here, but this could change if the OpenCL rounding mode pragmas are
60// used.
61//
62// The denormal mode here should match what is reported by the OpenCL runtime
63// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
64// can also be override to flush with the -cl-denorms-are-zero compiler flag.
65//
66// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
67// precision, and leaves single precision to flush all and does not report
68// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
69// CL_FP_DENORM for both.
70//
71// FIXME: It seems some instructions do not support single precision denormals
72// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
73// and sin_f32, cos_f32 on most parts).
74
75// We want to use these instructions, and using fp32 denormals also causes
76// instructions to run at the double precision rate for the device so it's
77// probably best to just report no single precision denormals.
78static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
79 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
80 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
81 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
82 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
83}
84
85static AsmPrinter *
86createAMDGPUAsmPrinterPass(TargetMachine &tm,
87 std::unique_ptr<MCStreamer> &&Streamer) {
88 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
89}
90
91extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
92LLVMInitializeAMDGPUAsmPrinter() {
93 TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(),
94 Fn: llvm::createR600AsmPrinterPass);
95 TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(),
96 Fn: createAMDGPUAsmPrinterPass);
97}
98
99namespace {
100class AMDGPUAsmPrinterHandler : public AsmPrinterHandler {
101protected:
102 AMDGPUAsmPrinter *Asm;
103
104public:
105 AMDGPUAsmPrinterHandler(AMDGPUAsmPrinter *A) : Asm(A) {}
106
107 void beginFunction(const MachineFunction *MF) override {}
108
109 void endFunction(const MachineFunction *MF) override { Asm->endFunction(MF); }
110
111 void endModule() override {}
112};
113} // End anonymous namespace
114
115AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
116 std::unique_ptr<MCStreamer> Streamer)
117 : AsmPrinter(TM, std::move(Streamer)) {
118 assert(OutStreamer && "AsmPrinter constructed without streamer");
119}
120
121StringRef AMDGPUAsmPrinter::getPassName() const {
122 return "AMDGPU Assembly Printer";
123}
124
125const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
126 return &TM.getMCSubtargetInfo();
127}
128
129AMDGPUTargetStreamer *AMDGPUAsmPrinter::getTargetStreamer() const {
130 if (!OutStreamer)
131 return nullptr;
132 return static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
133}
134
135void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
136 IsTargetStreamerInitialized = false;
137}
138
139void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
140 IsTargetStreamerInitialized = true;
141
142 // TODO: Which one is called first, emitStartOfAsmFile or
143 // emitFunctionBodyStart?
144 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
145 initializeTargetID(M);
146
147 const Triple &TT = M.getTargetTriple();
148 if (TT.getOS() != Triple::AMDHSA && TT.getOS() != Triple::AMDPAL)
149 return;
150
151 getTargetStreamer()->EmitDirectiveAMDGCNTarget();
152
153 if (TT.getOS() == Triple::AMDHSA) {
154 getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
155 COV: CodeObjectVersion);
156 HSAMetadataStream->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID());
157 }
158
159 if (TT.getOS() == Triple::AMDPAL)
160 getTargetStreamer()->getPALMetadata()->readFromIR(M);
161}
162
163void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
164 // Init target streamer if it has not yet happened
165 if (!IsTargetStreamerInitialized)
166 initTargetStreamer(M);
167
168 const Triple &TT = M.getTargetTriple();
169 if (TT.getOS() != Triple::AMDHSA)
170 getTargetStreamer()->EmitISAVersion();
171
172 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
173 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
174 if (TT.getOS() == Triple::AMDHSA) {
175 HSAMetadataStream->end();
176 bool Success = HSAMetadataStream->emitTo(TargetStreamer&: *getTargetStreamer());
177 (void)Success;
178 assert(Success && "Malformed HSA Metadata");
179 }
180}
181
182void AMDGPUAsmPrinter::emitFunctionBodyStart() {
183 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
184 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
185 const Function &F = MF->getFunction();
186
187 // TODO: We're checking this late, would be nice to check it earlier.
188 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
189 reportFatalUsageError(
190 reason: STM.getCPU() + " is only available on code object version 6 or better");
191 }
192
193 // TODO: Which one is called first, emitStartOfAsmFile or
194 // emitFunctionBodyStart?
195 if (!getTargetStreamer()->getTargetID())
196 initializeTargetID(M: *F.getParent());
197
198 const auto &FunctionTargetID = STM.getTargetID();
199 // Make sure function's xnack settings are compatible with module's
200 // xnack settings.
201 if (FunctionTargetID.isXnackSupported() &&
202 FunctionTargetID.getXnackSetting() != AMDGPU::TargetIDSetting::Any &&
203 FunctionTargetID.getXnackSetting() !=
204 getTargetStreamer()->getTargetID()->getXnackSetting()) {
205 OutContext.reportError(
206 L: {}, Msg: "xnack setting of '" + Twine(MF->getName()) +
207 "' function does not match module xnack setting");
208 return;
209 }
210 // Make sure function's sramecc settings are compatible with module's
211 // sramecc settings.
212 if (FunctionTargetID.isSramEccSupported() &&
213 FunctionTargetID.getSramEccSetting() != AMDGPU::TargetIDSetting::Any &&
214 FunctionTargetID.getSramEccSetting() !=
215 getTargetStreamer()->getTargetID()->getSramEccSetting()) {
216 OutContext.reportError(
217 L: {}, Msg: "sramecc setting of '" + Twine(MF->getName()) +
218 "' function does not match module sramecc setting");
219 return;
220 }
221
222 if (!MFI.isEntryFunction())
223 return;
224
225 if (STM.isMesaKernel(F) &&
226 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
227 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
228 AMDGPUMCKernelCodeT KernelCode;
229 getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF);
230 KernelCode.validate(STI: &STM, Ctx&: MF->getContext());
231 getTargetStreamer()->EmitAMDKernelCodeT(Header&: KernelCode);
232 }
233
234 if (STM.isAmdHsaOS())
235 HSAMetadataStream->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo);
236}
237
238/// Set bits in a kernel descriptor MCExpr field:
239/// return ((Dst & ~Mask) | (Value << Shift))
240static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value,
241 uint32_t Mask, uint32_t Shift, MCContext &Ctx) {
242 const auto *Shft = MCConstantExpr::create(Value: Shift, Ctx);
243 const auto *Msk = MCConstantExpr::create(Value: Mask, Ctx);
244 Dst = MCBinaryExpr::createAnd(LHS: Dst, RHS: MCUnaryExpr::createNot(Expr: Msk, Ctx), Ctx);
245 Dst = MCBinaryExpr::createOr(LHS: Dst, RHS: MCBinaryExpr::createShl(LHS: Value, RHS: Shft, Ctx),
246 Ctx);
247 return Dst;
248}
249
250void AMDGPUAsmPrinter::endFunction(const MachineFunction *MF) {
251 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
252 if (!MFI.isEntryFunction())
253 return;
254
255 assert(TM.getTargetTriple().getOS() == Triple::AMDHSA);
256
257 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
258 MCContext &Ctx = MF->getContext();
259
260 AMDGPU::MCKernelDescriptor KD =
261 getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo);
262
263 // Compute inst_pref_size using MCExpr label subtraction for exact code
264 // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter)
265 // right after the function code, so (Lfunc_end - func_sym) gives the
266 // exact function code size in bytes.
267 if (STM.hasInstPrefSize()) {
268 const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub(
269 LHS: MCSymbolRefExpr::create(Symbol: getFunctionEnd(), Ctx&: OutContext),
270 RHS: MCSymbolRefExpr::create(Symbol: CurrentFnSym, Ctx&: OutContext), Ctx&: OutContext);
271
272 uint32_t Mask, Shift, Width, CacheLineSize;
273 STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize);
274 const MCExpr *InstPrefSize =
275 AMDGPUMCExpr::createInstPrefSize(CodeSizeBytes: CodeSizeExpr, Ctx);
276 KD.compute_pgm_rsrc3 =
277 setBits(Dst: KD.compute_pgm_rsrc3, Value: InstPrefSize, Mask, Shift, Ctx);
278 }
279
280 auto &Streamer = getTargetStreamer()->getStreamer();
281 auto &Context = Streamer.getContext();
282 auto &ObjectFileInfo = *Context.getObjectFileInfo();
283 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
284
285 Streamer.pushSection();
286 Streamer.switchSection(Section: &ReadOnlySection);
287
288 // CP microcode requires the kernel descriptor to be allocated on 64 byte
289 // alignment.
290 Streamer.emitValueToAlignment(Alignment: Align(64), Fill: 0, FillLen: 1, MaxBytesToEmit: 0);
291 ReadOnlySection.ensureMinAlignment(MinAlignment: Align(64));
292
293 SmallString<128> KernelName;
294 getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction());
295 getTargetStreamer()->EmitAmdhsaKernelDescriptor(
296 STI: STM, KernelName, KernelDescriptor: KD, NextVGPR: CurrentProgramInfo.NumVGPRsForWavesPerEU,
297 NextSGPR: MCBinaryExpr::createSub(
298 LHS: CurrentProgramInfo.NumSGPRsForWavesPerEU,
299 RHS: AMDGPUMCExpr::createExtraSGPRs(
300 VCCUsed: CurrentProgramInfo.VCCUsed, FlatScrUsed: CurrentProgramInfo.FlatUsed,
301 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx&: Context),
302 Ctx&: Context),
303 ReserveVCC: CurrentProgramInfo.VCCUsed, ReserveFlatScr: CurrentProgramInfo.FlatUsed);
304
305 Streamer.popSection();
306}
307
308void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
309 Register RegNo = MI->getOperand(i: 0).getReg();
310
311 SmallString<128> Str;
312 raw_svector_ostream OS(Str);
313 OS << "implicit-def: "
314 << printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo());
315
316 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
317 OS << " : SGPR spill to VGPR lane";
318
319 OutStreamer->AddComment(T: OS.str());
320 OutStreamer->addBlankLine();
321}
322
323void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
324 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
325 AsmPrinter::emitFunctionEntryLabel();
326 return;
327 }
328
329 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
330 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
331 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(F: MF->getFunction())) {
332 SmallString<128> SymbolName;
333 getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()),
334 getTargetStreamer()->EmitAMDGPUSymbolType(SymbolName,
335 Type: ELF::STT_AMDGPU_HSA_KERNEL);
336 }
337 if (DumpCodeInstEmitter) {
338 // Disassemble function name label to text.
339 DisasmLines.push_back(x: MF->getName().str() + ":");
340 DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
341 HexLines.emplace_back(args: "");
342 }
343
344 AsmPrinter::emitFunctionEntryLabel();
345}
346
347void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
348 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) {
349 // Write a line for the basic block label if it is not only fallthrough.
350 DisasmLines.push_back(x: (Twine("BB") + Twine(getFunctionNumber()) + "_" +
351 Twine(MBB.getNumber()) + ":")
352 .str());
353 DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
354 HexLines.emplace_back(args: "");
355 }
356 AsmPrinter::emitBasicBlockStart(MBB);
357}
358
359void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
360 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
361 if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) {
362 OutContext.reportError(L: {},
363 Msg: Twine(GV->getName()) +
364 ": unsupported initializer for address space");
365 return;
366 }
367
368 const Triple::OSType OS = TM.getTargetTriple().getOS();
369 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
370 if (!AMDGPUTargetMachine::EnableObjectLinking)
371 return;
372 // With object linking, LDS definitions should have been externalized
373 // by earlier passes (e.g. LDS lowering, named barrier lowering).
374 // Only declarations reach here, emitted as SHN_AMDGPU_LDS symbols
375 // so the linker can assign their offsets.
376 assert(GV->isDeclaration() &&
377 "LDS definitions should have been externalized when object "
378 "linking is enabled");
379 }
380
381 MCSymbol *GVSym = getSymbol(GV);
382
383 GVSym->redefineIfPossible();
384 if (GVSym->isDefined() || GVSym->isVariable())
385 report_fatal_error(reason: "symbol '" + Twine(GVSym->getName()) +
386 "' is already defined");
387
388 const DataLayout &DL = GV->getDataLayout();
389 uint64_t Size = GV->getGlobalSize(DL);
390 Align Alignment = GV->getAlign().value_or(u: Align(4));
391
392 emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration());
393 emitLinkage(GV, GVSym);
394 auto *TS = getTargetStreamer();
395 TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment);
396 return;
397 }
398
399 AsmPrinter::emitGlobalVariable(GV);
400}
401
402bool AMDGPUAsmPrinter::doInitialization(Module &M) {
403 const llvm::Triple &TT = M.getTargetTriple();
404 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
405
406 if (TT.getOS() == Triple::AMDHSA) {
407 switch (CodeObjectVersion) {
408 case AMDGPU::AMDHSA_COV4:
409 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
410 break;
411 case AMDGPU::AMDHSA_COV5:
412 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
413 break;
414 case AMDGPU::AMDHSA_COV6:
415 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
416 break;
417 default:
418 reportFatalUsageError(reason: "unsupported code object version");
419 }
420
421 addAsmPrinterHandler(Handler: std::make_unique<AMDGPUAsmPrinterHandler>(args: this));
422 }
423
424 return AsmPrinter::doInitialization(M);
425}
426
427/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
428///
429/// Remove dependency on GCNSubtarget and depend only only the necessary values
430/// for said occupancy computation. Should match computeOccupancy implementation
431/// without passing \p STM on.
432const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
433 const MCExpr *NumVGPRs,
434 unsigned DynamicVGPRBlockSize,
435 const GCNSubtarget &STM, MCContext &Ctx) {
436 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(STI: STM);
437 unsigned Granule = IsaInfo::getVGPRAllocGranule(STI: STM, DynamicVGPRBlockSize);
438 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(STI: STM);
439
440 // Bake the per-function SGPR budget into the operands so the late-evaluated
441 // MCExpr stays arithmetic. The trap reservation in particular is implicit on
442 // amdhsa and lives on STM, not on the assembler's MCSubtargetInfo.
443 unsigned SGPRTotal = IsaInfo::getTotalNumSGPRs(STI: STM);
444 unsigned SGPRGranule = IsaInfo::getSGPRAllocGranule(STI: STM);
445 unsigned SGPRTrapReserve = STM.hasTrapHandler() ? IsaInfo::TRAP_NUM_SGPRS : 0;
446
447 auto CreateExpr = [&Ctx](unsigned Value) {
448 return MCConstantExpr::create(Value, Ctx);
449 };
450
451 // Zero SGPR count when SGPRs don't limit occupancy, so the MCExpr skips the
452 // SGPR term without having to test the generation itself.
453 const MCExpr *SGPRArg =
454 IsaInfo::isSGPROccupancyLimited(STI: STM) ? NumSGPRs : CreateExpr(0);
455
456 return AMDGPUMCExpr::create(Kind: AMDGPUMCExpr::AGVK_Occupancy,
457 Args: {CreateExpr(MaxWaves), CreateExpr(Granule),
458 CreateExpr(TargetTotalNumVGPRs),
459 CreateExpr(InitOcc), CreateExpr(SGPRTotal),
460 CreateExpr(SGPRGranule),
461 CreateExpr(SGPRTrapReserve), SGPRArg, NumVGPRs},
462 Ctx);
463}
464
465void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
466 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(CC: F.getCallingConv()))
467 return;
468
469 using RIK = MCResourceInfo::ResourceInfoKind;
470 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
471 MCSymbol *FnSym = TM.getSymbol(GV: &F);
472
473 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
474 int64_t Val;
475 if (Value->evaluateAsAbsolute(Res&: Val)) {
476 Res = Val;
477 return true;
478 }
479 return false;
480 };
481
482 const uint64_t MaxScratchPerWorkitem =
483 STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
484 MCSymbol *ScratchSizeSymbol =
485 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_PrivateSegSize, OutContext);
486 uint64_t ScratchSize;
487 if (ScratchSizeSymbol->isVariable() &&
488 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
489 ScratchSize > MaxScratchPerWorkitem) {
490 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
491 DS_Error);
492 F.getContext().diagnose(DI: DiagStackSize);
493 }
494
495 // Validate addressable scalar registers (i.e., prior to added implicit
496 // SGPRs).
497 MCSymbol *NumSGPRSymbol =
498 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext);
499 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
500 !STM.hasSGPRInitBug()) {
501 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
502 uint64_t NumSgpr;
503 if (NumSGPRSymbol->isVariable() &&
504 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
505 NumSgpr > MaxAddressableNumSGPRs) {
506 F.getContext().diagnose(DI: DiagnosticInfoResourceLimit(
507 F, "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs,
508 DS_Error, DK_ResourceLimit));
509 return;
510 }
511 }
512
513 MCSymbol *VCCUsedSymbol =
514 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext);
515 MCSymbol *FlatUsedSymbol =
516 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_UsesFlatScratch, OutContext);
517 uint64_t VCCUsed, FlatUsed, NumSgpr;
518
519 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
520 FlatUsedSymbol->isVariable() &&
521 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
522 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
523 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
524
525 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
526 // resolvable.
527 NumSgpr += IsaInfo::getNumExtraSGPRs(
528 STI: STM, VCCUsed, FlatScrUsed: FlatUsed,
529 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny());
530 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
531 STM.hasSGPRInitBug()) {
532 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
533 if (NumSgpr > MaxAddressableNumSGPRs) {
534 F.getContext().diagnose(DI: DiagnosticInfoResourceLimit(
535 F, "scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error,
536 DK_ResourceLimit));
537 return;
538 }
539 }
540
541 MCSymbol *NumVgprSymbol =
542 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext);
543 MCSymbol *NumAgprSymbol =
544 RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext);
545 uint64_t NumVgpr, NumAgpr;
546
547 MachineModuleInfo &MMI =
548 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
549 MachineFunction *MF = MMI.getMachineFunction(F);
550 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
551 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
552 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
553 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
554 unsigned MaxWaves = MFI.getMaxWavesPerEU();
555 uint64_t TotalNumVgpr =
556 getTotalNumVGPRs(has90AInsts: STM.hasGFX90AInsts(), ArgNumAGPR: NumAgpr, ArgNumVGPR: NumVgpr);
557 uint64_t NumVGPRsForWavesPerEU =
558 std::max(l: {TotalNumVgpr, (uint64_t)1,
559 (uint64_t)STM.getMinNumVGPRs(
560 WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize())});
561 uint64_t NumSGPRsForWavesPerEU = std::max(
562 l: {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(WavesPerEU: MaxWaves)});
563 const MCExpr *OccupancyExpr = createOccupancy(
564 InitOcc: STM.getOccupancyWithWorkGroupSizes(MF: *MF).second,
565 NumSGPRs: MCConstantExpr::create(Value: NumSGPRsForWavesPerEU, Ctx&: OutContext),
566 NumVGPRs: MCConstantExpr::create(Value: NumVGPRsForWavesPerEU, Ctx&: OutContext),
567 DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize(), STM, Ctx&: OutContext);
568 uint64_t Occupancy;
569
570 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
571 F, Name: "amdgpu-waves-per-eu", Default: {0, 0}, OnlyFirstRequired: true);
572
573 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
574 DiagnosticInfoOptimizationFailure Diag(
575 F, F.getSubprogram(),
576 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
577 "'" +
578 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
579 ", final occupancy is " + Twine(Occupancy));
580 F.getContext().diagnose(DI: Diag);
581 return;
582 }
583 }
584 }
585}
586
587static void appendTypeEncoding(std::string &Enc, Type *Ty, const DataLayout &DL,
588 bool IsReturnType) {
589 if (Ty->isVoidTy()) {
590 Enc += 'v';
591 return;
592 }
593 unsigned Bits = DL.getTypeSizeInBits(Ty);
594 // Zero-sized non-void types (e.g. `{}` or `[0 x i8]`) consume no ABI
595 // registers. For returns, emit the same no-result marker as void so the
596 // parameter encoding still has an explicit return-type prefix.
597 if (Bits == 0) {
598 if (IsReturnType)
599 Enc += 'v';
600 return;
601 }
602 if (Bits <= 32)
603 Enc += 'i';
604 else if (Bits <= 64)
605 Enc += 'l';
606 else
607 Enc.append(n: divideCeil(Numerator: Bits, Denominator: 32), c: 'i');
608}
609
610static std::string computeTypeId(const FunctionType *FTy,
611 const DataLayout &DL) {
612 std::string Enc;
613 appendTypeEncoding(Enc, Ty: FTy->getReturnType(), DL, /*IsReturnType=*/true);
614 for (Type *ParamTy : FTy->params())
615 appendTypeEncoding(Enc, Ty: ParamTy, DL, /*IsReturnType=*/false);
616 return Enc;
617}
618
619void AMDGPUAsmPrinter::collectCallEdge(const MachineInstr &MI) {
620 if (!AMDGPUTargetMachine::EnableObjectLinking)
621 return;
622 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
623 const MachineOperand *Callee =
624 TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::callee);
625 if (!Callee || !Callee->isGlobal())
626 return;
627 DirectCallEdges.insert(
628 X: {getSymbol(GV: &MF->getFunction()), getSymbol(GV: Callee->getGlobal())});
629}
630
631void AMDGPUAsmPrinter::emitAMDGPUInfo(Module &M) {
632 if (!AMDGPUTargetMachine::EnableObjectLinking)
633 return;
634
635 const NamedMDNode *LDSMD = M.getNamedMetadata(Name: "amdgpu.lds.uses");
636 bool HasLDSUses = LDSMD && LDSMD->getNumOperands() > 0;
637
638 const NamedMDNode *BarMD = M.getNamedMetadata(Name: "amdgpu.named_barrier.uses");
639 bool HasNamedBarriers = BarMD && BarMD->getNumOperands() > 0;
640
641 // Collect address-taken functions (with type IDs) and indirect call sites.
642 DenseMap<const Function *, std::string> AddrTakenTypeIds;
643 using IndirectCallInfo = std::pair<const Function *, std::string>;
644 SmallVector<IndirectCallInfo, 8> IndirectCalls;
645
646 for (const Function &F : M) {
647 bool IsKernel = AMDGPU::isKernel(CC: F.getCallingConv());
648
649 if (!IsKernel && F.hasAddressTaken(/*PutOffender=*/nullptr,
650 /*IgnoreCallbackUses=*/false,
651 /*IgnoreAssumeLikeCalls=*/true,
652 /*IgnoreLLVMUsed=*/IngoreLLVMUsed: true)) {
653 AddrTakenTypeIds[&F] =
654 computeTypeId(FTy: F.getFunctionType(), DL: M.getDataLayout());
655 }
656
657 if (F.isDeclaration())
658 continue;
659
660 StringSet<> SeenTypeIds;
661 for (const BasicBlock &BB : F) {
662 for (const Instruction &I : BB) {
663 const auto *CB = dyn_cast<CallBase>(Val: &I);
664 if (!CB || !CB->isIndirectCall())
665 continue;
666 std::string TId =
667 computeTypeId(FTy: CB->getFunctionType(), DL: M.getDataLayout());
668 if (SeenTypeIds.insert(key: TId).second)
669 IndirectCalls.push_back(Elt: {&F, std::move(TId)});
670 }
671 }
672 }
673
674 if (FunctionInfos.empty() && DirectCallEdges.empty() && !HasLDSUses &&
675 !HasNamedBarriers && AddrTakenTypeIds.empty() && IndirectCalls.empty())
676 return;
677
678 AMDGPU::InfoSectionData Data;
679 Data.Funcs = std::move(FunctionInfos);
680
681 for (auto &[F, TypeId] : AddrTakenTypeIds) {
682 MCSymbol *Sym = getSymbol(GV: F);
683 Data.TypeIds.push_back(Elt: {Sym, TypeId});
684 }
685
686 for (auto &[CallerSym, CalleeSym] : DirectCallEdges)
687 Data.Calls.push_back(Elt: {CallerSym, CalleeSym});
688 DirectCallEdges.clear();
689
690 if (HasLDSUses) {
691 for (const MDNode *N : LDSMD->operands()) {
692 auto *Func = mdconst::extract<Function>(MD: N->getOperand(I: 0));
693 auto *LdsVar = mdconst::extract<GlobalVariable>(MD: N->getOperand(I: 1));
694 Data.Uses.push_back(Elt: {getSymbol(GV: Func), getSymbol(GV: LdsVar)});
695 }
696 }
697
698 if (HasNamedBarriers) {
699 for (const MDNode *N : BarMD->operands()) {
700 auto *BarVar = mdconst::extract<GlobalVariable>(MD: N->getOperand(I: 0));
701 MCSymbol *BarSym = getSymbol(GV: BarVar);
702 for (unsigned I = 1, E = N->getNumOperands(); I < E; ++I) {
703 auto *Func = mdconst::extract<Function>(MD: N->getOperand(I));
704 Data.Uses.push_back(Elt: {getSymbol(GV: Func), BarSym});
705 }
706 }
707 }
708
709 for (auto &[Caller, Enc] : IndirectCalls) {
710 MCSymbol *CallerSym = getSymbol(GV: Caller);
711 Data.IndirectCalls.push_back(Elt: {CallerSym, Enc});
712 }
713
714 getTargetStreamer()->emitAMDGPUInfo(Data);
715}
716
717bool AMDGPUAsmPrinter::doFinalization(Module &M) {
718 const Triple &TT = M.getTargetTriple();
719
720 // Pad with s_code_end to help tools and guard against instruction prefetch
721 // causing stale data in caches. Arguably this should be done by the linker,
722 // which is why this isn't done for Mesa.
723 // Don't do it if there is no code.
724 const MCSubtargetInfo &STI = *getGlobalSTI();
725 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
726 (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::AMDPAL)) {
727 MCSection *TextSect = getObjFileLowering().getTextSection();
728 if (TextSect->hasInstructions()) {
729 OutStreamer->switchSection(Section: TextSect);
730 getTargetStreamer()->EmitCodeEnd(STI);
731 }
732 }
733
734 // Emit the unified .amdgpu.info section (per-function resources, call graph,
735 // LDS/named-barrier use edges, indirect calls, and address-taken type IDs).
736 emitAMDGPUInfo(M);
737
738 // Assign expressions which can only be resolved when all other functions are
739 // known.
740 RI.finalize(OutContext);
741
742 // Switch section and emit all GPR maximums within the processed module.
743 OutStreamer->pushSection();
744 MCSectionELF *MaxGPRSection =
745 OutContext.getELFSection(Section: ".AMDGPU.gpr_maximums", Type: ELF::SHT_PROGBITS, Flags: 0);
746 OutStreamer->switchSection(Section: MaxGPRSection);
747 getTargetStreamer()->EmitMCResourceMaximums(
748 MaxVGPR: RI.getMaxVGPRSymbol(OutContext), MaxAGPR: RI.getMaxAGPRSymbol(OutContext),
749 MaxSGPR: RI.getMaxSGPRSymbol(OutContext), MaxNamedBarrier: RI.getMaxNamedBarrierSymbol(OutContext));
750 OutStreamer->popSection();
751
752 // In the object-linking pipeline per-function resource MCExprs reference
753 // external callee symbols that cannot be evaluated here, so cross-TU limit
754 // checks would silently no-op for every non-leaf function. Defer resource
755 // sanity checking to the linker, which re-validates against the aggregated
756 // call graph in the combined .amdgpu.info metadata.
757 if (!AMDGPUTargetMachine::EnableObjectLinking) {
758 for (Function &F : M.functions())
759 validateMCResourceInfo(F);
760 }
761
762 RI.reset();
763
764 return AsmPrinter::doFinalization(M);
765}
766
767SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
768 SmallString<128> Str;
769 raw_svector_ostream OSS(Str);
770 auto &Streamer = getTargetStreamer()->getStreamer();
771 auto &Context = Streamer.getContext();
772 const MCExpr *New = foldAMDGPUMCExpr(Expr: Value, Ctx&: Context);
773 printAMDGPUMCExpr(Expr: New, OS&: OSS, MAI: &MAI);
774 return Str;
775}
776
777// Print comments that apply to both callable functions and entry points.
778void AMDGPUAsmPrinter::emitCommonFunctionComments(
779 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
780 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
781 const AMDGPUMachineFunctionInfo *MFI) {
782 OutStreamer->emitRawComment(T: " codeLenInByte = " + Twine(CodeSize), TabPrefix: false);
783 OutStreamer->emitRawComment(T: " TotalNumSgprs: " + getMCExprStr(Value: NumSGPR),
784 TabPrefix: false);
785 OutStreamer->emitRawComment(T: " NumVgprs: " + getMCExprStr(Value: NumVGPR), TabPrefix: false);
786 if (NumAGPR && TotalNumVGPR) {
787 OutStreamer->emitRawComment(T: " NumAgprs: " + getMCExprStr(Value: NumAGPR), TabPrefix: false);
788 OutStreamer->emitRawComment(T: " TotalNumVgprs: " + getMCExprStr(Value: TotalNumVGPR),
789 TabPrefix: false);
790 }
791 OutStreamer->emitRawComment(T: " ScratchSize: " + getMCExprStr(Value: ScratchSize),
792 TabPrefix: false);
793 OutStreamer->emitRawComment(T: " MemoryBound: " + Twine(MFI->isMemoryBound()),
794 TabPrefix: false);
795}
796
797const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
798 const MachineFunction &MF) const {
799 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
800 MCContext &Ctx = MF.getContext();
801 uint16_t KernelCodeProperties = 0;
802 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
803
804 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
805 KernelCodeProperties |=
806 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
807 }
808 if (UserSGPRInfo.hasDispatchPtr()) {
809 KernelCodeProperties |=
810 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
811 }
812 if (UserSGPRInfo.hasQueuePtr()) {
813 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
814 }
815 if (UserSGPRInfo.hasKernargSegmentPtr()) {
816 KernelCodeProperties |=
817 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
818 }
819 if (UserSGPRInfo.hasDispatchID()) {
820 KernelCodeProperties |=
821 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
822 }
823 if (UserSGPRInfo.hasFlatScratchInit()) {
824 KernelCodeProperties |=
825 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
826 }
827 if (UserSGPRInfo.hasPrivateSegmentSize()) {
828 KernelCodeProperties |=
829 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
830 }
831 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
832 KernelCodeProperties |=
833 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
834 }
835
836 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
837 // un-evaluatable at this point so it cannot be conditionally checked here.
838 // Instead, we'll directly shift the possibly unknown MCExpr into its place
839 // and bitwise-or it into KernelCodeProperties.
840 const MCExpr *KernelCodePropExpr =
841 MCConstantExpr::create(Value: KernelCodeProperties, Ctx);
842 const MCExpr *OrValue = MCConstantExpr::create(
843 Value: amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
844 OrValue = MCBinaryExpr::createShl(LHS: CurrentProgramInfo.DynamicCallStack,
845 RHS: OrValue, Ctx);
846 KernelCodePropExpr = MCBinaryExpr::createOr(LHS: KernelCodePropExpr, RHS: OrValue, Ctx);
847
848 return KernelCodePropExpr;
849}
850
851MCKernelDescriptor
852AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
853 const SIProgramInfo &PI) const {
854 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
855 const Function &F = MF.getFunction();
856 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
857 MCContext &Ctx = MF.getContext();
858
859 MCKernelDescriptor KernelDescriptor;
860
861 KernelDescriptor.group_segment_fixed_size =
862 MCConstantExpr::create(Value: PI.LDSSize, Ctx);
863 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
864
865 Align MaxKernArgAlign;
866 KernelDescriptor.kernarg_size = MCConstantExpr::create(
867 Value: STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign), Ctx);
868
869 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(ST: STM, Ctx);
870 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(ST: STM, Ctx);
871 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
872
873 int64_t PGM_Rsrc3 = 1;
874 bool EvaluatableRsrc3 =
875 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(Res&: PGM_Rsrc3);
876 (void)PGM_Rsrc3;
877 (void)EvaluatableRsrc3;
878 assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
879 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
880 static_cast<uint64_t>(PGM_Rsrc3) == 0);
881 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
882
883 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
884 Value: AMDGPU::hasKernargPreload(STI: STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
885 Ctx);
886
887 return KernelDescriptor;
888}
889
890bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
891 // Init target streamer lazily on the first function so that previous passes
892 // can set metadata.
893 if (!IsTargetStreamerInitialized)
894 initTargetStreamer(M&: *MF.getFunction().getParent());
895
896 ResourceUsage =
897 &getAnalysis<AMDGPUResourceUsageAnalysisWrapperPass>().getResourceInfo();
898 CurrentProgramInfo.reset(MF);
899
900 const AMDGPUMachineFunctionInfo *MFI =
901 MF.getInfo<AMDGPUMachineFunctionInfo>();
902 MCContext &Ctx = MF.getContext();
903
904 // The starting address of all shader programs must be 256 bytes aligned.
905 // Regular functions just need the basic required instruction alignment.
906 MF.ensureAlignment(A: MFI->isEntryFunction() ? Align(256) : Align(4));
907
908 SetupMachineFunction(MF);
909
910 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
911 MCContext &Context = getObjFileLowering().getContext();
912 // FIXME: This should be an explicit check for Mesa.
913 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
914 MCSectionELF *ConfigSection =
915 Context.getELFSection(Section: ".AMDGPU.config", Type: ELF::SHT_PROGBITS, Flags: 0);
916 OutStreamer->switchSection(Section: ConfigSection);
917 }
918
919 RI.gatherResourceInfo(MF, FRI: *ResourceUsage, OutContext);
920
921 if (AMDGPUTargetMachine::EnableObjectLinking) {
922 const AMDGPUResourceUsageAnalysisWrapperPass::FunctionResourceInfo &RU =
923 *ResourceUsage;
924 FunctionInfos.push_back(
925 Elt: {/*NumSGPR=*/static_cast<uint32_t>(RU.NumExplicitSGPR),
926 /*NumArchVGPR=*/static_cast<uint32_t>(RU.NumVGPR),
927 /*NumAccVGPR=*/static_cast<uint32_t>(RU.NumAGPR),
928 /*PrivateSegmentSize=*/static_cast<uint32_t>(RU.PrivateSegmentSize),
929 /*UsesVCC=*/RU.UsesVCC,
930 /*UsesFlatScratch=*/RU.UsesFlatScratch,
931 /*HasDynStack=*/RU.HasDynamicallySizedStack,
932 /*Sym=*/getSymbol(GV: &MF.getFunction())});
933 }
934
935 if (MFI->isModuleEntryFunction()) {
936 getSIProgramInfo(Out&: CurrentProgramInfo, MF);
937 }
938
939 if (STM.isAmdPalOS()) {
940 if (MFI->isEntryFunction())
941 EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo);
942 else if (MFI->isModuleEntryFunction())
943 emitPALFunctionMetadata(MF);
944 } else if (!STM.isAmdHsaOS()) {
945 EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo);
946 }
947
948 DumpCodeInstEmitter = nullptr;
949 if (STM.dumpCode()) {
950 // For -dumpcode, get the assembler out of the streamer. This only works
951 // with -filetype=obj.
952 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
953 if (Assembler)
954 DumpCodeInstEmitter = Assembler->getEmitterPtr();
955 }
956
957 DisasmLines.clear();
958 HexLines.clear();
959 DisasmLineMaxLen = 0;
960
961 emitFunctionBody();
962
963 emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(),
964 hasMAIInsts: STM.hasMAIInsts());
965
966 {
967 using RIK = MCResourceInfo::ResourceInfoKind;
968 getTargetStreamer()->EmitMCResourceInfo(
969 NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext),
970 NumAGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext),
971 NumExplicitSGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext),
972 NumNamedBarrier: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumNamedBarrier,
973 OutContext),
974 PrivateSegmentSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
975 OutContext),
976 UsesVCC: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext),
977 UsesFlatScratch: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesFlatScratch,
978 OutContext),
979 HasDynamicallySizedStack: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasDynSizedStack,
980 OutContext),
981 HasRecursion: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasRecursion,
982 OutContext),
983 HasIndirectCall: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasIndirectCall,
984 OutContext));
985 }
986
987 // Emit _dvgpr$ symbol when appropriate.
988 emitDVgprSymbol(MF);
989
990 if (isVerbose()) {
991 MCSectionELF *CommentSection =
992 Context.getELFSection(Section: ".AMDGPU.csdata", Type: ELF::SHT_PROGBITS, Flags: 0);
993 OutStreamer->switchSection(Section: CommentSection);
994
995 if (!MFI->isEntryFunction()) {
996 using RIK = MCResourceInfo::ResourceInfoKind;
997 OutStreamer->emitRawComment(T: " Function info:", TabPrefix: false);
998
999 emitCommonFunctionComments(
1000 NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext)
1001 ->getVariableValue(),
1002 NumAGPR: STM.hasMAIInsts() ? RI.getSymbol(FuncName: CurrentFnSym->getName(),
1003 RIK: RIK::RIK_NumAGPR, OutContext)
1004 ->getVariableValue()
1005 : nullptr,
1006 TotalNumVGPR: RI.createTotalNumVGPRs(MF, Ctx),
1007 NumSGPR: RI.createTotalNumSGPRs(
1008 MF,
1009 hasXnack: MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
1010 Ctx),
1011 ScratchSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
1012 OutContext)
1013 ->getVariableValue(),
1014 CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
1015 return false;
1016 }
1017
1018 OutStreamer->emitRawComment(T: " Kernel info:", TabPrefix: false);
1019 emitCommonFunctionComments(
1020 NumVGPR: CurrentProgramInfo.NumArchVGPR,
1021 NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
1022 TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR,
1023 ScratchSize: CurrentProgramInfo.ScratchSize,
1024 CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
1025
1026 OutStreamer->emitRawComment(
1027 T: " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), TabPrefix: false);
1028 OutStreamer->emitRawComment(
1029 T: " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), TabPrefix: false);
1030 OutStreamer->emitRawComment(
1031 T: " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
1032 " bytes/workgroup (compile time only)",
1033 TabPrefix: false);
1034
1035 OutStreamer->emitRawComment(
1036 T: " SGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.SGPRBlocks), TabPrefix: false);
1037
1038 OutStreamer->emitRawComment(
1039 T: " VGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.VGPRBlocks), TabPrefix: false);
1040
1041 OutStreamer->emitRawComment(
1042 T: " NumSGPRsForWavesPerEU: " +
1043 getMCExprStr(Value: CurrentProgramInfo.NumSGPRsForWavesPerEU),
1044 TabPrefix: false);
1045 OutStreamer->emitRawComment(
1046 T: " NumVGPRsForWavesPerEU: " +
1047 getMCExprStr(Value: CurrentProgramInfo.NumVGPRsForWavesPerEU),
1048 TabPrefix: false);
1049
1050 if (STM.hasGFX90AInsts()) {
1051 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
1052 LHS: CurrentProgramInfo.AccumOffset, RHS: MCConstantExpr::create(Value: 1, Ctx), Ctx);
1053 AdjustedAccum = MCBinaryExpr::createMul(
1054 LHS: AdjustedAccum, RHS: MCConstantExpr::create(Value: 4, Ctx), Ctx);
1055 OutStreamer->emitRawComment(
1056 T: " AccumOffset: " + getMCExprStr(Value: AdjustedAccum), TabPrefix: false);
1057 }
1058
1059 if (STM.hasGFX1250Insts())
1060 OutStreamer->emitRawComment(
1061 T: " NamedBarCnt: " + getMCExprStr(Value: CurrentProgramInfo.NamedBarCnt),
1062 TabPrefix: false);
1063
1064 OutStreamer->emitRawComment(
1065 T: " Occupancy: " + getMCExprStr(Value: CurrentProgramInfo.Occupancy), TabPrefix: false);
1066
1067 OutStreamer->emitRawComment(
1068 T: " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), TabPrefix: false);
1069
1070 OutStreamer->emitRawComment(
1071 T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
1072 getMCExprStr(Value: CurrentProgramInfo.ScratchEnable),
1073 TabPrefix: false);
1074 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " +
1075 Twine(CurrentProgramInfo.UserSGPR),
1076 TabPrefix: false);
1077 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
1078 Twine(CurrentProgramInfo.TrapHandlerEnable),
1079 TabPrefix: false);
1080 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
1081 Twine(CurrentProgramInfo.TGIdXEnable),
1082 TabPrefix: false);
1083 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
1084 Twine(CurrentProgramInfo.TGIdYEnable),
1085 TabPrefix: false);
1086 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
1087 Twine(CurrentProgramInfo.TGIdZEnable),
1088 TabPrefix: false);
1089 OutStreamer->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
1090 Twine(CurrentProgramInfo.TIdIGCompCount),
1091 TabPrefix: false);
1092
1093 [[maybe_unused]] int64_t PGMRSrc3;
1094 assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
1095 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
1096 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
1097 static_cast<uint64_t>(PGMRSrc3) == 0));
1098 if (STM.hasGFX90AInsts()) {
1099 OutStreamer->emitRawComment(
1100 T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
1101 getMCExprStr(Value: MCKernelDescriptor::bits_get(
1102 Src: CurrentProgramInfo.ComputePGMRSrc3,
1103 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
1104 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
1105 TabPrefix: false);
1106 OutStreamer->emitRawComment(
1107 T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
1108 getMCExprStr(Value: MCKernelDescriptor::bits_get(
1109 Src: CurrentProgramInfo.ComputePGMRSrc3,
1110 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
1111 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
1112 TabPrefix: false);
1113 }
1114 }
1115
1116 if (DumpCodeInstEmitter) {
1117
1118 OutStreamer->switchSection(
1119 Section: Context.getELFSection(Section: ".AMDGPU.disasm", Type: ELF::SHT_PROGBITS, Flags: 0));
1120
1121 for (size_t i = 0; i < DisasmLines.size(); ++i) {
1122 std::string Comment = "\n";
1123 if (!HexLines[i].empty()) {
1124 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
1125 Comment += " ; " + HexLines[i] + "\n";
1126 }
1127
1128 OutStreamer->emitBytes(Data: StringRef(DisasmLines[i]));
1129 OutStreamer->emitBytes(Data: StringRef(Comment));
1130 }
1131 }
1132
1133 return false;
1134}
1135
1136// When appropriate, add a _dvgpr$ symbol, with the value of the function
1137// symbol, plus an offset encoding one less than the number of VGPR blocks used
1138// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
1139// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
1140// used by a front-end to have functions that are chained rather than called,
1141// and a dispatcher that dynamically resizes the VGPR count before dispatching
1142// to a function.
1143void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
1144 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1145 if (MFI.isDynamicVGPREnabled() &&
1146 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
1147 MCContext &Ctx = MF.getContext();
1148 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
1149
1150 const MCExpr *EncodedBlocks;
1151 MCValue NumVGPRs;
1152 if (CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
1153 Res&: NumVGPRs, Asm: nullptr) &&
1154 NumVGPRs.isAbsolute()) {
1155
1156 // Calculate number of VGPR blocks.
1157 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
1158 unsigned NumBlocks =
1159 divideCeil(Numerator: std::max(a: unsigned(NumVGPRs.getConstant()), b: 1U), Denominator: BlockSize);
1160
1161 if (NumBlocks > AMDGPU::IsaInfo::MaxDynamicVGPRBlocks) {
1162 OutContext.reportError(
1163 L: {}, Msg: "DVGPR block count " + Twine(NumBlocks) +
1164 " exceeds maximum of " +
1165 Twine(AMDGPU::IsaInfo::MaxDynamicVGPRBlocks) +
1166 " for __dvgpr$ symbol for '" +
1167 Twine(CurrentFnSym->getName()) + "'");
1168 return;
1169 }
1170 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
1171 EncodedBlocks = MCConstantExpr::create(Value: EncodedNumBlocks, Ctx);
1172 } else {
1173 // Value not yet available so build a symbolic MCExpr:
1174 // ((alignTo(max(NumVGPRs, 1), BlockSize) / BlockSize - 1) << 3
1175 const MCExpr *One = MCConstantExpr::create(Value: 1, Ctx);
1176 const MCExpr *BlockSizeConst = MCConstantExpr::create(Value: BlockSize, Ctx);
1177 const MCExpr *MaxVGPRs = AMDGPUMCExpr::createMax(
1178 Args: {CurrentProgramInfo.NumVGPRsForWavesPerEU, One}, Ctx);
1179 const MCExpr *NumBlocks = MCBinaryExpr::createDiv(
1180 LHS: AMDGPUMCExpr::createAlignTo(Value: MaxVGPRs, Align: BlockSizeConst, Ctx),
1181 RHS: BlockSizeConst, Ctx);
1182 EncodedBlocks =
1183 MCBinaryExpr::createShl(LHS: MCBinaryExpr::createSub(LHS: NumBlocks, RHS: One, Ctx),
1184 RHS: MCConstantExpr::create(Value: 3, Ctx), Ctx);
1185 }
1186
1187 // Add to function symbol to create _dvgpr$ symbol.
1188 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
1189 LHS: MCSymbolRefExpr::create(Symbol: CurrentFnSym, Ctx), RHS: EncodedBlocks, Ctx);
1190 MCSymbol *DVgprFuncSym =
1191 Ctx.getOrCreateSymbol(Name: Twine("_dvgpr$") + CurrentFnSym->getName());
1192 OutStreamer->emitAssignment(Symbol: DVgprFuncSym, Value: DVgprFuncVal);
1193 emitVisibility(Sym: DVgprFuncSym, Visibility: MF.getFunction().getVisibility());
1194 emitLinkage(GV: &MF.getFunction(), GVSym: DVgprFuncSym);
1195 }
1196}
1197
1198// TODO: Fold this into emitFunctionBodyStart.
1199void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
1200 // In the beginning all features are either 'Any' or 'NotSupported',
1201 // depending on global target features. This will cover empty modules.
1202 getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(),
1203 FeatureString: getGlobalSTI()->getFeatureString());
1204
1205 // If module is empty, we are done.
1206 if (M.empty())
1207 return;
1208
1209 // If module is not empty, need to find first 'Off' or 'On' feature
1210 // setting per feature from functions in module.
1211 for (auto &F : M) {
1212 auto &TSTargetID = getTargetStreamer()->getTargetID();
1213 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
1214 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
1215 break;
1216
1217 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
1218 const AMDGPU::TargetID &STMTargetID = STM.getTargetID();
1219 if (TSTargetID->isXnackSupported())
1220 if (TSTargetID->getXnackSetting() == AMDGPU::TargetIDSetting::Any)
1221 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
1222 if (TSTargetID->isSramEccSupported())
1223 if (TSTargetID->getSramEccSetting() == AMDGPU::TargetIDSetting::Any)
1224 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
1225 }
1226}
1227
1228// AccumOffset computed for the MCExpr equivalent of:
1229// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
1230static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
1231 const MCExpr *ConstFour = MCConstantExpr::create(Value: 4, Ctx);
1232 const MCExpr *ConstOne = MCConstantExpr::create(Value: 1, Ctx);
1233
1234 // Can't be lower than 1 for subsequent alignTo.
1235 const MCExpr *MaximumTaken =
1236 AMDGPUMCExpr::createMax(Args: {ConstOne, NumVGPR}, Ctx);
1237
1238 // Practically, it's computing divideCeil(MaximumTaken, 4).
1239 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
1240 LHS: AMDGPUMCExpr::createAlignTo(Value: MaximumTaken, Align: ConstFour, Ctx), RHS: ConstFour,
1241 Ctx);
1242
1243 return MCBinaryExpr::createSub(LHS: DivCeil, RHS: ConstOne, Ctx);
1244}
1245
1246void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1247 const MachineFunction &MF) {
1248 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1249 MCContext &Ctx = MF.getContext();
1250
1251 auto CreateExpr = [&Ctx](int64_t Value) {
1252 return MCConstantExpr::create(Value, Ctx);
1253 };
1254
1255 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1256 int64_t Val;
1257 if (Value->evaluateAsAbsolute(Res&: Val)) {
1258 Res = Val;
1259 return true;
1260 }
1261 return false;
1262 };
1263
1264 auto GetSymRefExpr =
1265 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1266 MCSymbol *Sym = RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK, OutContext);
1267 return MCSymbolRefExpr::create(Symbol: Sym, Ctx);
1268 };
1269
1270 using RIK = MCResourceInfo::ResourceInfoKind;
1271 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1272 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1273 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1274 NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1275
1276 ProgInfo.AccumOffset = computeAccumOffset(NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1277 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1278 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1279 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1280 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1281 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1282 ProgInfo.DynamicCallStack =
1283 MCBinaryExpr::createOr(LHS: GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1284 RHS: GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1285
1286 const MCExpr *BarBlkConst = MCConstantExpr::create(Value: 4, Ctx);
1287 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1288 Value: GetSymRefExpr(RIK::RIK_NumNamedBarrier), Align: BarBlkConst, Ctx);
1289 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(LHS: AlignToBlk, RHS: BarBlkConst, Ctx);
1290
1291 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1292
1293 // The calculations related to SGPR/VGPR blocks are
1294 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1295 // unified.
1296 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1297 VCCUsed: ProgInfo.VCCUsed, FlatScrUsed: ProgInfo.FlatUsed,
1298 XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1299
1300 // Check the addressable register limit before we add ExtraSGPRs.
1301 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1302 !STM.hasSGPRInitBug()) {
1303 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1304 uint64_t NumSgpr;
1305 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1306 NumSgpr > MaxAddressableNumSGPRs) {
1307 // This can happen due to a compiler bug or when using inline asm.
1308 LLVMContext &Ctx = MF.getFunction().getContext();
1309 Ctx.diagnose(DI: DiagnosticInfoResourceLimit(
1310 MF.getFunction(), "addressable scalar registers", NumSgpr,
1311 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit));
1312 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1313 }
1314 }
1315
1316 // Account for extra SGPRs and VGPRs reserved for debugger use.
1317 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(LHS: ProgInfo.NumSGPR, RHS: ExtraSGPRs, Ctx);
1318
1319 const Function &F = MF.getFunction();
1320
1321 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1322 // dispatch registers as function args.
1323 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1324 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1325
1326 if (WaveDispatchNumSGPR) {
1327 ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
1328 Args: {ProgInfo.NumSGPR,
1329 MCBinaryExpr::createAdd(LHS: CreateExpr(WaveDispatchNumSGPR), RHS: ExtraSGPRs,
1330 Ctx)},
1331 Ctx);
1332 }
1333
1334 if (WaveDispatchNumVGPR) {
1335 ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
1336 Args: {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1337
1338 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1339 NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1340 }
1341
1342 // Adjust number of registers used to meet default/requested minimum/maximum
1343 // number of waves per execution unit request.
1344 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1345 ProgInfo.NumSGPRsForWavesPerEU =
1346 AMDGPUMCExpr::createMax(Args: {ProgInfo.NumSGPR, CreateExpr(1ul),
1347 CreateExpr(STM.getMinNumSGPRs(WavesPerEU: MaxWaves))},
1348 Ctx);
1349 ProgInfo.NumVGPRsForWavesPerEU =
1350 AMDGPUMCExpr::createMax(Args: {ProgInfo.NumVGPR, CreateExpr(1ul),
1351 CreateExpr(STM.getMinNumVGPRs(
1352 WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()))},
1353 Ctx);
1354
1355 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
1356 STM.hasSGPRInitBug()) {
1357 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1358 uint64_t NumSgpr;
1359 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1360 NumSgpr > MaxAddressableNumSGPRs) {
1361 // This can happen due to a compiler bug or when using inline asm to use
1362 // the registers which are usually reserved for vcc etc.
1363 LLVMContext &Ctx = MF.getFunction().getContext();
1364 Ctx.diagnose(DI: DiagnosticInfoResourceLimit(
1365 MF.getFunction(), "scalar registers", NumSgpr, MaxAddressableNumSGPRs,
1366 DS_Error, DK_ResourceLimit));
1367 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1368 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1369 }
1370 }
1371
1372 if (STM.hasSGPRInitBug()) {
1373 ProgInfo.NumSGPR =
1374 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1375 ProgInfo.NumSGPRsForWavesPerEU =
1376 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1377 }
1378
1379 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1380 LLVMContext &Ctx = MF.getFunction().getContext();
1381 Ctx.diagnose(DI: DiagnosticInfoResourceLimit(
1382 MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(),
1383 STM.getMaxNumUserSGPRs(), DS_Error));
1384 }
1385
1386 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1387 LLVMContext &Ctx = MF.getFunction().getContext();
1388 Ctx.diagnose(DI: DiagnosticInfoResourceLimit(
1389 MF.getFunction(), "local memory", MFI->getLDSSize(),
1390 STM.getAddressableLocalMemorySize(), DS_Error));
1391 }
1392 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1393 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1394 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1395 unsigned Granule) {
1396 const MCExpr *OneConst = CreateExpr(1ul);
1397 const MCExpr *GranuleConst = CreateExpr(Granule);
1398 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax(Args: {NumGPR, OneConst}, Ctx);
1399 const MCExpr *AlignToGPR =
1400 AMDGPUMCExpr::createAlignTo(Value: MaxNumGPR, Align: GranuleConst, Ctx);
1401 const MCExpr *DivGPR =
1402 MCBinaryExpr::createDiv(LHS: AlignToGPR, RHS: GranuleConst, Ctx);
1403 const MCExpr *SubGPR = MCBinaryExpr::createSub(LHS: DivGPR, RHS: OneConst, Ctx);
1404 return SubGPR;
1405 };
1406 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1407 if (STM.getGeneration() >= AMDGPUSubtarget::GFX10) {
1408 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1409 } else {
1410 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
1411 IsaInfo::getSGPREncodingGranule(STI: STM));
1412 }
1413 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1414 IsaInfo::getVGPREncodingGranule(STI: STM));
1415
1416 const SIModeRegisterDefaults Mode = MFI->getMode();
1417
1418 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1419 // register.
1420 ProgInfo.FloatMode = getFPMode(Mode);
1421
1422 ProgInfo.IEEEMode = Mode.IEEE;
1423
1424 // Make clamp modifier on NaN input returns 0.
1425 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1426
1427 unsigned LDSAlignShift = 8;
1428 switch (getLdsDwGranularity(ST: STM)) {
1429 case 512:
1430 case 320:
1431 LDSAlignShift = 11;
1432 break;
1433 case 128:
1434 LDSAlignShift = 9;
1435 break;
1436 case 64:
1437 LDSAlignShift = 8;
1438 break;
1439 default:
1440 llvm_unreachable("invald LDS block size");
1441 }
1442
1443 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1444 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1445
1446 ProgInfo.LDSSize = MFI->getLDSSize();
1447 ProgInfo.LDSBlocks =
1448 alignTo(Value: ProgInfo.LDSSize, Align: 1ULL << LDSAlignShift) >> LDSAlignShift;
1449
1450 // The MCExpr equivalent of divideCeil.
1451 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1452 const MCExpr *Ceil =
1453 AMDGPUMCExpr::createAlignTo(Value: Numerator, Align: Denominator, Ctx);
1454 return MCBinaryExpr::createDiv(LHS: Ceil, RHS: Denominator, Ctx);
1455 };
1456
1457 // Scratch is allocated in 64-dword or 256-dword blocks.
1458 unsigned ScratchAlignShift =
1459 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1460 // We need to program the hardware with the amount of scratch memory that
1461 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1462 // scratch memory used per thread.
1463 ProgInfo.ScratchBlocks = DivideCeil(
1464 MCBinaryExpr::createMul(LHS: ProgInfo.ScratchSize,
1465 RHS: CreateExpr(STM.getWavefrontSize()), Ctx),
1466 CreateExpr(1ULL << ScratchAlignShift));
1467
1468 if (STM.supportsWGP()) {
1469 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1470 }
1471
1472 if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= 10) {
1473 ProgInfo.MemOrdered = 1;
1474 ProgInfo.FwdProgress = !F.hasFnAttribute(Kind: "amdgpu-no-fwd-progress");
1475 }
1476
1477 // 0 = X, 1 = XY, 2 = XYZ
1478 unsigned TIDIGCompCnt = 0;
1479 if (MFI->hasWorkItemIDZ())
1480 TIDIGCompCnt = 2;
1481 else if (MFI->hasWorkItemIDY())
1482 TIDIGCompCnt = 1;
1483
1484 // The private segment wave byte offset is the last of the system SGPRs. We
1485 // initially assumed it was allocated, and may have used it. It shouldn't harm
1486 // anything to disable it if we know the stack isn't used here. We may still
1487 // have emitted code reading it to initialize scratch, but if that's unused
1488 // reading garbage should be OK.
1489 ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
1490 LHS: MCBinaryExpr::createGT(LHS: ProgInfo.ScratchBlocks,
1491 RHS: MCConstantExpr::create(Value: 0, Ctx), Ctx),
1492 RHS: ProgInfo.DynamicCallStack, Ctx);
1493
1494 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1495 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1496 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1497 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1498 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1499 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1500 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1501 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1502 ProgInfo.EXCPEnMSB = 0;
1503 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1504 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1505 ProgInfo.EXCPEnable = 0;
1506
1507 if (STM.hasGFX90AInsts()) {
1508 ProgInfo.ComputePGMRSrc3 =
1509 setBits(Dst: ProgInfo.ComputePGMRSrc3, Value: ProgInfo.AccumOffset,
1510 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1511 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx);
1512 ProgInfo.ComputePGMRSrc3 =
1513 setBits(Dst: ProgInfo.ComputePGMRSrc3, Value: CreateExpr(ProgInfo.TgSplit),
1514 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1515 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx);
1516 }
1517
1518 if (STM.hasGFX1250Insts())
1519 ProgInfo.ComputePGMRSrc3 =
1520 setBits(Dst: ProgInfo.ComputePGMRSrc3, Value: ProgInfo.NamedBarCnt,
1521 Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1522 Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx);
1523
1524 ProgInfo.Occupancy = createOccupancy(
1525 InitOcc: STM.computeOccupancy(F, LDSSize: ProgInfo.LDSSize).second,
1526 NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU, NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU,
1527 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1528
1529 const auto [MinWEU, MaxWEU] =
1530 AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default: {0, 0}, OnlyFirstRequired: true);
1531 uint64_t Occupancy;
1532 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1533 DiagnosticInfoOptimizationFailure Diag(
1534 F, F.getSubprogram(),
1535 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1536 "'" +
1537 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1538 ", final occupancy is " + Twine(Occupancy));
1539 F.getContext().diagnose(DI: Diag);
1540 }
1541}
1542
1543static unsigned getRsrcReg(CallingConv::ID CallConv) {
1544 switch (CallConv) {
1545 default:
1546 [[fallthrough]];
1547 case CallingConv::AMDGPU_CS:
1548 return R_00B848_COMPUTE_PGM_RSRC1;
1549 case CallingConv::AMDGPU_LS:
1550 return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1551 case CallingConv::AMDGPU_HS:
1552 return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1553 case CallingConv::AMDGPU_ES:
1554 return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1555 case CallingConv::AMDGPU_GS:
1556 return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1557 case CallingConv::AMDGPU_VS:
1558 return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1559 case CallingConv::AMDGPU_PS:
1560 return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1561 }
1562}
1563
1564void AMDGPUAsmPrinter::EmitProgramInfoSI(
1565 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1566 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1567 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1568 unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv());
1569 MCContext &Ctx = MF.getContext();
1570
1571 // (((Value) & Mask) << Shift)
1572 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1573 const MCExpr *msk = MCConstantExpr::create(Value: Mask, Ctx);
1574 const MCExpr *shft = MCConstantExpr::create(Value: Shift, Ctx);
1575 return MCBinaryExpr::createShl(LHS: MCBinaryExpr::createAnd(LHS: Value, RHS: msk, Ctx),
1576 RHS: shft, Ctx);
1577 };
1578
1579 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1580 int64_t Val;
1581 if (Value->evaluateAsAbsolute(Res&: Val))
1582 OutStreamer->emitIntValue(Value: static_cast<uint64_t>(Val), Size);
1583 else
1584 OutStreamer->emitValue(Value, Size);
1585 };
1586
1587 if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
1588 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1589
1590 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx),
1591 /*Size=*/4);
1592
1593 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1594 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(ST: STM, Ctx),
1595 /*Size=*/4);
1596
1597 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1598
1599 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1600 // appropriate generation.
1601 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1602 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1603 /*Mask=*/0x3FFFF, /*Shift=*/12),
1604 /*Size=*/4);
1605 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1606 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1607 /*Mask=*/0x7FFF, /*Shift=*/12),
1608 /*Size=*/4);
1609 } else {
1610 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1611 /*Mask=*/0x1FFF, /*Shift=*/12),
1612 /*Size=*/4);
1613 }
1614
1615 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1616 // 0" comment but I don't see a corresponding field in the register spec.
1617 } else {
1618 OutStreamer->emitInt32(Value: RsrcReg);
1619
1620 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1621 LHS: SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1622 RHS: SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1623 Ctx&: MF.getContext());
1624 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1625 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1626
1627 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1628 // appropriate generation.
1629 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1630 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1631 /*Mask=*/0x3FFFF, /*Shift=*/12),
1632 /*Size=*/4);
1633 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1634 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1635 /*Mask=*/0x7FFF, /*Shift=*/12),
1636 /*Size=*/4);
1637 } else {
1638 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1639 /*Mask=*/0x1FFF, /*Shift=*/12),
1640 /*Size=*/4);
1641 }
1642 }
1643
1644 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1645 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1646 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1647 ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2)
1648 : CurrentProgramInfo.LDSBlocks;
1649 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1650 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1651 OutStreamer->emitInt32(Value: MFI->getPSInputEnable());
1652 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1653 OutStreamer->emitInt32(Value: MFI->getPSInputAddr());
1654 }
1655
1656 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1657 OutStreamer->emitInt32(Value: MFI->getNumSpilledSGPRs());
1658 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1659 OutStreamer->emitInt32(Value: MFI->getNumSpilledVGPRs());
1660}
1661
1662// Helper function to add common PAL Metadata 3.0+
1663static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1664 const SIProgramInfo &CurrentProgramInfo,
1665 CallingConv::ID CC, const GCNSubtarget &ST,
1666 unsigned DynamicVGPRBlockSize) {
1667 if (ST.hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode))
1668 MD->setHwStage(CC, field: ".ieee_mode", Val: (bool)CurrentProgramInfo.IEEEMode);
1669
1670 MD->setHwStage(CC, field: ".wgp_mode", Val: (bool)CurrentProgramInfo.WgpMode);
1671 MD->setHwStage(CC, field: ".mem_ordered", Val: (bool)CurrentProgramInfo.MemOrdered);
1672 MD->setHwStage(CC, field: ".forward_progress", Val: (bool)CurrentProgramInfo.FwdProgress);
1673
1674 if (AMDGPU::isCompute(CC)) {
1675 MD->setHwStage(CC, field: ".trap_present",
1676 Val: (bool)CurrentProgramInfo.TrapHandlerEnable);
1677 MD->setHwStage(CC, field: ".excp_en", Val: CurrentProgramInfo.EXCPEnable);
1678
1679 if (DynamicVGPRBlockSize != 0)
1680 MD->setComputeRegisters(field: ".dynamic_vgpr_en", Val: true);
1681 }
1682
1683 MD->updateHwStageMaximum(
1684 CC, field: ".lds_size",
1685 Val: (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1686 sizeof(uint32_t)));
1687}
1688
1689// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1690// is AMDPAL. It stores each compute/SPI register setting and other PAL
1691// metadata items into the PALMD::Metadata, combining with any provided by the
1692// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1693// is then written as a single block in the .note section.
1694void AMDGPUAsmPrinter::EmitPALMetadata(
1695 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1696 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1697 auto CC = MF.getFunction().getCallingConv();
1698 auto *MD = getTargetStreamer()->getPALMetadata();
1699 auto &Ctx = MF.getContext();
1700
1701 MD->setEntryPoint(CC, Name: MF.getFunction().getName());
1702 MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1703
1704 // For targets that support dynamic VGPRs, set the number of saved dynamic
1705 // VGPRs (if any) in the PAL metadata.
1706 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1707 if (MFI->isDynamicVGPREnabled() &&
1708 MFI->getScratchReservedForDynamicVGPRs() > 0)
1709 MD->setHwStage(CC, field: ".dynamic_vgpr_saved_count",
1710 Val: MFI->getScratchReservedForDynamicVGPRs() / 4);
1711
1712 // Only set AGPRs for supported devices
1713 if (STM.hasMAIInsts()) {
1714 MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR);
1715 }
1716
1717 MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1718 if (MD->getPALMajorVersion() < 3) {
1719 MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM, Ctx), Ctx);
1720 if (AMDGPU::isCompute(CC)) {
1721 MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2(ST: STM, Ctx), Ctx);
1722 } else {
1723 const MCExpr *HasScratchBlocks =
1724 MCBinaryExpr::createGT(LHS: CurrentProgramInfo.ScratchBlocks,
1725 RHS: MCConstantExpr::create(Value: 0, Ctx), Ctx);
1726 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1727 MD->setRsrc2(CC, Val: maskShiftSet(Val: HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1728 }
1729 } else {
1730 MD->setHwStage(CC, field: ".debug_mode", Val: (bool)CurrentProgramInfo.DebugMode);
1731 MD->setHwStage(CC, field: ".scratch_en", Type: msgpack::Type::Boolean,
1732 Val: CurrentProgramInfo.ScratchEnable);
1733 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM,
1734 DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize());
1735 }
1736
1737 // ScratchSize is in bytes, 16 aligned.
1738 MD->setScratchSize(
1739 CC,
1740 Val: AMDGPUMCExpr::createAlignTo(Value: CurrentProgramInfo.ScratchSize,
1741 Align: MCConstantExpr::create(Value: 16, Ctx), Ctx),
1742 Ctx);
1743
1744 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1745 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1746 ? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: 2)
1747 : CurrentProgramInfo.LDSBlocks;
1748 if (MD->getPALMajorVersion() < 3) {
1749 MD->setRsrc2(
1750 CC,
1751 Val: MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
1752 Ctx);
1753 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1754 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1755 } else {
1756 // Graphics registers
1757 const unsigned ExtraLdsDwGranularity =
1758 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1759 MD->setGraphicsRegisters(
1760 field: ".ps_extra_lds_size",
1761 Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1762
1763 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1764 static StringLiteral const PsInputFields[] = {
1765 ".persp_sample_ena", ".persp_center_ena",
1766 ".persp_centroid_ena", ".persp_pull_model_ena",
1767 ".linear_sample_ena", ".linear_center_ena",
1768 ".linear_centroid_ena", ".line_stipple_tex_ena",
1769 ".pos_x_float_ena", ".pos_y_float_ena",
1770 ".pos_z_float_ena", ".pos_w_float_ena",
1771 ".front_face_ena", ".ancillary_ena",
1772 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1773 unsigned PSInputEna = MFI->getPSInputEnable();
1774 unsigned PSInputAddr = MFI->getPSInputAddr();
1775 for (auto [Idx, Field] : enumerate(First: PsInputFields)) {
1776 MD->setGraphicsRegisters(field1: ".spi_ps_input_ena", field2: Field,
1777 Val: (bool)((PSInputEna >> Idx) & 1));
1778 MD->setGraphicsRegisters(field1: ".spi_ps_input_addr", field2: Field,
1779 Val: (bool)((PSInputAddr >> Idx) & 1));
1780 }
1781 }
1782 }
1783
1784 // For version 3 and above the wave front size is already set in the metadata
1785 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1786 MD->setWave32(MF.getFunction().getCallingConv());
1787}
1788
1789void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1790 auto *MD = getTargetStreamer()->getPALMetadata();
1791 const MachineFrameInfo &MFI = MF.getFrameInfo();
1792 StringRef FnName = MF.getFunction().getName();
1793 MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize());
1794 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1795 MCContext &Ctx = MF.getContext();
1796
1797 if (MD->getPALMajorVersion() < 3) {
1798 // Set compute registers
1799 MD->setRsrc1(
1800 CC: CallingConv::AMDGPU_CS,
1801 Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1802 MD->setRsrc2(CC: CallingConv::AMDGPU_CS,
1803 Val: CurrentProgramInfo.getComputePGMRSrc2(ST, Ctx), Ctx);
1804 } else {
1805 EmitPALMetadataCommon(
1806 MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST,
1807 DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1808 }
1809
1810 // Set optional info
1811 MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize);
1812 MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1813 MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1814}
1815
1816// This is supposed to be log2(Size)
1817static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1818 switch (Size) {
1819 case 4:
1820 return AMD_ELEMENT_4_BYTES;
1821 case 8:
1822 return AMD_ELEMENT_8_BYTES;
1823 case 16:
1824 return AMD_ELEMENT_16_BYTES;
1825 default:
1826 llvm_unreachable("invalid private_element_size");
1827 }
1828}
1829
1830void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1831 const SIProgramInfo &CurrentProgramInfo,
1832 const MachineFunction &MF) const {
1833 const Function &F = MF.getFunction();
1834 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1835 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1836
1837 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1838 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1839 MCContext &Ctx = MF.getContext();
1840
1841 Out.initDefault(STI: STM, Ctx, /*InitMCExpr=*/false);
1842
1843 Out.compute_pgm_resource1_registers =
1844 CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx);
1845 Out.compute_pgm_resource2_registers =
1846 CurrentProgramInfo.getComputePGMRSrc2(ST: STM, Ctx);
1847 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1848
1849 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1850
1851 AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1852 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1853
1854 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1855 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1856 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1857 }
1858
1859 if (UserSGPRInfo.hasDispatchPtr())
1860 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1861
1862 if (UserSGPRInfo.hasQueuePtr())
1863 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1864
1865 if (UserSGPRInfo.hasKernargSegmentPtr())
1866 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1867
1868 if (UserSGPRInfo.hasDispatchID())
1869 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1870
1871 if (UserSGPRInfo.hasFlatScratchInit())
1872 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1873
1874 if (UserSGPRInfo.hasPrivateSegmentSize())
1875 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
1876
1877 if (STM.isXNACKEnabled())
1878 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1879
1880 Align MaxKernArgAlign;
1881 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign);
1882 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1883 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1884 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1885 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1886
1887 // kernarg_segment_alignment is specified as log of the alignment.
1888 // The minimum alignment is 16.
1889 // FIXME: The metadata treats the minimum as 4?
1890 Out.kernarg_segment_alignment = Log2(A: std::max(a: Align(16), b: MaxKernArgAlign));
1891}
1892
1893bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1894 const char *ExtraCode, raw_ostream &O) {
1895 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1896 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O))
1897 return false;
1898
1899 if (ExtraCode && ExtraCode[0]) {
1900 if (ExtraCode[1] != 0)
1901 return true; // Unknown modifier.
1902
1903 switch (ExtraCode[0]) {
1904 case 'r':
1905 break;
1906 default:
1907 return true;
1908 }
1909 }
1910
1911 // TODO: Should be able to support other operand types like globals.
1912 const MachineOperand &MO = MI->getOperand(i: OpNo);
1913 if (MO.isReg()) {
1914 AMDGPUInstPrinter::printRegOperand(Reg: MO.getReg(), O,
1915 MRI: *MF->getSubtarget().getRegisterInfo());
1916 return false;
1917 }
1918 if (MO.isImm()) {
1919 int64_t Val = MO.getImm();
1920 if (AMDGPU::isInlinableIntLiteral(Literal: Val)) {
1921 O << Val;
1922 } else if (isUInt<16>(x: Val)) {
1923 O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val));
1924 } else if (isUInt<32>(x: Val)) {
1925 O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val));
1926 } else {
1927 O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val));
1928 }
1929 return false;
1930 }
1931 return true;
1932}
1933
1934void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1935 AU.addRequired<AMDGPUResourceUsageAnalysisWrapperPass>();
1936 AU.addPreserved<AMDGPUResourceUsageAnalysisWrapperPass>();
1937 AU.addRequired<MachineModuleInfoWrapperPass>();
1938 AU.addPreserved<MachineModuleInfoWrapperPass>();
1939 AsmPrinter::getAnalysisUsage(AU);
1940}
1941
1942void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1943 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1944 bool isModuleEntryFunction, bool hasMAIInsts) {
1945 if (!ORE)
1946 return;
1947
1948 const char *Name = "kernel-resource-usage";
1949 const char *Indent = " ";
1950
1951 // If the remark is not specifically enabled, do not output to yaml
1952 LLVMContext &Ctx = MF.getFunction().getContext();
1953 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name))
1954 return;
1955
1956 // Currently non-kernel functions have no resources to emit.
1957 if (!isEntryFunctionCC(CC: MF.getFunction().getCallingConv()))
1958 return;
1959
1960 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1961 StringRef RemarkLabel, auto Argument) {
1962 // Add an indent for every line besides the line with the kernel name. This
1963 // makes it easier to tell which resource usage go with which kernel since
1964 // the kernel name will always be displayed first.
1965 std::string LabelStr = RemarkLabel.str() + ": ";
1966 if (RemarkName != "FunctionName")
1967 LabelStr = Indent + LabelStr;
1968
1969 ORE->emit([&]() {
1970 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1971 MF.getFunction().getSubprogram(),
1972 &MF.front())
1973 << LabelStr << ore::NV(RemarkName, Argument);
1974 });
1975 };
1976
1977 // FIXME: Formatting here is pretty nasty because clang does not accept
1978 // newlines from diagnostics. This forces us to emit multiple diagnostic
1979 // remarks to simulate newlines. If and when clang does accept newlines, this
1980 // formatting should be aggregated into one remark with newlines to avoid
1981 // printing multiple diagnostic location and diag opts.
1982 EmitResourceUsageRemark("FunctionName", "Function Name",
1983 MF.getFunction().getName());
1984 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1985 getMCExprStr(Value: CurrentProgramInfo.NumSGPR));
1986 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1987 getMCExprStr(Value: CurrentProgramInfo.NumArchVGPR));
1988 if (hasMAIInsts) {
1989 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1990 getMCExprStr(Value: CurrentProgramInfo.NumAccVGPR));
1991 }
1992 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1993 getMCExprStr(Value: CurrentProgramInfo.ScratchSize));
1994 int64_t DynStack;
1995 bool DynStackEvaluatable =
1996 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(Res&: DynStack);
1997 StringRef DynamicStackStr =
1998 DynStackEvaluatable && DynStack ? "True" : "False";
1999 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
2000 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
2001 getMCExprStr(Value: CurrentProgramInfo.Occupancy));
2002 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
2003 CurrentProgramInfo.SGPRSpill);
2004 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
2005 CurrentProgramInfo.VGPRSpill);
2006 if (isModuleEntryFunction)
2007 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
2008 CurrentProgramInfo.LDSSize);
2009}
2010
2011char AMDGPUAsmPrinter::ID = 0;
2012
2013INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
2014 "AMDGPU Assembly Printer", false, false)
2015