AMDGPUAsmPrinter.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp]

1	//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	///
11	/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12	/// code. When passed an MCAsmStreamer it prints assembly and when passed
13	/// an MCObjectStreamer it outputs binary code.
14	//
15	//===----------------------------------------------------------------------===//
16	//
17
18	#include "AMDGPUAsmPrinter.h"
19	#include "AMDGPU.h"
20	#include "AMDGPUHSAMetadataStreamer.h"
21	#include "AMDGPUMCResourceInfo.h"
22	#include "AMDGPUResourceUsageAnalysis.h"
23	#include "GCNSubtarget.h"
24	#include "MCTargetDesc/AMDGPUInstPrinter.h"
25	#include "MCTargetDesc/AMDGPUMCExpr.h"
26	#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
27	#include "MCTargetDesc/AMDGPUTargetStreamer.h"
28	#include "R600AsmPrinter.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "TargetInfo/AMDGPUTargetInfo.h"
31	#include "Utils/AMDGPUBaseInfo.h"
32	#include "Utils/AMDKernelCodeTUtils.h"
33	#include "Utils/SIDefinesUtils.h"
34	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35	#include "llvm/BinaryFormat/ELF.h"
36	#include "llvm/CodeGen/MachineFrameInfo.h"
37	#include "llvm/CodeGen/MachineModuleInfo.h"
38	#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
39	#include "llvm/IR/DiagnosticInfo.h"
40	#include "llvm/MC/MCAssembler.h"
41	#include "llvm/MC/MCContext.h"
42	#include "llvm/MC/MCSectionELF.h"
43	#include "llvm/MC/MCStreamer.h"
44	#include "llvm/MC/TargetRegistry.h"
45	#include "llvm/Support/AMDHSAKernelDescriptor.h"
46	#include "llvm/Support/Compiler.h"
47	#include "llvm/Target/TargetLoweringObjectFile.h"
48	#include "llvm/Target/TargetMachine.h"
49	#include "llvm/TargetParser/TargetParser.h"
50
51	using namespace llvm;
52	using namespace llvm::AMDGPU;
53
54	// This should get the default rounding mode from the kernel. We just set the
55	// default here, but this could change if the OpenCL rounding mode pragmas are
56	// used.
57	//
58	// The denormal mode here should match what is reported by the OpenCL runtime
59	// for the CL_FP_DENORM bit from CL_DEVICE_{HALF\|SINGLE\|DOUBLE}_FP_CONFIG, but
60	// can also be override to flush with the -cl-denorms-are-zero compiler flag.
61	//
62	// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
63	// precision, and leaves single precision to flush all and does not report
64	// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
65	// CL_FP_DENORM for both.
66	//
67	// FIXME: It seems some instructions do not support single precision denormals
68	// regardless of the mode (exp__f32, rcp__f32, rsq__f32, rsq_f32, sqrt_f32,
69	// and sin_f32, cos_f32 on most parts).
70
71	// We want to use these instructions, and using fp32 denormals also causes
72	// instructions to run at the double precision rate for the device so it's
73	// probably best to just report no single precision denormals.
74	static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
75	return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) \|
76	FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) \|
77	FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) \|
78	FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
79	}
80
81	static AsmPrinter *
82	createAMDGPUAsmPrinterPass(TargetMachine &tm,
83	std::unique_ptr<MCStreamer> &&Streamer) {
84	return new AMDGPUAsmPrinter (tm, std::move(Streamer));
85	}
86
87	extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
88	LLVMInitializeAMDGPUAsmPrinter() {
89	TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(),
90	Fn: llvm::createR600AsmPrinterPass);
91	TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(),
92	Fn: createAMDGPUAsmPrinterPass);
93	}
94
95	AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
96	std::unique_ptr<MCStreamer> Streamer)
97	: AsmPrinter (TM, std::move(Streamer)) {
98	assert(OutStreamer && "AsmPrinter constructed without streamer");
99	}
100
101	StringRef AMDGPUAsmPrinter::getPassName() const {
102	return "AMDGPU Assembly Printer";
103	}
104
105	const MCSubtargetInfo AMDGPUAsmPrinter::getGlobalSTI() const* {
106	return TM.getMCSubtargetInfo();
107	}
108
109	AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
110	if (!OutStreamer)
111	return nullptr;
112	return static_cast<AMDGPUTargetStreamer*>(OutStreamer ->getTargetStreamer());
113	}
114
115	void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
116	IsTargetStreamerInitialized = false;
117	}
118
119	void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
120	IsTargetStreamerInitialized = true;
121
122	// TODO: Which one is called first, emitStartOfAsmFile or
123	// emitFunctionBodyStart?
124	if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
125	initializeTargetID(M);
126
127	if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
128	TM.getTargetTriple().getOS() != Triple::AMDPAL)
129	return;
130
131	getTargetStreamer()->EmitDirectiveAMDGCNTarget();
132
133	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
134	getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
135	COV: CodeObjectVersion);
136	HSAMetadataStream ->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID());
137	}
138
139	if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
140	getTargetStreamer()->getPALMetadata()->readFromIR(M);
141	}
142
143	void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
144	// Init target streamer if it has not yet happened
145	if (!IsTargetStreamerInitialized)
146	initTargetStreamer(M);
147
148	if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
149	getTargetStreamer()->EmitISAVersion();
150
151	// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
152	// Emit HSA Metadata (NT_AMD_HSA_METADATA).
153	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
154	HSAMetadataStream ->end();
155	bool Success = HSAMetadataStream ->emitTo(TargetStreamer&: *getTargetStreamer());
156	(void)Success;
157	assert(Success && "Malformed HSA Metadata");
158	}
159	}
160
161	void AMDGPUAsmPrinter::emitFunctionBodyStart() {
162	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
163	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
164	const Function &F = MF->getFunction();
165
166	// TODO: We're checking this late, would be nice to check it earlier.
167	if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
168	reportFatalUsageError(
169	reason: STM.getCPU() + " is only available on code object version 6 or better");
170	}
171
172	// TODO: Which one is called first, emitStartOfAsmFile or
173	// emitFunctionBodyStart?
174	if (!getTargetStreamer()->getTargetID())
175	initializeTargetID(M: *F.getParent());
176
177	const auto &FunctionTargetID = STM.getTargetID();
178	// Make sure function's xnack settings are compatible with module's
179	// xnack settings.
180	if (FunctionTargetID.isXnackSupported() &&
181	FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
182	FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
183	OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine (MF->getName()) +
184	"' function does not match module xnack setting");
185	return;
186	}
187	// Make sure function's sramecc settings are compatible with module's
188	// sramecc settings.
189	if (FunctionTargetID.isSramEccSupported() &&
190	FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
191	FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
192	OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine (MF->getName()) +
193	"' function does not match module sramecc setting");
194	return;
195	}
196
197	if (!MFI.isEntryFunction())
198	return;
199
200	if (STM.isMesaKernel(F) &&
201	(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
202	F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
203	AMDGPUMCKernelCodeT KernelCode;
204	getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF);
205	KernelCode.validate(STI: &STM, Ctx&: MF->getContext());
206	getTargetStreamer()->EmitAMDKernelCodeT(Header&: KernelCode);
207	}
208
209	if (STM.isAmdHsaOS())
210	HSAMetadataStream ->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo);
211	}
212
213	void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
214	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
215	if (!MFI.isEntryFunction())
216	return;
217
218	if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
219	return;
220
221	auto &Streamer = getTargetStreamer()->getStreamer();
222	auto &Context = Streamer.getContext();
223	auto &ObjectFileInfo = *Context.getObjectFileInfo();
224	auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
225
226	Streamer.pushSection();
227	Streamer.switchSection(Section: &ReadOnlySection);
228
229	// CP microcode requires the kernel descriptor to be allocated on 64 byte
230	// alignment.
231	Streamer.emitValueToAlignment(Alignment: Align (`64`), Value: `0`, ValueSize: `1`, MaxBytesToEmit: `0`);
232	ReadOnlySection.ensureMinAlignment(MinAlignment: Align (`64`));
233
234	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
235
236	SmallString<`128`> KernelName;
237	getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction());
238	getTargetStreamer()->EmitAmdhsaKernelDescriptor(
239	STI: STM, KernelName, KernelDescriptor: getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo),
240	NextVGPR: CurrentProgramInfo.NumVGPRsForWavesPerEU,
241	NextSGPR: MCBinaryExpr::createSub(
242	LHS: CurrentProgramInfo.NumSGPRsForWavesPerEU,
243	RHS: AMDGPUMCExpr::createExtraSGPRs(
244	VCCUsed: CurrentProgramInfo.VCCUsed, FlatScrUsed: CurrentProgramInfo.FlatUsed,
245	XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx&: Context),
246	Ctx&: Context),
247	ReserveVCC: CurrentProgramInfo.VCCUsed, ReserveFlatScr: CurrentProgramInfo.FlatUsed);
248
249	Streamer.popSection();
250	}
251
252	void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr MI) const* {
253	Register RegNo = MI->getOperand(i: `0`).getReg();
254
255	SmallString<`128`> Str;
256	raw_svector_ostream OS(Str);
257	OS << "implicit-def: "
258	<< printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo());
259
260	if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
261	OS << " : SGPR spill to VGPR lane";
262
263	OutStreamer ->AddComment(T: OS.str());
264	OutStreamer ->addBlankLine();
265	}
266
267	void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
268	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
269	AsmPrinter::emitFunctionEntryLabel();
270	return;
271	}
272
273	const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
274	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
275	if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(F: MF->getFunction())) {
276	SmallString<`128`> SymbolName;
277	getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()),
278	getTargetStreamer()->EmitAMDGPUSymbolType(
279	SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL);
280	}
281	if (DumpCodeInstEmitter) {
282	// Disassemble function name label to text.
283	DisasmLines.push_back(x: MF->getName().str() + ":");
284	DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
285	HexLines.emplace_back(args: "");
286	}
287
288	AsmPrinter::emitFunctionEntryLabel();
289	}
290
291	void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
292	if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) {
293	// Write a line for the basic block label if it is not only fallthrough.
294	DisasmLines.push_back(
295	x: (Twine ("BB") + Twine (getFunctionNumber())
296	+ "_" + Twine (MBB.getNumber()) + ":").str());
297	DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
298	HexLines.emplace_back(args: "");
299	}
300	AsmPrinter::emitBasicBlockStart(MBB);
301	}
302
303	void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
304	if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
305	if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) {
306	OutContext.reportError(L: {},
307	Msg: Twine (GV->getName()) +
308	": unsupported initializer for address space");
309	return;
310	}
311
312	// LDS variables aren't emitted in HSA or PAL yet.
313	const Triple::OSType OS = TM.getTargetTriple().getOS();
314	if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
315	return;
316
317	MCSymbol *GVSym = getSymbol(GV);
318
319	GVSym->redefineIfPossible();
320	if (GVSym->isDefined() \|\| GVSym->isVariable())
321	report_fatal_error(reason: "symbol '" + Twine (GVSym->getName()) +
322	"' is already defined");
323
324	const DataLayout &DL = GV->getDataLayout();
325	uint64_t Size = DL.getTypeAllocSize(Ty: GV->getValueType());
326	Align Alignment = GV->getAlign().value_or(u: Align (`4`));
327
328	emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration());
329	emitLinkage(GV, GVSym);
330	auto *TS = getTargetStreamer();
331	TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment);
332	return;
333	}
334
335	AsmPrinter::emitGlobalVariable(GV);
336	}
337
338	bool AMDGPUAsmPrinter::doInitialization(Module &M) {
339	CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
340
341	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
342	switch (CodeObjectVersion) {
343	case AMDGPU::AMDHSA_COV4:
344	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
345	break;
346	case AMDGPU::AMDHSA_COV5:
347	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
348	break;
349	case AMDGPU::AMDHSA_COV6:
350	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
351	break;
352	default:
353	reportFatalUsageError(reason: "unsupported code object version");
354	}
355	}
356
357	return AsmPrinter::doInitialization(M);
358	}
359
360	void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
361	if (F.isDeclaration() \|\| !AMDGPU::isModuleEntryFunctionCC(CC: F.getCallingConv()))
362	return;
363
364	using RIK = MCResourceInfo::ResourceInfoKind;
365	const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
366	MCSymbol *FnSym = TM.getSymbol(GV: &F);
367	bool IsLocal = F.hasLocalLinkage();
368
369	auto TryGetMCExprValue = [](const MCExpr Value, uint64_t &Res) -> bool* {
370	int64_t Val;
371	if (Value->evaluateAsAbsolute(Res&: Val)) {
372	Res = Val;
373	return true;
374	}
375	return false;
376	};
377
378	const uint64_t MaxScratchPerWorkitem =
379	STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
380	MCSymbol *ScratchSizeSymbol = RI.getSymbol(
381	FuncName: FnSym->getName(), RIK: RIK::RIK_PrivateSegSize, OutContext, IsLocal);
382	uint64_t ScratchSize;
383	if (ScratchSizeSymbol->isVariable() &&
384	TryGetMCExprValue (ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
385	ScratchSize > MaxScratchPerWorkitem) {
386	DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
387	DS_Error);
388	F.getContext().diagnose(DI: DiagStackSize);
389	}
390
391	// Validate addressable scalar registers (i.e., prior to added implicit
392	// SGPRs).
393	MCSymbol *NumSGPRSymbol =
394	RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext, IsLocal);
395	if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
396	!STM.hasSGPRInitBug()) {
397	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
398	uint64_t NumSgpr;
399	if (NumSGPRSymbol->isVariable() &&
400	TryGetMCExprValue (NumSGPRSymbol->getVariableValue(), NumSgpr) &&
401	NumSgpr > MaxAddressableNumSGPRs) {
402	DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
403	NumSgpr, MaxAddressableNumSGPRs,
404	DS_Error, DK_ResourceLimit);
405	F.getContext().diagnose(DI: Diag);
406	return;
407	}
408	}
409
410	MCSymbol *VCCUsedSymbol =
411	RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext, IsLocal);
412	MCSymbol *FlatUsedSymbol = RI.getSymbol(
413	FuncName: FnSym->getName(), RIK: RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
414	uint64_t VCCUsed, FlatUsed, NumSgpr;
415
416	if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
417	FlatUsedSymbol->isVariable() &&
418	TryGetMCExprValue (NumSGPRSymbol->getVariableValue(), NumSgpr) &&
419	TryGetMCExprValue (VCCUsedSymbol->getVariableValue(), VCCUsed) &&
420	TryGetMCExprValue (FlatUsedSymbol->getVariableValue(), FlatUsed)) {
421
422	// Recomputes NumSgprs + implicit SGPRs but all symbols should now be
423	// resolvable.
424	NumSgpr += IsaInfo::getNumExtraSGPRs(
425	STI: &STM, VCCUsed, FlatScrUsed: FlatUsed,
426	XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny());
427	if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS \|\|
428	STM.hasSGPRInitBug()) {
429	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
430	if (NumSgpr > MaxAddressableNumSGPRs) {
431	DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
432	MaxAddressableNumSGPRs, DS_Error,
433	DK_ResourceLimit);
434	F.getContext().diagnose(DI: Diag);
435	return;
436	}
437	}
438
439	MCSymbol *NumVgprSymbol =
440	RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext, IsLocal);
441	MCSymbol *NumAgprSymbol =
442	RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext, IsLocal);
443	uint64_t NumVgpr, NumAgpr;
444
445	MachineModuleInfo &MMI =
446	getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
447	MachineFunction *MF = MMI.getMachineFunction(F);
448	if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
449	TryGetMCExprValue (NumVgprSymbol->getVariableValue(), NumVgpr) &&
450	TryGetMCExprValue (NumAgprSymbol->getVariableValue(), NumAgpr)) {
451	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
452	unsigned MaxWaves = MFI.getMaxWavesPerEU();
453	uint64_t TotalNumVgpr =
454	getTotalNumVGPRs(has90AInsts: STM.hasGFX90AInsts(), ArgNumAGPR: NumAgpr, ArgNumVGPR: NumVgpr);
455	uint64_t NumVGPRsForWavesPerEU =
456	std::max(l: {TotalNumVgpr, (uint64_t)`1`,
457	(uint64_t)STM.getMinNumVGPRs(
458	WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize())});
459	uint64_t NumSGPRsForWavesPerEU = std::max(
460	l: {NumSgpr, (uint64_t)`1`, (uint64_t)STM.getMinNumSGPRs(WavesPerEU: MaxWaves)});
461	const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
462	InitOcc: STM.getOccupancyWithWorkGroupSizes(MF: *MF).second,
463	NumSGPRs: MCConstantExpr::create(Value: NumSGPRsForWavesPerEU, Ctx&: OutContext),
464	NumVGPRs: MCConstantExpr::create(Value: NumVGPRsForWavesPerEU, Ctx&: OutContext),
465	DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize(), STM, Ctx&: OutContext);
466	uint64_t Occupancy;
467
468	const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
469	F, Name: "amdgpu-waves-per-eu", Default: {`0`, `0`}, OnlyFirstRequired: true);
470
471	if (TryGetMCExprValue (OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
472	DiagnosticInfoOptimizationFailure Diag(
473	F, F.getSubprogram(),
474	"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
475	"'" +
476	F.getName() + "': desired occupancy was " + Twine (MinWEU) +
477	", final occupancy is " + Twine (Occupancy));
478	F.getContext().diagnose(DI: Diag);
479	return;
480	}
481	}
482	}
483	}
484
485	bool AMDGPUAsmPrinter::doFinalization(Module &M) {
486	// Pad with s_code_end to help tools and guard against instruction prefetch
487	// causing stale data in caches. Arguably this should be done by the linker,
488	// which is why this isn't done for Mesa.
489	const MCSubtargetInfo &STI = *getGlobalSTI();
490	if ((AMDGPU::isGFX10Plus(STI) \|\| AMDGPU::isGFX90A(STI)) &&
491	(STI.getTargetTriple().getOS() == Triple::AMDHSA \|\|
492	STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
493	OutStreamer ->switchSection(Section: getObjFileLowering().getTextSection());
494	getTargetStreamer()->EmitCodeEnd(STI);
495	}
496
497	// Assign expressions which can only be resolved when all other functions are
498	// known.
499	RI.finalize(OutContext);
500
501	// Switch section and emit all GPR maximums within the processed module.
502	OutStreamer ->pushSection();
503	MCSectionELF *MaxGPRSection =
504	OutContext.getELFSection(Section: ".AMDGPU.gpr_maximums", Type: ELF::SHT_PROGBITS, Flags: `0`);
505	OutStreamer ->switchSection(Section: MaxGPRSection);
506	getTargetStreamer()->EmitMCResourceMaximums(MaxVGPR: RI.getMaxVGPRSymbol(OutContext),
507	MaxAGPR: RI.getMaxAGPRSymbol(OutContext),
508	MaxSGPR: RI.getMaxSGPRSymbol(OutContext));
509	OutStreamer ->popSection();
510
511	for (Function &F : M.functions())
512	validateMCResourceInfo(F);
513
514	RI.reset();
515
516	return AsmPrinter::doFinalization(M);
517	}
518
519	SmallString<`128`> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
520	SmallString<`128`> Str;
521	raw_svector_ostream OSS(Str);
522	auto &Streamer = getTargetStreamer()->getStreamer();
523	auto &Context = Streamer.getContext();
524	const MCExpr *New = foldAMDGPUMCExpr(Expr: Value, Ctx&: Context);
525	printAMDGPUMCExpr(Expr: New, OS&: OSS, MAI);
526	return Str;
527	}
528
529	// Print comments that apply to both callable functions and entry points.
530	void AMDGPUAsmPrinter::emitCommonFunctionComments(
531	const MCExpr NumVGPR, const* MCExpr NumAGPR, const* MCExpr *TotalNumVGPR,
532	const MCExpr NumSGPR, const* MCExpr *ScratchSize, uint64_t CodeSize,
533	const AMDGPUMachineFunction *MFI) {
534	OutStreamer ->emitRawComment(T: " codeLenInByte = " + Twine (CodeSize), TabPrefix: false);
535	OutStreamer ->emitRawComment(T: " TotalNumSgprs: " + getMCExprStr(Value: NumSGPR),
536	TabPrefix: false);
537	OutStreamer ->emitRawComment(T: " NumVgprs: " + getMCExprStr(Value: NumVGPR), TabPrefix: false);
538	if (NumAGPR && TotalNumVGPR) {
539	OutStreamer ->emitRawComment(T: " NumAgprs: " + getMCExprStr(Value: NumAGPR), TabPrefix: false);
540	OutStreamer ->emitRawComment(T: " TotalNumVgprs: " + getMCExprStr(Value: TotalNumVGPR),
541	TabPrefix: false);
542	}
543	OutStreamer ->emitRawComment(T: " ScratchSize: " + getMCExprStr(Value: ScratchSize),
544	TabPrefix: false);
545	OutStreamer ->emitRawComment(T: " MemoryBound: " + Twine (MFI->isMemoryBound()),
546	TabPrefix: false);
547	}
548
549	const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
550	const MachineFunction &MF) const {
551	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
552	MCContext &Ctx = MF.getContext();
553	uint16_t KernelCodeProperties = `0`;
554	const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
555
556	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
557	KernelCodeProperties \|=
558	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
559	}
560	if (UserSGPRInfo.hasDispatchPtr()) {
561	KernelCodeProperties \|=
562	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
563	}
564	if (UserSGPRInfo.hasQueuePtr()) {
565	KernelCodeProperties \|=
566	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
567	}
568	if (UserSGPRInfo.hasKernargSegmentPtr()) {
569	KernelCodeProperties \|=
570	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
571	}
572	if (UserSGPRInfo.hasDispatchID()) {
573	KernelCodeProperties \|=
574	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
575	}
576	if (UserSGPRInfo.hasFlatScratchInit()) {
577	KernelCodeProperties \|=
578	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
579	}
580	if (UserSGPRInfo.hasPrivateSegmentSize()) {
581	KernelCodeProperties \|=
582	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
583	}
584	if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
585	KernelCodeProperties \|=
586	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
587	}
588
589	// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
590	// un-evaluatable at this point so it cannot be conditionally checked here.
591	// Instead, we'll directly shift the possibly unknown MCExpr into its place
592	// and bitwise-or it into KernelCodeProperties.
593	const MCExpr *KernelCodePropExpr =
594	MCConstantExpr::create(Value: KernelCodeProperties, Ctx);
595	const MCExpr *OrValue = MCConstantExpr::create(
596	Value: amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
597	OrValue = MCBinaryExpr::createShl(LHS: CurrentProgramInfo.DynamicCallStack,
598	RHS: OrValue, Ctx);
599	KernelCodePropExpr = MCBinaryExpr::createOr(LHS: KernelCodePropExpr, RHS: OrValue, Ctx);
600
601	return KernelCodePropExpr;
602	}
603
604	MCKernelDescriptor
605	AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
606	const SIProgramInfo &PI) const {
607	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
608	const Function &F = MF.getFunction();
609	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
610	MCContext &Ctx = MF.getContext();
611
612	MCKernelDescriptor KernelDescriptor;
613
614	KernelDescriptor.group_segment_fixed_size =
615	MCConstantExpr::create(Value: PI.LDSSize, Ctx);
616	KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
617
618	Align MaxKernArgAlign;
619	KernelDescriptor.kernarg_size = MCConstantExpr::create(
620	Value: STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign), Ctx);
621
622	KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(ST: STM, Ctx);
623	KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
624	KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
625
626	int64_t PGRM_Rsrc3 = `1`;
627	bool EvaluatableRsrc3 =
628	CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(Res&: PGRM_Rsrc3);
629	(void)PGRM_Rsrc3;
630	(void)EvaluatableRsrc3;
631	assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 \|\|
632	STM.hasGFX90AInsts() \|\| !EvaluatableRsrc3 \|\|
633	static_cast<uint64_t>(PGRM_Rsrc3) == `0`);
634	KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
635
636	KernelDescriptor.kernarg_preload = MCConstantExpr::create(
637	Value: AMDGPU::hasKernargPreload(STI: STM) ? Info->getNumKernargPreloadedSGPRs() : `0`,
638	Ctx);
639
640	return KernelDescriptor;
641	}
642
643	bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
644	// Init target streamer lazily on the first function so that previous passes
645	// can set metadata.
646	if (!IsTargetStreamerInitialized)
647	initTargetStreamer(M&: *MF.getFunction().getParent());
648
649	ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
650	CurrentProgramInfo.reset(MF);
651
652	const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
653	MCContext &Ctx = MF.getContext();
654
655	// The starting address of all shader programs must be 256 bytes aligned.
656	// Regular functions just need the basic required instruction alignment.
657	MF.setAlignment(MFI->isEntryFunction() ? Align (`256`) : Align (`4`));
658
659	SetupMachineFunction(MF);
660
661	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
662	MCContext &Context = getObjFileLowering().getContext();
663	bool IsLocal = MF.getFunction().hasLocalLinkage();
664	// FIXME: This should be an explicit check for Mesa.
665	if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
666	MCSectionELF *ConfigSection =
667	Context.getELFSection(Section: ".AMDGPU.config", Type: ELF::SHT_PROGBITS, Flags: `0`);
668	OutStreamer ->switchSection(Section: ConfigSection);
669	}
670
671	const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
672	ResourceUsage->getResourceInfo();
673	RI.gatherResourceInfo(MF, FRI: Info, OutContext);
674
675	if (MFI->isModuleEntryFunction()) {
676	getSIProgramInfo(Out&: CurrentProgramInfo, MF);
677	}
678
679	if (STM.isAmdPalOS()) {
680	if (MFI->isEntryFunction())
681	EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo);
682	else if (MFI->isModuleEntryFunction())
683	emitPALFunctionMetadata(MF);
684	} else if (!STM.isAmdHsaOS()) {
685	EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo);
686	}
687
688	DumpCodeInstEmitter = nullptr;
689	if (STM.dumpCode()) {
690	// For -dumpcode, get the assembler out of the streamer. This only works
691	// with -filetype=obj.
692	MCAssembler *Assembler = OutStreamer ->getAssemblerPtr();
693	if (Assembler)
694	DumpCodeInstEmitter = Assembler->getEmitterPtr();
695	}
696
697	DisasmLines.clear();
698	HexLines.clear();
699	DisasmLineMaxLen = `0`;
700
701	emitFunctionBody();
702
703	emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(),
704	hasMAIInsts: STM.hasMAIInsts());
705
706	{
707	using RIK = MCResourceInfo::ResourceInfoKind;
708	getTargetStreamer()->EmitMCResourceInfo(
709	NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext,
710	IsLocal),
711	NumAGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext,
712	IsLocal),
713	NumExplicitSGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext,
714	IsLocal),
715	PrivateSegmentSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
716	OutContext, IsLocal),
717	UsesVCC: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext,
718	IsLocal),
719	UsesFlatScratch: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesFlatScratch,
720	OutContext, IsLocal),
721	HasDynamicallySizedStack: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasDynSizedStack,
722	OutContext, IsLocal),
723	HasRecursion: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasRecursion, OutContext,
724	IsLocal),
725	HasIndirectCall: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasIndirectCall,
726	OutContext, IsLocal));
727	}
728
729	if (isVerbose()) {
730	MCSectionELF *CommentSection =
731	Context.getELFSection(Section: ".AMDGPU.csdata", Type: ELF::SHT_PROGBITS, Flags: `0`);
732	OutStreamer ->switchSection(Section: CommentSection);
733
734	if (!MFI->isEntryFunction()) {
735	using RIK = MCResourceInfo::ResourceInfoKind;
736	OutStreamer ->emitRawComment(T: " Function info:", TabPrefix: false);
737
738	emitCommonFunctionComments(
739	NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext,
740	IsLocal)
741	->getVariableValue(),
742	NumAGPR: STM.hasMAIInsts()
743	? RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR,
744	OutContext, IsLocal)
745	->getVariableValue()
746	: nullptr,
747	TotalNumVGPR: RI.createTotalNumVGPRs(MF, Ctx),
748	NumSGPR: RI.createTotalNumSGPRs(
749	MF,
750	hasXnack: MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
751	Ctx),
752	ScratchSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
753	OutContext, IsLocal)
754	->getVariableValue(),
755	CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
756	return false;
757	}
758
759	OutStreamer ->emitRawComment(T: " Kernel info:", TabPrefix: false);
760	emitCommonFunctionComments(
761	NumVGPR: CurrentProgramInfo.NumArchVGPR,
762	NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
763	TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR,
764	ScratchSize: CurrentProgramInfo.ScratchSize,
765	CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
766
767	OutStreamer ->emitRawComment(
768	T: " FloatMode: " + Twine (CurrentProgramInfo.FloatMode), TabPrefix: false);
769	OutStreamer ->emitRawComment(
770	T: " IeeeMode: " + Twine (CurrentProgramInfo.IEEEMode), TabPrefix: false);
771	OutStreamer ->emitRawComment(
772	T: " LDSByteSize: " + Twine (CurrentProgramInfo.LDSSize) +
773	" bytes/workgroup (compile time only)", TabPrefix: false);
774
775	OutStreamer ->emitRawComment(
776	T: " SGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.SGPRBlocks), TabPrefix: false);
777
778	OutStreamer ->emitRawComment(
779	T: " VGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.VGPRBlocks), TabPrefix: false);
780
781	OutStreamer ->emitRawComment(
782	T: " NumSGPRsForWavesPerEU: " +
783	getMCExprStr(Value: CurrentProgramInfo.NumSGPRsForWavesPerEU),
784	TabPrefix: false);
785	OutStreamer ->emitRawComment(
786	T: " NumVGPRsForWavesPerEU: " +
787	getMCExprStr(Value: CurrentProgramInfo.NumVGPRsForWavesPerEU),
788	TabPrefix: false);
789
790	if (STM.hasGFX90AInsts()) {
791	const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
792	LHS: CurrentProgramInfo.AccumOffset, RHS: MCConstantExpr::create(Value: `1`, Ctx), Ctx);
793	AdjustedAccum = MCBinaryExpr::createMul(
794	LHS: AdjustedAccum, RHS: MCConstantExpr::create(Value: `4`, Ctx), Ctx);
795	OutStreamer ->emitRawComment(
796	T: " AccumOffset: " + getMCExprStr(Value: AdjustedAccum), TabPrefix: false);
797	}
798
799	OutStreamer ->emitRawComment(
800	T: " Occupancy: " + getMCExprStr(Value: CurrentProgramInfo.Occupancy), TabPrefix: false);
801
802	OutStreamer ->emitRawComment(
803	T: " WaveLimiterHint : " + Twine (MFI->needsWaveLimiter()), TabPrefix: false);
804
805	OutStreamer ->emitRawComment(
806	T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
807	getMCExprStr(Value: CurrentProgramInfo.ScratchEnable),
808	TabPrefix: false);
809	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " +
810	Twine (CurrentProgramInfo.UserSGPR),
811	TabPrefix: false);
812	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
813	Twine (CurrentProgramInfo.TrapHandlerEnable),
814	TabPrefix: false);
815	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
816	Twine (CurrentProgramInfo.TGIdXEnable),
817	TabPrefix: false);
818	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
819	Twine (CurrentProgramInfo.TGIdYEnable),
820	TabPrefix: false);
821	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
822	Twine (CurrentProgramInfo.TGIdZEnable),
823	TabPrefix: false);
824	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
825	Twine (CurrentProgramInfo.TIdIGCompCount),
826	TabPrefix: false);
827
828	[[maybe_unused]] int64_t PGMRSrc3;
829	assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 \|\|
830	STM.hasGFX90AInsts() \|\|
831	(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
832	static_cast<uint64_t>(PGMRSrc3) == `0`));
833	if (STM.hasGFX90AInsts()) {
834	OutStreamer ->emitRawComment(
835	T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
836	getMCExprStr(Value: MCKernelDescriptor::bits_get(
837	Src: CurrentProgramInfo.ComputePGMRSrc3,
838	Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
839	Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
840	TabPrefix: false);
841	OutStreamer ->emitRawComment(
842	T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
843	getMCExprStr(Value: MCKernelDescriptor::bits_get(
844	Src: CurrentProgramInfo.ComputePGMRSrc3,
845	Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
846	Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
847	TabPrefix: false);
848	}
849	}
850
851	if (DumpCodeInstEmitter) {
852
853	OutStreamer ->switchSection(
854	Section: Context.getELFSection(Section: ".AMDGPU.disasm", Type: ELF::SHT_PROGBITS, Flags: `0`));
855
856	for (size_t i = `0`; i < DisasmLines.size(); ++i) {
857	std::string Comment = "\n";
858	if (!HexLines [i].empty()) {
859	Comment = std::string (DisasmLineMaxLen - DisasmLines [i].size(), `' '`);
860	Comment += " ; " + HexLines [i] + "\n";
861	}
862
863	OutStreamer ->emitBytes(Data: StringRef (DisasmLines [i]));
864	OutStreamer ->emitBytes(Data: StringRef (Comment));
865	}
866	}
867
868	return false;
869	}
870
871	// TODO: Fold this into emitFunctionBodyStart.
872	void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
873	// In the beginning all features are either 'Any' or 'NotSupported',
874	// depending on global target features. This will cover empty modules.
875	getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(),
876	FeatureString: getGlobalSTI()->getFeatureString());
877
878	// If module is empty, we are done.
879	if (M.empty())
880	return;
881
882	// If module is not empty, need to find first 'Off' or 'On' feature
883	// setting per feature from functions in module.
884	for (auto &F : M) {
885	auto &TSTargetID = getTargetStreamer()->getTargetID();
886	if ((!TSTargetID ->isXnackSupported() \|\| TSTargetID ->isXnackOnOrOff()) &&
887	(!TSTargetID ->isSramEccSupported() \|\| TSTargetID ->isSramEccOnOrOff()))
888	break;
889
890	const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
891	const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
892	if (TSTargetID ->isXnackSupported())
893	if (TSTargetID ->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
894	TSTargetID ->setXnackSetting(STMTargetID.getXnackSetting());
895	if (TSTargetID ->isSramEccSupported())
896	if (TSTargetID ->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
897	TSTargetID ->setSramEccSetting(STMTargetID.getSramEccSetting());
898	}
899	}
900
901	// AccumOffset computed for the MCExpr equivalent of:
902	// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
903	static const MCExpr computeAccumOffset(const* MCExpr *NumVGPR, MCContext &Ctx) {
904	const MCExpr *ConstFour = MCConstantExpr::create(Value: `4`, Ctx);
905	const MCExpr *ConstOne = MCConstantExpr::create(Value: `1`, Ctx);
906
907	// Can't be lower than 1 for subsequent alignTo.
908	const MCExpr *MaximumTaken =
909	AMDGPUMCExpr::createMax(Args: {ConstOne, NumVGPR}, Ctx);
910
911	// Practically, it's computing divideCeil(MaximumTaken, 4).
912	const MCExpr *DivCeil = MCBinaryExpr::createDiv(
913	LHS: AMDGPUMCExpr::createAlignTo(Value: MaximumTaken, Align: ConstFour, Ctx), RHS: ConstFour,
914	Ctx);
915
916	return MCBinaryExpr::createSub(LHS: DivCeil, RHS: ConstOne, Ctx);
917	}
918
919	void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
920	const MachineFunction &MF) {
921	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
922	bool IsLocal = MF.getFunction().hasLocalLinkage();
923	MCContext &Ctx = MF.getContext();
924
925	auto CreateExpr = [&Ctx](int64_t Value) {
926	return MCConstantExpr::create(Value, Ctx);
927	};
928
929	auto TryGetMCExprValue = [](const MCExpr Value, uint64_t &Res) -> bool* {
930	int64_t Val;
931	if (Value->evaluateAsAbsolute(Res&: Val)) {
932	Res = Val;
933	return true;
934	}
935	return false;
936	};
937
938	auto GetSymRefExpr =
939	[&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
940	MCSymbol *Sym =
941	RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK, OutContext, IsLocal);
942	return MCSymbolRefExpr::create(Symbol: Sym, Ctx);
943	};
944
945	using RIK = MCResourceInfo::ResourceInfoKind;
946	ProgInfo.NumArchVGPR = GetSymRefExpr (RIK::RIK_NumVGPR);
947	ProgInfo.NumAccVGPR = GetSymRefExpr (RIK::RIK_NumAGPR);
948	ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
949	NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
950
951	ProgInfo.AccumOffset = computeAccumOffset(NumVGPR: ProgInfo.NumArchVGPR, Ctx);
952	ProgInfo.TgSplit = STM.isTgSplitEnabled();
953	ProgInfo.NumSGPR = GetSymRefExpr (RIK::RIK_NumSGPR);
954	ProgInfo.ScratchSize = GetSymRefExpr (RIK::RIK_PrivateSegSize);
955	ProgInfo.VCCUsed = GetSymRefExpr (RIK::RIK_UsesVCC);
956	ProgInfo.FlatUsed = GetSymRefExpr (RIK::RIK_UsesFlatScratch);
957	ProgInfo.DynamicCallStack =
958	MCBinaryExpr::createOr(LHS: GetSymRefExpr (RIK::RIK_HasDynSizedStack),
959	RHS: GetSymRefExpr (RIK::RIK_HasRecursion), Ctx);
960
961	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
962
963	// The calculations related to SGPR/VGPR blocks are
964	// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
965	// unified.
966	const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
967	VCCUsed: ProgInfo.VCCUsed, FlatScrUsed: ProgInfo.FlatUsed,
968	XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
969
970	// Check the addressable register limit before we add ExtraSGPRs.
971	if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
972	!STM.hasSGPRInitBug()) {
973	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
974	uint64_t NumSgpr;
975	if (TryGetMCExprValue (ProgInfo.NumSGPR, NumSgpr) &&
976	NumSgpr > MaxAddressableNumSGPRs) {
977	// This can happen due to a compiler bug or when using inline asm.
978	LLVMContext &Ctx = MF.getFunction().getContext();
979	DiagnosticInfoResourceLimit Diag(
980	MF.getFunction(), "addressable scalar registers", NumSgpr,
981	MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
982	Ctx.diagnose(DI: Diag);
983	ProgInfo.NumSGPR = CreateExpr (MaxAddressableNumSGPRs - `1`);
984	}
985	}
986
987	// Account for extra SGPRs and VGPRs reserved for debugger use.
988	ProgInfo.NumSGPR = MCBinaryExpr::createAdd(LHS: ProgInfo.NumSGPR, RHS: ExtraSGPRs, Ctx);
989
990	const Function &F = MF.getFunction();
991
992	// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
993	// dispatch registers are function args.
994	unsigned WaveDispatchNumSGPR = `0`, WaveDispatchNumVGPR = `0`;
995
996	if (isShader(CC: F.getCallingConv())) {
997	bool IsPixelShader =
998	F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
999
1000	// Calculate the number of VGPR registers based on the SPI input registers
1001	uint32_t InputEna = `0`;
1002	uint32_t InputAddr = `0`;
1003	unsigned LastEna = `0`;
1004
1005	if (IsPixelShader) {
1006	// Note for IsPixelShader:
1007	// By this stage, all enabled inputs are tagged in InputAddr as well.
1008	// We will use InputAddr to determine whether the input counts against the
1009	// vgpr total and only use the InputEnable to determine the last input
1010	// that is relevant - if extra arguments are used, then we have to honour
1011	// the InputAddr for any intermediate non-enabled inputs.
1012	InputEna = MFI->getPSInputEnable();
1013	InputAddr = MFI->getPSInputAddr();
1014
1015	// We only need to consider input args up to the last used arg.
1016	assert((InputEna \|\| InputAddr) &&
1017	"PSInputAddr and PSInputEnable should "
1018	"never both be 0 for AMDGPU_PS shaders");
1019	// There are some rare circumstances where InputAddr is non-zero and
1020	// InputEna can be set to 0. In this case we default to setting LastEna
1021	// to 1.
1022	LastEna = InputEna ? llvm::Log2_32(Value: InputEna) + `1` : `1`;
1023	}
1024
1025	// FIXME: We should be using the number of registers determined during
1026	// calling convention lowering to legalize the types.
1027	const DataLayout &DL = F.getDataLayout();
1028	unsigned PSArgCount = `0`;
1029	unsigned IntermediateVGPR = `0`;
1030	for (auto &Arg : F.args()) {
1031	unsigned NumRegs = (DL.getTypeSizeInBits(Ty: Arg.getType()) + `31`) / `32`;
1032	if (Arg.hasAttribute(Kind: Attribute::InReg)) {
1033	WaveDispatchNumSGPR += NumRegs;
1034	} else {
1035	// If this is a PS shader and we're processing the PS Input args (first
1036	// 16 VGPR), use the InputEna and InputAddr bits to define how many
1037	// VGPRs are actually used.
1038	// Any extra VGPR arguments are handled as normal arguments (and
1039	// contribute to the VGPR count whether they're used or not).
1040	if (IsPixelShader && PSArgCount < `16`) {
1041	if ((`1` << PSArgCount) & InputAddr) {
1042	if (PSArgCount < LastEna)
1043	WaveDispatchNumVGPR += NumRegs;
1044	else
1045	IntermediateVGPR += NumRegs;
1046	}
1047	PSArgCount++;
1048	} else {
1049	// If there are extra arguments we have to include the allocation for
1050	// the non-used (but enabled with InputAddr) input arguments
1051	if (IntermediateVGPR) {
1052	WaveDispatchNumVGPR += IntermediateVGPR;
1053	IntermediateVGPR = `0`;
1054	}
1055	WaveDispatchNumVGPR += NumRegs;
1056	}
1057	}
1058	}
1059	ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
1060	Args: {ProgInfo.NumSGPR, CreateExpr (WaveDispatchNumSGPR)}, Ctx);
1061
1062	ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
1063	Args: {ProgInfo.NumVGPR, CreateExpr (WaveDispatchNumVGPR)}, Ctx);
1064
1065	ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1066	NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1067	} else if (isKernel(CC: F.getCallingConv()) &&
1068	MFI->getNumKernargPreloadedSGPRs()) {
1069	// Consider cases where the total number of UserSGPRs with trailing
1070	// allocated preload SGPRs, is greater than the number of explicitly
1071	// referenced SGPRs.
1072	const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1073	LHS: CreateExpr (MFI->getNumUserSGPRs()), RHS: ExtraSGPRs, Ctx);
1074	ProgInfo.NumSGPR =
1075	AMDGPUMCExpr::createMax(Args: {ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
1076	}
1077
1078	// Adjust number of registers used to meet default/requested minimum/maximum
1079	// number of waves per execution unit request.
1080	unsigned MaxWaves = MFI->getMaxWavesPerEU();
1081	ProgInfo.NumSGPRsForWavesPerEU =
1082	AMDGPUMCExpr::createMax(Args: {ProgInfo.NumSGPR, CreateExpr (`1ul`),
1083	CreateExpr (STM.getMinNumSGPRs(WavesPerEU: MaxWaves))},
1084	Ctx);
1085	ProgInfo.NumVGPRsForWavesPerEU =
1086	AMDGPUMCExpr::createMax(Args: {ProgInfo.NumVGPR, CreateExpr (`1ul`),
1087	CreateExpr (STM.getMinNumVGPRs(
1088	WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()))},
1089	Ctx);
1090
1091	if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS \|\|
1092	STM.hasSGPRInitBug()) {
1093	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1094	uint64_t NumSgpr;
1095	if (TryGetMCExprValue (ProgInfo.NumSGPR, NumSgpr) &&
1096	NumSgpr > MaxAddressableNumSGPRs) {
1097	// This can happen due to a compiler bug or when using inline asm to use
1098	// the registers which are usually reserved for vcc etc.
1099	LLVMContext &Ctx = MF.getFunction().getContext();
1100	DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1101	NumSgpr, MaxAddressableNumSGPRs,
1102	DS_Error, DK_ResourceLimit);
1103	Ctx.diagnose(DI: Diag);
1104	ProgInfo.NumSGPR = CreateExpr (MaxAddressableNumSGPRs);
1105	ProgInfo.NumSGPRsForWavesPerEU = CreateExpr (MaxAddressableNumSGPRs);
1106	}
1107	}
1108
1109	if (STM.hasSGPRInitBug()) {
1110	ProgInfo.NumSGPR =
1111	CreateExpr (AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1112	ProgInfo.NumSGPRsForWavesPerEU =
1113	CreateExpr (AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1114	}
1115
1116	if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1117	LLVMContext &Ctx = MF.getFunction().getContext();
1118	DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1119	MFI->getNumUserSGPRs(),
1120	STM.getMaxNumUserSGPRs(), DS_Error);
1121	Ctx.diagnose(DI: Diag);
1122	}
1123
1124	if (MFI->getLDSSize() >
1125	static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
1126	LLVMContext &Ctx = MF.getFunction().getContext();
1127	DiagnosticInfoResourceLimit Diag(
1128	MF.getFunction(), "local memory", MFI->getLDSSize(),
1129	STM.getAddressableLocalMemorySize(), DS_Error);
1130	Ctx.diagnose(DI: Diag);
1131	}
1132	// The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1133	// (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1134	auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1135	unsigned Granule) {
1136	const MCExpr *OneConst = CreateExpr (`1ul`);
1137	const MCExpr *GranuleConst = CreateExpr (Granule);
1138	const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax(Args: {NumGPR, OneConst}, Ctx);
1139	const MCExpr *AlignToGPR =
1140	AMDGPUMCExpr::createAlignTo(Value: MaxNumGPR, Align: GranuleConst, Ctx);
1141	const MCExpr *DivGPR =
1142	MCBinaryExpr::createDiv(LHS: AlignToGPR, RHS: GranuleConst, Ctx);
1143	const MCExpr *SubGPR = MCBinaryExpr::createSub(LHS: DivGPR, RHS: OneConst, Ctx);
1144	return SubGPR;
1145	};
1146
1147	ProgInfo.SGPRBlocks = GetNumGPRBlocks (ProgInfo.NumSGPRsForWavesPerEU,
1148	IsaInfo::getSGPREncodingGranule(STI: &STM));
1149	ProgInfo.VGPRBlocks = GetNumGPRBlocks (ProgInfo.NumVGPRsForWavesPerEU,
1150	IsaInfo::getVGPREncodingGranule(STI: &STM));
1151
1152	const SIModeRegisterDefaults Mode = MFI->getMode();
1153
1154	// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1155	// register.
1156	ProgInfo.FloatMode = getFPMode(Mode);
1157
1158	ProgInfo.IEEEMode = Mode.IEEE;
1159
1160	// Make clamp modifier on NaN input returns 0.
1161	ProgInfo.DX10Clamp = Mode.DX10Clamp;
1162
1163	unsigned LDSAlignShift;
1164	if (STM.getFeatureBits().test(I: FeatureAddressableLocalMemorySize163840)) {
1165	// LDS is allocated in 320 dword blocks.
1166	LDSAlignShift = `11`;
1167	} else if (STM.getFeatureBits().test(
1168	I: FeatureAddressableLocalMemorySize65536)) {
1169	// LDS is allocated in 128 dword blocks.
1170	LDSAlignShift = `9`;
1171	} else {
1172	// LDS is allocated in 64 dword blocks.
1173	LDSAlignShift = `8`;
1174	}
1175
1176	ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1177	ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1178
1179	ProgInfo.LDSSize = MFI->getLDSSize();
1180	ProgInfo.LDSBlocks =
1181	alignTo(Value: ProgInfo.LDSSize, Align: `1ULL` << LDSAlignShift) >> LDSAlignShift;
1182
1183	// The MCExpr equivalent of divideCeil.
1184	auto DivideCeil = [&Ctx](const MCExpr Numerator, const* MCExpr *Denominator) {
1185	const MCExpr *Ceil =
1186	AMDGPUMCExpr::createAlignTo(Value: Numerator, Align: Denominator, Ctx);
1187	return MCBinaryExpr::createDiv(LHS: Ceil, RHS: Denominator, Ctx);
1188	};
1189
1190	// Scratch is allocated in 64-dword or 256-dword blocks.
1191	unsigned ScratchAlignShift =
1192	STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? `8` : `10`;
1193	// We need to program the hardware with the amount of scratch memory that
1194	// is used by the entire wave. ProgInfo.ScratchSize is the amount of
1195	// scratch memory used per thread.
1196	ProgInfo.ScratchBlocks = DivideCeil (
1197	MCBinaryExpr::createMul(LHS: ProgInfo.ScratchSize,
1198	RHS: CreateExpr (STM.getWavefrontSize()), Ctx),
1199	CreateExpr (`1ULL` << ScratchAlignShift));
1200
1201	if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= `10`) {
1202	ProgInfo.WgpMode = STM.isCuModeEnabled() ? `0` : `1`;
1203	ProgInfo.MemOrdered = `1`;
1204	ProgInfo.FwdProgress = `1`;
1205	}
1206
1207	// 0 = X, 1 = XY, 2 = XYZ
1208	unsigned TIDIGCompCnt = `0`;
1209	if (MFI->hasWorkItemIDZ())
1210	TIDIGCompCnt = `2`;
1211	else if (MFI->hasWorkItemIDY())
1212	TIDIGCompCnt = `1`;
1213
1214	// The private segment wave byte offset is the last of the system SGPRs. We
1215	// initially assumed it was allocated, and may have used it. It shouldn't harm
1216	// anything to disable it if we know the stack isn't used here. We may still
1217	// have emitted code reading it to initialize scratch, but if that's unused
1218	// reading garbage should be OK.
1219	ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
1220	LHS: MCBinaryExpr::createGT(LHS: ProgInfo.ScratchBlocks,
1221	RHS: MCConstantExpr::create(Value: `0`, Ctx), Ctx),
1222	RHS: ProgInfo.DynamicCallStack, Ctx);
1223
1224	ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1225	// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1226	ProgInfo.TrapHandlerEnable =
1227	STM.isAmdHsaOS() ? `0` : STM.isTrapHandlerEnabled();
1228	ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1229	ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1230	ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1231	ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1232	ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1233	ProgInfo.EXCPEnMSB = `0`;
1234	// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1235	ProgInfo.LdsSize = STM.isAmdHsaOS() ? `0` : ProgInfo.LDSBlocks;
1236	ProgInfo.EXCPEnable = `0`;
1237
1238	// return ((Dst & ~Mask) \| (Value << Shift))
1239	auto SetBits = [&Ctx](const MCExpr Dst, const* MCExpr *Value, uint32_t Mask,
1240	uint32_t Shift) {
1241	const auto *Shft = MCConstantExpr::create(Value: Shift, Ctx);
1242	const auto *Msk = MCConstantExpr::create(Value: Mask, Ctx);
1243	Dst = MCBinaryExpr::createAnd(LHS: Dst, RHS: MCUnaryExpr::createNot(Expr: Msk, Ctx), Ctx);
1244	Dst = MCBinaryExpr::createOr(LHS: Dst, RHS: MCBinaryExpr::createShl(LHS: Value, RHS: Shft, Ctx),
1245	Ctx);
1246	return Dst;
1247	};
1248
1249	if (STM.hasGFX90AInsts()) {
1250	ProgInfo.ComputePGMRSrc3 =
1251	SetBits (ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1252	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1253	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1254	ProgInfo.ComputePGMRSrc3 =
1255	SetBits (ProgInfo.ComputePGMRSrc3, CreateExpr (ProgInfo.TgSplit),
1256	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1257	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1258	}
1259
1260	ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
1261	InitOcc: STM.computeOccupancy(F, LDSSize: ProgInfo.LDSSize).second,
1262	NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU, NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU,
1263	DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1264
1265	const auto [MinWEU, MaxWEU] =
1266	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default: {`0`, `0`}, OnlyFirstRequired: true);
1267	uint64_t Occupancy;
1268	if (TryGetMCExprValue (ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1269	DiagnosticInfoOptimizationFailure Diag(
1270	F, F.getSubprogram(),
1271	"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1272	"'" +
1273	F.getName() + "': desired occupancy was " + Twine (MinWEU) +
1274	", final occupancy is " + Twine (Occupancy));
1275	F.getContext().diagnose(DI: Diag);
1276	}
1277
1278	if (isGFX11Plus(STI: STM)) {
1279	uint32_t CodeSizeInBytes = (uint32_t)std::min(
1280	a: ProgInfo.getFunctionCodeSize(MF, IsLowerBound: true / IsLowerBound /),
1281	b: (uint64_t)std::numeric_limits<uint32_t>::max());
1282	uint32_t CodeSizeInLines = divideCeil(Numerator: CodeSizeInBytes, Denominator: `128`);
1283	uint32_t Field, Shift, Width;
1284	if (isGFX11(STI: STM)) {
1285	Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1286	Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1287	Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1288	} else {
1289	Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1290	Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1291	Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1292	}
1293	uint64_t InstPrefSize = std::min(a: CodeSizeInLines, b: (`1u` << Width) - `1`);
1294	ProgInfo.ComputePGMRSrc3 = SetBits (ProgInfo.ComputePGMRSrc3,
1295	CreateExpr (InstPrefSize), Field, Shift);
1296	}
1297	}
1298
1299	static unsigned getRsrcReg(CallingConv::ID CallConv) {
1300	switch (CallConv) {
1301	default: [[fallthrough]];
1302	case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1303	case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1304	case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1305	case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1306	case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1307	case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1308	case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1309	}
1310	}
1311
1312	void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1313	const SIProgramInfo &CurrentProgramInfo) {
1314	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1315	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1316	unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv());
1317	MCContext &Ctx = MF.getContext();
1318
1319	// (((Value) & Mask) << Shift)
1320	auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1321	const MCExpr *msk = MCConstantExpr::create(Value: Mask, Ctx);
1322	const MCExpr *shft = MCConstantExpr::create(Value: Shift, Ctx);
1323	return MCBinaryExpr::createShl(LHS: MCBinaryExpr::createAnd(LHS: Value, RHS: msk, Ctx),
1324	RHS: shft, Ctx);
1325	};
1326
1327	auto EmitResolvedOrExpr = [this](const MCExpr Value, unsigned* Size) {
1328	int64_t Val;
1329	if (Value->evaluateAsAbsolute(Res&: Val))
1330	OutStreamer ->emitIntValue(Value: static_cast<uint64_t>(Val), Size);
1331	else
1332	OutStreamer ->emitValue(Value, Size);
1333	};
1334
1335	if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
1336	OutStreamer ->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1337
1338	EmitResolvedOrExpr (CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx),
1339	/Size=/`4`);
1340
1341	OutStreamer ->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1342	EmitResolvedOrExpr (CurrentProgramInfo.getComputePGMRSrc2(Ctx), /Size=/`4`);
1343
1344	OutStreamer ->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1345
1346	// Sets bits according to S_0286E8_WAVESIZE_ mask and shift values for the*
1347	// appropriate generation.
1348	if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1349	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1350	/Mask=/`0x3FFFF`, /Shift=/`12`),
1351	/Size=/`4`);
1352	} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1353	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1354	/Mask=/`0x7FFF`, /Shift=/`12`),
1355	/Size=/`4`);
1356	} else {
1357	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1358	/Mask=/`0x1FFF`, /Shift=/`12`),
1359	/Size=/`4`);
1360	}
1361
1362	// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1363	// 0" comment but I don't see a corresponding field in the register spec.
1364	} else {
1365	OutStreamer ->emitInt32(Value: RsrcReg);
1366
1367	const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1368	LHS: SetBits (CurrentProgramInfo.VGPRBlocks, /Mask=/`0x3F`, /Shift=/`0`),
1369	RHS: SetBits (CurrentProgramInfo.SGPRBlocks, /Mask=/`0x0F`, /Shift=/`6`),
1370	Ctx&: MF.getContext());
1371	EmitResolvedOrExpr (GPRBlocks, /Size=/`4`);
1372	OutStreamer ->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1373
1374	// Sets bits according to S_0286E8_WAVESIZE_ mask and shift values for the*
1375	// appropriate generation.
1376	if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1377	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1378	/Mask=/`0x3FFFF`, /Shift=/`12`),
1379	/Size=/`4`);
1380	} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1381	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1382	/Mask=/`0x7FFF`, /Shift=/`12`),
1383	/Size=/`4`);
1384	} else {
1385	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1386	/Mask=/`0x1FFF`, /Shift=/`12`),
1387	/Size=/`4`);
1388	}
1389	}
1390
1391	if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1392	OutStreamer ->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1393	unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1394	? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: `2`)
1395	: CurrentProgramInfo.LDSBlocks;
1396	OutStreamer ->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1397	OutStreamer ->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1398	OutStreamer ->emitInt32(Value: MFI->getPSInputEnable());
1399	OutStreamer ->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1400	OutStreamer ->emitInt32(Value: MFI->getPSInputAddr());
1401	}
1402
1403	OutStreamer ->emitInt32(R_SPILLED_SGPRS);
1404	OutStreamer ->emitInt32(Value: MFI->getNumSpilledSGPRs());
1405	OutStreamer ->emitInt32(R_SPILLED_VGPRS);
1406	OutStreamer ->emitInt32(Value: MFI->getNumSpilledVGPRs());
1407	}
1408
1409	// Helper function to add common PAL Metadata 3.0+
1410	static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1411	const SIProgramInfo &CurrentProgramInfo,
1412	CallingConv::ID CC, const GCNSubtarget &ST,
1413	unsigned DynamicVGPRBlockSize) {
1414	if (ST.hasIEEEMode())
1415	MD->setHwStage(CC, field: ".ieee_mode", Val: (bool)CurrentProgramInfo.IEEEMode);
1416
1417	MD->setHwStage(CC, field: ".wgp_mode", Val: (bool)CurrentProgramInfo.WgpMode);
1418	MD->setHwStage(CC, field: ".mem_ordered", Val: (bool)CurrentProgramInfo.MemOrdered);
1419
1420	if (AMDGPU::isCompute(CC)) {
1421	MD->setHwStage(CC, field: ".trap_present",
1422	Val: (bool)CurrentProgramInfo.TrapHandlerEnable);
1423	MD->setHwStage(CC, field: ".excp_en", Val: CurrentProgramInfo.EXCPEnable);
1424
1425	if (DynamicVGPRBlockSize != `0`)
1426	MD->setComputeRegisters(field: ".dynamic_vgpr_en", Val: true);
1427	}
1428
1429	MD->setHwStage(CC, field: ".lds_size",
1430	Val: (unsigned)(CurrentProgramInfo.LdsSize *
1431	getLdsDwGranularity(ST) * sizeof(uint32_t)));
1432	}
1433
1434	// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1435	// is AMDPAL. It stores each compute/SPI register setting and other PAL
1436	// metadata items into the PALMD::Metadata, combining with any provided by the
1437	// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1438	// is then written as a single block in the .note section.
1439	void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1440	const SIProgramInfo &CurrentProgramInfo) {
1441	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1442	auto CC = MF.getFunction().getCallingConv();
1443	auto *MD = getTargetStreamer()->getPALMetadata();
1444	auto &Ctx = MF.getContext();
1445
1446	MD->setEntryPoint(CC, Name: MF.getFunction().getName());
1447	MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1448
1449	// For targets that support dynamic VGPRs, set the number of saved dynamic
1450	// VGPRs (if any) in the PAL metadata.
1451	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1452	if (MFI->isDynamicVGPREnabled() &&
1453	MFI->getScratchReservedForDynamicVGPRs() > `0`)
1454	MD->setHwStage(CC, field: ".dynamic_vgpr_saved_count",
1455	Val: MFI->getScratchReservedForDynamicVGPRs() / `4`);
1456
1457	// Only set AGPRs for supported devices
1458	if (STM.hasMAIInsts()) {
1459	MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR);
1460	}
1461
1462	MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1463	if (MD->getPALMajorVersion() < `3`) {
1464	MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM, Ctx), Ctx);
1465	if (AMDGPU::isCompute(CC)) {
1466	MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1467	} else {
1468	const MCExpr *HasScratchBlocks =
1469	MCBinaryExpr::createGT(LHS: CurrentProgramInfo.ScratchBlocks,
1470	RHS: MCConstantExpr::create(Value: `0`, Ctx), Ctx);
1471	auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1472	MD->setRsrc2(CC, Val: maskShiftSet(Val: HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1473	}
1474	} else {
1475	MD->setHwStage(CC, field: ".debug_mode", Val: (bool)CurrentProgramInfo.DebugMode);
1476	MD->setHwStage(CC, field: ".scratch_en", Type: msgpack::Type::Boolean,
1477	Val: CurrentProgramInfo.ScratchEnable);
1478	EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM,
1479	DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize());
1480	}
1481
1482	// ScratchSize is in bytes, 16 aligned.
1483	MD->setScratchSize(
1484	CC,
1485	Val: AMDGPUMCExpr::createAlignTo(Value: CurrentProgramInfo.ScratchSize,
1486	Align: MCConstantExpr::create(Value: `16`, Ctx), Ctx),
1487	Ctx);
1488
1489	if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1490	unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1491	? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: `2`)
1492	: CurrentProgramInfo.LDSBlocks;
1493	if (MD->getPALMajorVersion() < `3`) {
1494	MD->setRsrc2(
1495	CC,
1496	Val: MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
1497	Ctx);
1498	MD->setSpiPsInputEna(MFI->getPSInputEnable());
1499	MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1500	} else {
1501	// Graphics registers
1502	const unsigned ExtraLdsDwGranularity =
1503	STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? `256` : `128`;
1504	MD->setGraphicsRegisters(
1505	field: ".ps_extra_lds_size",
1506	Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1507
1508	// Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1509	static StringLiteral const PsInputFields[] = {
1510	".persp_sample_ena", ".persp_center_ena",
1511	".persp_centroid_ena", ".persp_pull_model_ena",
1512	".linear_sample_ena", ".linear_center_ena",
1513	".linear_centroid_ena", ".line_stipple_tex_ena",
1514	".pos_x_float_ena", ".pos_y_float_ena",
1515	".pos_z_float_ena", ".pos_w_float_ena",
1516	".front_face_ena", ".ancillary_ena",
1517	".sample_coverage_ena", ".pos_fixed_pt_ena"};
1518	unsigned PSInputEna = MFI->getPSInputEnable();
1519	unsigned PSInputAddr = MFI->getPSInputAddr();
1520	for (auto [Idx, Field] : enumerate(First: PsInputFields)) {
1521	MD->setGraphicsRegisters(field1: ".spi_ps_input_ena", field2: Field,
1522	Val: (bool)((PSInputEna >> Idx) & `1`));
1523	MD->setGraphicsRegisters(field1: ".spi_ps_input_addr", field2: Field,
1524	Val: (bool)((PSInputAddr >> Idx) & `1`));
1525	}
1526	}
1527	}
1528
1529	// For version 3 and above the wave front size is already set in the metadata
1530	if (MD->getPALMajorVersion() < `3` && STM.isWave32())
1531	MD->setWave32(MF.getFunction().getCallingConv());
1532	}
1533
1534	void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1535	auto *MD = getTargetStreamer()->getPALMetadata();
1536	const MachineFrameInfo &MFI = MF.getFrameInfo();
1537	StringRef FnName = MF.getFunction().getName();
1538	MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize());
1539	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1540	MCContext &Ctx = MF.getContext();
1541
1542	if (MD->getPALMajorVersion() < `3`) {
1543	// Set compute registers
1544	MD->setRsrc1(
1545	CC: CallingConv::AMDGPU_CS,
1546	Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1547	MD->setRsrc2(CC: CallingConv::AMDGPU_CS,
1548	Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1549	} else {
1550	EmitPALMetadataCommon(
1551	MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST,
1552	DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1553	}
1554
1555	// Set optional info
1556	MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize);
1557	MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1558	MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1559	}
1560
1561	// This is supposed to be log2(Size)
1562	static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1563	switch (Size) {
1564	case `4`:
1565	return AMD_ELEMENT_4_BYTES;
1566	case `8`:
1567	return AMD_ELEMENT_8_BYTES;
1568	case `16`:
1569	return AMD_ELEMENT_16_BYTES;
1570	default:
1571	llvm_unreachable("invalid private_element_size");
1572	}
1573	}
1574
1575	void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1576	const SIProgramInfo &CurrentProgramInfo,
1577	const MachineFunction &MF) const {
1578	const Function &F = MF.getFunction();
1579	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
1580	F.getCallingConv() == CallingConv::SPIR_KERNEL);
1581
1582	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1583	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1584	MCContext &Ctx = MF.getContext();
1585
1586	Out.initDefault(STI: &STM, Ctx, /InitMCExpr=/false);
1587
1588	Out.compute_pgm_resource1_registers =
1589	CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx);
1590	Out.compute_pgm_resource2_registers =
1591	CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1592	Out.code_properties \|= AMD_CODE_PROPERTY_IS_PTR64;
1593
1594	Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1595
1596	AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1597	getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1598
1599	const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1600	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1601	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1602	}
1603
1604	if (UserSGPRInfo.hasDispatchPtr())
1605	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1606
1607	if (UserSGPRInfo.hasQueuePtr())
1608	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1609
1610	if (UserSGPRInfo.hasKernargSegmentPtr())
1611	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1612
1613	if (UserSGPRInfo.hasDispatchID())
1614	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1615
1616	if (UserSGPRInfo.hasFlatScratchInit())
1617	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1618
1619	if (UserSGPRInfo.hasPrivateSegmentSize())
1620	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
1621
1622	if (STM.isXNACKEnabled())
1623	Out.code_properties \|= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1624
1625	Align MaxKernArgAlign;
1626	Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign);
1627	Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1628	Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1629	Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1630	Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1631
1632	// kernarg_segment_alignment is specified as log of the alignment.
1633	// The minimum alignment is 16.
1634	// FIXME: The metadata treats the minimum as 4?
1635	Out.kernarg_segment_alignment = Log2(A: std::max(a: Align (`16`), b: MaxKernArgAlign));
1636	}
1637
1638	bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr MI, unsigned* OpNo,
1639	const char *ExtraCode, raw_ostream &O) {
1640	// First try the generic code, which knows about modifiers like 'c' and 'n'.
1641	if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O))
1642	return false;
1643
1644	if (ExtraCode && ExtraCode[`0`]) {
1645	if (ExtraCode[`1`] != `0`)
1646	return true; // Unknown modifier.
1647
1648	switch (ExtraCode[`0`]) {
1649	case `'r'`:
1650	break;
1651	default:
1652	return true;
1653	}
1654	}
1655
1656	// TODO: Should be able to support other operand types like globals.
1657	const MachineOperand &MO = MI->getOperand(i: OpNo);
1658	if (MO.isReg()) {
1659	AMDGPUInstPrinter::printRegOperand(Reg: MO.getReg(), O,
1660	MRI: *MF->getSubtarget().getRegisterInfo());
1661	return false;
1662	}
1663	if (MO.isImm()) {
1664	int64_t Val = MO.getImm();
1665	if (AMDGPU::isInlinableIntLiteral(Literal: Val)) {
1666	O << Val;
1667	} else if (isUInt<`16`>(x: Val)) {
1668	O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val));
1669	} else if (isUInt<`32`>(x: Val)) {
1670	O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val));
1671	} else {
1672	O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val));
1673	}
1674	return false;
1675	}
1676	return true;
1677	}
1678
1679	void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1680	AU.addRequired<AMDGPUResourceUsageAnalysis>();
1681	AU.addPreserved<AMDGPUResourceUsageAnalysis>();
1682	AU.addRequired<MachineModuleInfoWrapperPass>();
1683	AU.addPreserved<MachineModuleInfoWrapperPass>();
1684	AsmPrinter::getAnalysisUsage(AU);
1685	}
1686
1687	void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1688	const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1689	bool isModuleEntryFunction, bool hasMAIInsts) {
1690	if (!ORE)
1691	return;
1692
1693	const char *Name = "kernel-resource-usage";
1694	const char *Indent = " ";
1695
1696	// If the remark is not specifically enabled, do not output to yaml
1697	LLVMContext &Ctx = MF.getFunction().getContext();
1698	if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name))
1699	return;
1700
1701	// Currently non-kernel functions have no resources to emit.
1702	if (!isEntryFunctionCC(CC: MF.getFunction().getCallingConv()))
1703	return;
1704
1705	auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1706	StringRef RemarkLabel, auto Argument) {
1707	// Add an indent for every line besides the line with the kernel name. This
1708	// makes it easier to tell which resource usage go with which kernel since
1709	// the kernel name will always be displayed first.
1710	std::string LabelStr = RemarkLabel.str() + ": ";
1711	if (RemarkName != "FunctionName")
1712	LabelStr = Indent + LabelStr;
1713
1714	ORE->emit([&]() {
1715	return MachineOptimizationRemarkAnalysis (Name, RemarkName,
1716	MF.getFunction().getSubprogram(),
1717	&MF.front())
1718	<< LabelStr << ore::NV(RemarkName, Argument);
1719	});
1720	};
1721
1722	// FIXME: Formatting here is pretty nasty because clang does not accept
1723	// newlines from diagnostics. This forces us to emit multiple diagnostic
1724	// remarks to simulate newlines. If and when clang does accept newlines, this
1725	// formatting should be aggregated into one remark with newlines to avoid
1726	// printing multiple diagnostic location and diag opts.
1727	EmitResourceUsageRemark ("FunctionName", "Function Name",
1728	MF.getFunction().getName());
1729	EmitResourceUsageRemark ("NumSGPR", "TotalSGPRs",
1730	getMCExprStr(Value: CurrentProgramInfo.NumSGPR));
1731	EmitResourceUsageRemark ("NumVGPR", "VGPRs",
1732	getMCExprStr(Value: CurrentProgramInfo.NumArchVGPR));
1733	if (hasMAIInsts) {
1734	EmitResourceUsageRemark ("NumAGPR", "AGPRs",
1735	getMCExprStr(Value: CurrentProgramInfo.NumAccVGPR));
1736	}
1737	EmitResourceUsageRemark ("ScratchSize", "ScratchSize [bytes/lane]",
1738	getMCExprStr(Value: CurrentProgramInfo.ScratchSize));
1739	int64_t DynStack;
1740	bool DynStackEvaluatable =
1741	CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(Res&: DynStack);
1742	StringRef DynamicStackStr =
1743	DynStackEvaluatable && DynStack ? "True" : "False";
1744	EmitResourceUsageRemark ("DynamicStack", "Dynamic Stack", DynamicStackStr);
1745	EmitResourceUsageRemark ("Occupancy", "Occupancy [waves/SIMD]",
1746	getMCExprStr(Value: CurrentProgramInfo.Occupancy));
1747	EmitResourceUsageRemark ("SGPRSpill", "SGPRs Spill",
1748	CurrentProgramInfo.SGPRSpill);
1749	EmitResourceUsageRemark ("VGPRSpill", "VGPRs Spill",
1750	CurrentProgramInfo.VGPRSpill);
1751	if (isModuleEntryFunction)
1752	EmitResourceUsageRemark ("BytesLDS", "LDS Size [bytes/block]",
1753	CurrentProgramInfo.LDSSize);
1754	}
1755
1756	char AMDGPUAsmPrinter::ID = `0`;
1757
1758	INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1759	"AMDGPU Assembly Printer", false, false)
1760

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp