AMDGPUAsmPrinter.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp]

1	//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	///
11	/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12	/// code. When passed an MCAsmStreamer it prints assembly and when passed
13	/// an MCObjectStreamer it outputs binary code.
14	//
15	//===----------------------------------------------------------------------===//
16	//
17
18	#include "AMDGPUAsmPrinter.h"
19	#include "AMDGPU.h"
20	#include "AMDGPUHSAMetadataStreamer.h"
21	#include "AMDGPUMCResourceInfo.h"
22	#include "AMDGPUResourceUsageAnalysis.h"
23	#include "GCNSubtarget.h"
24	#include "MCTargetDesc/AMDGPUInstPrinter.h"
25	#include "MCTargetDesc/AMDGPUMCExpr.h"
26	#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
27	#include "MCTargetDesc/AMDGPUTargetStreamer.h"
28	#include "R600AsmPrinter.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "TargetInfo/AMDGPUTargetInfo.h"
31	#include "Utils/AMDGPUBaseInfo.h"
32	#include "Utils/AMDKernelCodeTUtils.h"
33	#include "Utils/SIDefinesUtils.h"
34	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35	#include "llvm/BinaryFormat/ELF.h"
36	#include "llvm/CodeGen/MachineFrameInfo.h"
37	#include "llvm/CodeGen/MachineModuleInfo.h"
38	#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
39	#include "llvm/IR/DiagnosticInfo.h"
40	#include "llvm/MC/MCAssembler.h"
41	#include "llvm/MC/MCContext.h"
42	#include "llvm/MC/MCSectionELF.h"
43	#include "llvm/MC/MCStreamer.h"
44	#include "llvm/MC/MCValue.h"
45	#include "llvm/MC/TargetRegistry.h"
46	#include "llvm/Support/AMDHSAKernelDescriptor.h"
47	#include "llvm/Support/Compiler.h"
48	#include "llvm/Target/TargetLoweringObjectFile.h"
49	#include "llvm/Target/TargetMachine.h"
50	#include "llvm/TargetParser/TargetParser.h"
51
52	using namespace llvm;
53	using namespace llvm::AMDGPU;
54
55	// This should get the default rounding mode from the kernel. We just set the
56	// default here, but this could change if the OpenCL rounding mode pragmas are
57	// used.
58	//
59	// The denormal mode here should match what is reported by the OpenCL runtime
60	// for the CL_FP_DENORM bit from CL_DEVICE_{HALF\|SINGLE\|DOUBLE}_FP_CONFIG, but
61	// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62	//
63	// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64	// precision, and leaves single precision to flush all and does not report
65	// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66	// CL_FP_DENORM for both.
67	//
68	// FIXME: It seems some instructions do not support single precision denormals
69	// regardless of the mode (exp__f32, rcp__f32, rsq__f32, rsq_f32, sqrt_f32,
70	// and sin_f32, cos_f32 on most parts).
71
72	// We want to use these instructions, and using fp32 denormals also causes
73	// instructions to run at the double precision rate for the device so it's
74	// probably best to just report no single precision denormals.
75	static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
76	return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) \|
77	FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) \|
78	FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) \|
79	FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
80	}
81
82	static AsmPrinter *
83	createAMDGPUAsmPrinterPass(TargetMachine &tm,
84	std::unique_ptr<MCStreamer> &&Streamer) {
85	return new AMDGPUAsmPrinter (tm, std::move(Streamer));
86	}
87
88	extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
89	LLVMInitializeAMDGPUAsmPrinter() {
90	TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(),
91	Fn: llvm::createR600AsmPrinterPass);
92	TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(),
93	Fn: createAMDGPUAsmPrinterPass);
94	}
95
96	AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
97	std::unique_ptr<MCStreamer> Streamer)
98	: AsmPrinter (TM, std::move(Streamer)) {
99	assert(OutStreamer && "AsmPrinter constructed without streamer");
100	}
101
102	StringRef AMDGPUAsmPrinter::getPassName() const {
103	return "AMDGPU Assembly Printer";
104	}
105
106	const MCSubtargetInfo AMDGPUAsmPrinter::getGlobalSTI() const* {
107	return TM.getMCSubtargetInfo();
108	}
109
110	AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
111	if (!OutStreamer)
112	return nullptr;
113	return static_cast<AMDGPUTargetStreamer*>(OutStreamer ->getTargetStreamer());
114	}
115
116	void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
117	IsTargetStreamerInitialized = false;
118	}
119
120	void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
121	IsTargetStreamerInitialized = true;
122
123	// TODO: Which one is called first, emitStartOfAsmFile or
124	// emitFunctionBodyStart?
125	if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126	initializeTargetID(M);
127
128	if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
129	TM.getTargetTriple().getOS() != Triple::AMDPAL)
130	return;
131
132	getTargetStreamer()->EmitDirectiveAMDGCNTarget();
133
134	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
135	getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
136	COV: CodeObjectVersion);
137	HSAMetadataStream ->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID());
138	}
139
140	if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
141	getTargetStreamer()->getPALMetadata()->readFromIR(M);
142	}
143
144	void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
145	// Init target streamer if it has not yet happened
146	if (!IsTargetStreamerInitialized)
147	initTargetStreamer(M);
148
149	if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
150	getTargetStreamer()->EmitISAVersion();
151
152	// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153	// Emit HSA Metadata (NT_AMD_HSA_METADATA).
154	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
155	HSAMetadataStream ->end();
156	bool Success = HSAMetadataStream ->emitTo(TargetStreamer&: *getTargetStreamer());
157	(void)Success;
158	assert(Success && "Malformed HSA Metadata");
159	}
160	}
161
162	void AMDGPUAsmPrinter::emitFunctionBodyStart() {
163	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
164	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165	const Function &F = MF->getFunction();
166
167	// TODO: We're checking this late, would be nice to check it earlier.
168	if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
169	reportFatalUsageError(
170	reason: STM.getCPU() + " is only available on code object version 6 or better");
171	}
172
173	// TODO: Which one is called first, emitStartOfAsmFile or
174	// emitFunctionBodyStart?
175	if (!getTargetStreamer()->getTargetID())
176	initializeTargetID(M: *F.getParent());
177
178	const auto &FunctionTargetID = STM.getTargetID();
179	// Make sure function's xnack settings are compatible with module's
180	// xnack settings.
181	if (FunctionTargetID.isXnackSupported() &&
182	FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183	FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184	OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine (MF->getName()) +
185	"' function does not match module xnack setting");
186	return;
187	}
188	// Make sure function's sramecc settings are compatible with module's
189	// sramecc settings.
190	if (FunctionTargetID.isSramEccSupported() &&
191	FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192	FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193	OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine (MF->getName()) +
194	"' function does not match module sramecc setting");
195	return;
196	}
197
198	if (!MFI.isEntryFunction())
199	return;
200
201	if (STM.isMesaKernel(F) &&
202	(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
203	F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204	AMDGPUMCKernelCodeT KernelCode;
205	getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF);
206	KernelCode.validate(STI: &STM, Ctx&: MF->getContext());
207	getTargetStreamer()->EmitAMDKernelCodeT(Header&: KernelCode);
208	}
209
210	if (STM.isAmdHsaOS())
211	HSAMetadataStream ->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo);
212	}
213
214	void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
215	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
216	if (!MFI.isEntryFunction())
217	return;
218
219	if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
220	return;
221
222	auto &Streamer = getTargetStreamer()->getStreamer();
223	auto &Context = Streamer.getContext();
224	auto &ObjectFileInfo = *Context.getObjectFileInfo();
225	auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227	Streamer.pushSection();
228	Streamer.switchSection(Section: &ReadOnlySection);
229
230	// CP microcode requires the kernel descriptor to be allocated on 64 byte
231	// alignment.
232	Streamer.emitValueToAlignment(Alignment: Align (`64`), Fill: `0`, FillLen: `1`, MaxBytesToEmit: `0`);
233	ReadOnlySection.ensureMinAlignment(MinAlignment: Align (`64`));
234
235	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237	SmallString<`128`> KernelName;
238	getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction());
239	getTargetStreamer()->EmitAmdhsaKernelDescriptor(
240	STI: STM, KernelName, KernelDescriptor: getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo),
241	NextVGPR: CurrentProgramInfo.NumVGPRsForWavesPerEU,
242	NextSGPR: MCBinaryExpr::createSub(
243	LHS: CurrentProgramInfo.NumSGPRsForWavesPerEU,
244	RHS: AMDGPUMCExpr::createExtraSGPRs(
245	VCCUsed: CurrentProgramInfo.VCCUsed, FlatScrUsed: CurrentProgramInfo.FlatUsed,
246	XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx&: Context),
247	Ctx&: Context),
248	ReserveVCC: CurrentProgramInfo.VCCUsed, ReserveFlatScr: CurrentProgramInfo.FlatUsed);
249
250	Streamer.popSection();
251	}
252
253	void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr MI) const* {
254	Register RegNo = MI->getOperand(i: `0`).getReg();
255
256	SmallString<`128`> Str;
257	raw_svector_ostream OS(Str);
258	OS << "implicit-def: "
259	<< printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo());
260
261	if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262	OS << " : SGPR spill to VGPR lane";
263
264	OutStreamer ->AddComment(T: OS.str());
265	OutStreamer ->addBlankLine();
266	}
267
268	void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
269	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
270	AsmPrinter::emitFunctionEntryLabel();
271	return;
272	}
273
274	const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
275	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276	if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(F: MF->getFunction())) {
277	SmallString<`128`> SymbolName;
278	getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()),
279	getTargetStreamer()->EmitAMDGPUSymbolType(
280	SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL);
281	}
282	if (DumpCodeInstEmitter) {
283	// Disassemble function name label to text.
284	DisasmLines.push_back(x: MF->getName().str() + ":");
285	DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
286	HexLines.emplace_back(args: "");
287	}
288
289	AsmPrinter::emitFunctionEntryLabel();
290	}
291
292	void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
293	if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) {
294	// Write a line for the basic block label if it is not only fallthrough.
295	DisasmLines.push_back(
296	x: (Twine ("BB") + Twine (getFunctionNumber())
297	+ "_" + Twine (MBB.getNumber()) + ":").str());
298	DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
299	HexLines.emplace_back(args: "");
300	}
301	AsmPrinter::emitBasicBlockStart(MBB);
302	}
303
304	void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
305	if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
306	if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) {
307	OutContext.reportError(L: {},
308	Msg: Twine (GV->getName()) +
309	": unsupported initializer for address space");
310	return;
311	}
312
313	// LDS variables aren't emitted in HSA or PAL yet.
314	const Triple::OSType OS = TM.getTargetTriple().getOS();
315	if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
316	return;
317
318	MCSymbol *GVSym = getSymbol(GV);
319
320	GVSym->redefineIfPossible();
321	if (GVSym->isDefined() \|\| GVSym->isVariable())
322	report_fatal_error(reason: "symbol '" + Twine (GVSym->getName()) +
323	"' is already defined");
324
325	const DataLayout &DL = GV->getDataLayout();
326	uint64_t Size = GV->getGlobalSize(DL);
327	Align Alignment = GV->getAlign().value_or(u: Align (`4`));
328
329	emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration());
330	emitLinkage(GV, GVSym);
331	auto *TS = getTargetStreamer();
332	TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment);
333	return;
334	}
335
336	AsmPrinter::emitGlobalVariable(GV);
337	}
338
339	bool AMDGPUAsmPrinter::doInitialization(Module &M) {
340	CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
342	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
343	switch (CodeObjectVersion) {
344	case AMDGPU::AMDHSA_COV4:
345	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346	break;
347	case AMDGPU::AMDHSA_COV5:
348	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349	break;
350	case AMDGPU::AMDHSA_COV6:
351	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352	break;
353	default:
354	reportFatalUsageError(reason: "unsupported code object version");
355	}
356	}
357
358	return AsmPrinter::doInitialization(M);
359	}
360
361	/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
362	///
363	/// Remove dependency on GCNSubtarget and depend only only the necessary values
364	/// for said occupancy computation. Should match computeOccupancy implementation
365	/// without passing \p STM on.
366	const AMDGPUMCExpr createOccupancy(unsigned* InitOcc, const MCExpr *NumSGPRs,
367	const MCExpr *NumVGPRs,
368	unsigned DynamicVGPRBlockSize,
369	const GCNSubtarget &STM, MCContext &Ctx) {
370	unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(STI: &STM);
371	unsigned Granule = IsaInfo::getVGPRAllocGranule(STI: &STM, DynamicVGPRBlockSize);
372	unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(STI: &STM);
373	unsigned Generation = STM.getGeneration();
374
375	auto CreateExpr = [&Ctx](unsigned Value) {
376	return MCConstantExpr::create(Value, Ctx);
377	};
378
379	return AMDGPUMCExpr::create(Kind: AMDGPUMCExpr::AGVK_Occupancy,
380	Args: {CreateExpr (MaxWaves), CreateExpr (Granule),
381	CreateExpr (TargetTotalNumVGPRs),
382	CreateExpr (Generation), CreateExpr (InitOcc),
383	NumSGPRs, NumVGPRs},
384	Ctx);
385	}
386
387	void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
388	if (F.isDeclaration() \|\| !AMDGPU::isModuleEntryFunctionCC(CC: F.getCallingConv()))
389	return;
390
391	using RIK = MCResourceInfo::ResourceInfoKind;
392	const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
393	MCSymbol *FnSym = TM.getSymbol(GV: &F);
394	bool IsLocal = F.hasLocalLinkage();
395
396	auto TryGetMCExprValue = [](const MCExpr Value, uint64_t &Res) -> bool* {
397	int64_t Val;
398	if (Value->evaluateAsAbsolute(Res&: Val)) {
399	Res = Val;
400	return true;
401	}
402	return false;
403	};
404
405	const uint64_t MaxScratchPerWorkitem =
406	STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
407	MCSymbol *ScratchSizeSymbol = RI.getSymbol(
408	FuncName: FnSym->getName(), RIK: RIK::RIK_PrivateSegSize, OutContext, IsLocal);
409	uint64_t ScratchSize;
410	if (ScratchSizeSymbol->isVariable() &&
411	TryGetMCExprValue (ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
412	ScratchSize > MaxScratchPerWorkitem) {
413	DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
414	DS_Error);
415	F.getContext().diagnose(DI: DiagStackSize);
416	}
417
418	// Validate addressable scalar registers (i.e., prior to added implicit
419	// SGPRs).
420	MCSymbol *NumSGPRSymbol =
421	RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext, IsLocal);
422	if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
423	!STM.hasSGPRInitBug()) {
424	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
425	uint64_t NumSgpr;
426	if (NumSGPRSymbol->isVariable() &&
427	TryGetMCExprValue (NumSGPRSymbol->getVariableValue(), NumSgpr) &&
428	NumSgpr > MaxAddressableNumSGPRs) {
429	DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
430	NumSgpr, MaxAddressableNumSGPRs,
431	DS_Error, DK_ResourceLimit);
432	F.getContext().diagnose(DI: Diag);
433	return;
434	}
435	}
436
437	MCSymbol *VCCUsedSymbol =
438	RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext, IsLocal);
439	MCSymbol *FlatUsedSymbol = RI.getSymbol(
440	FuncName: FnSym->getName(), RIK: RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
441	uint64_t VCCUsed, FlatUsed, NumSgpr;
442
443	if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
444	FlatUsedSymbol->isVariable() &&
445	TryGetMCExprValue (NumSGPRSymbol->getVariableValue(), NumSgpr) &&
446	TryGetMCExprValue (VCCUsedSymbol->getVariableValue(), VCCUsed) &&
447	TryGetMCExprValue (FlatUsedSymbol->getVariableValue(), FlatUsed)) {
448
449	// Recomputes NumSgprs + implicit SGPRs but all symbols should now be
450	// resolvable.
451	NumSgpr += IsaInfo::getNumExtraSGPRs(
452	STI: &STM, VCCUsed, FlatScrUsed: FlatUsed,
453	XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny());
454	if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS \|\|
455	STM.hasSGPRInitBug()) {
456	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
457	if (NumSgpr > MaxAddressableNumSGPRs) {
458	DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
459	MaxAddressableNumSGPRs, DS_Error,
460	DK_ResourceLimit);
461	F.getContext().diagnose(DI: Diag);
462	return;
463	}
464	}
465
466	MCSymbol *NumVgprSymbol =
467	RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext, IsLocal);
468	MCSymbol *NumAgprSymbol =
469	RI.getSymbol(FuncName: FnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext, IsLocal);
470	uint64_t NumVgpr, NumAgpr;
471
472	MachineModuleInfo &MMI =
473	getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
474	MachineFunction *MF = MMI.getMachineFunction(F);
475	if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
476	TryGetMCExprValue (NumVgprSymbol->getVariableValue(), NumVgpr) &&
477	TryGetMCExprValue (NumAgprSymbol->getVariableValue(), NumAgpr)) {
478	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
479	unsigned MaxWaves = MFI.getMaxWavesPerEU();
480	uint64_t TotalNumVgpr =
481	getTotalNumVGPRs(has90AInsts: STM.hasGFX90AInsts(), ArgNumAGPR: NumAgpr, ArgNumVGPR: NumVgpr);
482	uint64_t NumVGPRsForWavesPerEU =
483	std::max(l: {TotalNumVgpr, (uint64_t)`1`,
484	(uint64_t)STM.getMinNumVGPRs(
485	WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize())});
486	uint64_t NumSGPRsForWavesPerEU = std::max(
487	l: {NumSgpr, (uint64_t)`1`, (uint64_t)STM.getMinNumSGPRs(WavesPerEU: MaxWaves)});
488	const MCExpr *OccupancyExpr = createOccupancy(
489	InitOcc: STM.getOccupancyWithWorkGroupSizes(MF: *MF).second,
490	NumSGPRs: MCConstantExpr::create(Value: NumSGPRsForWavesPerEU, Ctx&: OutContext),
491	NumVGPRs: MCConstantExpr::create(Value: NumVGPRsForWavesPerEU, Ctx&: OutContext),
492	DynamicVGPRBlockSize: MFI.getDynamicVGPRBlockSize(), STM, Ctx&: OutContext);
493	uint64_t Occupancy;
494
495	const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
496	F, Name: "amdgpu-waves-per-eu", Default: {`0`, `0`}, OnlyFirstRequired: true);
497
498	if (TryGetMCExprValue (OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
499	DiagnosticInfoOptimizationFailure Diag(
500	F, F.getSubprogram(),
501	"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
502	"'" +
503	F.getName() + "': desired occupancy was " + Twine (MinWEU) +
504	", final occupancy is " + Twine (Occupancy));
505	F.getContext().diagnose(DI: Diag);
506	return;
507	}
508	}
509	}
510	}
511
512	bool AMDGPUAsmPrinter::doFinalization(Module &M) {
513	// Pad with s_code_end to help tools and guard against instruction prefetch
514	// causing stale data in caches. Arguably this should be done by the linker,
515	// which is why this isn't done for Mesa.
516	// Don't do it if there is no code.
517	const MCSubtargetInfo &STI = *getGlobalSTI();
518	if ((AMDGPU::isGFX10Plus(STI) \|\| AMDGPU::isGFX90A(STI)) &&
519	(STI.getTargetTriple().getOS() == Triple::AMDHSA \|\|
520	STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
521	MCSection *TextSect = getObjFileLowering().getTextSection();
522	if (TextSect->hasInstructions()) {
523	OutStreamer ->switchSection(Section: TextSect);
524	getTargetStreamer()->EmitCodeEnd(STI);
525	}
526	}
527
528	// Assign expressions which can only be resolved when all other functions are
529	// known.
530	RI.finalize(OutContext);
531
532	// Switch section and emit all GPR maximums within the processed module.
533	OutStreamer ->pushSection();
534	MCSectionELF *MaxGPRSection =
535	OutContext.getELFSection(Section: ".AMDGPU.gpr_maximums", Type: ELF::SHT_PROGBITS, Flags: `0`);
536	OutStreamer ->switchSection(Section: MaxGPRSection);
537	getTargetStreamer()->EmitMCResourceMaximums(
538	MaxVGPR: RI.getMaxVGPRSymbol(OutContext), MaxAGPR: RI.getMaxAGPRSymbol(OutContext),
539	MaxSGPR: RI.getMaxSGPRSymbol(OutContext), MaxNamedBarrier: RI.getMaxNamedBarrierSymbol(OutContext));
540	OutStreamer ->popSection();
541
542	for (Function &F : M.functions())
543	validateMCResourceInfo(F);
544
545	RI.reset();
546
547	return AsmPrinter::doFinalization(M);
548	}
549
550	SmallString<`128`> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
551	SmallString<`128`> Str;
552	raw_svector_ostream OSS(Str);
553	auto &Streamer = getTargetStreamer()->getStreamer();
554	auto &Context = Streamer.getContext();
555	const MCExpr *New = foldAMDGPUMCExpr(Expr: Value, Ctx&: Context);
556	printAMDGPUMCExpr(Expr: New, OS&: OSS, MAI);
557	return Str;
558	}
559
560	// Print comments that apply to both callable functions and entry points.
561	void AMDGPUAsmPrinter::emitCommonFunctionComments(
562	const MCExpr NumVGPR, const* MCExpr NumAGPR, const* MCExpr *TotalNumVGPR,
563	const MCExpr NumSGPR, const* MCExpr *ScratchSize, uint64_t CodeSize,
564	const AMDGPUMachineFunction *MFI) {
565	OutStreamer ->emitRawComment(T: " codeLenInByte = " + Twine (CodeSize), TabPrefix: false);
566	OutStreamer ->emitRawComment(T: " TotalNumSgprs: " + getMCExprStr(Value: NumSGPR),
567	TabPrefix: false);
568	OutStreamer ->emitRawComment(T: " NumVgprs: " + getMCExprStr(Value: NumVGPR), TabPrefix: false);
569	if (NumAGPR && TotalNumVGPR) {
570	OutStreamer ->emitRawComment(T: " NumAgprs: " + getMCExprStr(Value: NumAGPR), TabPrefix: false);
571	OutStreamer ->emitRawComment(T: " TotalNumVgprs: " + getMCExprStr(Value: TotalNumVGPR),
572	TabPrefix: false);
573	}
574	OutStreamer ->emitRawComment(T: " ScratchSize: " + getMCExprStr(Value: ScratchSize),
575	TabPrefix: false);
576	OutStreamer ->emitRawComment(T: " MemoryBound: " + Twine (MFI->isMemoryBound()),
577	TabPrefix: false);
578	}
579
580	const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
581	const MachineFunction &MF) const {
582	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
583	MCContext &Ctx = MF.getContext();
584	uint16_t KernelCodeProperties = `0`;
585	const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
586
587	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
588	KernelCodeProperties \|=
589	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
590	}
591	if (UserSGPRInfo.hasDispatchPtr()) {
592	KernelCodeProperties \|=
593	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
594	}
595	if (UserSGPRInfo.hasQueuePtr()) {
596	KernelCodeProperties \|=
597	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
598	}
599	if (UserSGPRInfo.hasKernargSegmentPtr()) {
600	KernelCodeProperties \|=
601	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
602	}
603	if (UserSGPRInfo.hasDispatchID()) {
604	KernelCodeProperties \|=
605	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
606	}
607	if (UserSGPRInfo.hasFlatScratchInit()) {
608	KernelCodeProperties \|=
609	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
610	}
611	if (UserSGPRInfo.hasPrivateSegmentSize()) {
612	KernelCodeProperties \|=
613	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
614	}
615	if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
616	KernelCodeProperties \|=
617	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
618	}
619
620	// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
621	// un-evaluatable at this point so it cannot be conditionally checked here.
622	// Instead, we'll directly shift the possibly unknown MCExpr into its place
623	// and bitwise-or it into KernelCodeProperties.
624	const MCExpr *KernelCodePropExpr =
625	MCConstantExpr::create(Value: KernelCodeProperties, Ctx);
626	const MCExpr *OrValue = MCConstantExpr::create(
627	Value: amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
628	OrValue = MCBinaryExpr::createShl(LHS: CurrentProgramInfo.DynamicCallStack,
629	RHS: OrValue, Ctx);
630	KernelCodePropExpr = MCBinaryExpr::createOr(LHS: KernelCodePropExpr, RHS: OrValue, Ctx);
631
632	return KernelCodePropExpr;
633	}
634
635	MCKernelDescriptor
636	AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
637	const SIProgramInfo &PI) const {
638	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
639	const Function &F = MF.getFunction();
640	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
641	MCContext &Ctx = MF.getContext();
642
643	MCKernelDescriptor KernelDescriptor;
644
645	KernelDescriptor.group_segment_fixed_size =
646	MCConstantExpr::create(Value: PI.LDSSize, Ctx);
647	KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
648
649	Align MaxKernArgAlign;
650	KernelDescriptor.kernarg_size = MCConstantExpr::create(
651	Value: STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign), Ctx);
652
653	KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(ST: STM, Ctx);
654	KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
655	KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
656
657	int64_t PGM_Rsrc3 = `1`;
658	bool EvaluatableRsrc3 =
659	CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(Res&: PGM_Rsrc3);
660	(void)PGM_Rsrc3;
661	(void)EvaluatableRsrc3;
662	assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 \|\|
663	STM.hasGFX90AInsts() \|\| STM.hasGFX1250Insts() \|\| !EvaluatableRsrc3 \|\|
664	static_cast<uint64_t>(PGM_Rsrc3) == `0`);
665	KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
666
667	KernelDescriptor.kernarg_preload = MCConstantExpr::create(
668	Value: AMDGPU::hasKernargPreload(STI: STM) ? Info->getNumKernargPreloadedSGPRs() : `0`,
669	Ctx);
670
671	return KernelDescriptor;
672	}
673
674	bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
675	// Init target streamer lazily on the first function so that previous passes
676	// can set metadata.
677	if (!IsTargetStreamerInitialized)
678	initTargetStreamer(M&: *MF.getFunction().getParent());
679
680	ResourceUsage =
681	&getAnalysis<AMDGPUResourceUsageAnalysisWrapperPass>().getResourceInfo();
682	CurrentProgramInfo.reset(MF);
683
684	const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
685	MCContext &Ctx = MF.getContext();
686
687	// The starting address of all shader programs must be 256 bytes aligned.
688	// Regular functions just need the basic required instruction alignment.
689	MF.ensureAlignment(A: MFI->isEntryFunction() ? Align (`256`) : Align (`4`));
690
691	SetupMachineFunction(MF);
692
693	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
694	MCContext &Context = getObjFileLowering().getContext();
695	bool IsLocal = MF.getFunction().hasLocalLinkage();
696	// FIXME: This should be an explicit check for Mesa.
697	if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
698	MCSectionELF *ConfigSection =
699	Context.getELFSection(Section: ".AMDGPU.config", Type: ELF::SHT_PROGBITS, Flags: `0`);
700	OutStreamer ->switchSection(Section: ConfigSection);
701	}
702
703	RI.gatherResourceInfo(MF, FRI: *ResourceUsage, OutContext);
704
705	if (MFI->isModuleEntryFunction()) {
706	getSIProgramInfo(Out&: CurrentProgramInfo, MF);
707	}
708
709	if (STM.isAmdPalOS()) {
710	if (MFI->isEntryFunction())
711	EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo);
712	else if (MFI->isModuleEntryFunction())
713	emitPALFunctionMetadata(MF);
714	} else if (!STM.isAmdHsaOS()) {
715	EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo);
716	}
717
718	DumpCodeInstEmitter = nullptr;
719	if (STM.dumpCode()) {
720	// For -dumpcode, get the assembler out of the streamer. This only works
721	// with -filetype=obj.
722	MCAssembler *Assembler = OutStreamer ->getAssemblerPtr();
723	if (Assembler)
724	DumpCodeInstEmitter = Assembler->getEmitterPtr();
725	}
726
727	DisasmLines.clear();
728	HexLines.clear();
729	DisasmLineMaxLen = `0`;
730
731	emitFunctionBody();
732
733	emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(),
734	hasMAIInsts: STM.hasMAIInsts());
735
736	{
737	using RIK = MCResourceInfo::ResourceInfoKind;
738	getTargetStreamer()->EmitMCResourceInfo(
739	NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext,
740	IsLocal),
741	NumAGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR, OutContext,
742	IsLocal),
743	NumExplicitSGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumSGPR, OutContext,
744	IsLocal),
745	NumNamedBarrier: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumNamedBarrier,
746	OutContext, IsLocal),
747	PrivateSegmentSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
748	OutContext, IsLocal),
749	UsesVCC: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesVCC, OutContext,
750	IsLocal),
751	UsesFlatScratch: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_UsesFlatScratch,
752	OutContext, IsLocal),
753	HasDynamicallySizedStack: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasDynSizedStack,
754	OutContext, IsLocal),
755	HasRecursion: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasRecursion, OutContext,
756	IsLocal),
757	HasIndirectCall: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_HasIndirectCall,
758	OutContext, IsLocal));
759	}
760
761	// Emit _dvgpr$ symbol when appropriate.
762	emitDVgprSymbol(MF);
763
764	if (isVerbose()) {
765	MCSectionELF *CommentSection =
766	Context.getELFSection(Section: ".AMDGPU.csdata", Type: ELF::SHT_PROGBITS, Flags: `0`);
767	OutStreamer ->switchSection(Section: CommentSection);
768
769	if (!MFI->isEntryFunction()) {
770	using RIK = MCResourceInfo::ResourceInfoKind;
771	OutStreamer ->emitRawComment(T: " Function info:", TabPrefix: false);
772
773	emitCommonFunctionComments(
774	NumVGPR: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumVGPR, OutContext,
775	IsLocal)
776	->getVariableValue(),
777	NumAGPR: STM.hasMAIInsts()
778	? RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_NumAGPR,
779	OutContext, IsLocal)
780	->getVariableValue()
781	: nullptr,
782	TotalNumVGPR: RI.createTotalNumVGPRs(MF, Ctx),
783	NumSGPR: RI.createTotalNumSGPRs(
784	MF,
785	hasXnack: MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
786	Ctx),
787	ScratchSize: RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK: RIK::RIK_PrivateSegSize,
788	OutContext, IsLocal)
789	->getVariableValue(),
790	CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
791	return false;
792	}
793
794	OutStreamer ->emitRawComment(T: " Kernel info:", TabPrefix: false);
795	emitCommonFunctionComments(
796	NumVGPR: CurrentProgramInfo.NumArchVGPR,
797	NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
798	TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR,
799	ScratchSize: CurrentProgramInfo.ScratchSize,
800	CodeSize: CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
801
802	OutStreamer ->emitRawComment(
803	T: " FloatMode: " + Twine (CurrentProgramInfo.FloatMode), TabPrefix: false);
804	OutStreamer ->emitRawComment(
805	T: " IeeeMode: " + Twine (CurrentProgramInfo.IEEEMode), TabPrefix: false);
806	OutStreamer ->emitRawComment(
807	T: " LDSByteSize: " + Twine (CurrentProgramInfo.LDSSize) +
808	" bytes/workgroup (compile time only)", TabPrefix: false);
809
810	OutStreamer ->emitRawComment(
811	T: " SGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.SGPRBlocks), TabPrefix: false);
812
813	OutStreamer ->emitRawComment(
814	T: " VGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.VGPRBlocks), TabPrefix: false);
815
816	OutStreamer ->emitRawComment(
817	T: " NumSGPRsForWavesPerEU: " +
818	getMCExprStr(Value: CurrentProgramInfo.NumSGPRsForWavesPerEU),
819	TabPrefix: false);
820	OutStreamer ->emitRawComment(
821	T: " NumVGPRsForWavesPerEU: " +
822	getMCExprStr(Value: CurrentProgramInfo.NumVGPRsForWavesPerEU),
823	TabPrefix: false);
824
825	if (STM.hasGFX90AInsts()) {
826	const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
827	LHS: CurrentProgramInfo.AccumOffset, RHS: MCConstantExpr::create(Value: `1`, Ctx), Ctx);
828	AdjustedAccum = MCBinaryExpr::createMul(
829	LHS: AdjustedAccum, RHS: MCConstantExpr::create(Value: `4`, Ctx), Ctx);
830	OutStreamer ->emitRawComment(
831	T: " AccumOffset: " + getMCExprStr(Value: AdjustedAccum), TabPrefix: false);
832	}
833
834	if (STM.hasGFX1250Insts())
835	OutStreamer ->emitRawComment(
836	T: " NamedBarCnt: " + getMCExprStr(Value: CurrentProgramInfo.NamedBarCnt),
837	TabPrefix: false);
838
839	OutStreamer ->emitRawComment(
840	T: " Occupancy: " + getMCExprStr(Value: CurrentProgramInfo.Occupancy), TabPrefix: false);
841
842	OutStreamer ->emitRawComment(
843	T: " WaveLimiterHint : " + Twine (MFI->needsWaveLimiter()), TabPrefix: false);
844
845	OutStreamer ->emitRawComment(
846	T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
847	getMCExprStr(Value: CurrentProgramInfo.ScratchEnable),
848	TabPrefix: false);
849	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " +
850	Twine (CurrentProgramInfo.UserSGPR),
851	TabPrefix: false);
852	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
853	Twine (CurrentProgramInfo.TrapHandlerEnable),
854	TabPrefix: false);
855	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
856	Twine (CurrentProgramInfo.TGIdXEnable),
857	TabPrefix: false);
858	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
859	Twine (CurrentProgramInfo.TGIdYEnable),
860	TabPrefix: false);
861	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
862	Twine (CurrentProgramInfo.TGIdZEnable),
863	TabPrefix: false);
864	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
865	Twine (CurrentProgramInfo.TIdIGCompCount),
866	TabPrefix: false);
867
868	[[maybe_unused]] int64_t PGMRSrc3;
869	assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 \|\|
870	STM.hasGFX90AInsts() \|\| STM.hasGFX1250Insts() \|\|
871	(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
872	static_cast<uint64_t>(PGMRSrc3) == `0`));
873	if (STM.hasGFX90AInsts()) {
874	OutStreamer ->emitRawComment(
875	T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
876	getMCExprStr(Value: MCKernelDescriptor::bits_get(
877	Src: CurrentProgramInfo.ComputePGMRSrc3,
878	Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
879	Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
880	TabPrefix: false);
881	OutStreamer ->emitRawComment(
882	T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
883	getMCExprStr(Value: MCKernelDescriptor::bits_get(
884	Src: CurrentProgramInfo.ComputePGMRSrc3,
885	Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
886	Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
887	TabPrefix: false);
888	}
889	}
890
891	if (DumpCodeInstEmitter) {
892
893	OutStreamer ->switchSection(
894	Section: Context.getELFSection(Section: ".AMDGPU.disasm", Type: ELF::SHT_PROGBITS, Flags: `0`));
895
896	for (size_t i = `0`; i < DisasmLines.size(); ++i) {
897	std::string Comment = "\n";
898	if (!HexLines [i].empty()) {
899	Comment = std::string (DisasmLineMaxLen - DisasmLines [i].size(), `' '`);
900	Comment += " ; " + HexLines [i] + "\n";
901	}
902
903	OutStreamer ->emitBytes(Data: StringRef (DisasmLines [i]));
904	OutStreamer ->emitBytes(Data: StringRef (Comment));
905	}
906	}
907
908	return false;
909	}
910
911	// When appropriate, add a _dvgpr$ symbol, with the value of the function
912	// symbol, plus an offset encoding one less than the number of VGPR blocks used
913	// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
914	// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
915	// used by a front-end to have functions that are chained rather than called,
916	// and a dispatcher that dynamically resizes the VGPR count before dispatching
917	// to a function.
918	void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
919	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
920	if (MFI.isDynamicVGPREnabled() &&
921	MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
922	MCContext &Ctx = MF.getContext();
923	unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
924	MCValue NumVGPRs;
925	if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
926	Res&: NumVGPRs, Asm: nullptr) \|\|
927	!NumVGPRs.isAbsolute()) {
928	llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
929	}
930	// Calculate number of VGPR blocks.
931	// Treat 0 VGPRs as 1 VGPR to avoid underflowing.
932	unsigned NumBlocks =
933	divideCeil(Numerator: std::max(a: unsigned(NumVGPRs.getConstant()), b: `1U`), Denominator: BlockSize);
934
935	if (NumBlocks > `8`) {
936	OutContext.reportError(L: {},
937	Msg: "too many DVGPR blocks for _dvgpr$ symbol for '" +
938	Twine (CurrentFnSym->getName()) + "'");
939	return;
940	}
941	unsigned EncodedNumBlocks = (NumBlocks - `1`) << `3`;
942	// Add to function symbol to create _dvgpr$ symbol.
943	const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
944	LHS: MCSymbolRefExpr::create(Symbol: CurrentFnSym, Ctx),
945	RHS: MCConstantExpr::create(Value: EncodedNumBlocks, Ctx), Ctx);
946	MCSymbol *DVgprFuncSym =
947	Ctx.getOrCreateSymbol(Name: Twine ("_dvgpr$") + CurrentFnSym->getName());
948	OutStreamer ->emitAssignment(Symbol: DVgprFuncSym, Value: DVgprFuncVal);
949	emitVisibility(Sym: DVgprFuncSym, Visibility: MF.getFunction().getVisibility());
950	emitLinkage(GV: &MF.getFunction(), GVSym: DVgprFuncSym);
951	}
952	}
953
954	// TODO: Fold this into emitFunctionBodyStart.
955	void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
956	// In the beginning all features are either 'Any' or 'NotSupported',
957	// depending on global target features. This will cover empty modules.
958	getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(),
959	FeatureString: getGlobalSTI()->getFeatureString());
960
961	// If module is empty, we are done.
962	if (M.empty())
963	return;
964
965	// If module is not empty, need to find first 'Off' or 'On' feature
966	// setting per feature from functions in module.
967	for (auto &F : M) {
968	auto &TSTargetID = getTargetStreamer()->getTargetID();
969	if ((!TSTargetID ->isXnackSupported() \|\| TSTargetID ->isXnackOnOrOff()) &&
970	(!TSTargetID ->isSramEccSupported() \|\| TSTargetID ->isSramEccOnOrOff()))
971	break;
972
973	const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
974	const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
975	if (TSTargetID ->isXnackSupported())
976	if (TSTargetID ->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
977	TSTargetID ->setXnackSetting(STMTargetID.getXnackSetting());
978	if (TSTargetID ->isSramEccSupported())
979	if (TSTargetID ->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
980	TSTargetID ->setSramEccSetting(STMTargetID.getSramEccSetting());
981	}
982	}
983
984	// AccumOffset computed for the MCExpr equivalent of:
985	// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
986	static const MCExpr computeAccumOffset(const* MCExpr *NumVGPR, MCContext &Ctx) {
987	const MCExpr *ConstFour = MCConstantExpr::create(Value: `4`, Ctx);
988	const MCExpr *ConstOne = MCConstantExpr::create(Value: `1`, Ctx);
989
990	// Can't be lower than 1 for subsequent alignTo.
991	const MCExpr *MaximumTaken =
992	AMDGPUMCExpr::createMax(Args: {ConstOne, NumVGPR}, Ctx);
993
994	// Practically, it's computing divideCeil(MaximumTaken, 4).
995	const MCExpr *DivCeil = MCBinaryExpr::createDiv(
996	LHS: AMDGPUMCExpr::createAlignTo(Value: MaximumTaken, Align: ConstFour, Ctx), RHS: ConstFour,
997	Ctx);
998
999	return MCBinaryExpr::createSub(LHS: DivCeil, RHS: ConstOne, Ctx);
1000	}
1001
1002	void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1003	const MachineFunction &MF) {
1004	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1005	bool IsLocal = MF.getFunction().hasLocalLinkage();
1006	MCContext &Ctx = MF.getContext();
1007
1008	auto CreateExpr = [&Ctx](int64_t Value) {
1009	return MCConstantExpr::create(Value, Ctx);
1010	};
1011
1012	auto TryGetMCExprValue = [](const MCExpr Value, uint64_t &Res) -> bool* {
1013	int64_t Val;
1014	if (Value->evaluateAsAbsolute(Res&: Val)) {
1015	Res = Val;
1016	return true;
1017	}
1018	return false;
1019	};
1020
1021	auto GetSymRefExpr =
1022	[&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1023	MCSymbol *Sym =
1024	RI.getSymbol(FuncName: CurrentFnSym->getName(), RIK, OutContext, IsLocal);
1025	return MCSymbolRefExpr::create(Symbol: Sym, Ctx);
1026	};
1027
1028	using RIK = MCResourceInfo::ResourceInfoKind;
1029	ProgInfo.NumArchVGPR = GetSymRefExpr (RIK::RIK_NumVGPR);
1030	ProgInfo.NumAccVGPR = GetSymRefExpr (RIK::RIK_NumAGPR);
1031	ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1032	NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1033
1034	ProgInfo.AccumOffset = computeAccumOffset(NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1035	ProgInfo.TgSplit = STM.isTgSplitEnabled();
1036	ProgInfo.NumSGPR = GetSymRefExpr (RIK::RIK_NumSGPR);
1037	ProgInfo.ScratchSize = GetSymRefExpr (RIK::RIK_PrivateSegSize);
1038	ProgInfo.VCCUsed = GetSymRefExpr (RIK::RIK_UsesVCC);
1039	ProgInfo.FlatUsed = GetSymRefExpr (RIK::RIK_UsesFlatScratch);
1040	ProgInfo.DynamicCallStack =
1041	MCBinaryExpr::createOr(LHS: GetSymRefExpr (RIK::RIK_HasDynSizedStack),
1042	RHS: GetSymRefExpr (RIK::RIK_HasRecursion), Ctx);
1043
1044	const MCExpr *BarBlkConst = MCConstantExpr::create(Value: `4`, Ctx);
1045	const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1046	Value: GetSymRefExpr (RIK::RIK_NumNamedBarrier), Align: BarBlkConst, Ctx);
1047	ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(LHS: AlignToBlk, RHS: BarBlkConst, Ctx);
1048
1049	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1050
1051	// The calculations related to SGPR/VGPR blocks are
1052	// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1053	// unified.
1054	const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1055	VCCUsed: ProgInfo.VCCUsed, FlatScrUsed: ProgInfo.FlatUsed,
1056	XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1057
1058	// Check the addressable register limit before we add ExtraSGPRs.
1059	if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1060	!STM.hasSGPRInitBug()) {
1061	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1062	uint64_t NumSgpr;
1063	if (TryGetMCExprValue (ProgInfo.NumSGPR, NumSgpr) &&
1064	NumSgpr > MaxAddressableNumSGPRs) {
1065	// This can happen due to a compiler bug or when using inline asm.
1066	LLVMContext &Ctx = MF.getFunction().getContext();
1067	DiagnosticInfoResourceLimit Diag(
1068	MF.getFunction(), "addressable scalar registers", NumSgpr,
1069	MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
1070	Ctx.diagnose(DI: Diag);
1071	ProgInfo.NumSGPR = CreateExpr (MaxAddressableNumSGPRs - `1`);
1072	}
1073	}
1074
1075	// Account for extra SGPRs and VGPRs reserved for debugger use.
1076	ProgInfo.NumSGPR = MCBinaryExpr::createAdd(LHS: ProgInfo.NumSGPR, RHS: ExtraSGPRs, Ctx);
1077
1078	const Function &F = MF.getFunction();
1079
1080	// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1081	// dispatch registers as function args.
1082	unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1083	WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1084
1085	if (WaveDispatchNumSGPR) {
1086	ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
1087	Args: {ProgInfo.NumSGPR,
1088	MCBinaryExpr::createAdd(LHS: CreateExpr (WaveDispatchNumSGPR), RHS: ExtraSGPRs,
1089	Ctx)},
1090	Ctx);
1091	}
1092
1093	if (WaveDispatchNumVGPR) {
1094	ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
1095	Args: {ProgInfo.NumVGPR, CreateExpr (WaveDispatchNumVGPR)}, Ctx);
1096
1097	ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
1098	NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
1099	}
1100
1101	// Adjust number of registers used to meet default/requested minimum/maximum
1102	// number of waves per execution unit request.
1103	unsigned MaxWaves = MFI->getMaxWavesPerEU();
1104	ProgInfo.NumSGPRsForWavesPerEU =
1105	AMDGPUMCExpr::createMax(Args: {ProgInfo.NumSGPR, CreateExpr (`1ul`),
1106	CreateExpr (STM.getMinNumSGPRs(WavesPerEU: MaxWaves))},
1107	Ctx);
1108	ProgInfo.NumVGPRsForWavesPerEU =
1109	AMDGPUMCExpr::createMax(Args: {ProgInfo.NumVGPR, CreateExpr (`1ul`),
1110	CreateExpr (STM.getMinNumVGPRs(
1111	WavesPerEU: MaxWaves, DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize()))},
1112	Ctx);
1113
1114	if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS \|\|
1115	STM.hasSGPRInitBug()) {
1116	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1117	uint64_t NumSgpr;
1118	if (TryGetMCExprValue (ProgInfo.NumSGPR, NumSgpr) &&
1119	NumSgpr > MaxAddressableNumSGPRs) {
1120	// This can happen due to a compiler bug or when using inline asm to use
1121	// the registers which are usually reserved for vcc etc.
1122	LLVMContext &Ctx = MF.getFunction().getContext();
1123	DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1124	NumSgpr, MaxAddressableNumSGPRs,
1125	DS_Error, DK_ResourceLimit);
1126	Ctx.diagnose(DI: Diag);
1127	ProgInfo.NumSGPR = CreateExpr (MaxAddressableNumSGPRs);
1128	ProgInfo.NumSGPRsForWavesPerEU = CreateExpr (MaxAddressableNumSGPRs);
1129	}
1130	}
1131
1132	if (STM.hasSGPRInitBug()) {
1133	ProgInfo.NumSGPR =
1134	CreateExpr (AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1135	ProgInfo.NumSGPRsForWavesPerEU =
1136	CreateExpr (AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
1137	}
1138
1139	if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1140	LLVMContext &Ctx = MF.getFunction().getContext();
1141	DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1142	MFI->getNumUserSGPRs(),
1143	STM.getMaxNumUserSGPRs(), DS_Error);
1144	Ctx.diagnose(DI: Diag);
1145	}
1146
1147	if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1148	LLVMContext &Ctx = MF.getFunction().getContext();
1149	DiagnosticInfoResourceLimit Diag(
1150	MF.getFunction(), "local memory", MFI->getLDSSize(),
1151	STM.getAddressableLocalMemorySize(), DS_Error);
1152	Ctx.diagnose(DI: Diag);
1153	}
1154	// The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1155	// (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1156	auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1157	unsigned Granule) {
1158	const MCExpr *OneConst = CreateExpr (`1ul`);
1159	const MCExpr *GranuleConst = CreateExpr (Granule);
1160	const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax(Args: {NumGPR, OneConst}, Ctx);
1161	const MCExpr *AlignToGPR =
1162	AMDGPUMCExpr::createAlignTo(Value: MaxNumGPR, Align: GranuleConst, Ctx);
1163	const MCExpr *DivGPR =
1164	MCBinaryExpr::createDiv(LHS: AlignToGPR, RHS: GranuleConst, Ctx);
1165	const MCExpr *SubGPR = MCBinaryExpr::createSub(LHS: DivGPR, RHS: OneConst, Ctx);
1166	return SubGPR;
1167	};
1168	// GFX10+ will always allocate 128 SGPRs and this field must be 0
1169	if (STM.getGeneration() >= AMDGPUSubtarget::GFX10) {
1170	ProgInfo.SGPRBlocks = CreateExpr (`0ul`);
1171	} else {
1172	ProgInfo.SGPRBlocks = GetNumGPRBlocks (
1173	ProgInfo.NumSGPRsForWavesPerEU, IsaInfo::getSGPREncodingGranule(STI: &STM));
1174	}
1175	ProgInfo.VGPRBlocks = GetNumGPRBlocks (ProgInfo.NumVGPRsForWavesPerEU,
1176	IsaInfo::getVGPREncodingGranule(STI: &STM));
1177
1178	const SIModeRegisterDefaults Mode = MFI->getMode();
1179
1180	// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1181	// register.
1182	ProgInfo.FloatMode = getFPMode(Mode);
1183
1184	ProgInfo.IEEEMode = Mode.IEEE;
1185
1186	// Make clamp modifier on NaN input returns 0.
1187	ProgInfo.DX10Clamp = Mode.DX10Clamp;
1188
1189	unsigned LDSAlignShift = `8`;
1190	switch (getLdsDwGranularity(ST: STM)) {
1191	case `512`:
1192	case `320`:
1193	LDSAlignShift = `11`;
1194	break;
1195	case `128`:
1196	LDSAlignShift = `9`;
1197	break;
1198	case `64`:
1199	LDSAlignShift = `8`;
1200	break;
1201	default:
1202	llvm_unreachable("invald LDS block size");
1203	}
1204
1205	ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1206	ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1207
1208	ProgInfo.LDSSize = MFI->getLDSSize();
1209	ProgInfo.LDSBlocks =
1210	alignTo(Value: ProgInfo.LDSSize, Align: `1ULL` << LDSAlignShift) >> LDSAlignShift;
1211
1212	// The MCExpr equivalent of divideCeil.
1213	auto DivideCeil = [&Ctx](const MCExpr Numerator, const* MCExpr *Denominator) {
1214	const MCExpr *Ceil =
1215	AMDGPUMCExpr::createAlignTo(Value: Numerator, Align: Denominator, Ctx);
1216	return MCBinaryExpr::createDiv(LHS: Ceil, RHS: Denominator, Ctx);
1217	};
1218
1219	// Scratch is allocated in 64-dword or 256-dword blocks.
1220	unsigned ScratchAlignShift =
1221	STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? `8` : `10`;
1222	// We need to program the hardware with the amount of scratch memory that
1223	// is used by the entire wave. ProgInfo.ScratchSize is the amount of
1224	// scratch memory used per thread.
1225	ProgInfo.ScratchBlocks = DivideCeil (
1226	MCBinaryExpr::createMul(LHS: ProgInfo.ScratchSize,
1227	RHS: CreateExpr (STM.getWavefrontSize()), Ctx),
1228	CreateExpr (`1ULL` << ScratchAlignShift));
1229
1230	if (STM.supportsWGP()) {
1231	ProgInfo.WgpMode = STM.isCuModeEnabled() ? `0` : `1`;
1232	}
1233
1234	if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= `10`) {
1235	ProgInfo.MemOrdered = `1`;
1236	ProgInfo.FwdProgress = !F.hasFnAttribute(Kind: "amdgpu-no-fwd-progress");
1237	}
1238
1239	// 0 = X, 1 = XY, 2 = XYZ
1240	unsigned TIDIGCompCnt = `0`;
1241	if (MFI->hasWorkItemIDZ())
1242	TIDIGCompCnt = `2`;
1243	else if (MFI->hasWorkItemIDY())
1244	TIDIGCompCnt = `1`;
1245
1246	// The private segment wave byte offset is the last of the system SGPRs. We
1247	// initially assumed it was allocated, and may have used it. It shouldn't harm
1248	// anything to disable it if we know the stack isn't used here. We may still
1249	// have emitted code reading it to initialize scratch, but if that's unused
1250	// reading garbage should be OK.
1251	ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
1252	LHS: MCBinaryExpr::createGT(LHS: ProgInfo.ScratchBlocks,
1253	RHS: MCConstantExpr::create(Value: `0`, Ctx), Ctx),
1254	RHS: ProgInfo.DynamicCallStack, Ctx);
1255
1256	ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1257	// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1258	ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? `0` : STM.hasTrapHandler();
1259	ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1260	ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1261	ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1262	ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1263	ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1264	ProgInfo.EXCPEnMSB = `0`;
1265	// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1266	ProgInfo.LdsSize = STM.isAmdHsaOS() ? `0` : ProgInfo.LDSBlocks;
1267	ProgInfo.EXCPEnable = `0`;
1268
1269	// return ((Dst & ~Mask) \| (Value << Shift))
1270	auto SetBits = [&Ctx](const MCExpr Dst, const* MCExpr *Value, uint32_t Mask,
1271	uint32_t Shift) {
1272	const auto *Shft = MCConstantExpr::create(Value: Shift, Ctx);
1273	const auto *Msk = MCConstantExpr::create(Value: Mask, Ctx);
1274	Dst = MCBinaryExpr::createAnd(LHS: Dst, RHS: MCUnaryExpr::createNot(Expr: Msk, Ctx), Ctx);
1275	Dst = MCBinaryExpr::createOr(LHS: Dst, RHS: MCBinaryExpr::createShl(LHS: Value, RHS: Shft, Ctx),
1276	Ctx);
1277	return Dst;
1278	};
1279
1280	if (STM.hasGFX90AInsts()) {
1281	ProgInfo.ComputePGMRSrc3 =
1282	SetBits (ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1283	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1284	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1285	ProgInfo.ComputePGMRSrc3 =
1286	SetBits (ProgInfo.ComputePGMRSrc3, CreateExpr (ProgInfo.TgSplit),
1287	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1288	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1289	}
1290
1291	if (STM.hasGFX1250Insts())
1292	ProgInfo.ComputePGMRSrc3 =
1293	SetBits (ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1294	amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1295	amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1296
1297	ProgInfo.Occupancy = createOccupancy(
1298	InitOcc: STM.computeOccupancy(F, LDSSize: ProgInfo.LDSSize).second,
1299	NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU, NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU,
1300	DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1301
1302	const auto [MinWEU, MaxWEU] =
1303	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default: {`0`, `0`}, OnlyFirstRequired: true);
1304	uint64_t Occupancy;
1305	if (TryGetMCExprValue (ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1306	DiagnosticInfoOptimizationFailure Diag(
1307	F, F.getSubprogram(),
1308	"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1309	"'" +
1310	F.getName() + "': desired occupancy was " + Twine (MinWEU) +
1311	", final occupancy is " + Twine (Occupancy));
1312	F.getContext().diagnose(DI: Diag);
1313	}
1314
1315	if (isGFX11Plus(STI: STM)) {
1316	uint32_t CodeSizeInBytes = (uint32_t)std::min(
1317	a: ProgInfo.getFunctionCodeSize(MF, IsLowerBound: true / IsLowerBound /),
1318	b: (uint64_t)std::numeric_limits<uint32_t>::max());
1319	uint32_t CodeSizeInLines = divideCeil(Numerator: CodeSizeInBytes, Denominator: `128`);
1320	uint32_t Field, Shift, Width;
1321	if (isGFX11(STI: STM)) {
1322	Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1323	Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1324	Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1325	} else {
1326	Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1327	Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1328	Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1329	}
1330	uint64_t InstPrefSize = std::min(a: CodeSizeInLines, b: (`1u` << Width) - `1`);
1331	ProgInfo.ComputePGMRSrc3 = SetBits (ProgInfo.ComputePGMRSrc3,
1332	CreateExpr (InstPrefSize), Field, Shift);
1333	}
1334	}
1335
1336	static unsigned getRsrcReg(CallingConv::ID CallConv) {
1337	switch (CallConv) {
1338	default: [[fallthrough]];
1339	case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1340	case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1341	case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1342	case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1343	case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1344	case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1345	case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1346	}
1347	}
1348
1349	void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1350	const SIProgramInfo &CurrentProgramInfo) {
1351	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1352	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1353	unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv());
1354	MCContext &Ctx = MF.getContext();
1355
1356	// (((Value) & Mask) << Shift)
1357	auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1358	const MCExpr *msk = MCConstantExpr::create(Value: Mask, Ctx);
1359	const MCExpr *shft = MCConstantExpr::create(Value: Shift, Ctx);
1360	return MCBinaryExpr::createShl(LHS: MCBinaryExpr::createAnd(LHS: Value, RHS: msk, Ctx),
1361	RHS: shft, Ctx);
1362	};
1363
1364	auto EmitResolvedOrExpr = [this](const MCExpr Value, unsigned* Size) {
1365	int64_t Val;
1366	if (Value->evaluateAsAbsolute(Res&: Val))
1367	OutStreamer ->emitIntValue(Value: static_cast<uint64_t>(Val), Size);
1368	else
1369	OutStreamer ->emitValue(Value, Size);
1370	};
1371
1372	if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
1373	OutStreamer ->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1374
1375	EmitResolvedOrExpr (CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx),
1376	/Size=/`4`);
1377
1378	OutStreamer ->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1379	EmitResolvedOrExpr (CurrentProgramInfo.getComputePGMRSrc2(Ctx), /Size=/`4`);
1380
1381	OutStreamer ->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1382
1383	// Sets bits according to S_0286E8_WAVESIZE_ mask and shift values for the*
1384	// appropriate generation.
1385	if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1386	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1387	/Mask=/`0x3FFFF`, /Shift=/`12`),
1388	/Size=/`4`);
1389	} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1390	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1391	/Mask=/`0x7FFF`, /Shift=/`12`),
1392	/Size=/`4`);
1393	} else {
1394	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1395	/Mask=/`0x1FFF`, /Shift=/`12`),
1396	/Size=/`4`);
1397	}
1398
1399	// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1400	// 0" comment but I don't see a corresponding field in the register spec.
1401	} else {
1402	OutStreamer ->emitInt32(Value: RsrcReg);
1403
1404	const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1405	LHS: SetBits (CurrentProgramInfo.VGPRBlocks, /Mask=/`0x3F`, /Shift=/`0`),
1406	RHS: SetBits (CurrentProgramInfo.SGPRBlocks, /Mask=/`0x0F`, /Shift=/`6`),
1407	Ctx&: MF.getContext());
1408	EmitResolvedOrExpr (GPRBlocks, /Size=/`4`);
1409	OutStreamer ->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1410
1411	// Sets bits according to S_0286E8_WAVESIZE_ mask and shift values for the*
1412	// appropriate generation.
1413	if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1414	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1415	/Mask=/`0x3FFFF`, /Shift=/`12`),
1416	/Size=/`4`);
1417	} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1418	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1419	/Mask=/`0x7FFF`, /Shift=/`12`),
1420	/Size=/`4`);
1421	} else {
1422	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1423	/Mask=/`0x1FFF`, /Shift=/`12`),
1424	/Size=/`4`);
1425	}
1426	}
1427
1428	if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1429	OutStreamer ->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1430	unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1431	? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: `2`)
1432	: CurrentProgramInfo.LDSBlocks;
1433	OutStreamer ->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1434	OutStreamer ->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1435	OutStreamer ->emitInt32(Value: MFI->getPSInputEnable());
1436	OutStreamer ->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1437	OutStreamer ->emitInt32(Value: MFI->getPSInputAddr());
1438	}
1439
1440	OutStreamer ->emitInt32(R_SPILLED_SGPRS);
1441	OutStreamer ->emitInt32(Value: MFI->getNumSpilledSGPRs());
1442	OutStreamer ->emitInt32(R_SPILLED_VGPRS);
1443	OutStreamer ->emitInt32(Value: MFI->getNumSpilledVGPRs());
1444	}
1445
1446	// Helper function to add common PAL Metadata 3.0+
1447	static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1448	const SIProgramInfo &CurrentProgramInfo,
1449	CallingConv::ID CC, const GCNSubtarget &ST,
1450	unsigned DynamicVGPRBlockSize) {
1451	if (ST.hasFeature(Feature: AMDGPU::FeatureDX10ClampAndIEEEMode))
1452	MD->setHwStage(CC, field: ".ieee_mode", Val: (bool)CurrentProgramInfo.IEEEMode);
1453
1454	MD->setHwStage(CC, field: ".wgp_mode", Val: (bool)CurrentProgramInfo.WgpMode);
1455	MD->setHwStage(CC, field: ".mem_ordered", Val: (bool)CurrentProgramInfo.MemOrdered);
1456	MD->setHwStage(CC, field: ".forward_progress", Val: (bool)CurrentProgramInfo.FwdProgress);
1457
1458	if (AMDGPU::isCompute(CC)) {
1459	MD->setHwStage(CC, field: ".trap_present",
1460	Val: (bool)CurrentProgramInfo.TrapHandlerEnable);
1461	MD->setHwStage(CC, field: ".excp_en", Val: CurrentProgramInfo.EXCPEnable);
1462
1463	if (DynamicVGPRBlockSize != `0`)
1464	MD->setComputeRegisters(field: ".dynamic_vgpr_en", Val: true);
1465	}
1466
1467	MD->updateHwStageMaximum(
1468	CC, field: ".lds_size",
1469	Val: (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1470	sizeof(uint32_t)));
1471	}
1472
1473	// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1474	// is AMDPAL. It stores each compute/SPI register setting and other PAL
1475	// metadata items into the PALMD::Metadata, combining with any provided by the
1476	// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1477	// is then written as a single block in the .note section.
1478	void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1479	const SIProgramInfo &CurrentProgramInfo) {
1480	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1481	auto CC = MF.getFunction().getCallingConv();
1482	auto *MD = getTargetStreamer()->getPALMetadata();
1483	auto &Ctx = MF.getContext();
1484
1485	MD->setEntryPoint(CC, Name: MF.getFunction().getName());
1486	MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1487
1488	// For targets that support dynamic VGPRs, set the number of saved dynamic
1489	// VGPRs (if any) in the PAL metadata.
1490	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1491	if (MFI->isDynamicVGPREnabled() &&
1492	MFI->getScratchReservedForDynamicVGPRs() > `0`)
1493	MD->setHwStage(CC, field: ".dynamic_vgpr_saved_count",
1494	Val: MFI->getScratchReservedForDynamicVGPRs() / `4`);
1495
1496	// Only set AGPRs for supported devices
1497	if (STM.hasMAIInsts()) {
1498	MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR);
1499	}
1500
1501	MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1502	if (MD->getPALMajorVersion() < `3`) {
1503	MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM, Ctx), Ctx);
1504	if (AMDGPU::isCompute(CC)) {
1505	MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1506	} else {
1507	const MCExpr *HasScratchBlocks =
1508	MCBinaryExpr::createGT(LHS: CurrentProgramInfo.ScratchBlocks,
1509	RHS: MCConstantExpr::create(Value: `0`, Ctx), Ctx);
1510	auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1511	MD->setRsrc2(CC, Val: maskShiftSet(Val: HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1512	}
1513	} else {
1514	MD->setHwStage(CC, field: ".debug_mode", Val: (bool)CurrentProgramInfo.DebugMode);
1515	MD->setHwStage(CC, field: ".scratch_en", Type: msgpack::Type::Boolean,
1516	Val: CurrentProgramInfo.ScratchEnable);
1517	EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM,
1518	DynamicVGPRBlockSize: MFI->getDynamicVGPRBlockSize());
1519	}
1520
1521	// ScratchSize is in bytes, 16 aligned.
1522	MD->setScratchSize(
1523	CC,
1524	Val: AMDGPUMCExpr::createAlignTo(Value: CurrentProgramInfo.ScratchSize,
1525	Align: MCConstantExpr::create(Value: `16`, Ctx), Ctx),
1526	Ctx);
1527
1528	if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1529	unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1530	? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: `2`)
1531	: CurrentProgramInfo.LDSBlocks;
1532	if (MD->getPALMajorVersion() < `3`) {
1533	MD->setRsrc2(
1534	CC,
1535	Val: MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
1536	Ctx);
1537	MD->setSpiPsInputEna(MFI->getPSInputEnable());
1538	MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1539	} else {
1540	// Graphics registers
1541	const unsigned ExtraLdsDwGranularity =
1542	STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? `256` : `128`;
1543	MD->setGraphicsRegisters(
1544	field: ".ps_extra_lds_size",
1545	Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1546
1547	// Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1548	static StringLiteral const PsInputFields[] = {
1549	".persp_sample_ena", ".persp_center_ena",
1550	".persp_centroid_ena", ".persp_pull_model_ena",
1551	".linear_sample_ena", ".linear_center_ena",
1552	".linear_centroid_ena", ".line_stipple_tex_ena",
1553	".pos_x_float_ena", ".pos_y_float_ena",
1554	".pos_z_float_ena", ".pos_w_float_ena",
1555	".front_face_ena", ".ancillary_ena",
1556	".sample_coverage_ena", ".pos_fixed_pt_ena"};
1557	unsigned PSInputEna = MFI->getPSInputEnable();
1558	unsigned PSInputAddr = MFI->getPSInputAddr();
1559	for (auto [Idx, Field] : enumerate(First: PsInputFields)) {
1560	MD->setGraphicsRegisters(field1: ".spi_ps_input_ena", field2: Field,
1561	Val: (bool)((PSInputEna >> Idx) & `1`));
1562	MD->setGraphicsRegisters(field1: ".spi_ps_input_addr", field2: Field,
1563	Val: (bool)((PSInputAddr >> Idx) & `1`));
1564	}
1565	}
1566	}
1567
1568	// For version 3 and above the wave front size is already set in the metadata
1569	if (MD->getPALMajorVersion() < `3` && STM.isWave32())
1570	MD->setWave32(MF.getFunction().getCallingConv());
1571	}
1572
1573	void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1574	auto *MD = getTargetStreamer()->getPALMetadata();
1575	const MachineFrameInfo &MFI = MF.getFrameInfo();
1576	StringRef FnName = MF.getFunction().getName();
1577	MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize());
1578	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1579	MCContext &Ctx = MF.getContext();
1580
1581	if (MD->getPALMajorVersion() < `3`) {
1582	// Set compute registers
1583	MD->setRsrc1(
1584	CC: CallingConv::AMDGPU_CS,
1585	Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1586	MD->setRsrc2(CC: CallingConv::AMDGPU_CS,
1587	Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1588	} else {
1589	EmitPALMetadataCommon(
1590	MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST,
1591	DynamicVGPRBlockSize: MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1592	}
1593
1594	// Set optional info
1595	MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize);
1596	MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1597	MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1598	}
1599
1600	// This is supposed to be log2(Size)
1601	static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1602	switch (Size) {
1603	case `4`:
1604	return AMD_ELEMENT_4_BYTES;
1605	case `8`:
1606	return AMD_ELEMENT_8_BYTES;
1607	case `16`:
1608	return AMD_ELEMENT_16_BYTES;
1609	default:
1610	llvm_unreachable("invalid private_element_size");
1611	}
1612	}
1613
1614	void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1615	const SIProgramInfo &CurrentProgramInfo,
1616	const MachineFunction &MF) const {
1617	const Function &F = MF.getFunction();
1618	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
1619	F.getCallingConv() == CallingConv::SPIR_KERNEL);
1620
1621	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1622	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1623	MCContext &Ctx = MF.getContext();
1624
1625	Out.initDefault(STI: &STM, Ctx, /InitMCExpr=/false);
1626
1627	Out.compute_pgm_resource1_registers =
1628	CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx);
1629	Out.compute_pgm_resource2_registers =
1630	CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1631	Out.code_properties \|= AMD_CODE_PROPERTY_IS_PTR64;
1632
1633	Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1634
1635	AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1636	getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1637
1638	const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1639	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1640	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1641	}
1642
1643	if (UserSGPRInfo.hasDispatchPtr())
1644	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1645
1646	if (UserSGPRInfo.hasQueuePtr())
1647	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1648
1649	if (UserSGPRInfo.hasKernargSegmentPtr())
1650	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1651
1652	if (UserSGPRInfo.hasDispatchID())
1653	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1654
1655	if (UserSGPRInfo.hasFlatScratchInit())
1656	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1657
1658	if (UserSGPRInfo.hasPrivateSegmentSize())
1659	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
1660
1661	if (STM.isXNACKEnabled())
1662	Out.code_properties \|= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1663
1664	Align MaxKernArgAlign;
1665	Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign);
1666	Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1667	Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1668	Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1669	Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1670
1671	// kernarg_segment_alignment is specified as log of the alignment.
1672	// The minimum alignment is 16.
1673	// FIXME: The metadata treats the minimum as 4?
1674	Out.kernarg_segment_alignment = Log2(A: std::max(a: Align (`16`), b: MaxKernArgAlign));
1675	}
1676
1677	bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr MI, unsigned* OpNo,
1678	const char *ExtraCode, raw_ostream &O) {
1679	// First try the generic code, which knows about modifiers like 'c' and 'n'.
1680	if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O))
1681	return false;
1682
1683	if (ExtraCode && ExtraCode[`0`]) {
1684	if (ExtraCode[`1`] != `0`)
1685	return true; // Unknown modifier.
1686
1687	switch (ExtraCode[`0`]) {
1688	case `'r'`:
1689	break;
1690	default:
1691	return true;
1692	}
1693	}
1694
1695	// TODO: Should be able to support other operand types like globals.
1696	const MachineOperand &MO = MI->getOperand(i: OpNo);
1697	if (MO.isReg()) {
1698	AMDGPUInstPrinter::printRegOperand(Reg: MO.getReg(), O,
1699	MRI: *MF->getSubtarget().getRegisterInfo());
1700	return false;
1701	}
1702	if (MO.isImm()) {
1703	int64_t Val = MO.getImm();
1704	if (AMDGPU::isInlinableIntLiteral(Literal: Val)) {
1705	O << Val;
1706	} else if (isUInt<`16`>(x: Val)) {
1707	O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val));
1708	} else if (isUInt<`32`>(x: Val)) {
1709	O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val));
1710	} else {
1711	O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val));
1712	}
1713	return false;
1714	}
1715	return true;
1716	}
1717
1718	void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1719	AU.addRequired<AMDGPUResourceUsageAnalysisWrapperPass>();
1720	AU.addPreserved<AMDGPUResourceUsageAnalysisWrapperPass>();
1721	AU.addRequired<MachineModuleInfoWrapperPass>();
1722	AU.addPreserved<MachineModuleInfoWrapperPass>();
1723	AsmPrinter::getAnalysisUsage(AU);
1724	}
1725
1726	void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1727	const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1728	bool isModuleEntryFunction, bool hasMAIInsts) {
1729	if (!ORE)
1730	return;
1731
1732	const char *Name = "kernel-resource-usage";
1733	const char *Indent = " ";
1734
1735	// If the remark is not specifically enabled, do not output to yaml
1736	LLVMContext &Ctx = MF.getFunction().getContext();
1737	if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name))
1738	return;
1739
1740	// Currently non-kernel functions have no resources to emit.
1741	if (!isEntryFunctionCC(CC: MF.getFunction().getCallingConv()))
1742	return;
1743
1744	auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1745	StringRef RemarkLabel, auto Argument) {
1746	// Add an indent for every line besides the line with the kernel name. This
1747	// makes it easier to tell which resource usage go with which kernel since
1748	// the kernel name will always be displayed first.
1749	std::string LabelStr = RemarkLabel.str() + ": ";
1750	if (RemarkName != "FunctionName")
1751	LabelStr = Indent + LabelStr;
1752
1753	ORE->emit([&]() {
1754	return MachineOptimizationRemarkAnalysis (Name, RemarkName,
1755	MF.getFunction().getSubprogram(),
1756	&MF.front())
1757	<< LabelStr << ore::NV(RemarkName, Argument);
1758	});
1759	};
1760
1761	// FIXME: Formatting here is pretty nasty because clang does not accept
1762	// newlines from diagnostics. This forces us to emit multiple diagnostic
1763	// remarks to simulate newlines. If and when clang does accept newlines, this
1764	// formatting should be aggregated into one remark with newlines to avoid
1765	// printing multiple diagnostic location and diag opts.
1766	EmitResourceUsageRemark ("FunctionName", "Function Name",
1767	MF.getFunction().getName());
1768	EmitResourceUsageRemark ("NumSGPR", "TotalSGPRs",
1769	getMCExprStr(Value: CurrentProgramInfo.NumSGPR));
1770	EmitResourceUsageRemark ("NumVGPR", "VGPRs",
1771	getMCExprStr(Value: CurrentProgramInfo.NumArchVGPR));
1772	if (hasMAIInsts) {
1773	EmitResourceUsageRemark ("NumAGPR", "AGPRs",
1774	getMCExprStr(Value: CurrentProgramInfo.NumAccVGPR));
1775	}
1776	EmitResourceUsageRemark ("ScratchSize", "ScratchSize [bytes/lane]",
1777	getMCExprStr(Value: CurrentProgramInfo.ScratchSize));
1778	int64_t DynStack;
1779	bool DynStackEvaluatable =
1780	CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(Res&: DynStack);
1781	StringRef DynamicStackStr =
1782	DynStackEvaluatable && DynStack ? "True" : "False";
1783	EmitResourceUsageRemark ("DynamicStack", "Dynamic Stack", DynamicStackStr);
1784	EmitResourceUsageRemark ("Occupancy", "Occupancy [waves/SIMD]",
1785	getMCExprStr(Value: CurrentProgramInfo.Occupancy));
1786	EmitResourceUsageRemark ("SGPRSpill", "SGPRs Spill",
1787	CurrentProgramInfo.SGPRSpill);
1788	EmitResourceUsageRemark ("VGPRSpill", "VGPRs Spill",
1789	CurrentProgramInfo.VGPRSpill);
1790	if (isModuleEntryFunction)
1791	EmitResourceUsageRemark ("BytesLDS", "LDS Size [bytes/block]",
1792	CurrentProgramInfo.LDSSize);
1793	}
1794
1795	char AMDGPUAsmPrinter::ID = `0`;
1796
1797	INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1798	"AMDGPU Assembly Printer", false, false)
1799

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp