AMDGPUAsmPrinter.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp]

1	//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	///
11	/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12	/// code. When passed an MCAsmStreamer it prints assembly and when passed
13	/// an MCObjectStreamer it outputs binary code.
14	//
15	//===----------------------------------------------------------------------===//
16	//
17
18	#include "AMDGPUAsmPrinter.h"
19	#include "AMDGPU.h"
20	#include "AMDGPUHSAMetadataStreamer.h"
21	#include "AMDGPUResourceUsageAnalysis.h"
22	#include "GCNSubtarget.h"
23	#include "MCTargetDesc/AMDGPUInstPrinter.h"
24	#include "MCTargetDesc/AMDGPUMCExpr.h"
25	#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
26	#include "MCTargetDesc/AMDGPUTargetStreamer.h"
27	#include "R600AsmPrinter.h"
28	#include "SIMachineFunctionInfo.h"
29	#include "TargetInfo/AMDGPUTargetInfo.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "Utils/AMDKernelCodeTUtils.h"
32	#include "Utils/SIDefinesUtils.h"
33	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
34	#include "llvm/BinaryFormat/ELF.h"
35	#include "llvm/CodeGen/MachineFrameInfo.h"
36	#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
37	#include "llvm/IR/DiagnosticInfo.h"
38	#include "llvm/MC/MCAssembler.h"
39	#include "llvm/MC/MCContext.h"
40	#include "llvm/MC/MCSectionELF.h"
41	#include "llvm/MC/MCStreamer.h"
42	#include "llvm/MC/TargetRegistry.h"
43	#include "llvm/Support/AMDHSAKernelDescriptor.h"
44	#include "llvm/Target/TargetLoweringObjectFile.h"
45	#include "llvm/Target/TargetMachine.h"
46	#include "llvm/TargetParser/TargetParser.h"
47
48	using namespace llvm;
49	using namespace llvm::AMDGPU;
50
51	// This should get the default rounding mode from the kernel. We just set the
52	// default here, but this could change if the OpenCL rounding mode pragmas are
53	// used.
54	//
55	// The denormal mode here should match what is reported by the OpenCL runtime
56	// for the CL_FP_DENORM bit from CL_DEVICE_{HALF\|SINGLE\|DOUBLE}_FP_CONFIG, but
57	// can also be override to flush with the -cl-denorms-are-zero compiler flag.
58	//
59	// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
60	// precision, and leaves single precision to flush all and does not report
61	// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
62	// CL_FP_DENORM for both.
63	//
64	// FIXME: It seems some instructions do not support single precision denormals
65	// regardless of the mode (exp__f32, rcp__f32, rsq__f32, rsq_f32, sqrt_f32,
66	// and sin_f32, cos_f32 on most parts).
67
68	// We want to use these instructions, and using fp32 denormals also causes
69	// instructions to run at the double precision rate for the device so it's
70	// probably best to just report no single precision denormals.
71	static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
72	return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) \|
73	FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) \|
74	FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) \|
75	FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
76	}
77
78	static AsmPrinter *
79	createAMDGPUAsmPrinterPass(TargetMachine &tm,
80	std::unique_ptr<MCStreamer> &&Streamer) {
81	return new AMDGPUAsmPrinter (tm, std::move(Streamer));
82	}
83
84	extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
85	TargetRegistry::RegisterAsmPrinter(T&: getTheR600Target(),
86	Fn: llvm::createR600AsmPrinterPass);
87	TargetRegistry::RegisterAsmPrinter(T&: getTheGCNTarget(),
88	Fn: createAMDGPUAsmPrinterPass);
89	}
90
91	AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
92	std::unique_ptr<MCStreamer> Streamer)
93	: AsmPrinter (TM, std::move(Streamer)) {
94	assert(OutStreamer && "AsmPrinter constructed without streamer");
95	}
96
97	StringRef AMDGPUAsmPrinter::getPassName() const {
98	return "AMDGPU Assembly Printer";
99	}
100
101	const MCSubtargetInfo AMDGPUAsmPrinter::getGlobalSTI() const* {
102	return TM.getMCSubtargetInfo();
103	}
104
105	AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
106	if (!OutStreamer)
107	return nullptr;
108	return static_cast<AMDGPUTargetStreamer*>(OutStreamer ->getTargetStreamer());
109	}
110
111	void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
112	IsTargetStreamerInitialized = false;
113	}
114
115	void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
116	IsTargetStreamerInitialized = true;
117
118	// TODO: Which one is called first, emitStartOfAsmFile or
119	// emitFunctionBodyStart?
120	if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
121	initializeTargetID(M);
122
123	if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
124	TM.getTargetTriple().getOS() != Triple::AMDPAL)
125	return;
126
127	getTargetStreamer()->EmitDirectiveAMDGCNTarget();
128
129	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
130	getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
131	COV: CodeObjectVersion);
132	HSAMetadataStream ->begin(Mod: M, TargetID: *getTargetStreamer()->getTargetID());
133	}
134
135	if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
136	getTargetStreamer()->getPALMetadata()->readFromIR(M);
137	}
138
139	void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
140	// Init target streamer if it has not yet happened
141	if (!IsTargetStreamerInitialized)
142	initTargetStreamer(M);
143
144	if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
145	getTargetStreamer()->EmitISAVersion();
146
147	// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
148	// Emit HSA Metadata (NT_AMD_HSA_METADATA).
149	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
150	HSAMetadataStream ->end();
151	bool Success = HSAMetadataStream ->emitTo(TargetStreamer&: *getTargetStreamer());
152	(void)Success;
153	assert(Success && "Malformed HSA Metadata");
154	}
155	}
156
157	void AMDGPUAsmPrinter::emitFunctionBodyStart() {
158	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
159	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
160	const Function &F = MF->getFunction();
161
162	// TODO: We're checking this late, would be nice to check it earlier.
163	if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
164	report_fatal_error(
165	reason: STM.getCPU() + " is only available on code object version 6 or better",
166	/gen_crash_diag/ false);
167	}
168
169	// TODO: Which one is called first, emitStartOfAsmFile or
170	// emitFunctionBodyStart?
171	if (!getTargetStreamer()->getTargetID())
172	initializeTargetID(M: *F.getParent());
173
174	const auto &FunctionTargetID = STM.getTargetID();
175	// Make sure function's xnack settings are compatible with module's
176	// xnack settings.
177	if (FunctionTargetID.isXnackSupported() &&
178	FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
179	FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
180	OutContext.reportError(L: {}, Msg: "xnack setting of '" + Twine (MF->getName()) +
181	"' function does not match module xnack setting");
182	return;
183	}
184	// Make sure function's sramecc settings are compatible with module's
185	// sramecc settings.
186	if (FunctionTargetID.isSramEccSupported() &&
187	FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
188	FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
189	OutContext.reportError(L: {}, Msg: "sramecc setting of '" + Twine (MF->getName()) +
190	"' function does not match module sramecc setting");
191	return;
192	}
193
194	if (!MFI.isEntryFunction())
195	return;
196
197	if (STM.isMesaKernel(F) &&
198	(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
199	F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
200	AMDGPUMCKernelCodeT KernelCode;
201	getAmdKernelCode(Out&: KernelCode, KernelInfo: CurrentProgramInfo, MF: *MF);
202	KernelCode.validate(STI: &STM, Ctx&: MF->getContext());
203	getTargetStreamer()->EmitAMDKernelCodeT(Header&: KernelCode);
204	}
205
206	if (STM.isAmdHsaOS())
207	HSAMetadataStream ->emitKernel(MF: *MF, ProgramInfo: CurrentProgramInfo);
208
209	if (MFI.getNumKernargPreloadedSGPRs() > `0`) {
210	assert(AMDGPU::hasKernargPreload(STM));
211	getTargetStreamer()->EmitKernargPreloadHeader(STI: *getGlobalSTI(),
212	TrapEnabled: STM.isAmdHsaOS());
213	}
214	}
215
216	void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
217	const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
218	if (!MFI.isEntryFunction())
219	return;
220
221	if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
222	return;
223
224	auto &Streamer = getTargetStreamer()->getStreamer();
225	auto &Context = Streamer.getContext();
226	auto &ObjectFileInfo = *Context.getObjectFileInfo();
227	auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
228
229	Streamer.pushSection();
230	Streamer.switchSection(Section: &ReadOnlySection);
231
232	// CP microcode requires the kernel descriptor to be allocated on 64 byte
233	// alignment.
234	Streamer.emitValueToAlignment(Alignment: Align (`64`), Value: `0`, ValueSize: `1`, MaxBytesToEmit: `0`);
235	ReadOnlySection.ensureMinAlignment(MinAlignment: Align (`64`));
236
237	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
238
239	SmallString<`128`> KernelName;
240	getNameWithPrefix(Name&: KernelName, GV: &MF->getFunction());
241	getTargetStreamer()->EmitAmdhsaKernelDescriptor(
242	STI: STM, KernelName, KernelDescriptor: getAmdhsaKernelDescriptor(MF: *MF, PI: CurrentProgramInfo),
243	NextVGPR: CurrentProgramInfo.NumVGPRsForWavesPerEU,
244	NextSGPR: MCBinaryExpr::createSub(
245	LHS: CurrentProgramInfo.NumSGPRsForWavesPerEU,
246	RHS: AMDGPUMCExpr::createExtraSGPRs(
247	VCCUsed: CurrentProgramInfo.VCCUsed, FlatScrUsed: CurrentProgramInfo.FlatUsed,
248	XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx&: Context),
249	Ctx&: Context),
250	ReserveVCC: CurrentProgramInfo.VCCUsed, ReserveFlatScr: CurrentProgramInfo.FlatUsed);
251
252	Streamer.popSection();
253	}
254
255	void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr MI) const* {
256	Register RegNo = MI->getOperand(i: `0`).getReg();
257
258	SmallString<`128`> Str;
259	raw_svector_ostream OS(Str);
260	OS << "implicit-def: "
261	<< printReg(Reg: RegNo, TRI: MF->getSubtarget().getRegisterInfo());
262
263	if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
264	OS << " : SGPR spill to VGPR lane";
265
266	OutStreamer ->AddComment(T: OS.str());
267	OutStreamer ->addBlankLine();
268	}
269
270	void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
271	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
272	AsmPrinter::emitFunctionEntryLabel();
273	return;
274	}
275
276	const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
277	const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
278	if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(F: MF->getFunction())) {
279	SmallString<`128`> SymbolName;
280	getNameWithPrefix(Name&: SymbolName, GV: &MF->getFunction()),
281	getTargetStreamer()->EmitAMDGPUSymbolType(
282	SymbolName, Type: ELF::STT_AMDGPU_HSA_KERNEL);
283	}
284	if (DumpCodeInstEmitter) {
285	// Disassemble function name label to text.
286	DisasmLines.push_back(x: MF->getName().str() + ":");
287	DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
288	HexLines.emplace_back(args: "");
289	}
290
291	AsmPrinter::emitFunctionEntryLabel();
292	}
293
294	void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
295	if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(MBB: &MBB)) {
296	// Write a line for the basic block label if it is not only fallthrough.
297	DisasmLines.push_back(
298	x: (Twine ("BB") + Twine (getFunctionNumber())
299	+ "_" + Twine (MBB.getNumber()) + ":").str());
300	DisasmLineMaxLen = std::max(a: DisasmLineMaxLen, b: DisasmLines.back().size());
301	HexLines.emplace_back(args: "");
302	}
303	AsmPrinter::emitBasicBlockStart(MBB);
304	}
305
306	void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
307	if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
308	if (GV->hasInitializer() && !isa<UndefValue>(Val: GV->getInitializer())) {
309	OutContext.reportError(L: {},
310	Msg: Twine (GV->getName()) +
311	": unsupported initializer for address space");
312	return;
313	}
314
315	// LDS variables aren't emitted in HSA or PAL yet.
316	const Triple::OSType OS = TM.getTargetTriple().getOS();
317	if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
318	return;
319
320	MCSymbol *GVSym = getSymbol(GV);
321
322	GVSym->redefineIfPossible();
323	if (GVSym->isDefined() \|\| GVSym->isVariable())
324	report_fatal_error(reason: "symbol '" + Twine (GVSym->getName()) +
325	"' is already defined");
326
327	const DataLayout &DL = GV->getDataLayout();
328	uint64_t Size = DL.getTypeAllocSize(Ty: GV->getValueType());
329	Align Alignment = GV->getAlign().value_or(u: Align (`4`));
330
331	emitVisibility(Sym: GVSym, Visibility: GV->getVisibility(), IsDefinition: !GV->isDeclaration());
332	emitLinkage(GV, GVSym);
333	auto TS = getTargetStreamer();
334	TS->emitAMDGPULDS(Symbol: GVSym, Size, Alignment);
335	return;
336	}
337
338	AsmPrinter::emitGlobalVariable(GV);
339	}
340
341	bool AMDGPUAsmPrinter::doInitialization(Module &M) {
342	CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
343
344	if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
345	switch (CodeObjectVersion) {
346	case AMDGPU::AMDHSA_COV4:
347	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
348	break;
349	case AMDGPU::AMDHSA_COV5:
350	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
351	break;
352	case AMDGPU::AMDHSA_COV6:
353	HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
354	break;
355	default:
356	report_fatal_error(reason: "Unexpected code object version");
357	}
358	}
359	return AsmPrinter::doInitialization(M);
360	}
361
362	bool AMDGPUAsmPrinter::doFinalization(Module &M) {
363	// Pad with s_code_end to help tools and guard against instruction prefetch
364	// causing stale data in caches. Arguably this should be done by the linker,
365	// which is why this isn't done for Mesa.
366	const MCSubtargetInfo &STI = *getGlobalSTI();
367	if ((AMDGPU::isGFX10Plus(STI) \|\| AMDGPU::isGFX90A(STI)) &&
368	(STI.getTargetTriple().getOS() == Triple::AMDHSA \|\|
369	STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
370	OutStreamer ->switchSection(Section: getObjFileLowering().getTextSection());
371	getTargetStreamer()->EmitCodeEnd(STI);
372	}
373
374	return AsmPrinter::doFinalization(M);
375	}
376
377	// Print comments that apply to both callable functions and entry points.
378	void AMDGPUAsmPrinter::emitCommonFunctionComments(
379	uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
380	uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
381	const AMDGPUMachineFunction *MFI) {
382	OutStreamer ->emitRawComment(T: " codeLenInByte = " + Twine (CodeSize), TabPrefix: false);
383	OutStreamer ->emitRawComment(T: " NumSgprs: " + Twine (NumSGPR), TabPrefix: false);
384	OutStreamer ->emitRawComment(T: " NumVgprs: " + Twine (NumVGPR), TabPrefix: false);
385	if (NumAGPR) {
386	OutStreamer ->emitRawComment(T: " NumAgprs: " + Twine (NumAGPR), TabPrefix: false*);
387	OutStreamer ->emitRawComment(T: " TotalNumVgprs: " + Twine (TotalNumVGPR),
388	TabPrefix: false);
389	}
390	OutStreamer ->emitRawComment(T: " ScratchSize: " + Twine (ScratchSize), TabPrefix: false);
391	OutStreamer ->emitRawComment(T: " MemoryBound: " + Twine (MFI->isMemoryBound()),
392	TabPrefix: false);
393	}
394
395	SmallString<`128`> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
396	SmallString<`128`> Str;
397	raw_svector_ostream OSS(Str);
398	int64_t IVal;
399	if (Value->evaluateAsAbsolute(Res&: IVal)) {
400	OSS << static_cast<uint64_t>(IVal);
401	} else {
402	Value->print(OS&: OSS, MAI);
403	}
404	return Str;
405	}
406
407	void AMDGPUAsmPrinter::emitCommonFunctionComments(
408	const MCExpr NumVGPR, const* MCExpr NumAGPR, const* MCExpr *TotalNumVGPR,
409	const MCExpr NumSGPR, const* MCExpr *ScratchSize, uint64_t CodeSize,
410	const AMDGPUMachineFunction *MFI) {
411	OutStreamer ->emitRawComment(T: " codeLenInByte = " + Twine (CodeSize), TabPrefix: false);
412	OutStreamer ->emitRawComment(T: " NumSgprs: " + getMCExprStr(Value: NumSGPR), TabPrefix: false);
413	OutStreamer ->emitRawComment(T: " NumVgprs: " + getMCExprStr(Value: NumVGPR), TabPrefix: false);
414	if (NumAGPR && TotalNumVGPR) {
415	OutStreamer ->emitRawComment(T: " NumAgprs: " + getMCExprStr(Value: NumAGPR), TabPrefix: false);
416	OutStreamer ->emitRawComment(T: " TotalNumVgprs: " + getMCExprStr(Value: TotalNumVGPR),
417	TabPrefix: false);
418	}
419	OutStreamer ->emitRawComment(T: " ScratchSize: " + getMCExprStr(Value: ScratchSize),
420	TabPrefix: false);
421	OutStreamer ->emitRawComment(T: " MemoryBound: " + Twine (MFI->isMemoryBound()),
422	TabPrefix: false);
423	}
424
425	const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
426	const MachineFunction &MF) const {
427	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
428	MCContext &Ctx = MF.getContext();
429	uint16_t KernelCodeProperties = `0`;
430	const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
431
432	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
433	KernelCodeProperties \|=
434	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
435	}
436	if (UserSGPRInfo.hasDispatchPtr()) {
437	KernelCodeProperties \|=
438	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
439	}
440	if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
441	KernelCodeProperties \|=
442	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
443	}
444	if (UserSGPRInfo.hasKernargSegmentPtr()) {
445	KernelCodeProperties \|=
446	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
447	}
448	if (UserSGPRInfo.hasDispatchID()) {
449	KernelCodeProperties \|=
450	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
451	}
452	if (UserSGPRInfo.hasFlatScratchInit()) {
453	KernelCodeProperties \|=
454	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
455	}
456	if (UserSGPRInfo.hasPrivateSegmentSize()) {
457	KernelCodeProperties \|=
458	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
459	}
460	if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
461	KernelCodeProperties \|=
462	amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
463	}
464
465	// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
466	// un-evaluatable at this point so it cannot be conditionally checked here.
467	// Instead, we'll directly shift the possibly unknown MCExpr into its place
468	// and bitwise-or it into KernelCodeProperties.
469	const MCExpr *KernelCodePropExpr =
470	MCConstantExpr::create(Value: KernelCodeProperties, Ctx);
471	const MCExpr *OrValue = MCConstantExpr::create(
472	Value: amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
473	OrValue = MCBinaryExpr::createShl(LHS: CurrentProgramInfo.DynamicCallStack,
474	RHS: OrValue, Ctx);
475	KernelCodePropExpr = MCBinaryExpr::createOr(LHS: KernelCodePropExpr, RHS: OrValue, Ctx);
476
477	return KernelCodePropExpr;
478	}
479
480	MCKernelDescriptor
481	AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
482	const SIProgramInfo &PI) const {
483	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
484	const Function &F = MF.getFunction();
485	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
486	MCContext &Ctx = MF.getContext();
487
488	MCKernelDescriptor KernelDescriptor;
489
490	KernelDescriptor.group_segment_fixed_size =
491	MCConstantExpr::create(Value: PI.LDSSize, Ctx);
492	KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
493
494	Align MaxKernArgAlign;
495	KernelDescriptor.kernarg_size = MCConstantExpr::create(
496	Value: STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign), Ctx);
497
498	KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(ST: STM, Ctx);
499	KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
500	KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
501
502	int64_t PGRM_Rsrc3 = `1`;
503	bool EvaluatableRsrc3 =
504	CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(Res&: PGRM_Rsrc3);
505	(void)PGRM_Rsrc3;
506	(void)EvaluatableRsrc3;
507	assert(STM.hasGFX90AInsts() \|\| !EvaluatableRsrc3 \|\|
508	static_cast<uint64_t>(PGRM_Rsrc3) == `0`);
509	KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
510
511	KernelDescriptor.kernarg_preload = MCConstantExpr::create(
512	Value: AMDGPU::hasKernargPreload(STI: STM) ? Info->getNumKernargPreloadedSGPRs() : `0`,
513	Ctx);
514
515	return KernelDescriptor;
516	}
517
518	bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
519	// Init target streamer lazily on the first function so that previous passes
520	// can set metadata.
521	if (!IsTargetStreamerInitialized)
522	initTargetStreamer(M&: *MF.getFunction().getParent());
523
524	ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
525	CurrentProgramInfo.reset(MF);
526
527	const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
528	MCContext &Ctx = MF.getContext();
529
530	// The starting address of all shader programs must be 256 bytes aligned.
531	// Regular functions just need the basic required instruction alignment.
532	MF.setAlignment(MFI->isEntryFunction() ? Align (`256`) : Align (`4`));
533
534	SetupMachineFunction(MF);
535
536	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
537	MCContext &Context = getObjFileLowering().getContext();
538	// FIXME: This should be an explicit check for Mesa.
539	if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
540	MCSectionELF *ConfigSection =
541	Context.getELFSection(Section: ".AMDGPU.config", Type: ELF::SHT_PROGBITS, Flags: `0`);
542	OutStreamer ->switchSection(Section: ConfigSection);
543	}
544
545	if (MFI->isModuleEntryFunction()) {
546	getSIProgramInfo(Out&: CurrentProgramInfo, MF);
547	}
548
549	if (STM.isAmdPalOS()) {
550	if (MFI->isEntryFunction())
551	EmitPALMetadata(MF, KernelInfo: CurrentProgramInfo);
552	else if (MFI->isModuleEntryFunction())
553	emitPALFunctionMetadata(MF);
554	} else if (!STM.isAmdHsaOS()) {
555	EmitProgramInfoSI(MF, KernelInfo: CurrentProgramInfo);
556	}
557
558	DumpCodeInstEmitter = nullptr;
559	if (STM.dumpCode()) {
560	// For -dumpcode, get the assembler out of the streamer. This only works
561	// with -filetype=obj.
562	MCAssembler *Assembler = OutStreamer ->getAssemblerPtr();
563	if (Assembler)
564	DumpCodeInstEmitter = Assembler->getEmitterPtr();
565	}
566
567	DisasmLines.clear();
568	HexLines.clear();
569	DisasmLineMaxLen = `0`;
570
571	emitFunctionBody();
572
573	emitResourceUsageRemarks(MF, CurrentProgramInfo, isModuleEntryFunction: MFI->isModuleEntryFunction(),
574	hasMAIInsts: STM.hasMAIInsts());
575
576	if (isVerbose()) {
577	MCSectionELF *CommentSection =
578	Context.getELFSection(Section: ".AMDGPU.csdata", Type: ELF::SHT_PROGBITS, Flags: `0`);
579	OutStreamer ->switchSection(Section: CommentSection);
580
581	if (!MFI->isEntryFunction()) {
582	OutStreamer ->emitRawComment(T: " Function info:", TabPrefix: false);
583	const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
584	ResourceUsage->getResourceInfo(F: &MF.getFunction());
585	emitCommonFunctionComments(
586	NumVGPR: Info.NumVGPR,
587	NumAGPR: STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
588	TotalNumVGPR: Info.getTotalNumVGPRs(ST: STM),
589	NumSGPR: Info.getTotalNumSGPRs(ST: MF.getSubtarget<GCNSubtarget>()),
590	ScratchSize: Info.PrivateSegmentSize, CodeSize: getFunctionCodeSize(MF), MFI);
591	return false;
592	}
593
594	OutStreamer ->emitRawComment(T: " Kernel info:", TabPrefix: false);
595	emitCommonFunctionComments(
596	NumVGPR: CurrentProgramInfo.NumArchVGPR,
597	NumAGPR: STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
598	TotalNumVGPR: CurrentProgramInfo.NumVGPR, NumSGPR: CurrentProgramInfo.NumSGPR,
599	ScratchSize: CurrentProgramInfo.ScratchSize, CodeSize: getFunctionCodeSize(MF), MFI);
600
601	OutStreamer ->emitRawComment(
602	T: " FloatMode: " + Twine (CurrentProgramInfo.FloatMode), TabPrefix: false);
603	OutStreamer ->emitRawComment(
604	T: " IeeeMode: " + Twine (CurrentProgramInfo.IEEEMode), TabPrefix: false);
605	OutStreamer ->emitRawComment(
606	T: " LDSByteSize: " + Twine (CurrentProgramInfo.LDSSize) +
607	" bytes/workgroup (compile time only)", TabPrefix: false);
608
609	OutStreamer ->emitRawComment(
610	T: " SGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.SGPRBlocks), TabPrefix: false);
611
612	OutStreamer ->emitRawComment(
613	T: " VGPRBlocks: " + getMCExprStr(Value: CurrentProgramInfo.VGPRBlocks), TabPrefix: false);
614
615	OutStreamer ->emitRawComment(
616	T: " NumSGPRsForWavesPerEU: " +
617	getMCExprStr(Value: CurrentProgramInfo.NumSGPRsForWavesPerEU),
618	TabPrefix: false);
619	OutStreamer ->emitRawComment(
620	T: " NumVGPRsForWavesPerEU: " +
621	getMCExprStr(Value: CurrentProgramInfo.NumVGPRsForWavesPerEU),
622	TabPrefix: false);
623
624	if (STM.hasGFX90AInsts()) {
625	const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
626	LHS: CurrentProgramInfo.AccumOffset, RHS: MCConstantExpr::create(Value: `1`, Ctx), Ctx);
627	AdjustedAccum = MCBinaryExpr::createMul(
628	LHS: AdjustedAccum, RHS: MCConstantExpr::create(Value: `4`, Ctx), Ctx);
629	OutStreamer ->emitRawComment(
630	T: " AccumOffset: " + getMCExprStr(Value: AdjustedAccum), TabPrefix: false);
631	}
632
633	OutStreamer ->emitRawComment(
634	T: " Occupancy: " + getMCExprStr(Value: CurrentProgramInfo.Occupancy), TabPrefix: false);
635
636	OutStreamer ->emitRawComment(
637	T: " WaveLimiterHint : " + Twine (MFI->needsWaveLimiter()), TabPrefix: false);
638
639	OutStreamer ->emitRawComment(
640	T: " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
641	getMCExprStr(Value: CurrentProgramInfo.ScratchEnable),
642	TabPrefix: false);
643	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:USER_SGPR: " +
644	Twine (CurrentProgramInfo.UserSGPR),
645	TabPrefix: false);
646	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
647	Twine (CurrentProgramInfo.TrapHandlerEnable),
648	TabPrefix: false);
649	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
650	Twine (CurrentProgramInfo.TGIdXEnable),
651	TabPrefix: false);
652	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
653	Twine (CurrentProgramInfo.TGIdYEnable),
654	TabPrefix: false);
655	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
656	Twine (CurrentProgramInfo.TGIdZEnable),
657	TabPrefix: false);
658	OutStreamer ->emitRawComment(T: " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
659	Twine (CurrentProgramInfo.TIdIGCompCount),
660	TabPrefix: false);
661
662	[[maybe_unused]] int64_t PGMRSrc3;
663	assert(STM.hasGFX90AInsts() \|\|
664	(CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
665	PGMRSrc3) &&
666	static_cast<uint64_t>(PGMRSrc3) == `0`));
667	if (STM.hasGFX90AInsts()) {
668	OutStreamer ->emitRawComment(
669	T: " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
670	getMCExprStr(Value: MCKernelDescriptor::bits_get(
671	Src: CurrentProgramInfo.ComputePGMRSrc3GFX90A,
672	Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
673	Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
674	TabPrefix: false);
675	OutStreamer ->emitRawComment(
676	T: " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
677	getMCExprStr(Value: MCKernelDescriptor::bits_get(
678	Src: CurrentProgramInfo.ComputePGMRSrc3GFX90A,
679	Shift: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
680	Mask: amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
681	TabPrefix: false);
682	}
683	}
684
685	if (DumpCodeInstEmitter) {
686
687	OutStreamer ->switchSection(
688	Section: Context.getELFSection(Section: ".AMDGPU.disasm", Type: ELF::SHT_PROGBITS, Flags: `0`));
689
690	for (size_t i = `0`; i < DisasmLines.size(); ++i) {
691	std::string Comment = "\n";
692	if (!HexLines [i].empty()) {
693	Comment = std::string (DisasmLineMaxLen - DisasmLines [i].size(), `' '`);
694	Comment += " ; " + HexLines [i] + "\n";
695	}
696
697	OutStreamer ->emitBytes(Data: StringRef (DisasmLines [i]));
698	OutStreamer ->emitBytes(Data: StringRef (Comment));
699	}
700	}
701
702	return false;
703	}
704
705	// TODO: Fold this into emitFunctionBodyStart.
706	void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
707	// In the beginning all features are either 'Any' or 'NotSupported',
708	// depending on global target features. This will cover empty modules.
709	getTargetStreamer()->initializeTargetID(STI: *getGlobalSTI(),
710	FeatureString: getGlobalSTI()->getFeatureString());
711
712	// If module is empty, we are done.
713	if (M.empty())
714	return;
715
716	// If module is not empty, need to find first 'Off' or 'On' feature
717	// setting per feature from functions in module.
718	for (auto &F : M) {
719	auto &TSTargetID = getTargetStreamer()->getTargetID();
720	if ((!TSTargetID ->isXnackSupported() \|\| TSTargetID ->isXnackOnOrOff()) &&
721	(!TSTargetID ->isSramEccSupported() \|\| TSTargetID ->isSramEccOnOrOff()))
722	break;
723
724	const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
725	const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
726	if (TSTargetID ->isXnackSupported())
727	if (TSTargetID ->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
728	TSTargetID ->setXnackSetting(STMTargetID.getXnackSetting());
729	if (TSTargetID ->isSramEccSupported())
730	if (TSTargetID ->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
731	TSTargetID ->setSramEccSetting(STMTargetID.getSramEccSetting());
732	}
733	}
734
735	uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
736	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
737	const SIInstrInfo *TII = STM.getInstrInfo();
738
739	uint64_t CodeSize = `0`;
740
741	for (const MachineBasicBlock &MBB : MF) {
742	for (const MachineInstr &MI : MBB) {
743	// TODO: CodeSize should account for multiple functions.
744
745	// TODO: Should we count size of debug info?
746	if (MI.isDebugInstr())
747	continue;
748
749	CodeSize += TII->getInstSizeInBytes(MI);
750	}
751	}
752
753	return CodeSize;
754	}
755
756	void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
757	const MachineFunction &MF) {
758	const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
759	ResourceUsage->getResourceInfo(F: &MF.getFunction());
760	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
761	MCContext &Ctx = MF.getContext();
762
763	auto CreateExpr = [&Ctx](int64_t Value) {
764	return MCConstantExpr::create(Value, Ctx);
765	};
766
767	auto TryGetMCExprValue = [](const MCExpr Value, uint64_t &Res) -> bool* {
768	int64_t Val;
769	if (Value->evaluateAsAbsolute(Res&: Val)) {
770	Res = Val;
771	return true;
772	}
773	return false;
774	};
775
776	ProgInfo.NumArchVGPR = CreateExpr (Info.NumVGPR);
777	ProgInfo.NumAccVGPR = CreateExpr (Info.NumAGPR);
778	ProgInfo.NumVGPR = CreateExpr (Info.getTotalNumVGPRs(ST: STM));
779	ProgInfo.AccumOffset =
780	CreateExpr (alignTo(Value: std::max(a: `1`, b: Info.NumVGPR), Align: `4`) / `4` - `1`);
781	ProgInfo.TgSplit = STM.isTgSplitEnabled();
782	ProgInfo.NumSGPR = CreateExpr (Info.NumExplicitSGPR);
783	ProgInfo.ScratchSize = CreateExpr (Info.PrivateSegmentSize);
784	ProgInfo.VCCUsed = CreateExpr (Info.UsesVCC);
785	ProgInfo.FlatUsed = CreateExpr (Info.UsesFlatScratch);
786	ProgInfo.DynamicCallStack =
787	CreateExpr (Info.HasDynamicallySizedStack \|\| Info.HasRecursion);
788
789	const uint64_t MaxScratchPerWorkitem =
790	STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
791	uint64_t ScratchSize;
792	if (TryGetMCExprValue (ProgInfo.ScratchSize, ScratchSize) &&
793	ScratchSize > MaxScratchPerWorkitem) {
794	DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
795	MaxScratchPerWorkitem, DS_Error);
796	MF.getFunction().getContext().diagnose(DI: DiagStackSize);
797	}
798
799	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
800
801	// The calculations related to SGPR/VGPR blocks are
802	// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
803	// unified.
804	const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
805	VCCUsed: ProgInfo.VCCUsed, FlatScrUsed: ProgInfo.FlatUsed,
806	XNACKUsed: getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
807
808	// Check the addressable register limit before we add ExtraSGPRs.
809	if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
810	!STM.hasSGPRInitBug()) {
811	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
812	uint64_t NumSgpr;
813	if (TryGetMCExprValue (ProgInfo.NumSGPR, NumSgpr) &&
814	NumSgpr > MaxAddressableNumSGPRs) {
815	// This can happen due to a compiler bug or when using inline asm.
816	LLVMContext &Ctx = MF.getFunction().getContext();
817	DiagnosticInfoResourceLimit Diag(
818	MF.getFunction(), "addressable scalar registers", NumSgpr,
819	MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
820	Ctx.diagnose(DI: Diag);
821	ProgInfo.NumSGPR = CreateExpr (MaxAddressableNumSGPRs - `1`);
822	}
823	}
824
825	// Account for extra SGPRs and VGPRs reserved for debugger use.
826	ProgInfo.NumSGPR = MCBinaryExpr::createAdd(LHS: ProgInfo.NumSGPR, RHS: ExtraSGPRs, Ctx);
827
828	const Function &F = MF.getFunction();
829
830	// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
831	// dispatch registers are function args.
832	unsigned WaveDispatchNumSGPR = `0`, WaveDispatchNumVGPR = `0`;
833
834	if (isShader(CC: F.getCallingConv())) {
835	bool IsPixelShader =
836	F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
837
838	// Calculate the number of VGPR registers based on the SPI input registers
839	uint32_t InputEna = `0`;
840	uint32_t InputAddr = `0`;
841	unsigned LastEna = `0`;
842
843	if (IsPixelShader) {
844	// Note for IsPixelShader:
845	// By this stage, all enabled inputs are tagged in InputAddr as well.
846	// We will use InputAddr to determine whether the input counts against the
847	// vgpr total and only use the InputEnable to determine the last input
848	// that is relevant - if extra arguments are used, then we have to honour
849	// the InputAddr for any intermediate non-enabled inputs.
850	InputEna = MFI->getPSInputEnable();
851	InputAddr = MFI->getPSInputAddr();
852
853	// We only need to consider input args up to the last used arg.
854	assert((InputEna \|\| InputAddr) &&
855	"PSInputAddr and PSInputEnable should "
856	"never both be 0 for AMDGPU_PS shaders");
857	// There are some rare circumstances where InputAddr is non-zero and
858	// InputEna can be set to 0. In this case we default to setting LastEna
859	// to 1.
860	LastEna = InputEna ? llvm::Log2_32(Value: InputEna) + `1` : `1`;
861	}
862
863	// FIXME: We should be using the number of registers determined during
864	// calling convention lowering to legalize the types.
865	const DataLayout &DL = F.getDataLayout();
866	unsigned PSArgCount = `0`;
867	unsigned IntermediateVGPR = `0`;
868	for (auto &Arg : F.args()) {
869	unsigned NumRegs = (DL.getTypeSizeInBits(Ty: Arg.getType()) + `31`) / `32`;
870	if (Arg.hasAttribute(Kind: Attribute::InReg)) {
871	WaveDispatchNumSGPR += NumRegs;
872	} else {
873	// If this is a PS shader and we're processing the PS Input args (first
874	// 16 VGPR), use the InputEna and InputAddr bits to define how many
875	// VGPRs are actually used.
876	// Any extra VGPR arguments are handled as normal arguments (and
877	// contribute to the VGPR count whether they're used or not).
878	if (IsPixelShader && PSArgCount < `16`) {
879	if ((`1` << PSArgCount) & InputAddr) {
880	if (PSArgCount < LastEna)
881	WaveDispatchNumVGPR += NumRegs;
882	else
883	IntermediateVGPR += NumRegs;
884	}
885	PSArgCount++;
886	} else {
887	// If there are extra arguments we have to include the allocation for
888	// the non-used (but enabled with InputAddr) input arguments
889	if (IntermediateVGPR) {
890	WaveDispatchNumVGPR += IntermediateVGPR;
891	IntermediateVGPR = `0`;
892	}
893	WaveDispatchNumVGPR += NumRegs;
894	}
895	}
896	}
897	ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
898	Args: {ProgInfo.NumSGPR, CreateExpr (WaveDispatchNumSGPR)}, Ctx);
899
900	ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
901	Args: {ProgInfo.NumVGPR, CreateExpr (WaveDispatchNumVGPR)}, Ctx);
902
903	ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
904	NumAGPR: ProgInfo.NumAccVGPR, NumVGPR: ProgInfo.NumArchVGPR, Ctx);
905	}
906
907	// Adjust number of registers used to meet default/requested minimum/maximum
908	// number of waves per execution unit request.
909	unsigned MaxWaves = MFI->getMaxWavesPerEU();
910	ProgInfo.NumSGPRsForWavesPerEU =
911	AMDGPUMCExpr::createMax(Args: {ProgInfo.NumSGPR, CreateExpr (`1ul`),
912	CreateExpr (STM.getMinNumSGPRs(WavesPerEU: MaxWaves))},
913	Ctx);
914	ProgInfo.NumVGPRsForWavesPerEU =
915	AMDGPUMCExpr::createMax(Args: {ProgInfo.NumVGPR, CreateExpr (`1ul`),
916	CreateExpr (STM.getMinNumVGPRs(WavesPerEU: MaxWaves))},
917	Ctx);
918
919	if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS \|\|
920	STM.hasSGPRInitBug()) {
921	unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
922	uint64_t NumSgpr;
923	if (TryGetMCExprValue (ProgInfo.NumSGPR, NumSgpr) &&
924	NumSgpr > MaxAddressableNumSGPRs) {
925	// This can happen due to a compiler bug or when using inline asm to use
926	// the registers which are usually reserved for vcc etc.
927	LLVMContext &Ctx = MF.getFunction().getContext();
928	DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
929	NumSgpr, MaxAddressableNumSGPRs,
930	DS_Error, DK_ResourceLimit);
931	Ctx.diagnose(DI: Diag);
932	ProgInfo.NumSGPR = CreateExpr (MaxAddressableNumSGPRs);
933	ProgInfo.NumSGPRsForWavesPerEU = CreateExpr (MaxAddressableNumSGPRs);
934	}
935	}
936
937	if (STM.hasSGPRInitBug()) {
938	ProgInfo.NumSGPR =
939	CreateExpr (AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
940	ProgInfo.NumSGPRsForWavesPerEU =
941	CreateExpr (AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
942	}
943
944	if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
945	LLVMContext &Ctx = MF.getFunction().getContext();
946	DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
947	MFI->getNumUserSGPRs(),
948	STM.getMaxNumUserSGPRs(), DS_Error);
949	Ctx.diagnose(DI: Diag);
950	}
951
952	if (MFI->getLDSSize() >
953	static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
954	LLVMContext &Ctx = MF.getFunction().getContext();
955	DiagnosticInfoResourceLimit Diag(
956	MF.getFunction(), "local memory", MFI->getLDSSize(),
957	STM.getAddressableLocalMemorySize(), DS_Error);
958	Ctx.diagnose(DI: Diag);
959	}
960	// The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
961	// (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
962	auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
963	unsigned Granule) {
964	const MCExpr *OneConst = CreateExpr (`1ul`);
965	const MCExpr *GranuleConst = CreateExpr (Granule);
966	const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax(Args: {NumGPR, OneConst}, Ctx);
967	const MCExpr *AlignToGPR =
968	AMDGPUMCExpr::createAlignTo(Value: MaxNumGPR, Align: GranuleConst, Ctx);
969	const MCExpr *DivGPR =
970	MCBinaryExpr::createDiv(LHS: AlignToGPR, RHS: GranuleConst, Ctx);
971	const MCExpr *SubGPR = MCBinaryExpr::createSub(LHS: DivGPR, RHS: OneConst, Ctx);
972	return SubGPR;
973	};
974
975	ProgInfo.SGPRBlocks = GetNumGPRBlocks (ProgInfo.NumSGPRsForWavesPerEU,
976	IsaInfo::getSGPREncodingGranule(STI: &STM));
977	ProgInfo.VGPRBlocks = GetNumGPRBlocks (ProgInfo.NumVGPRsForWavesPerEU,
978	IsaInfo::getVGPREncodingGranule(STI: &STM));
979
980	const SIModeRegisterDefaults Mode = MFI->getMode();
981
982	// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
983	// register.
984	ProgInfo.FloatMode = getFPMode(Mode);
985
986	ProgInfo.IEEEMode = Mode.IEEE;
987
988	// Make clamp modifier on NaN input returns 0.
989	ProgInfo.DX10Clamp = Mode.DX10Clamp;
990
991	unsigned LDSAlignShift;
992	if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
993	// LDS is allocated in 64 dword blocks.
994	LDSAlignShift = `8`;
995	} else {
996	// LDS is allocated in 128 dword blocks.
997	LDSAlignShift = `9`;
998	}
999
1000	ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1001	ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1002
1003	ProgInfo.LDSSize = MFI->getLDSSize();
1004	ProgInfo.LDSBlocks =
1005	alignTo(Value: ProgInfo.LDSSize, Align: `1ULL` << LDSAlignShift) >> LDSAlignShift;
1006
1007	// The MCExpr equivalent of divideCeil.
1008	auto DivideCeil = [&Ctx](const MCExpr Numerator, const* MCExpr *Denominator) {
1009	const MCExpr *Ceil =
1010	AMDGPUMCExpr::createAlignTo(Value: Numerator, Align: Denominator, Ctx);
1011	return MCBinaryExpr::createDiv(LHS: Ceil, RHS: Denominator, Ctx);
1012	};
1013
1014	// Scratch is allocated in 64-dword or 256-dword blocks.
1015	unsigned ScratchAlignShift =
1016	STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? `8` : `10`;
1017	// We need to program the hardware with the amount of scratch memory that
1018	// is used by the entire wave. ProgInfo.ScratchSize is the amount of
1019	// scratch memory used per thread.
1020	ProgInfo.ScratchBlocks = DivideCeil (
1021	MCBinaryExpr::createMul(LHS: ProgInfo.ScratchSize,
1022	RHS: CreateExpr (STM.getWavefrontSize()), Ctx),
1023	CreateExpr (`1ULL` << ScratchAlignShift));
1024
1025	if (getIsaVersion(GPU: getGlobalSTI()->getCPU()).Major >= `10`) {
1026	ProgInfo.WgpMode = STM.isCuModeEnabled() ? `0` : `1`;
1027	ProgInfo.MemOrdered = `1`;
1028	}
1029
1030	// 0 = X, 1 = XY, 2 = XYZ
1031	unsigned TIDIGCompCnt = `0`;
1032	if (MFI->hasWorkItemIDZ())
1033	TIDIGCompCnt = `2`;
1034	else if (MFI->hasWorkItemIDY())
1035	TIDIGCompCnt = `1`;
1036
1037	// The private segment wave byte offset is the last of the system SGPRs. We
1038	// initially assumed it was allocated, and may have used it. It shouldn't harm
1039	// anything to disable it if we know the stack isn't used here. We may still
1040	// have emitted code reading it to initialize scratch, but if that's unused
1041	// reading garbage should be OK.
1042	ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
1043	LHS: MCBinaryExpr::createGT(LHS: ProgInfo.ScratchBlocks,
1044	RHS: MCConstantExpr::create(Value: `0`, Ctx), Ctx),
1045	RHS: ProgInfo.DynamicCallStack, Ctx);
1046
1047	ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1048	// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1049	ProgInfo.TrapHandlerEnable =
1050	STM.isAmdHsaOS() ? `0` : STM.isTrapHandlerEnabled();
1051	ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1052	ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1053	ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1054	ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1055	ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1056	ProgInfo.EXCPEnMSB = `0`;
1057	// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1058	ProgInfo.LdsSize = STM.isAmdHsaOS() ? `0` : ProgInfo.LDSBlocks;
1059	ProgInfo.EXCPEnable = `0`;
1060
1061	if (STM.hasGFX90AInsts()) {
1062	// return ((Dst & ~Mask) \| (Value << Shift))
1063	auto SetBits = [&Ctx](const MCExpr Dst, const* MCExpr *Value, uint32_t Mask,
1064	uint32_t Shift) {
1065	auto Shft = MCConstantExpr::create(Value: Shift, Ctx);
1066	auto Msk = MCConstantExpr::create(Value: Mask, Ctx);
1067	Dst = MCBinaryExpr::createAnd(LHS: Dst, RHS: MCUnaryExpr::createNot(Expr: Msk, Ctx), Ctx);
1068	Dst = MCBinaryExpr::createOr(
1069	LHS: Dst, RHS: MCBinaryExpr::createShl(LHS: Value, RHS: Shft, Ctx), Ctx);
1070	return Dst;
1071	};
1072
1073	ProgInfo.ComputePGMRSrc3GFX90A =
1074	SetBits (ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1075	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1076	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1077	ProgInfo.ComputePGMRSrc3GFX90A =
1078	SetBits (ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr (ProgInfo.TgSplit),
1079	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1080	amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1081	}
1082
1083	ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
1084	InitOcc: STM.computeOccupancy(F, LDSSize: ProgInfo.LDSSize), NumSGPRs: ProgInfo.NumSGPRsForWavesPerEU,
1085	NumVGPRs: ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1086
1087	const auto [MinWEU, MaxWEU] =
1088	AMDGPU::getIntegerPairAttribute(F, Name: "amdgpu-waves-per-eu", Default: {`0`, `0`}, OnlyFirstRequired: true);
1089	uint64_t Occupancy;
1090	if (TryGetMCExprValue (ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1091	DiagnosticInfoOptimizationFailure Diag(
1092	F, F.getSubprogram(),
1093	"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1094	"'" +
1095	F.getName() + "': desired occupancy was " + Twine (MinWEU) +
1096	", final occupancy is " + Twine (Occupancy));
1097	F.getContext().diagnose(DI: Diag);
1098	}
1099	}
1100
1101	static unsigned getRsrcReg(CallingConv::ID CallConv) {
1102	switch (CallConv) {
1103	default: [[fallthrough]];
1104	case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1105	case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1106	case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1107	case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1108	case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1109	case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1110	case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1111	}
1112	}
1113
1114	void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1115	const SIProgramInfo &CurrentProgramInfo) {
1116	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1117	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1118	unsigned RsrcReg = getRsrcReg(CallConv: MF.getFunction().getCallingConv());
1119	MCContext &Ctx = MF.getContext();
1120
1121	// (((Value) & Mask) << Shift)
1122	auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1123	const MCExpr *msk = MCConstantExpr::create(Value: Mask, Ctx);
1124	const MCExpr *shft = MCConstantExpr::create(Value: Shift, Ctx);
1125	return MCBinaryExpr::createShl(LHS: MCBinaryExpr::createAnd(LHS: Value, RHS: msk, Ctx),
1126	RHS: shft, Ctx);
1127	};
1128
1129	auto EmitResolvedOrExpr = [this](const MCExpr Value, unsigned* Size) {
1130	int64_t Val;
1131	if (Value->evaluateAsAbsolute(Res&: Val))
1132	OutStreamer ->emitIntValue(Value: static_cast<uint64_t>(Val), Size);
1133	else
1134	OutStreamer ->emitValue(Value, Size);
1135	};
1136
1137	if (AMDGPU::isCompute(CC: MF.getFunction().getCallingConv())) {
1138	OutStreamer ->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1139
1140	EmitResolvedOrExpr (CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx),
1141	/Size=/`4`);
1142
1143	OutStreamer ->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1144	EmitResolvedOrExpr (CurrentProgramInfo.getComputePGMRSrc2(Ctx), /Size=/`4`);
1145
1146	OutStreamer ->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1147
1148	// Sets bits according to S_0286E8_WAVESIZE_ mask and shift values for the*
1149	// appropriate generation.
1150	if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1151	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1152	/Mask=/`0x3FFFF`, /Shift=/`12`),
1153	/Size=/`4`);
1154	} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1155	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1156	/Mask=/`0x7FFF`, /Shift=/`12`),
1157	/Size=/`4`);
1158	} else {
1159	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1160	/Mask=/`0x1FFF`, /Shift=/`12`),
1161	/Size=/`4`);
1162	}
1163
1164	// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1165	// 0" comment but I don't see a corresponding field in the register spec.
1166	} else {
1167	OutStreamer ->emitInt32(Value: RsrcReg);
1168
1169	const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1170	LHS: SetBits (CurrentProgramInfo.VGPRBlocks, /Mask=/`0x3F`, /Shift=/`0`),
1171	RHS: SetBits (CurrentProgramInfo.SGPRBlocks, /Mask=/`0x0F`, /Shift=/`6`),
1172	Ctx&: MF.getContext());
1173	EmitResolvedOrExpr (GPRBlocks, /Size=/`4`);
1174	OutStreamer ->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1175
1176	// Sets bits according to S_0286E8_WAVESIZE_ mask and shift values for the*
1177	// appropriate generation.
1178	if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1179	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1180	/Mask=/`0x3FFFF`, /Shift=/`12`),
1181	/Size=/`4`);
1182	} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1183	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1184	/Mask=/`0x7FFF`, /Shift=/`12`),
1185	/Size=/`4`);
1186	} else {
1187	EmitResolvedOrExpr (SetBits (CurrentProgramInfo.ScratchBlocks,
1188	/Mask=/`0x1FFF`, /Shift=/`12`),
1189	/Size=/`4`);
1190	}
1191	}
1192
1193	if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1194	OutStreamer ->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1195	unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1196	? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: `2`)
1197	: CurrentProgramInfo.LDSBlocks;
1198	OutStreamer ->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1199	OutStreamer ->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1200	OutStreamer ->emitInt32(Value: MFI->getPSInputEnable());
1201	OutStreamer ->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1202	OutStreamer ->emitInt32(Value: MFI->getPSInputAddr());
1203	}
1204
1205	OutStreamer ->emitInt32(R_SPILLED_SGPRS);
1206	OutStreamer ->emitInt32(Value: MFI->getNumSpilledSGPRs());
1207	OutStreamer ->emitInt32(R_SPILLED_VGPRS);
1208	OutStreamer ->emitInt32(Value: MFI->getNumSpilledVGPRs());
1209	}
1210
1211	// Helper function to add common PAL Metadata 3.0+
1212	static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
1213	const SIProgramInfo &CurrentProgramInfo,
1214	CallingConv::ID CC, const GCNSubtarget &ST) {
1215	if (ST.hasIEEEMode())
1216	MD->setHwStage(CC, field: ".ieee_mode", Val: (bool)CurrentProgramInfo.IEEEMode);
1217
1218	MD->setHwStage(CC, field: ".wgp_mode", Val: (bool)CurrentProgramInfo.WgpMode);
1219	MD->setHwStage(CC, field: ".mem_ordered", Val: (bool)CurrentProgramInfo.MemOrdered);
1220
1221	if (AMDGPU::isCompute(CC)) {
1222	MD->setHwStage(CC, field: ".trap_present",
1223	Val: (bool)CurrentProgramInfo.TrapHandlerEnable);
1224	MD->setHwStage(CC, field: ".excp_en", Val: CurrentProgramInfo.EXCPEnable);
1225	}
1226
1227	MD->setHwStage(CC, field: ".lds_size",
1228	Val: (unsigned)(CurrentProgramInfo.LdsSize *
1229	getLdsDwGranularity(ST) * sizeof(uint32_t)));
1230	}
1231
1232	// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1233	// is AMDPAL. It stores each compute/SPI register setting and other PAL
1234	// metadata items into the PALMD::Metadata, combining with any provided by the
1235	// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1236	// is then written as a single block in the .note section.
1237	void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1238	const SIProgramInfo &CurrentProgramInfo) {
1239	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1240	auto CC = MF.getFunction().getCallingConv();
1241	auto MD = getTargetStreamer()->getPALMetadata();
1242	auto &Ctx = MF.getContext();
1243
1244	MD->setEntryPoint(CC, Name: MF.getFunction().getName());
1245	MD->setNumUsedVgprs(CC, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1246
1247	// Only set AGPRs for supported devices
1248	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1249	if (STM.hasMAIInsts()) {
1250	MD->setNumUsedAgprs(CC, Val: CurrentProgramInfo.NumAccVGPR);
1251	}
1252
1253	MD->setNumUsedSgprs(CC, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1254	if (MD->getPALMajorVersion() < `3`) {
1255	MD->setRsrc1(CC, Val: CurrentProgramInfo.getPGMRSrc1(CC, ST: STM, Ctx), Ctx);
1256	if (AMDGPU::isCompute(CC)) {
1257	MD->setRsrc2(CC, Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1258	} else {
1259	const MCExpr *HasScratchBlocks =
1260	MCBinaryExpr::createGT(LHS: CurrentProgramInfo.ScratchBlocks,
1261	RHS: MCConstantExpr::create(Value: `0`, Ctx), Ctx);
1262	auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1263	MD->setRsrc2(CC, Val: maskShiftSet(Val: HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1264	}
1265	} else {
1266	MD->setHwStage(CC, field: ".debug_mode", Val: (bool)CurrentProgramInfo.DebugMode);
1267	MD->setHwStage(CC, field: ".scratch_en", Type: msgpack::Type::Boolean,
1268	Val: CurrentProgramInfo.ScratchEnable);
1269	EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, ST: STM);
1270	}
1271
1272	// ScratchSize is in bytes, 16 aligned.
1273	MD->setScratchSize(
1274	CC,
1275	Val: AMDGPUMCExpr::createAlignTo(Value: CurrentProgramInfo.ScratchSize,
1276	Align: MCConstantExpr::create(Value: `16`, Ctx), Ctx),
1277	Ctx);
1278
1279	if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1280	unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1281	? divideCeil(Numerator: CurrentProgramInfo.LDSBlocks, Denominator: `2`)
1282	: CurrentProgramInfo.LDSBlocks;
1283	if (MD->getPALMajorVersion() < `3`) {
1284	MD->setRsrc2(
1285	CC,
1286	Val: MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
1287	Ctx);
1288	MD->setSpiPsInputEna(MFI->getPSInputEnable());
1289	MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1290	} else {
1291	// Graphics registers
1292	const unsigned ExtraLdsDwGranularity =
1293	STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? `256` : `128`;
1294	MD->setGraphicsRegisters(
1295	field: ".ps_extra_lds_size",
1296	Val: (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1297
1298	// Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1299	static StringLiteral const PsInputFields[] = {
1300	".persp_sample_ena", ".persp_center_ena",
1301	".persp_centroid_ena", ".persp_pull_model_ena",
1302	".linear_sample_ena", ".linear_center_ena",
1303	".linear_centroid_ena", ".line_stipple_tex_ena",
1304	".pos_x_float_ena", ".pos_y_float_ena",
1305	".pos_z_float_ena", ".pos_w_float_ena",
1306	".front_face_ena", ".ancillary_ena",
1307	".sample_coverage_ena", ".pos_fixed_pt_ena"};
1308	unsigned PSInputEna = MFI->getPSInputEnable();
1309	unsigned PSInputAddr = MFI->getPSInputAddr();
1310	for (auto [Idx, Field] : enumerate(First: PsInputFields)) {
1311	MD->setGraphicsRegisters(field1: ".spi_ps_input_ena", field2: Field,
1312	Val: (bool)((PSInputEna >> Idx) & `1`));
1313	MD->setGraphicsRegisters(field1: ".spi_ps_input_addr", field2: Field,
1314	Val: (bool)((PSInputAddr >> Idx) & `1`));
1315	}
1316	}
1317	}
1318
1319	// For version 3 and above the wave front size is already set in the metadata
1320	if (MD->getPALMajorVersion() < `3` && STM.isWave32())
1321	MD->setWave32(MF.getFunction().getCallingConv());
1322	}
1323
1324	void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1325	auto *MD = getTargetStreamer()->getPALMetadata();
1326	const MachineFrameInfo &MFI = MF.getFrameInfo();
1327	StringRef FnName = MF.getFunction().getName();
1328	MD->setFunctionScratchSize(FnName, Val: MFI.getStackSize());
1329	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1330	MCContext &Ctx = MF.getContext();
1331
1332	if (MD->getPALMajorVersion() < `3`) {
1333	// Set compute registers
1334	MD->setRsrc1(
1335	CC: CallingConv::AMDGPU_CS,
1336	Val: CurrentProgramInfo.getPGMRSrc1(CC: CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1337	MD->setRsrc2(CC: CallingConv::AMDGPU_CS,
1338	Val: CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1339	} else {
1340	EmitPALMetadataCommon(MD, CurrentProgramInfo, CC: CallingConv::AMDGPU_CS, ST);
1341	}
1342
1343	// Set optional info
1344	MD->setFunctionLdsSize(FnName, Val: CurrentProgramInfo.LDSSize);
1345	MD->setFunctionNumUsedVgprs(FnName, Val: CurrentProgramInfo.NumVGPRsForWavesPerEU);
1346	MD->setFunctionNumUsedSgprs(FnName, Val: CurrentProgramInfo.NumSGPRsForWavesPerEU);
1347	}
1348
1349	// This is supposed to be log2(Size)
1350	static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1351	switch (Size) {
1352	case `4`:
1353	return AMD_ELEMENT_4_BYTES;
1354	case `8`:
1355	return AMD_ELEMENT_8_BYTES;
1356	case `16`:
1357	return AMD_ELEMENT_16_BYTES;
1358	default:
1359	llvm_unreachable("invalid private_element_size");
1360	}
1361	}
1362
1363	void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1364	const SIProgramInfo &CurrentProgramInfo,
1365	const MachineFunction &MF) const {
1366	const Function &F = MF.getFunction();
1367	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
1368	F.getCallingConv() == CallingConv::SPIR_KERNEL);
1369
1370	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1371	const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1372	MCContext &Ctx = MF.getContext();
1373
1374	Out.initDefault(STI: &STM, Ctx, /InitMCExpr=/false);
1375
1376	Out.compute_pgm_resource1_registers =
1377	CurrentProgramInfo.getComputePGMRSrc1(ST: STM, Ctx);
1378	Out.compute_pgm_resource2_registers =
1379	CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1380	Out.code_properties \|= AMD_CODE_PROPERTY_IS_PTR64;
1381
1382	Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1383
1384	AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1385	getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1386
1387	const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1388	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1389	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1390	}
1391
1392	if (UserSGPRInfo.hasDispatchPtr())
1393	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1394
1395	if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1396	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1397
1398	if (UserSGPRInfo.hasKernargSegmentPtr())
1399	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1400
1401	if (UserSGPRInfo.hasDispatchID())
1402	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1403
1404	if (UserSGPRInfo.hasFlatScratchInit())
1405	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1406
1407	if (UserSGPRInfo.hasPrivateSegmentSize())
1408	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
1409
1410	if (UserSGPRInfo.hasDispatchPtr())
1411	Out.code_properties \|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1412
1413	if (STM.isXNACKEnabled())
1414	Out.code_properties \|= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1415
1416	Align MaxKernArgAlign;
1417	Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxAlign&: MaxKernArgAlign);
1418	Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1419	Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1420	Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1421	Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1422
1423	// kernarg_segment_alignment is specified as log of the alignment.
1424	// The minimum alignment is 16.
1425	// FIXME: The metadata treats the minimum as 4?
1426	Out.kernarg_segment_alignment = Log2(A: std::max(a: Align (`16`), b: MaxKernArgAlign));
1427	}
1428
1429	bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr MI, unsigned* OpNo,
1430	const char *ExtraCode, raw_ostream &O) {
1431	// First try the generic code, which knows about modifiers like 'c' and 'n'.
1432	if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, OS&: O))
1433	return false;
1434
1435	if (ExtraCode && ExtraCode[`0`]) {
1436	if (ExtraCode[`1`] != `0`)
1437	return true; // Unknown modifier.
1438
1439	switch (ExtraCode[`0`]) {
1440	case `'r'`:
1441	break;
1442	default:
1443	return true;
1444	}
1445	}
1446
1447	// TODO: Should be able to support other operand types like globals.
1448	const MachineOperand &MO = MI->getOperand(i: OpNo);
1449	if (MO.isReg()) {
1450	AMDGPUInstPrinter::printRegOperand(RegNo: MO.getReg(), O,
1451	MRI: *MF->getSubtarget().getRegisterInfo());
1452	return false;
1453	}
1454	if (MO.isImm()) {
1455	int64_t Val = MO.getImm();
1456	if (AMDGPU::isInlinableIntLiteral(Literal: Val)) {
1457	O << Val;
1458	} else if (isUInt<`16`>(x: Val)) {
1459	O << format(Fmt: "0x%" PRIx16, Vals: static_cast<uint16_t>(Val));
1460	} else if (isUInt<`32`>(x: Val)) {
1461	O << format(Fmt: "0x%" PRIx32, Vals: static_cast<uint32_t>(Val));
1462	} else {
1463	O << format(Fmt: "0x%" PRIx64, Vals: static_cast<uint64_t>(Val));
1464	}
1465	return false;
1466	}
1467	return true;
1468	}
1469
1470	void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
1471	AU.addRequired<AMDGPUResourceUsageAnalysis>();
1472	AU.addPreserved<AMDGPUResourceUsageAnalysis>();
1473	AsmPrinter::getAnalysisUsage(AU);
1474	}
1475
1476	void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1477	const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1478	bool isModuleEntryFunction, bool hasMAIInsts) {
1479	if (!ORE)
1480	return;
1481
1482	const char *Name = "kernel-resource-usage";
1483	const char *Indent = " ";
1484
1485	// If the remark is not specifically enabled, do not output to yaml
1486	LLVMContext &Ctx = MF.getFunction().getContext();
1487	if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(PassName: Name))
1488	return;
1489
1490	// Currently non-kernel functions have no resources to emit.
1491	if (!isEntryFunctionCC(CC: MF.getFunction().getCallingConv()))
1492	return;
1493
1494	auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1495	StringRef RemarkLabel, auto Argument) {
1496	// Add an indent for every line besides the line with the kernel name. This
1497	// makes it easier to tell which resource usage go with which kernel since
1498	// the kernel name will always be displayed first.
1499	std::string LabelStr = RemarkLabel.str() + ": ";
1500	if (RemarkName != "FunctionName")
1501	LabelStr = Indent + LabelStr;
1502
1503	ORE->emit([&]() {
1504	return MachineOptimizationRemarkAnalysis (Name, RemarkName,
1505	MF.getFunction().getSubprogram(),
1506	&MF.front())
1507	<< LabelStr << ore::NV(RemarkName, Argument);
1508	});
1509	};
1510
1511	// FIXME: Formatting here is pretty nasty because clang does not accept
1512	// newlines from diagnostics. This forces us to emit multiple diagnostic
1513	// remarks to simulate newlines. If and when clang does accept newlines, this
1514	// formatting should be aggregated into one remark with newlines to avoid
1515	// printing multiple diagnostic location and diag opts.
1516	EmitResourceUsageRemark ("FunctionName", "Function Name",
1517	MF.getFunction().getName());
1518	EmitResourceUsageRemark ("NumSGPR", "SGPRs",
1519	getMCExprStr(Value: CurrentProgramInfo.NumSGPR));
1520	EmitResourceUsageRemark ("NumVGPR", "VGPRs",
1521	getMCExprStr(Value: CurrentProgramInfo.NumArchVGPR));
1522	if (hasMAIInsts) {
1523	EmitResourceUsageRemark ("NumAGPR", "AGPRs",
1524	getMCExprStr(Value: CurrentProgramInfo.NumAccVGPR));
1525	}
1526	EmitResourceUsageRemark ("ScratchSize", "ScratchSize [bytes/lane]",
1527	getMCExprStr(Value: CurrentProgramInfo.ScratchSize));
1528	int64_t DynStack;
1529	bool DynStackEvaluatable =
1530	CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(Res&: DynStack);
1531	StringRef DynamicStackStr =
1532	DynStackEvaluatable && DynStack ? "True" : "False";
1533	EmitResourceUsageRemark ("DynamicStack", "Dynamic Stack", DynamicStackStr);
1534	EmitResourceUsageRemark ("Occupancy", "Occupancy [waves/SIMD]",
1535	getMCExprStr(Value: CurrentProgramInfo.Occupancy));
1536	EmitResourceUsageRemark ("SGPRSpill", "SGPRs Spill",
1537	CurrentProgramInfo.SGPRSpill);
1538	EmitResourceUsageRemark ("VGPRSpill", "VGPRs Spill",
1539	CurrentProgramInfo.VGPRSpill);
1540	if (isModuleEntryFunction)
1541	EmitResourceUsageRemark ("BytesLDS", "LDS Size [bytes/block]",
1542	CurrentProgramInfo.LDSSize);
1543	}
1544

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp