AMDGPULegalizerInfo.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp]

1	//===- AMDGPULegalizerInfo.cpp ------------------------------------ C++ --==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This file implements the targeting of the Machinelegalizer class for
10	/// AMDGPU.
11	/// \todo This should be generated by TableGen.
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPULegalizerInfo.h"
15
16	#include "AMDGPU.h"
17	#include "AMDGPUGlobalISelUtils.h"
18	#include "AMDGPUInstrInfo.h"
19	#include "AMDGPUMemoryUtils.h"
20	#include "AMDGPUTargetMachine.h"
21	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22	#include "SIInstrInfo.h"
23	#include "SIMachineFunctionInfo.h"
24	#include "SIRegisterInfo.h"
25	#include "Utils/AMDGPUBaseInfo.h"
26	#include "llvm/ADT/ScopeExit.h"
27	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28	#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30	#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31	#include "llvm/CodeGen/GlobalISel/Utils.h"
32	#include "llvm/CodeGen/TargetOpcodes.h"
33	#include "llvm/IR/DiagnosticInfo.h"
34	#include "llvm/IR/IntrinsicsAMDGPU.h"
35	#include "llvm/IR/IntrinsicsR600.h"
36
37	#define DEBUG_TYPE "amdgpu-legalinfo"
38
39	using namespace llvm;
40	using namespace LegalizeActions;
41	using namespace LegalizeMutations;
42	using namespace LegalityPredicates;
43	using namespace MIPatternMatch;
44
45	// Hack until load/store selection patterns support any tuple of legal types.
46	static cl::opt<bool> EnableNewLegality(
47	"amdgpu-global-isel-new-legality",
48	cl::desc ("Use GlobalISel desired legality, rather than try to use"
49	"rules compatible with selection patterns"),
50	cl::init(Val: false),
51	cl::ReallyHidden);
52
53	static constexpr unsigned MaxRegisterSize = `1024`;
54
55	// Round the number of elements to the next power of two elements
56	static LLT getPow2VectorType(LLT Ty) {
57	unsigned NElts = Ty.getNumElements();
58	unsigned Pow2NElts = `1` << Log2_32_Ceil(Value: NElts);
59	return Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: Pow2NElts));
60	}
61
62	// Round the number of bits to the next power of two bits
63	static LLT getPow2ScalarType(LLT Ty) {
64	unsigned Bits = Ty.getSizeInBits();
65	unsigned Pow2Bits = `1` << Log2_32_Ceil(Value: Bits);
66	return LLT::scalar(SizeInBits: Pow2Bits);
67	}
68
69	/// \returns true if this is an odd sized vector which should widen by adding an
70	/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71	/// excludes s1 vectors, which should always be scalarized.
72	static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73	return [=](const LegalityQuery &Query) {
74	const LLT Ty = Query.Types [TypeIdx];
75	if (!Ty.isVector())
76	return false;
77
78	const LLT EltTy = Ty.getElementType();
79	const unsigned EltSize = EltTy.getSizeInBits();
80	return Ty.getNumElements() % `2` != `0` &&
81	EltSize > `1` && EltSize < `32` &&
82	Ty.getSizeInBits() % `32` != `0`;
83	};
84	}
85
86	static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87	return [=](const LegalityQuery &Query) {
88	const LLT Ty = Query.Types [TypeIdx];
89	return Ty.getSizeInBits() % `32` == `0`;
90	};
91	}
92
93	static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94	return [=](const LegalityQuery &Query) {
95	const LLT Ty = Query.Types [TypeIdx];
96	const LLT EltTy = Ty.getScalarType();
97	return EltTy.getSizeInBits() == `16` && Ty.getNumElements() > `2`;
98	};
99	}
100
101	static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102	return [=](const LegalityQuery &Query) {
103	const LLT Ty = Query.Types [TypeIdx];
104	const LLT EltTy = Ty.getElementType();
105	return std::pair(TypeIdx,
106	LLT::fixed_vector(NumElements: Ty.getNumElements() + `1`, ScalarTy: EltTy));
107	};
108	}
109
110	static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111	return [=](const LegalityQuery &Query) {
112	const LLT Ty = Query.Types [TypeIdx];
113	const LLT EltTy = Ty.getElementType();
114	unsigned Size = Ty.getSizeInBits();
115	unsigned Pieces = (Size + `63`) / `64`;
116	unsigned NewNumElts = (Ty.getNumElements() + `1`) / Pieces;
117	return std::pair(TypeIdx, LLT::scalarOrVector(
118	EC: ElementCount::getFixed(MinVal: NewNumElts), ScalarTy: EltTy));
119	};
120	}
121
122	// Increase the number of vector elements to reach the next multiple of 32-bit
123	// type.
124	static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125	return [=](const LegalityQuery &Query) {
126	const LLT Ty = Query.Types [TypeIdx];
127
128	const LLT EltTy = Ty.getElementType();
129	const int Size = Ty.getSizeInBits();
130	const int EltSize = EltTy.getSizeInBits();
131	const int NextMul32 = (Size + `31`) / `32`;
132
133	assert(EltSize < `32`);
134
135	const int NewNumElts = (`32` * NextMul32 + EltSize - `1`) / EltSize;
136	return std::pair(TypeIdx, LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: EltTy));
137	};
138	}
139
140	// Increase the number of vector elements to reach the next legal RegClass.
141	static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142	return [=](const LegalityQuery &Query) {
143	const LLT Ty = Query.Types [TypeIdx];
144	const unsigned NumElts = Ty.getNumElements();
145	const unsigned EltSize = Ty.getElementType().getSizeInBits();
146	const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148	assert(EltSize == `32` \|\| EltSize == `64`);
149	assert(Ty.getSizeInBits() < MaxRegisterSize);
150
151	unsigned NewNumElts;
152	// Find the nearest legal RegClass that is larger than the current type.
153	for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154	if (SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: NewNumElts * EltSize))
155	break;
156	}
157	return std::pair(TypeIdx,
158	LLT::fixed_vector(NumElements: NewNumElts, ScalarTy: Ty.getElementType()));
159	};
160	}
161
162	static LLT getBufferRsrcScalarType(const LLT Ty) {
163	if (!Ty.isVector())
164	return LLT::scalar(SizeInBits: `128`);
165	const ElementCount NumElems = Ty.getElementCount();
166	return LLT::vector(EC: NumElems, ScalarTy: LLT::scalar(SizeInBits: `128`));
167	}
168
169	static LLT getBufferRsrcRegisterType(const LLT Ty) {
170	if (!Ty.isVector())
171	return LLT::fixed_vector(NumElements: `4`, ScalarTy: LLT::scalar(SizeInBits: `32`));
172	const unsigned NumElems = Ty.getElementCount().getFixedValue();
173	return LLT::fixed_vector(NumElements: NumElems * `4`, ScalarTy: LLT::scalar(SizeInBits: `32`));
174	}
175
176	static LLT getBitcastRegisterType(const LLT Ty) {
177	const unsigned Size = Ty.getSizeInBits();
178
179	if (Size <= `32`) {
180	// <2 x s8> -> s16
181	// <4 x s8> -> s32
182	return LLT::scalar(SizeInBits: Size);
183	}
184
185	return LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / `32`), ScalarSize: `32`);
186	}
187
188	static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189	return [=](const LegalityQuery &Query) {
190	const LLT Ty = Query.Types [TypeIdx];
191	return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192	};
193	}
194
195	static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196	return [=](const LegalityQuery &Query) {
197	const LLT Ty = Query.Types [TypeIdx];
198	unsigned Size = Ty.getSizeInBits();
199	assert(Size % `32` == `0`);
200	return std::pair(
201	TypeIdx, LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: Size / `32`), ScalarSize: `32`));
202	};
203	}
204
205	static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206	return [=](const LegalityQuery &Query) {
207	const LLT QueryTy = Query.Types [TypeIdx];
208	return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209	};
210	}
211
212	static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213	return [=](const LegalityQuery &Query) {
214	const LLT QueryTy = Query.Types [TypeIdx];
215	return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216	};
217	}
218
219	static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220	return [=](const LegalityQuery &Query) {
221	const LLT QueryTy = Query.Types [TypeIdx];
222	return QueryTy.isVector() && QueryTy.getNumElements() % `2` != `0`;
223	};
224	}
225
226	static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
227	return ((ST.useRealTrue16Insts() && Size == `16`) \|\| Size % `32` == `0`) &&
228	Size <= MaxRegisterSize;
229	}
230
231	static bool isRegisterVectorElementType(LLT EltTy) {
232	const int EltSize = EltTy.getSizeInBits();
233	return EltSize == `16` \|\| EltSize % `32` == `0`;
234	}
235
236	static bool isRegisterVectorType(LLT Ty) {
237	const int EltSize = Ty.getElementType().getSizeInBits();
238	return EltSize == `32` \|\| EltSize == `64` \|\|
239	(EltSize == `16` && Ty.getNumElements() % `2` == `0`) \|\|
240	EltSize == `128` \|\| EltSize == `256`;
241	}
242
243	// TODO: replace all uses of isRegisterType with isRegisterClassType
244	static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
245	if (!isRegisterSize(ST, Size: Ty.getSizeInBits()))
246	return false;
247
248	if (Ty.isVector())
249	return isRegisterVectorType(Ty);
250
251	return true;
252	}
253
254	// Any combination of 32 or 64-bit elements up the maximum register size, and
255	// multiples of v2s16.
256	static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
257	unsigned TypeIdx) {
258	return [=, &ST](const LegalityQuery &Query) {
259	return isRegisterType(ST, Ty: Query.Types [TypeIdx]);
260	};
261	}
262
263	// RegisterType that doesn't have a corresponding RegClass.
264	// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
265	// should be removed.
266	static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
267	unsigned TypeIdx) {
268	return [=, &ST](const LegalityQuery &Query) {
269	LLT Ty = Query.Types [TypeIdx];
270	return isRegisterType(ST, Ty) &&
271	!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: Ty.getSizeInBits());
272	};
273	}
274
275	static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
276	return [=](const LegalityQuery &Query) {
277	const LLT QueryTy = Query.Types [TypeIdx];
278	if (!QueryTy.isVector())
279	return false;
280	const LLT EltTy = QueryTy.getElementType();
281	return EltTy == LLT::scalar(SizeInBits: `16`) \|\| EltTy.getSizeInBits() >= `32`;
282	};
283	}
284
285	constexpr LLT S1 = LLT::scalar(SizeInBits: `1`);
286	constexpr LLT S8 = LLT::scalar(SizeInBits: `8`);
287	constexpr LLT S16 = LLT::scalar(SizeInBits: `16`);
288	constexpr LLT S32 = LLT::scalar(SizeInBits: `32`);
289	constexpr LLT F32 = LLT::float32();
290	constexpr LLT S64 = LLT::scalar(SizeInBits: `64`);
291	constexpr LLT F64 = LLT::float64();
292	constexpr LLT S96 = LLT::scalar(SizeInBits: `96`);
293	constexpr LLT S128 = LLT::scalar(SizeInBits: `128`);
294	constexpr LLT S160 = LLT::scalar(SizeInBits: `160`);
295	constexpr LLT S192 = LLT::scalar(SizeInBits: `192`);
296	constexpr LLT S224 = LLT::scalar(SizeInBits: `224`);
297	constexpr LLT S256 = LLT::scalar(SizeInBits: `256`);
298	constexpr LLT S512 = LLT::scalar(SizeInBits: `512`);
299	constexpr LLT S1024 = LLT::scalar(SizeInBits: `1024`);
300	constexpr LLT MaxScalar = LLT::scalar(SizeInBits: MaxRegisterSize);
301
302	constexpr LLT V2S8 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `8`);
303	constexpr LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
304	constexpr LLT V4S16 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`);
305	constexpr LLT V6S16 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `16`);
306	constexpr LLT V8S16 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `16`);
307	constexpr LLT V10S16 = LLT::fixed_vector(NumElements: `10`, ScalarSizeInBits: `16`);
308	constexpr LLT V12S16 = LLT::fixed_vector(NumElements: `12`, ScalarSizeInBits: `16`);
309	constexpr LLT V16S16 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `16`);
310
311	constexpr LLT V2F16 = LLT::fixed_vector(NumElements: `2`, ScalarTy: LLT::float16());
312	constexpr LLT V2BF16 = V2F16; // FIXME
313
314	constexpr LLT V2S32 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`);
315	constexpr LLT V3S32 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `32`);
316	constexpr LLT V4S32 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `32`);
317	constexpr LLT V5S32 = LLT::fixed_vector(NumElements: `5`, ScalarSizeInBits: `32`);
318	constexpr LLT V6S32 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `32`);
319	constexpr LLT V7S32 = LLT::fixed_vector(NumElements: `7`, ScalarSizeInBits: `32`);
320	constexpr LLT V8S32 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `32`);
321	constexpr LLT V9S32 = LLT::fixed_vector(NumElements: `9`, ScalarSizeInBits: `32`);
322	constexpr LLT V10S32 = LLT::fixed_vector(NumElements: `10`, ScalarSizeInBits: `32`);
323	constexpr LLT V11S32 = LLT::fixed_vector(NumElements: `11`, ScalarSizeInBits: `32`);
324	constexpr LLT V12S32 = LLT::fixed_vector(NumElements: `12`, ScalarSizeInBits: `32`);
325	constexpr LLT V16S32 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `32`);
326	constexpr LLT V32S32 = LLT::fixed_vector(NumElements: `32`, ScalarSizeInBits: `32`);
327
328	constexpr LLT V2S64 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
329	constexpr LLT V3S64 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `64`);
330	constexpr LLT V4S64 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `64`);
331	constexpr LLT V5S64 = LLT::fixed_vector(NumElements: `5`, ScalarSizeInBits: `64`);
332	constexpr LLT V6S64 = LLT::fixed_vector(NumElements: `6`, ScalarSizeInBits: `64`);
333	constexpr LLT V7S64 = LLT::fixed_vector(NumElements: `7`, ScalarSizeInBits: `64`);
334	constexpr LLT V8S64 = LLT::fixed_vector(NumElements: `8`, ScalarSizeInBits: `64`);
335	constexpr LLT V16S64 = LLT::fixed_vector(NumElements: `16`, ScalarSizeInBits: `64`);
336
337	constexpr LLT V2S128 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `128`);
338	constexpr LLT V4S128 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `128`);
339
340	constexpr std::initializer_list<LLT> AllScalarTypes = {
341	S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
342
343	constexpr std::initializer_list<LLT> AllS16Vectors{
344	V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
345
346	constexpr std::initializer_list<LLT> AllS32Vectors = {
347	V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
348	V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
349
350	constexpr std::initializer_list<LLT> AllS64Vectors = {
351	V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
352
353	constexpr std::initializer_list<LLT> AllVectors{
354	V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128,
355	V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
356	V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64,
357	V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
358
359	// Checks whether a type is in the list of legal register types.
360	static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
361	if (Ty.isPointerOrPointerVector())
362	Ty = Ty.changeElementType(NewEltTy: LLT::scalar(SizeInBits: Ty.getScalarSizeInBits()));
363
364	return is_contained(Set: AllS32Vectors, Element: Ty) \|\| is_contained(Set: AllS64Vectors, Element: Ty) \|\|
365	is_contained(Set: AllScalarTypes, Element: Ty) \|\|
366	(ST.useRealTrue16Insts() && Ty == S16) \|\|
367	is_contained(Set: AllS16Vectors, Element: Ty);
368	}
369
370	static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
371	unsigned TypeIdx) {
372	return [&ST, TypeIdx](const LegalityQuery &Query) {
373	return isRegisterClassType(ST, Ty: Query.Types [TypeIdx]);
374	};
375	}
376
377	// If we have a truncating store or an extending load with a data size larger
378	// than 32-bits, we need to reduce to a 32-bit type.
379	static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
380	return [=](const LegalityQuery &Query) {
381	const LLT Ty = Query.Types [TypeIdx];
382	return !Ty.isVector() && Ty.getSizeInBits() > `32` &&
383	Query.MMODescrs [`0`].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
384	};
385	}
386
387	// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
388	// handle some operations by just promoting the register during
389	// selection. There are also d16 loads on GFX9+ which preserve the high bits.
390	static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
391	bool IsLoad, bool IsAtomic) {
392	switch (AS) {
393	case AMDGPUAS::PRIVATE_ADDRESS:
394	// FIXME: Private element size.
395	return ST.enableFlatScratch() ? `128` : `32`;
396	case AMDGPUAS::LOCAL_ADDRESS:
397	return ST.useDS128() ? `128` : `64`;
398	case AMDGPUAS::GLOBAL_ADDRESS:
399	case AMDGPUAS::CONSTANT_ADDRESS:
400	case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
401	case AMDGPUAS::BUFFER_RESOURCE:
402	// Treat constant and global as identical. SMRD loads are sometimes usable for
403	// global loads (ideally constant address space should be eliminated)
404	// depending on the context. Legality cannot be context dependent, but
405	// RegBankSelect can split the load as necessary depending on the pointer
406	// register bank/uniformity and if the memory is invariant or not written in a
407	// kernel.
408	return IsLoad ? `512` : `128`;
409	default:
410	// FIXME: Flat addresses may contextually need to be split to 32-bit parts
411	// if they may alias scratch depending on the subtarget. This needs to be
412	// moved to custom handling to use addressMayBeAccessedAsPrivate
413	return ST.hasMultiDwordFlatScratchAddressing() \|\| IsAtomic ? `128` : `32`;
414	}
415	}
416
417	static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
418	const LegalityQuery &Query) {
419	const LLT Ty = Query.Types [`0`];
420
421	// Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
422	const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
423
424	unsigned RegSize = Ty.getSizeInBits();
425	uint64_t MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
426	uint64_t AlignBits = Query.MMODescrs [`0`].AlignInBits;
427	unsigned AS = Query.Types [`1`].getAddressSpace();
428
429	// All of these need to be custom lowered to cast the pointer operand.
430	if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
431	return false;
432
433	// Do not handle extending vector loads.
434	if (Ty.isVector() && MemSize != RegSize)
435	return false;
436
437	// TODO: We should be able to widen loads if the alignment is high enough, but
438	// we also need to modify the memory access size.
439	#if 0
440	// Accept widening loads based on alignment.
441	if (IsLoad && MemSize < Size)
442	MemSize = std::max(MemSize, Align);
443	#endif
444
445	// Only 1-byte and 2-byte to 32-bit extloads are valid.
446	if (MemSize != RegSize && RegSize != `32`)
447	return false;
448
449	if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
450	IsAtomic: Query.MMODescrs [`0`].Ordering !=
451	AtomicOrdering::NotAtomic))
452	return false;
453
454	switch (MemSize) {
455	case `8`:
456	case `16`:
457	case `32`:
458	case `64`:
459	case `128`:
460	break;
461	case `96`:
462	if (!ST.hasDwordx3LoadStores())
463	return false;
464	break;
465	case `256`:
466	case `512`:
467	// These may contextually need to be broken down.
468	break;
469	default:
470	return false;
471	}
472
473	assert(RegSize >= MemSize);
474
475	if (AlignBits < MemSize) {
476	const SITargetLowering *TLI = ST.getTargetLowering();
477	if (!TLI->allowsMisalignedMemoryAccessesImpl(Size: MemSize, AddrSpace: AS,
478	Alignment: Align (AlignBits / `8`)))
479	return false;
480	}
481
482	return true;
483	}
484
485	// The newer buffer intrinsic forms take their resource arguments as
486	// pointers in address space 8, aka s128 values. However, in order to not break
487	// SelectionDAG, the underlying operations have to continue to take v4i32
488	// arguments. Therefore, we convert resource pointers - or vectors of them
489	// to integer values here.
490	static bool hasBufferRsrcWorkaround(const LLT Ty) {
491	if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
492	return true;
493	if (Ty.isVector()) {
494	const LLT ElemTy = Ty.getElementType();
495	return hasBufferRsrcWorkaround(Ty: ElemTy);
496	}
497	return false;
498	}
499
500	// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
501	// workaround this. Eventually it should ignore the type for loads and only care
502	// about the size. Return true in cases where we will workaround this for now by
503	// bitcasting.
504	static bool loadStoreBitcastWorkaround(const LLT Ty) {
505	if (EnableNewLegality)
506	return false;
507
508	const unsigned Size = Ty.getSizeInBits();
509	if (Ty.isPointerVector())
510	return true;
511	if (Size <= `64`)
512	return false;
513	// Address space 8 pointers get their own workaround.
514	if (hasBufferRsrcWorkaround(Ty))
515	return false;
516	if (!Ty.isVector())
517	return true;
518
519	unsigned EltSize = Ty.getScalarSizeInBits();
520	return EltSize != `32` && EltSize != `64`;
521	}
522
523	static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
524	const LLT Ty = Query.Types [`0`];
525	return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
526	!hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
527	}
528
529	/// Return true if a load or store of the type should be lowered with a bitcast
530	/// to a different type.
531	static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
532	const LLT MemTy) {
533	const unsigned MemSizeInBits = MemTy.getSizeInBits();
534	const unsigned Size = Ty.getSizeInBits();
535	if (Size != MemSizeInBits)
536	return Size <= `32` && Ty.isVector();
537
538	if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
539	return true;
540
541	// Don't try to handle bitcasting vector ext loads for now.
542	return Ty.isVector() && (!MemTy.isVector() \|\| MemTy == Ty) &&
543	(Size <= `32` \|\| isRegisterSize(ST, Size)) &&
544	!isRegisterVectorElementType(EltTy: Ty.getElementType());
545	}
546
547	/// Return true if we should legalize a load by widening an odd sized memory
548	/// access up to the alignment. Note this case when the memory access itself
549	/// changes, not the size of the result register.
550	static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
551	uint64_t AlignInBits, unsigned AddrSpace,
552	unsigned Opcode) {
553	unsigned SizeInBits = MemoryTy.getSizeInBits();
554	// We don't want to widen cases that are naturally legal.
555	if (isPowerOf2_32(Value: SizeInBits))
556	return false;
557
558	// If we have 96-bit memory operations, we shouldn't touch them. Note we may
559	// end up widening these for a scalar load during RegBankSelect, if we don't
560	// have 96-bit scalar loads.
561	if (SizeInBits == `96` && ST.hasDwordx3LoadStores())
562	return false;
563
564	if (SizeInBits >= maxSizeForAddrSpace(ST, AS: AddrSpace, IsLoad: Opcode, IsAtomic: false))
565	return false;
566
567	// A load is known dereferenceable up to the alignment, so it's legal to widen
568	// to it.
569	//
570	// TODO: Could check dereferenceable for less aligned cases.
571	unsigned RoundedSize = NextPowerOf2(A: SizeInBits);
572	if (AlignInBits < RoundedSize)
573	return false;
574
575	// Do not widen if it would introduce a slow unaligned load.
576	const SITargetLowering *TLI = ST.getTargetLowering();
577	unsigned Fast = `0`;
578	return TLI->allowsMisalignedMemoryAccessesImpl(
579	Size: RoundedSize, AddrSpace, Alignment: Align (AlignInBits / `8`),
580	Flags: MachineMemOperand::MOLoad, IsFast: &Fast) &&
581	Fast;
582	}
583
584	static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
585	unsigned Opcode) {
586	if (Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic)
587	return false;
588
589	return shouldWidenLoad(ST, MemoryTy: Query.MMODescrs [`0`].MemoryTy,
590	AlignInBits: Query.MMODescrs [`0`].AlignInBits,
591	AddrSpace: Query.Types [`1`].getAddressSpace(), Opcode);
592	}
593
594	/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
595	/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
596	/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
597	static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
598	MachineRegisterInfo &MRI, unsigned Idx) {
599	MachineOperand &MO = MI.getOperand(i: Idx);
600
601	const LLT PointerTy = MRI.getType(Reg: MO.getReg());
602
603	// Paranoidly prevent us from doing this multiple times.
604	if (!hasBufferRsrcWorkaround(Ty: PointerTy))
605	return PointerTy;
606
607	const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
608	const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
609	if (!PointerTy.isVector()) {
610	// Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
611	const unsigned NumParts = PointerTy.getSizeInBits() / `32`;
612	const LLT S32 = LLT::scalar(SizeInBits: `32`);
613
614	Register VectorReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
615	std::array<Register, `4`> VectorElems;
616	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
617	for (unsigned I = `0`; I < NumParts; ++I)
618	VectorElems [I] =
619	B.buildExtractVectorElementConstant(Res: S32, Val: VectorReg, Idx: I).getReg(Idx: `0`);
620	B.buildMergeValues(Res: MO, Ops: VectorElems);
621	MO.setReg(VectorReg);
622	return VectorTy;
623	}
624	Register BitcastReg = MRI.createGenericVirtualRegister(Ty: VectorTy);
625	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
626	auto Scalar = B.buildBitcast(Dst: ScalarTy, Src: BitcastReg);
627	B.buildIntToPtr(Dst: MO, Src: Scalar);
628	MO.setReg(BitcastReg);
629
630	return VectorTy;
631	}
632
633	/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
634	/// the form in which the value must be in order to be passed to the low-level
635	/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
636	/// needed in order to account for the fact that we can't define a register
637	/// class for s128 without breaking SelectionDAG.
638	static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
639	MachineRegisterInfo &MRI = *B.getMRI();
640	const LLT PointerTy = MRI.getType(Reg: Pointer);
641	const LLT ScalarTy = getBufferRsrcScalarType(Ty: PointerTy);
642	const LLT VectorTy = getBufferRsrcRegisterType(Ty: PointerTy);
643
644	if (!PointerTy.isVector()) {
645	// Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
646	SmallVector<Register, `4`> PointerParts;
647	const unsigned NumParts = PointerTy.getSizeInBits() / `32`;
648	auto Unmerged = B.buildUnmerge(Res: LLT::scalar(SizeInBits: `32`), Op: Pointer);
649	for (unsigned I = `0`; I < NumParts; ++I)
650	PointerParts.push_back(Elt: Unmerged.getReg(Idx: I));
651	return B.buildBuildVector(Res: VectorTy, Ops: PointerParts).getReg(Idx: `0`);
652	}
653	Register Scalar = B.buildPtrToInt(Dst: ScalarTy, Src: Pointer).getReg(Idx: `0`);
654	return B.buildBitcast(Dst: VectorTy, Src: Scalar).getReg(Idx: `0`);
655	}
656
657	static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
658	unsigned Idx) {
659	MachineOperand &MO = MI.getOperand(i: Idx);
660
661	const LLT PointerTy = B.getMRI()->getType(Reg: MO.getReg());
662	// Paranoidly prevent us from doing this multiple times.
663	if (!hasBufferRsrcWorkaround(Ty: PointerTy))
664	return;
665	MO.setReg(castBufferRsrcToV4I32(Pointer: MO.getReg(), B));
666	}
667
668	AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
669	const GCNTargetMachine &TM)
670	: ST(ST_) {
671	using namespace TargetOpcode;
672
673	auto GetAddrSpacePtr = [&TM](unsigned AS) {
674	return LLT::pointer(AddressSpace: AS, SizeInBits: TM.getPointerSizeInBits(AS));
675	};
676
677	const LLT GlobalPtr = GetAddrSpacePtr (AMDGPUAS::GLOBAL_ADDRESS);
678	const LLT ConstantPtr = GetAddrSpacePtr (AMDGPUAS::CONSTANT_ADDRESS);
679	const LLT Constant32Ptr = GetAddrSpacePtr (AMDGPUAS::CONSTANT_ADDRESS_32BIT);
680	const LLT LocalPtr = GetAddrSpacePtr (AMDGPUAS::LOCAL_ADDRESS);
681	const LLT RegionPtr = GetAddrSpacePtr (AMDGPUAS::REGION_ADDRESS);
682	const LLT FlatPtr = GetAddrSpacePtr (AMDGPUAS::FLAT_ADDRESS);
683	const LLT PrivatePtr = GetAddrSpacePtr (AMDGPUAS::PRIVATE_ADDRESS);
684	const LLT BufferFatPtr = GetAddrSpacePtr (AMDGPUAS::BUFFER_FAT_POINTER);
685	const LLT RsrcPtr = GetAddrSpacePtr (AMDGPUAS::BUFFER_RESOURCE);
686	const LLT BufferStridedPtr =
687	GetAddrSpacePtr (AMDGPUAS::BUFFER_STRIDED_POINTER);
688
689	const LLT CodePtr = FlatPtr;
690
691	const std::initializer_list<LLT> AddrSpaces64 = {
692	GlobalPtr, ConstantPtr, FlatPtr
693	};
694
695	const std::initializer_list<LLT> AddrSpaces32 = {
696	LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
697	};
698
699	const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
700
701	const std::initializer_list<LLT> FPTypesBase = {
702	S32, S64
703	};
704
705	const std::initializer_list<LLT> FPTypes16 = {
706	S32, S64, S16
707	};
708
709	const std::initializer_list<LLT> FPTypesPK16 = {
710	S32, S64, S16, V2S16
711	};
712
713	const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
714
715	// s1 for VCC branches, s32 for SCC branches.
716	getActionDefinitionsBuilder(Opcode: G_BRCOND).legalFor(Types: {S1, S32});
717
718	// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
719	// elements for v3s16
720	getActionDefinitionsBuilder(Opcode: G_PHI)
721	.legalFor(Types: {S32, S64, V2S16, S16, V4S16, S1, S128, S256})
722	.legalFor(Types: AllS32Vectors)
723	.legalFor(Types: AllS64Vectors)
724	.legalFor(Types: AddrSpaces64)
725	.legalFor(Types: AddrSpaces32)
726	.legalFor(Types: AddrSpaces128)
727	.legalIf(Predicate: isPointer(TypeIdx: `0`))
728	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S256)
729	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
730	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `16`)
731	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
732	.scalarize(TypeIdx: `0`);
733
734	if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
735	// Full set of gfx9 features.
736	if (ST.hasScalarAddSub64()) {
737	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
738	.legalFor(Types: {S64, S32, S16, V2S16})
739	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
740	.scalarize(TypeIdx: `0`)
741	.minScalar(TypeIdx: `0`, Ty: S16)
742	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
743	.maxScalar(TypeIdx: `0`, Ty: S32);
744	} else {
745	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
746	.legalFor(Types: {S32, S16, V2S16})
747	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
748	.scalarize(TypeIdx: `0`)
749	.minScalar(TypeIdx: `0`, Ty: S16)
750	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
751	.maxScalar(TypeIdx: `0`, Ty: S32);
752	}
753
754	if (ST.hasScalarSMulU64()) {
755	getActionDefinitionsBuilder(Opcode: G_MUL)
756	.legalFor(Types: {S64, S32, S16, V2S16})
757	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
758	.scalarize(TypeIdx: `0`)
759	.minScalar(TypeIdx: `0`, Ty: S16)
760	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
761	.custom();
762	} else {
763	getActionDefinitionsBuilder(Opcode: G_MUL)
764	.legalFor(Types: {S32, S16, V2S16})
765	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
766	.scalarize(TypeIdx: `0`)
767	.minScalar(TypeIdx: `0`, Ty: S16)
768	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
769	.custom();
770	}
771	assert(ST.hasMad64_32());
772
773	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
774	.legalFor(Types: {S32, S16, V2S16}) // Clamp modifier
775	.minScalarOrElt(TypeIdx: `0`, Ty: S16)
776	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
777	.scalarize(TypeIdx: `0`)
778	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
779	.lower();
780	} else if (ST.has16BitInsts()) {
781	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
782	.legalFor(Types: {S32, S16})
783	.minScalar(TypeIdx: `0`, Ty: S16)
784	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
785	.maxScalar(TypeIdx: `0`, Ty: S32)
786	.scalarize(TypeIdx: `0`);
787
788	getActionDefinitionsBuilder(Opcode: G_MUL)
789	.legalFor(Types: {S32, S16})
790	.scalarize(TypeIdx: `0`)
791	.minScalar(TypeIdx: `0`, Ty: S16)
792	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
793	.custom();
794	assert(ST.hasMad64_32());
795
796	// Technically the saturating operations require clamp bit support, but this
797	// was introduced at the same time as 16-bit operations.
798	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
799	.legalFor(Types: {S32, S16}) // Clamp modifier
800	.minScalar(TypeIdx: `0`, Ty: S16)
801	.scalarize(TypeIdx: `0`)
802	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `16`)
803	.lower();
804
805	// We're just lowering this, but it helps get a better result to try to
806	// coerce to the desired type first.
807	getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
808	.minScalar(TypeIdx: `0`, Ty: S16)
809	.scalarize(TypeIdx: `0`)
810	.lower();
811	} else {
812	getActionDefinitionsBuilder(Opcodes: {G_ADD, G_SUB})
813	.legalFor(Types: {S32})
814	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`)
815	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
816	.scalarize(TypeIdx: `0`);
817
818	auto &Mul = getActionDefinitionsBuilder(Opcode: G_MUL)
819	.legalFor(Types: {S32})
820	.scalarize(TypeIdx: `0`)
821	.minScalar(TypeIdx: `0`, Ty: S32)
822	.widenScalarToNextMultipleOf(TypeIdx: `0`, Size: `32`);
823
824	if (ST.hasMad64_32())
825	Mul.custom();
826	else
827	Mul.maxScalar(TypeIdx: `0`, Ty: S32);
828
829	if (ST.hasIntClamp()) {
830	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
831	.legalFor(Types: {S32}) // Clamp modifier.
832	.scalarize(TypeIdx: `0`)
833	.minScalarOrElt(TypeIdx: `0`, Ty: S32)
834	.lower();
835	} else {
836	// Clamp bit support was added in VI, along with 16-bit operations.
837	getActionDefinitionsBuilder(Opcodes: {G_UADDSAT, G_USUBSAT})
838	.minScalar(TypeIdx: `0`, Ty: S32)
839	.scalarize(TypeIdx: `0`)
840	.lower();
841	}
842
843	// FIXME: DAG expansion gets better results. The widening uses the smaller
844	// range values and goes for the min/max lowering directly.
845	getActionDefinitionsBuilder(Opcodes: {G_SADDSAT, G_SSUBSAT})
846	.minScalar(TypeIdx: `0`, Ty: S32)
847	.scalarize(TypeIdx: `0`)
848	.lower();
849	}
850
851	getActionDefinitionsBuilder(
852	Opcodes: {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
853	.customFor(Types: {S32, S64})
854	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
855	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
856	.scalarize(TypeIdx: `0`);
857
858	auto &Mulh = getActionDefinitionsBuilder(Opcodes: {G_UMULH, G_SMULH})
859	.legalFor(Types: {S32})
860	.maxScalar(TypeIdx: `0`, Ty: S32);
861
862	if (ST.hasVOP3PInsts()) {
863	Mulh
864	.clampMaxNumElements(TypeIdx: `0`, EltTy: S8, MaxElements: `2`)
865	.lowerFor(Types: {V2S8});
866	}
867
868	Mulh
869	.scalarize(TypeIdx: `0`)
870	.lower();
871
872	// Report legal for any types we can handle anywhere. For the cases only legal
873	// on the SALU, RegBankSelect will be able to re-legalize.
874	getActionDefinitionsBuilder(Opcodes: {G_AND, G_OR, G_XOR})
875	.legalFor(Types: {S32, S1, S64, V2S32, S16, V2S16, V4S16})
876	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
877	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
878	.fewerElementsIf(
879	Predicate: all(P0: vectorWiderThan(TypeIdx: `0`, Size: `64`), P1: scalarOrEltNarrowerThan(TypeIdx: `0`, Size: `64`)),
880	Mutation: fewerEltsToSize64Vector(TypeIdx: `0`))
881	.widenScalarToNextPow2(TypeIdx: `0`)
882	.scalarize(TypeIdx: `0`);
883
884	getActionDefinitionsBuilder(
885	Opcodes: {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
886	.legalFor(Types: {{S32, S1}, {S32, S32}})
887	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
888	.scalarize(TypeIdx: `0`);
889
890	getActionDefinitionsBuilder(Opcode: G_BITCAST)
891	// Don't worry about the size constraint.
892	.legalIf(Predicate: all(P0: isRegisterClassType(ST, TypeIdx: `0`), P1: isRegisterClassType(ST, TypeIdx: `1`)))
893	.lower();
894
895	getActionDefinitionsBuilder(Opcode: G_CONSTANT)
896	.legalFor(Types: {S1, S32, S64, S16, GlobalPtr,
897	LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
898	.legalIf(Predicate: isPointer(TypeIdx: `0`))
899	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
900	.widenScalarToNextPow2(TypeIdx: `0`);
901
902	getActionDefinitionsBuilder(Opcode: G_FCONSTANT)
903	.legalFor(Types: {S32, S64, S16})
904	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
905
906	getActionDefinitionsBuilder(Opcodes: {G_IMPLICIT_DEF, G_FREEZE})
907	.legalIf(Predicate: isRegisterClassType(ST, TypeIdx: `0`))
908	// s1 and s16 are special cases because they have legal operations on
909	// them, but don't really occupy registers in the normal way.
910	.legalFor(Types: {S1, S16})
911	.clampNumElements(TypeIdx: `0`, MinTy: V16S32, MaxTy: V32S32)
912	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
913	.clampScalarOrElt(TypeIdx: `0`, MinTy: S32, MaxTy: MaxScalar)
914	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
915	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `16`);
916
917	getActionDefinitionsBuilder(Opcode: G_FRAME_INDEX).legalFor(Types: {PrivatePtr});
918
919	// If the amount is divergent, we have to do a wave reduction to get the
920	// maximum value, so this is expanded during RegBankSelect.
921	getActionDefinitionsBuilder(Opcode: G_DYN_STACKALLOC)
922	.legalFor(Types: {{PrivatePtr, S32}});
923
924	getActionDefinitionsBuilder(Opcode: G_STACKSAVE)
925	.customFor(Types: {PrivatePtr});
926	getActionDefinitionsBuilder(Opcode: G_STACKRESTORE)
927	.legalFor(Types: {PrivatePtr});
928
929	getActionDefinitionsBuilder(Opcodes: {G_GET_FPENV, G_SET_FPENV}).customFor(Types: {S64});
930
931	getActionDefinitionsBuilder(Opcode: G_GLOBAL_VALUE)
932	.customIf(Predicate: typeIsNot(TypeIdx: `0`, Type: PrivatePtr));
933
934	getActionDefinitionsBuilder(Opcode: G_BLOCK_ADDR).legalFor(Types: {CodePtr});
935
936	auto &FPOpActions = getActionDefinitionsBuilder(
937	Opcodes: { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
938	G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
939	.legalFor(Types: {S32, S64});
940	auto &TrigActions = getActionDefinitionsBuilder(Opcodes: {G_FSIN, G_FCOS})
941	.customFor(Types: {S32, S64});
942	auto &FDIVActions = getActionDefinitionsBuilder(Opcode: G_FDIV)
943	.customFor(Types: {S32, S64});
944
945	if (ST.has16BitInsts()) {
946	if (ST.hasVOP3PInsts())
947	FPOpActions.legalFor(Types: {S16, V2S16});
948	else
949	FPOpActions.legalFor(Types: {S16});
950
951	TrigActions.customFor(Types: {S16});
952	FDIVActions.customFor(Types: {S16});
953	}
954
955	if (ST.hasPackedFP32Ops()) {
956	FPOpActions.legalFor(Types: {V2S32});
957	FPOpActions.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S32, NumElts: `2`);
958	}
959
960	auto &MinNumMaxNum = getActionDefinitionsBuilder(
961	Opcodes: {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
962	G_FMAXNUM_IEEE});
963
964	if (ST.hasVOP3PInsts()) {
965	MinNumMaxNum.customFor(Types: FPTypesPK16)
966	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
967	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
968	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
969	.scalarize(TypeIdx: `0`);
970	} else if (ST.has16BitInsts()) {
971	MinNumMaxNum.customFor(Types: FPTypes16)
972	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
973	.scalarize(TypeIdx: `0`);
974	} else {
975	MinNumMaxNum.customFor(Types: FPTypesBase)
976	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
977	.scalarize(TypeIdx: `0`);
978	}
979
980	if (ST.hasVOP3PInsts())
981	FPOpActions.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`);
982
983	FPOpActions
984	.scalarize(TypeIdx: `0`)
985	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
986
987	TrigActions
988	.scalarize(TypeIdx: `0`)
989	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
990
991	FDIVActions
992	.scalarize(TypeIdx: `0`)
993	.clampScalar(TypeIdx: `0`, MinTy: ST.has16BitInsts() ? S16 : S32, MaxTy: S64);
994
995	getActionDefinitionsBuilder(Opcodes: {G_FNEG, G_FABS})
996	.legalFor(Types: FPTypesPK16)
997	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
998	.scalarize(TypeIdx: `0`)
999	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
1000
1001	if (ST.has16BitInsts()) {
1002	getActionDefinitionsBuilder(Opcode: G_FSQRT)
1003	.legalFor(Types: {S16})
1004	.customFor(Types: {S32, S64})
1005	.scalarize(TypeIdx: `0`)
1006	.unsupported();
1007	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1008	.legalFor(Types: {S32, S64, S16})
1009	.scalarize(TypeIdx: `0`)
1010	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
1011
1012	getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1013	.legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}})
1014	.scalarize(TypeIdx: `0`)
1015	.maxScalarIf(Predicate: typeIs(TypeIdx: `0`, TypesInit: S16), TypeIdx: `1`, Ty: S16)
1016	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
1017	.lower();
1018
1019	getActionDefinitionsBuilder(Opcode: G_FFREXP)
1020	.customFor(Types: {{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1021	.scalarize(TypeIdx: `0`)
1022	.lower();
1023	} else {
1024	getActionDefinitionsBuilder(Opcode: G_FSQRT)
1025	.customFor(Types: {S32, S64, S16})
1026	.scalarize(TypeIdx: `0`)
1027	.unsupported();
1028
1029
1030	if (ST.hasFractBug()) {
1031	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1032	.customFor(Types: {S64})
1033	.legalFor(Types: {S32, S64})
1034	.scalarize(TypeIdx: `0`)
1035	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1036	} else {
1037	getActionDefinitionsBuilder(Opcode: G_FFLOOR)
1038	.legalFor(Types: {S32, S64})
1039	.scalarize(TypeIdx: `0`)
1040	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1041	}
1042
1043	getActionDefinitionsBuilder(Opcodes: {G_FLDEXP, G_STRICT_FLDEXP})
1044	.legalFor(Types: {{S32, S32}, {S64, S32}})
1045	.scalarize(TypeIdx: `0`)
1046	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1047	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
1048	.lower();
1049
1050	getActionDefinitionsBuilder(Opcode: G_FFREXP)
1051	.customFor(Types: {{S32, S32}, {S64, S32}})
1052	.scalarize(TypeIdx: `0`)
1053	.minScalar(TypeIdx: `0`, Ty: S32)
1054	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
1055	.lower();
1056	}
1057
1058	auto &FPTruncActions = getActionDefinitionsBuilder(Opcode: G_FPTRUNC);
1059	if (ST.hasCvtPkF16F32Inst()) {
1060	FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1061	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`);
1062	} else {
1063	FPTruncActions.legalFor(Types: {{S32, S64}, {S16, S32}});
1064	}
1065	FPTruncActions.scalarize(TypeIdx: `0`).lower();
1066
1067	getActionDefinitionsBuilder(Opcode: G_FPEXT)
1068	.legalFor(Types: {{S64, S32}, {S32, S16}})
1069	.narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: `0`, Ty: S32))
1070	.scalarize(TypeIdx: `0`);
1071
1072	auto &FSubActions = getActionDefinitionsBuilder(Opcodes: {G_FSUB, G_STRICT_FSUB});
1073	if (ST.has16BitInsts()) {
1074	FSubActions
1075	// Use actual fsub instruction
1076	.legalFor(Types: {S32, S16})
1077	// Must use fadd + fneg
1078	.lowerFor(Types: {S64, V2S16});
1079	} else {
1080	FSubActions
1081	// Use actual fsub instruction
1082	.legalFor(Types: {S32})
1083	// Must use fadd + fneg
1084	.lowerFor(Types: {S64, S16, V2S16});
1085	}
1086
1087	FSubActions
1088	.scalarize(TypeIdx: `0`)
1089	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1090
1091	// Whether this is legal depends on the floating point mode for the function.
1092	auto &FMad = getActionDefinitionsBuilder(Opcode: G_FMAD);
1093	if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1094	FMad.customFor(Types: {S32, S16});
1095	else if (ST.hasMadMacF32Insts())
1096	FMad.customFor(Types: {S32});
1097	else if (ST.hasMadF16())
1098	FMad.customFor(Types: {S16});
1099	FMad.scalarize(TypeIdx: `0`)
1100	.lower();
1101
1102	auto &FRem = getActionDefinitionsBuilder(Opcode: G_FREM);
1103	if (ST.has16BitInsts()) {
1104	FRem.customFor(Types: {S16, S32, S64});
1105	} else {
1106	FRem.minScalar(TypeIdx: `0`, Ty: S32)
1107	.customFor(Types: {S32, S64});
1108	}
1109	FRem.scalarize(TypeIdx: `0`);
1110
1111	// TODO: Do we need to clamp maximum bitwidth?
1112	getActionDefinitionsBuilder(Opcode: G_TRUNC)
1113	.legalIf(Predicate: isScalar(TypeIdx: `0`))
1114	.legalFor(Types: {{V2S16, V2S32}})
1115	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
1116	// Avoid scalarizing in cases that should be truly illegal. In unresolvable
1117	// situations (like an invalid implicit use), we don't want to infinite loop
1118	// in the legalizer.
1119	.fewerElementsIf(Predicate: elementTypeIsLegal(TypeIdx: `0`), Mutation: LegalizeMutations::scalarize(TypeIdx: `0`))
1120	.alwaysLegal();
1121
1122	getActionDefinitionsBuilder(Opcodes: {G_SEXT, G_ZEXT, G_ANYEXT})
1123	.legalFor(Types: {{S64, S32}, {S32, S16}, {S64, S16},
1124	{S32, S1}, {S64, S1}, {S16, S1}})
1125	.scalarize(TypeIdx: `0`)
1126	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1127	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`);
1128
1129	// TODO: Split s1->s64 during regbankselect for VALU.
1130	auto &IToFP = getActionDefinitionsBuilder(Opcodes: {G_SITOFP, G_UITOFP})
1131	.legalFor(Types: {{S32, S32}, {S64, S32}, {S16, S32}})
1132	.lowerIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: S1))
1133	.customFor(Types: {{S32, S64}, {S64, S64}});
1134	if (ST.has16BitInsts())
1135	IToFP.legalFor(Types: {{S16, S16}});
1136	IToFP.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1137	.minScalar(TypeIdx: `0`, Ty: S32)
1138	.scalarize(TypeIdx: `0`)
1139	.widenScalarToNextPow2(TypeIdx: `1`);
1140
1141	auto &FPToI = getActionDefinitionsBuilder(Opcodes: {G_FPTOSI, G_FPTOUI})
1142	.legalFor(Types: {{S32, S32}, {S32, S64}, {S32, S16}})
1143	.customFor(Types: {{S64, S32}, {S64, S64}})
1144	.narrowScalarFor(Types: {{S64, S16}}, Mutation: changeTo(TypeIdx: `0`, Ty: S32));
1145	if (ST.has16BitInsts())
1146	FPToI.legalFor(Types: {{S16, S16}});
1147	else
1148	FPToI.minScalar(TypeIdx: `1`, Ty: S32);
1149
1150	FPToI.minScalar(TypeIdx: `0`, Ty: S32)
1151	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1152	.scalarize(TypeIdx: `0`)
1153	.lower();
1154
1155	getActionDefinitionsBuilder(Opcodes: {G_LROUND, G_LLROUND})
1156	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
1157	.scalarize(TypeIdx: `0`)
1158	.lower();
1159
1160	getActionDefinitionsBuilder(Opcode: G_INTRINSIC_FPTRUNC_ROUND)
1161	.legalFor(Types: {S16, S32})
1162	.scalarize(TypeIdx: `0`)
1163	.lower();
1164
1165	// Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1166	getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1167	.scalarize(TypeIdx: `0`)
1168	.lower();
1169
1170	getActionDefinitionsBuilder(Opcodes: {G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1171	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
1172	.scalarize(TypeIdx: `0`)
1173	.lower();
1174
1175	if (ST.has16BitInsts()) {
1176	getActionDefinitionsBuilder(
1177	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1178	.legalFor(Types: {S16, S32, S64})
1179	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
1180	.scalarize(TypeIdx: `0`);
1181	} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1182	getActionDefinitionsBuilder(
1183	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1184	.legalFor(Types: {S32, S64})
1185	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1186	.scalarize(TypeIdx: `0`);
1187	} else {
1188	getActionDefinitionsBuilder(
1189	Opcodes: {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1190	.legalFor(Types: {S32})
1191	.customFor(Types: {S64})
1192	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1193	.scalarize(TypeIdx: `0`);
1194	}
1195
1196	getActionDefinitionsBuilder(Opcode: G_PTR_ADD)
1197	.unsupportedFor(Types: {BufferFatPtr, BufferStridedPtr, RsrcPtr})
1198	.legalIf(Predicate: all(P0: isPointer(TypeIdx: `0`), P1: sameSize(TypeIdx0: `0`, TypeIdx1: `1`)))
1199	.scalarize(TypeIdx: `0`)
1200	.scalarSameSizeAs(TypeIdx: `1`, SameSizeIdx: `0`);
1201
1202	getActionDefinitionsBuilder(Opcode: G_PTRMASK)
1203	.legalIf(Predicate: all(P0: sameSize(TypeIdx0: `0`, TypeIdx1: `1`), P1: typeInSet(TypeIdx: `1`, TypesInit: {S64, S32})))
1204	.scalarSameSizeAs(TypeIdx: `1`, SameSizeIdx: `0`)
1205	.scalarize(TypeIdx: `0`);
1206
1207	auto &CmpBuilder =
1208	getActionDefinitionsBuilder(Opcode: G_ICMP)
1209	// The compare output type differs based on the register bank of the output,
1210	// so make both s1 and s32 legal.
1211	//
1212	// Scalar compares producing output in scc will be promoted to s32, as that
1213	// is the allocatable register type that will be needed for the copy from
1214	// scc. This will be promoted during RegBankSelect, and we assume something
1215	// before that won't try to use s32 result types.
1216	//
1217	// Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1218	// bank.
1219	.legalForCartesianProduct(
1220	Types0: {S1}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1221	.legalForCartesianProduct(
1222	Types0: {S32}, Types1: {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1223	if (ST.has16BitInsts()) {
1224	CmpBuilder.legalFor(Types: {{S1, S16}});
1225	}
1226
1227	CmpBuilder
1228	.widenScalarToNextPow2(TypeIdx: `1`)
1229	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1230	.scalarize(TypeIdx: `0`)
1231	.legalIf(Predicate: all(P0: typeInSet(TypeIdx: `0`, TypesInit: {S1, S32}), P1: isPointer(TypeIdx: `1`)));
1232
1233	auto &FCmpBuilder =
1234	getActionDefinitionsBuilder(Opcode: G_FCMP).legalForCartesianProduct(
1235	Types0: {S1}, Types1: ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1236
1237	if (ST.hasSALUFloatInsts())
1238	FCmpBuilder.legalForCartesianProduct(Types0: {S32}, Types1: {S16, S32});
1239
1240	FCmpBuilder
1241	.widenScalarToNextPow2(TypeIdx: `1`)
1242	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1243	.scalarize(TypeIdx: `0`);
1244
1245	// FIXME: fpow has a selection pattern that should move to custom lowering.
1246	auto &ExpOps = getActionDefinitionsBuilder(Opcode: G_FPOW);
1247	if (ST.has16BitInsts())
1248	ExpOps.customFor(Types: {{S32}, {S16}});
1249	else
1250	ExpOps.customFor(Types: {S32});
1251	ExpOps.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1252	.scalarize(TypeIdx: `0`);
1253
1254	getActionDefinitionsBuilder(Opcode: G_FPOWI)
1255	.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1256	.lower();
1257
1258	auto &Log2Ops = getActionDefinitionsBuilder(Opcodes: {G_FLOG2, G_FEXP2});
1259	Log2Ops.customFor(Types: {S32});
1260	if (ST.has16BitInsts())
1261	Log2Ops.legalFor(Types: {S16});
1262	else
1263	Log2Ops.customFor(Types: {S16});
1264	Log2Ops.scalarize(TypeIdx: `0`)
1265	.lower();
1266
1267	auto &LogOps =
1268	getActionDefinitionsBuilder(Opcodes: {G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1269	LogOps.customFor(Types: {S32, S16});
1270	LogOps.clampScalar(TypeIdx: `0`, MinTy: MinScalarFPTy, MaxTy: S32)
1271	.scalarize(TypeIdx: `0`);
1272
1273	// The 64-bit versions produce 32-bit results, but only on the SALU.
1274	getActionDefinitionsBuilder(Opcode: G_CTPOP)
1275	.legalFor(Types: {{S32, S32}, {S32, S64}})
1276	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1277	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`)
1278	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1279	.scalarize(TypeIdx: `0`)
1280	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`);
1281
1282	// If no 16 bit instr is available, lower into different instructions.
1283	if (ST.has16BitInsts())
1284	getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1285	.legalForCartesianProduct(Types0: {S1}, Types1: FPTypes16)
1286	.widenScalarToNextPow2(TypeIdx: `1`)
1287	.scalarize(TypeIdx: `0`)
1288	.lower();
1289	else
1290	getActionDefinitionsBuilder(Opcode: G_IS_FPCLASS)
1291	.legalForCartesianProduct(Types0: {S1}, Types1: FPTypesBase)
1292	.lowerFor(Types: {S1, S16})
1293	.widenScalarToNextPow2(TypeIdx: `1`)
1294	.scalarize(TypeIdx: `0`)
1295	.lower();
1296
1297	// The hardware instructions return a different result on 0 than the generic
1298	// instructions expect. The hardware produces -1, but these produce the
1299	// bitwidth.
1300	getActionDefinitionsBuilder(Opcodes: {G_CTLZ, G_CTTZ})
1301	.scalarize(TypeIdx: `0`)
1302	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1303	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1304	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1305	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`)
1306	.custom();
1307
1308	// The 64-bit versions produce 32-bit results, but only on the SALU.
1309	getActionDefinitionsBuilder(Opcode: G_CTLZ_ZERO_UNDEF)
1310	.legalFor(Types: {{S32, S32}, {S32, S64}})
1311	.customIf(Predicate: scalarNarrowerThan(TypeIdx: `1`, Size: `32`))
1312	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1313	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1314	.scalarize(TypeIdx: `0`)
1315	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1316	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`);
1317
1318	getActionDefinitionsBuilder(Opcode: G_CTTZ_ZERO_UNDEF)
1319	.legalFor(Types: {{S32, S32}, {S32, S64}})
1320	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1321	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S64)
1322	.scalarize(TypeIdx: `0`)
1323	.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`)
1324	.widenScalarToNextPow2(TypeIdx: `1`, MinSize: `32`);
1325
1326	// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1327	// RegBankSelect.
1328	getActionDefinitionsBuilder(Opcode: G_BITREVERSE)
1329	.legalFor(Types: {S32, S64})
1330	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
1331	.scalarize(TypeIdx: `0`)
1332	.widenScalarToNextPow2(TypeIdx: `0`);
1333
1334	if (ST.has16BitInsts()) {
1335	getActionDefinitionsBuilder(Opcode: G_BSWAP)
1336	.legalFor(Types: {S16, S32, V2S16})
1337	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
1338	// FIXME: Fixing non-power-of-2 before clamp is workaround for
1339	// narrowScalar limitation.
1340	.widenScalarToNextPow2(TypeIdx: `0`)
1341	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S32)
1342	.scalarize(TypeIdx: `0`);
1343
1344	if (ST.hasVOP3PInsts()) {
1345	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1346	.legalFor(Types: {S32, S16, V2S16})
1347	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
1348	.minScalar(TypeIdx: `0`, Ty: S16)
1349	.widenScalarToNextPow2(TypeIdx: `0`)
1350	.scalarize(TypeIdx: `0`)
1351	.lower();
1352	} else {
1353	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354	.legalFor(Types: {S32, S16})
1355	.widenScalarToNextPow2(TypeIdx: `0`)
1356	.minScalar(TypeIdx: `0`, Ty: S16)
1357	.scalarize(TypeIdx: `0`)
1358	.lower();
1359	}
1360	} else {
1361	// TODO: Should have same legality without v_perm_b32
1362	getActionDefinitionsBuilder(Opcode: G_BSWAP)
1363	.legalFor(Types: {S32})
1364	.lowerIf(Predicate: scalarNarrowerThan(TypeIdx: `0`, Size: `32`))
1365	// FIXME: Fixing non-power-of-2 before clamp is workaround for
1366	// narrowScalar limitation.
1367	.widenScalarToNextPow2(TypeIdx: `0`)
1368	.maxScalar(TypeIdx: `0`, Ty: S32)
1369	.scalarize(TypeIdx: `0`)
1370	.lower();
1371
1372	getActionDefinitionsBuilder(Opcodes: {G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1373	.legalFor(Types: {S32})
1374	.minScalar(TypeIdx: `0`, Ty: S32)
1375	.widenScalarToNextPow2(TypeIdx: `0`)
1376	.scalarize(TypeIdx: `0`)
1377	.lower();
1378	}
1379
1380	getActionDefinitionsBuilder(Opcode: G_INTTOPTR)
1381	// List the common cases
1382	.legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1383	.legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1384	.scalarize(TypeIdx: `0`)
1385	// Accept any address space as long as the size matches
1386	.legalIf(Predicate: sameSize(TypeIdx0: `0`, TypeIdx1: `1`))
1387	.widenScalarIf(Predicate: smallerThan(TypeIdx0: `1`, TypeIdx1: `0`),
1388	Mutation: [](const LegalityQuery &Query) {
1389	return std::pair(
1390	`1`, LLT::scalar(SizeInBits: Query.Types [`0`].getSizeInBits()));
1391	})
1392	.narrowScalarIf(Predicate: largerThan(TypeIdx0: `1`, TypeIdx1: `0`), Mutation: [](const LegalityQuery &Query) {
1393	return std::pair(`1`, LLT::scalar(SizeInBits: Query.Types [`0`].getSizeInBits()));
1394	});
1395
1396	getActionDefinitionsBuilder(Opcode: G_PTRTOINT)
1397	// List the common cases
1398	.legalForCartesianProduct(Types0: AddrSpaces64, Types1: {S64})
1399	.legalForCartesianProduct(Types0: AddrSpaces32, Types1: {S32})
1400	.scalarize(TypeIdx: `0`)
1401	// Accept any address space as long as the size matches
1402	.legalIf(Predicate: sameSize(TypeIdx0: `0`, TypeIdx1: `1`))
1403	.widenScalarIf(Predicate: smallerThan(TypeIdx0: `0`, TypeIdx1: `1`),
1404	Mutation: [](const LegalityQuery &Query) {
1405	return std::pair(
1406	`0`, LLT::scalar(SizeInBits: Query.Types [`1`].getSizeInBits()));
1407	})
1408	.narrowScalarIf(Predicate: largerThan(TypeIdx0: `0`, TypeIdx1: `1`), Mutation: [](const LegalityQuery &Query) {
1409	return std::pair(`0`, LLT::scalar(SizeInBits: Query.Types [`1`].getSizeInBits()));
1410	});
1411
1412	getActionDefinitionsBuilder(Opcode: G_ADDRSPACE_CAST)
1413	.scalarize(TypeIdx: `0`)
1414	.custom();
1415
1416	const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1417	bool IsLoad) -> bool {
1418	const LLT DstTy = Query.Types [`0`];
1419
1420	// Split vector extloads.
1421	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1422
1423	if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1424	return true;
1425
1426	const LLT PtrTy = Query.Types [`1`];
1427	unsigned AS = PtrTy.getAddressSpace();
1428	if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1429	IsAtomic: Query.MMODescrs [`0`].Ordering !=
1430	AtomicOrdering::NotAtomic))
1431	return true;
1432
1433	// Catch weird sized loads that don't evenly divide into the access sizes
1434	// TODO: May be able to widen depending on alignment etc.
1435	unsigned NumRegs = (MemSize + `31`) / `32`;
1436	if (NumRegs == `3`) {
1437	if (!ST.hasDwordx3LoadStores())
1438	return true;
1439	} else {
1440	// If the alignment allows, these should have been widened.
1441	if (!isPowerOf2_32(Value: NumRegs))
1442	return true;
1443	}
1444
1445	return false;
1446	};
1447
1448	unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `32`;
1449	unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `16`;
1450	unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? `0` : `8`;
1451
1452	// TODO: Refine based on subtargets which support unaligned access or 128-bit
1453	// LDS
1454	// TODO: Unsupported flat for SI.
1455
1456	for (unsigned Op : {G_LOAD, G_STORE}) {
1457	const bool IsStore = Op == G_STORE;
1458
1459	auto &Actions = getActionDefinitionsBuilder(Opcode: Op);
1460	// Explicitly list some common cases.
1461	// TODO: Does this help compile time at all?
1462	Actions.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S32, .Align: GlobalAlign32},
1463	{.Type0: V2S32, .Type1: GlobalPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1464	{.Type0: V4S32, .Type1: GlobalPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1465	{.Type0: S64, .Type1: GlobalPtr, .MemTy: S64, .Align: GlobalAlign32},
1466	{.Type0: V2S64, .Type1: GlobalPtr, .MemTy: V2S64, .Align: GlobalAlign32},
1467	{.Type0: V2S16, .Type1: GlobalPtr, .MemTy: V2S16, .Align: GlobalAlign32},
1468	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: GlobalAlign8},
1469	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: GlobalAlign16},
1470
1471	{.Type0: S32, .Type1: LocalPtr, .MemTy: S32, .Align: `32`},
1472	{.Type0: S64, .Type1: LocalPtr, .MemTy: S64, .Align: `32`},
1473	{.Type0: V2S32, .Type1: LocalPtr, .MemTy: V2S32, .Align: `32`},
1474	{.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: `8`},
1475	{.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: `16`},
1476	{.Type0: V2S16, .Type1: LocalPtr, .MemTy: S32, .Align: `32`},
1477
1478	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S32, .Align: `32`},
1479	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: `8`},
1480	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: `16`},
1481	{.Type0: V2S16, .Type1: PrivatePtr, .MemTy: S32, .Align: `32`},
1482
1483	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S32, .Align: GlobalAlign32},
1484	{.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32},
1485	{.Type0: V4S32, .Type1: ConstantPtr, .MemTy: V4S32, .Align: GlobalAlign32},
1486	{.Type0: S64, .Type1: ConstantPtr, .MemTy: S64, .Align: GlobalAlign32},
1487	{.Type0: V2S32, .Type1: ConstantPtr, .MemTy: V2S32, .Align: GlobalAlign32}});
1488	Actions.legalIf(
1489	Predicate: [=](const LegalityQuery &Query) -> bool {
1490	return isLoadStoreLegal(ST, Query);
1491	});
1492
1493	// The custom pointers (fat pointers, buffer resources) don't work with load
1494	// and store at this level. Fat pointers should have been lowered to
1495	// intrinsics before the translation to MIR.
1496	Actions.unsupportedIf(
1497	Predicate: typeInSet(TypeIdx: `1`, TypesInit: {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1498
1499	// Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1500	// ptrtoint. This is needed to account for the fact that we can't have i128
1501	// as a register class for SelectionDAG reasons.
1502	Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1503	return hasBufferRsrcWorkaround(Ty: Query.Types [`0`]);
1504	});
1505
1506	// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1507	// 64-bits.
1508	//
1509	// TODO: Should generalize bitcast action into coerce, which will also cover
1510	// inserting addrspacecasts.
1511	Actions.customIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: Constant32Ptr));
1512
1513	// Turn any illegal element vectors into something easier to deal
1514	// with. These will ultimately produce 32-bit scalar shifts to extract the
1515	// parts anyway.
1516	//
1517	// For odd 16-bit element vectors, prefer to split those into pieces with
1518	// 16-bit vector parts.
1519	Actions.bitcastIf(
1520	Predicate: [=](const LegalityQuery &Query) -> bool {
1521	return shouldBitcastLoadStoreType(ST, Ty: Query.Types [`0`],
1522	MemTy: Query.MMODescrs [`0`].MemoryTy);
1523	}, Mutation: bitcastToRegisterType(TypeIdx: `0`));
1524
1525	if (!IsStore) {
1526	// Widen suitably aligned loads by loading extra bytes. The standard
1527	// legalization actions can't properly express widening memory operands.
1528	Actions.customIf(Predicate: [=](const LegalityQuery &Query) -> bool {
1529	return shouldWidenLoad(ST, Query, Opcode: G_LOAD);
1530	});
1531	}
1532
1533	// FIXME: load/store narrowing should be moved to lower action
1534	Actions
1535	.narrowScalarIf(
1536	Predicate: [=](const LegalityQuery &Query) -> bool {
1537	return !Query.Types [`0`].isVector() &&
1538	needToSplitMemOp (Query, Op == G_LOAD);
1539	},
1540	Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1541	const LLT DstTy = Query.Types [`0`];
1542	const LLT PtrTy = Query.Types [`1`];
1543
1544	const unsigned DstSize = DstTy.getSizeInBits();
1545	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1546
1547	// Split extloads.
1548	if (DstSize > MemSize)
1549	return std::pair(`0`, LLT::scalar(SizeInBits: MemSize));
1550
1551	unsigned MaxSize = maxSizeForAddrSpace(
1552	ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1553	IsAtomic: Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic);
1554	if (MemSize > MaxSize)
1555	return std::pair(`0`, LLT::scalar(SizeInBits: MaxSize));
1556
1557	uint64_t Align = Query.MMODescrs [`0`].AlignInBits;
1558	return std::pair(`0`, LLT::scalar(SizeInBits: Align));
1559	})
1560	.fewerElementsIf(
1561	Predicate: [=](const LegalityQuery &Query) -> bool {
1562	return Query.Types [`0`].isVector() &&
1563	needToSplitMemOp (Query, Op == G_LOAD);
1564	},
1565	Mutation: [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1566	const LLT DstTy = Query.Types [`0`];
1567	const LLT PtrTy = Query.Types [`1`];
1568
1569	LLT EltTy = DstTy.getElementType();
1570	unsigned MaxSize = maxSizeForAddrSpace(
1571	ST, AS: PtrTy.getAddressSpace(), IsLoad: Op == G_LOAD,
1572	IsAtomic: Query.MMODescrs [`0`].Ordering != AtomicOrdering::NotAtomic);
1573
1574	// FIXME: Handle widened to power of 2 results better. This ends
1575	// up scalarizing.
1576	// FIXME: 3 element stores scalarized on SI
1577
1578	// Split if it's too large for the address space.
1579	unsigned MemSize = Query.MMODescrs [`0`].MemoryTy.getSizeInBits();
1580	if (MemSize > MaxSize) {
1581	unsigned NumElts = DstTy.getNumElements();
1582	unsigned EltSize = EltTy.getSizeInBits();
1583
1584	if (MaxSize % EltSize == `0`) {
1585	return std::pair(
1586	`0`, LLT::scalarOrVector(
1587	EC: ElementCount::getFixed(MinVal: MaxSize / EltSize), ScalarTy: EltTy));
1588	}
1589
1590	unsigned NumPieces = MemSize / MaxSize;
1591
1592	// FIXME: Refine when odd breakdowns handled
1593	// The scalars will need to be re-legalized.
1594	if (NumPieces == `1` \|\| NumPieces >= NumElts \|\|
1595	NumElts % NumPieces != `0`)
1596	return std::pair(`0`, EltTy);
1597
1598	return std::pair(`0`,
1599	LLT::fixed_vector(NumElements: NumElts / NumPieces, ScalarTy: EltTy));
1600	}
1601
1602	// FIXME: We could probably handle weird extending loads better.
1603	if (DstTy.getSizeInBits() > MemSize)
1604	return std::pair(`0`, EltTy);
1605
1606	unsigned EltSize = EltTy.getSizeInBits();
1607	unsigned DstSize = DstTy.getSizeInBits();
1608	if (!isPowerOf2_32(Value: DstSize)) {
1609	// We're probably decomposing an odd sized store. Try to split
1610	// to the widest type. TODO: Account for alignment. As-is it
1611	// should be OK, since the new parts will be further legalized.
1612	unsigned FloorSize = llvm::bit_floor(Value: DstSize);
1613	return std::pair(
1614	`0`, LLT::scalarOrVector(
1615	EC: ElementCount::getFixed(MinVal: FloorSize / EltSize), ScalarTy: EltTy));
1616	}
1617
1618	// May need relegalization for the scalars.
1619	return std::pair(`0`, EltTy);
1620	})
1621	.minScalar(TypeIdx: `0`, Ty: S32)
1622	.narrowScalarIf(Predicate: isWideScalarExtLoadTruncStore(TypeIdx: `0`), Mutation: changeTo(TypeIdx: `0`, Ty: S32))
1623	.widenScalarToNextPow2(TypeIdx: `0`)
1624	.moreElementsIf(Predicate: vectorSmallerThan(TypeIdx: `0`, Size: `32`), Mutation: moreEltsToNext32Bit(TypeIdx: `0`))
1625	.lower();
1626	}
1627
1628	// FIXME: Unaligned accesses not lowered.
1629	auto &ExtLoads = getActionDefinitionsBuilder(Opcodes: {G_SEXTLOAD, G_ZEXTLOAD})
1630	.legalForTypesWithMemDesc(TypesAndMemDesc: {{.Type0: S32, .Type1: GlobalPtr, .MemTy: S8, .Align: `8`},
1631	{.Type0: S32, .Type1: GlobalPtr, .MemTy: S16, .Align: `2` * `8`},
1632	{.Type0: S32, .Type1: LocalPtr, .MemTy: S8, .Align: `8`},
1633	{.Type0: S32, .Type1: LocalPtr, .MemTy: S16, .Align: `16`},
1634	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S8, .Align: `8`},
1635	{.Type0: S32, .Type1: PrivatePtr, .MemTy: S16, .Align: `16`},
1636	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S8, .Align: `8`},
1637	{.Type0: S32, .Type1: ConstantPtr, .MemTy: S16, .Align: `2` * `8`}})
1638	.legalIf(
1639	Predicate: [=](const LegalityQuery &Query) -> bool {
1640	return isLoadStoreLegal(ST, Query);
1641	});
1642
1643	if (ST.hasFlatAddressSpace()) {
1644	ExtLoads.legalForTypesWithMemDesc(
1645	TypesAndMemDesc: {{.Type0: S32, .Type1: FlatPtr, .MemTy: S8, .Align: `8`}, {.Type0: S32, .Type1: FlatPtr, .MemTy: S16, .Align: `16`}});
1646	}
1647
1648	// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1649	// 64-bits.
1650	//
1651	// TODO: Should generalize bitcast action into coerce, which will also cover
1652	// inserting addrspacecasts.
1653	ExtLoads.customIf(Predicate: typeIs(TypeIdx: `1`, TypesInit: Constant32Ptr));
1654
1655	ExtLoads.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S32)
1656	.widenScalarToNextPow2(TypeIdx: `0`)
1657	.lower();
1658
1659	auto &Atomics = getActionDefinitionsBuilder(
1660	Opcodes: {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1661	G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1662	G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1663	G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1664	.legalFor(Types: {{S32, GlobalPtr}, {S32, LocalPtr},
1665	{S64, GlobalPtr}, {S64, LocalPtr},
1666	{S32, RegionPtr}, {S64, RegionPtr}});
1667	if (ST.hasFlatAddressSpace()) {
1668	Atomics.legalFor(Types: {{S32, FlatPtr}, {S64, FlatPtr}});
1669	}
1670
1671	// TODO: v2bf16 operations, and fat buffer pointer support.
1672	auto &Atomic = getActionDefinitionsBuilder(Opcode: G_ATOMICRMW_FADD);
1673	if (ST.hasLDSFPAtomicAddF32()) {
1674	Atomic.legalFor(Types: {{S32, LocalPtr}, {S32, RegionPtr}});
1675	if (ST.hasLdsAtomicAddF64())
1676	Atomic.legalFor(Types: {{S64, LocalPtr}});
1677	if (ST.hasAtomicDsPkAdd16Insts())
1678	Atomic.legalFor(Types: {{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1679	}
1680	if (ST.hasAtomicFaddInsts())
1681	Atomic.legalFor(Types: {{S32, GlobalPtr}});
1682	if (ST.hasFlatAtomicFaddF32Inst())
1683	Atomic.legalFor(Types: {{S32, FlatPtr}});
1684
1685	if (ST.hasGFX90AInsts()) {
1686	// These are legal with some caveats, and should have undergone expansion in
1687	// the IR in most situations
1688	// TODO: Move atomic expansion into legalizer
1689	Atomic.legalFor(Types: {
1690	{S32, GlobalPtr},
1691	{S64, GlobalPtr},
1692	{S64, FlatPtr}
1693	});
1694	}
1695
1696	if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() \|\|
1697	ST.hasAtomicBufferGlobalPkAddF16Insts())
1698	Atomic.legalFor(Types: {{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1699	if (ST.hasAtomicGlobalPkAddBF16Inst())
1700	Atomic.legalFor(Types: {{V2BF16, GlobalPtr}});
1701	if (ST.hasAtomicFlatPkAdd16Insts())
1702	Atomic.legalFor(Types: {{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1703
1704
1705	// Most of the legalization work here is done by AtomicExpand. We could
1706	// probably use a simpler legality rule that just assumes anything is OK.
1707	auto &AtomicFMinFMax =
1708	getActionDefinitionsBuilder(Opcodes: {G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1709	.legalFor(Types: {{F32, LocalPtr}, {F64, LocalPtr}});
1710
1711	if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1712	AtomicFMinFMax.legalFor(Types: {{F32, GlobalPtr},{F32, BufferFatPtr}});
1713	if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1714	AtomicFMinFMax.legalFor(Types: {{F64, GlobalPtr}, {F64, BufferFatPtr}});
1715	if (ST.hasAtomicFMinFMaxF32FlatInsts())
1716	AtomicFMinFMax.legalFor(Types: {F32, FlatPtr});
1717	if (ST.hasAtomicFMinFMaxF64FlatInsts())
1718	AtomicFMinFMax.legalFor(Types: {F64, FlatPtr});
1719
1720	// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1721	// demarshalling
1722	getActionDefinitionsBuilder(Opcode: G_ATOMIC_CMPXCHG)
1723	.customFor(Types: {{S32, GlobalPtr}, {S64, GlobalPtr},
1724	{S32, FlatPtr}, {S64, FlatPtr}})
1725	.legalFor(Types: {{S32, LocalPtr}, {S64, LocalPtr},
1726	{S32, RegionPtr}, {S64, RegionPtr}});
1727	// TODO: Pointer types, any 32-bit or 64-bit vector
1728
1729	// Condition should be s32 for scalar, s1 for vector.
1730	getActionDefinitionsBuilder(Opcode: G_SELECT)
1731	.legalForCartesianProduct(Types0: {S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1732	LocalPtr, FlatPtr, PrivatePtr,
1733	LLT::fixed_vector(NumElements: `2`, ScalarTy: LocalPtr),
1734	LLT::fixed_vector(NumElements: `2`, ScalarTy: PrivatePtr)},
1735	Types1: {S1, S32})
1736	.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64)
1737	.scalarize(TypeIdx: `1`)
1738	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: `0`), Mutation: oneMoreElement(TypeIdx: `0`))
1739	.fewerElementsIf(Predicate: numElementsNotEven(TypeIdx: `0`), Mutation: scalarize(TypeIdx: `0`))
1740	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `2`)
1741	.clampMaxNumElements(TypeIdx: `0`, EltTy: LocalPtr, MaxElements: `2`)
1742	.clampMaxNumElements(TypeIdx: `0`, EltTy: PrivatePtr, MaxElements: `2`)
1743	.scalarize(TypeIdx: `0`)
1744	.widenScalarToNextPow2(TypeIdx: `0`)
1745	.legalIf(Predicate: all(P0: isPointer(TypeIdx: `0`), P1: typeInSet(TypeIdx: `1`, TypesInit: {S1, S32})));
1746
1747	// TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1748	// be more flexible with the shift amount type.
1749	auto &Shifts = getActionDefinitionsBuilder(Opcodes: {G_SHL, G_LSHR, G_ASHR})
1750	.legalFor(Types: {{S32, S32}, {S64, S32}});
1751	if (ST.has16BitInsts()) {
1752	if (ST.hasVOP3PInsts()) {
1753	Shifts.legalFor(Types: {{S16, S16}, {V2S16, V2S16}})
1754	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`);
1755	} else
1756	Shifts.legalFor(Types: {{S16, S16}});
1757
1758	// TODO: Support 16-bit shift amounts for all types
1759	Shifts.widenScalarIf(
1760	Predicate: [=](const LegalityQuery &Query) {
1761	// Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1762	// 32-bit amount.
1763	const LLT ValTy = Query.Types [`0`];
1764	const LLT AmountTy = Query.Types [`1`];
1765	return ValTy.isScalar() && ValTy.getSizeInBits() <= `16` &&
1766	AmountTy.getSizeInBits() < `16`;
1767	}, Mutation: changeTo(TypeIdx: `1`, Ty: S16));
1768	Shifts.maxScalarIf(Predicate: typeIs(TypeIdx: `0`, TypesInit: S16), TypeIdx: `1`, Ty: S16);
1769	Shifts.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32);
1770	Shifts.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `16`);
1771	Shifts.clampScalar(TypeIdx: `0`, MinTy: S16, MaxTy: S64);
1772
1773	getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1774	.minScalar(TypeIdx: `0`, Ty: S16)
1775	.scalarize(TypeIdx: `0`)
1776	.lower();
1777	} else {
1778	// Make sure we legalize the shift amount type first, as the general
1779	// expansion for the shifted type will produce much worse code if it hasn't
1780	// been truncated already.
1781	Shifts.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32);
1782	Shifts.widenScalarToNextPow2(TypeIdx: `0`, MinSize: `32`);
1783	Shifts.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
1784
1785	getActionDefinitionsBuilder(Opcodes: {G_SSHLSAT, G_USHLSAT})
1786	.minScalar(TypeIdx: `0`, Ty: S32)
1787	.scalarize(TypeIdx: `0`)
1788	.lower();
1789	}
1790	Shifts.scalarize(TypeIdx: `0`);
1791
1792	for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1793	unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? `1` : `0`;
1794	unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? `0` : `1`;
1795	unsigned IdxTypeIdx = `2`;
1796
1797	getActionDefinitionsBuilder(Opcode: Op)
1798	.customIf(Predicate: [=](const LegalityQuery &Query) {
1799	const LLT EltTy = Query.Types [EltTypeIdx];
1800	const LLT VecTy = Query.Types [VecTypeIdx];
1801	const LLT IdxTy = Query.Types [IdxTypeIdx];
1802	const unsigned EltSize = EltTy.getSizeInBits();
1803	const bool isLegalVecType =
1804	!!SIRegisterInfo::getSGPRClassForBitWidth(BitWidth: VecTy.getSizeInBits());
1805	// Address space 8 pointers are 128-bit wide values, but the logic
1806	// below will try to bitcast them to 2N x s64, which will fail.
1807	// Therefore, as an intermediate step, wrap extracts/insertions from a
1808	// ptrtoint-ing the vector and scalar arguments (or inttoptring the
1809	// extraction result) in order to produce a vector operation that can
1810	// be handled by the logic below.
1811	if (EltTy.isPointer() && EltSize > `64`)
1812	return true;
1813	return (EltSize == `32` \|\| EltSize == `64`) &&
1814	VecTy.getSizeInBits() % `32` == `0` &&
1815	VecTy.getSizeInBits() <= MaxRegisterSize &&
1816	IdxTy.getSizeInBits() == `32` &&
1817	isLegalVecType;
1818	})
1819	.bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1820	P1: scalarOrEltNarrowerThan(TypeIdx: VecTypeIdx, Size: `32`)),
1821	Mutation: bitcastToVectorElement32(TypeIdx: VecTypeIdx))
1822	//.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1823	.bitcastIf(Predicate: all(P0: sizeIsMultipleOf32(TypeIdx: VecTypeIdx),
1824	P1: scalarOrEltWiderThan(TypeIdx: VecTypeIdx, Size: `64`)),
1825	Mutation: [=](const LegalityQuery &Query) {
1826	// For > 64-bit element types, try to turn this into a
1827	// 64-bit element vector since we may be able to do better
1828	// indexing if this is scalar. If not, fall back to 32.
1829	const LLT EltTy = Query.Types [EltTypeIdx];
1830	const LLT VecTy = Query.Types [VecTypeIdx];
1831	const unsigned DstEltSize = EltTy.getSizeInBits();
1832	const unsigned VecSize = VecTy.getSizeInBits();
1833
1834	const unsigned TargetEltSize =
1835	DstEltSize % `64` == `0` ? `64` : `32`;
1836	return std::pair(VecTypeIdx,
1837	LLT::fixed_vector(NumElements: VecSize / TargetEltSize,
1838	ScalarSizeInBits: TargetEltSize));
1839	})
1840	.clampScalar(TypeIdx: EltTypeIdx, MinTy: S32, MaxTy: S64)
1841	.clampScalar(TypeIdx: VecTypeIdx, MinTy: S32, MaxTy: S64)
1842	.clampScalar(TypeIdx: IdxTypeIdx, MinTy: S32, MaxTy: S32)
1843	.clampMaxNumElements(TypeIdx: VecTypeIdx, EltTy: S32, MaxElements: `32`)
1844	// TODO: Clamp elements for 64-bit vectors?
1845	.moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: VecTypeIdx),
1846	Mutation: moreElementsToNextExistingRegClass(TypeIdx: VecTypeIdx))
1847	// It should only be necessary with variable indexes.
1848	// As a last resort, lower to the stack
1849	.lower();
1850	}
1851
1852	getActionDefinitionsBuilder(Opcode: G_EXTRACT_VECTOR_ELT)
1853	.unsupportedIf(Predicate: [=](const LegalityQuery &Query) {
1854	const LLT &EltTy = Query.Types [`1`].getElementType();
1855	return Query.Types [`0`] != EltTy;
1856	});
1857
1858	for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1859	unsigned BigTyIdx = Op == G_EXTRACT ? `1` : `0`;
1860	unsigned LitTyIdx = Op == G_EXTRACT ? `0` : `1`;
1861
1862	// FIXME: Doesn't handle extract of illegal sizes.
1863	getActionDefinitionsBuilder(Opcode: Op)
1864	.lowerIf(Predicate: all(P0: typeIs(TypeIdx: LitTyIdx, TypesInit: S16), P1: sizeIs(TypeIdx: BigTyIdx, Size: `32`)))
1865	.lowerIf(Predicate: [=](const LegalityQuery &Query) {
1866	// Sub-vector(or single element) insert and extract.
1867	// TODO: verify immediate offset here since lower only works with
1868	// whole elements.
1869	const LLT BigTy = Query.Types [BigTyIdx];
1870	return BigTy.isVector();
1871	})
1872	// FIXME: Multiples of 16 should not be legal.
1873	.legalIf(Predicate: [=](const LegalityQuery &Query) {
1874	const LLT BigTy = Query.Types [BigTyIdx];
1875	const LLT LitTy = Query.Types [LitTyIdx];
1876	return (BigTy.getSizeInBits() % `32` == `0`) &&
1877	(LitTy.getSizeInBits() % `16` == `0`);
1878	})
1879	.widenScalarIf(
1880	Predicate: [=](const LegalityQuery &Query) {
1881	const LLT BigTy = Query.Types [BigTyIdx];
1882	return (BigTy.getScalarSizeInBits() < `16`);
1883	},
1884	Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: BigTyIdx, Min: `16`))
1885	.widenScalarIf(
1886	Predicate: [=](const LegalityQuery &Query) {
1887	const LLT LitTy = Query.Types [LitTyIdx];
1888	return (LitTy.getScalarSizeInBits() < `16`);
1889	},
1890	Mutation: LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx: LitTyIdx, Min: `16`))
1891	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx), Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1892	.widenScalarToNextPow2(TypeIdx: BigTyIdx, MinSize: `32`);
1893
1894	}
1895
1896	auto &BuildVector =
1897	getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR)
1898	.legalForCartesianProduct(Types0: AllS32Vectors, Types1: {S32})
1899	.legalForCartesianProduct(Types0: AllS64Vectors, Types1: {S64})
1900	.clampNumElements(TypeIdx: `0`, MinTy: V16S32, MaxTy: V32S32)
1901	.clampNumElements(TypeIdx: `0`, MinTy: V2S64, MaxTy: V16S64)
1902	.fewerElementsIf(Predicate: isWideVec16(TypeIdx: `0`), Mutation: changeTo(TypeIdx: `0`, Ty: V2S16))
1903	.moreElementsIf(Predicate: isIllegalRegisterType(ST, TypeIdx: `0`),
1904	Mutation: moreElementsToNextExistingRegClass(TypeIdx: `0`));
1905
1906	if (ST.hasScalarPackInsts()) {
1907	BuildVector
1908	// FIXME: Should probably widen s1 vectors straight to s32
1909	.minScalarOrElt(TypeIdx: `0`, Ty: S16)
1910	.minScalar(TypeIdx: `1`, Ty: S16);
1911
1912	getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1913	.legalFor(Types: {V2S16, S32})
1914	.lower();
1915	} else {
1916	BuildVector.customFor(Types: {V2S16, S16});
1917	BuildVector.minScalarOrElt(TypeIdx: `0`, Ty: S32);
1918
1919	getActionDefinitionsBuilder(Opcode: G_BUILD_VECTOR_TRUNC)
1920	.customFor(Types: {V2S16, S32})
1921	.lower();
1922	}
1923
1924	BuildVector.legalIf(Predicate: isRegisterType(ST, TypeIdx: `0`));
1925
1926	// FIXME: Clamp maximum size
1927	getActionDefinitionsBuilder(Opcode: G_CONCAT_VECTORS)
1928	.legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: `0`), P1: isRegisterType(ST, TypeIdx: `1`)))
1929	.clampMaxNumElements(TypeIdx: `0`, EltTy: S32, MaxElements: `32`)
1930	.clampMaxNumElements(TypeIdx: `1`, EltTy: S16, MaxElements: `2`) // TODO: Make 4?
1931	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `64`);
1932
1933	getActionDefinitionsBuilder(Opcode: G_SHUFFLE_VECTOR).lower();
1934
1935	// Merge/Unmerge
1936	for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1937	unsigned BigTyIdx = Op == G_MERGE_VALUES ? `0` : `1`;
1938	unsigned LitTyIdx = Op == G_MERGE_VALUES ? `1` : `0`;
1939
1940	auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1941	const LLT Ty = Query.Types [TypeIdx];
1942	if (Ty.isVector()) {
1943	const LLT &EltTy = Ty.getElementType();
1944	if (EltTy.getSizeInBits() < `8` \|\| EltTy.getSizeInBits() > `512`)
1945	return true;
1946	if (!llvm::has_single_bit<uint32_t>(Value: EltTy.getSizeInBits()))
1947	return true;
1948	}
1949	return false;
1950	};
1951
1952	auto &Builder =
1953	getActionDefinitionsBuilder(Opcode: Op)
1954	.legalIf(Predicate: all(P0: isRegisterType(ST, TypeIdx: `0`), P1: isRegisterType(ST, TypeIdx: `1`)))
1955	.lowerFor(Types: {{S16, V2S16}})
1956	.lowerIf(Predicate: [=](const LegalityQuery &Query) {
1957	const LLT BigTy = Query.Types [BigTyIdx];
1958	return BigTy.getSizeInBits() == `32`;
1959	})
1960	// Try to widen to s16 first for small types.
1961	// TODO: Only do this on targets with legal s16 shifts
1962	.minScalarOrEltIf(Predicate: scalarNarrowerThan(TypeIdx: LitTyIdx, Size: `16`), TypeIdx: LitTyIdx, Ty: S16)
1963	.widenScalarToNextPow2(TypeIdx: LitTyIdx, /Min/ MinSize: `16`)
1964	.moreElementsIf(Predicate: isSmallOddVector(TypeIdx: BigTyIdx),
1965	Mutation: oneMoreElement(TypeIdx: BigTyIdx))
1966	.fewerElementsIf(Predicate: all(P0: typeIs(TypeIdx: `0`, TypesInit: S16), P1: vectorWiderThan(TypeIdx: `1`, Size: `32`),
1967	args: elementTypeIs(TypeIdx: `1`, EltTy: S16)),
1968	Mutation: changeTo(TypeIdx: `1`, Ty: V2S16))
1969	// Clamp the little scalar to s8-s256 and make it a power of 2. It's
1970	// not worth considering the multiples of 64 since 2192 and 2384
1971	// are not valid.
1972	.clampScalar(TypeIdx: LitTyIdx, MinTy: S32, MaxTy: S512)
1973	.widenScalarToNextPow2(TypeIdx: LitTyIdx, /Min/ MinSize: `32`)
1974	// Break up vectors with weird elements into scalars
1975	.fewerElementsIf(
1976	Predicate: [=](const LegalityQuery &Query) {
1977	return notValidElt (Query, LitTyIdx);
1978	},
1979	Mutation: scalarize(TypeIdx: `0`))
1980	.fewerElementsIf(
1981	Predicate: [=](const LegalityQuery &Query) {
1982	return notValidElt (Query, BigTyIdx);
1983	},
1984	Mutation: scalarize(TypeIdx: `1`))
1985	.clampScalar(TypeIdx: BigTyIdx, MinTy: S32, MaxTy: MaxScalar);
1986
1987	if (Op == G_MERGE_VALUES) {
1988	Builder.widenScalarIf(
1989	// TODO: Use 16-bit shifts if legal for 8-bit values?
1990	Predicate: [=](const LegalityQuery &Query) {
1991	const LLT Ty = Query.Types [LitTyIdx];
1992	return Ty.getSizeInBits() < `32`;
1993	},
1994	Mutation: changeTo(TypeIdx: LitTyIdx, Ty: S32));
1995	}
1996
1997	Builder.widenScalarIf(
1998	Predicate: [=](const LegalityQuery &Query) {
1999	const LLT Ty = Query.Types [BigTyIdx];
2000	return Ty.getSizeInBits() % `16` != `0`;
2001	},
2002	Mutation: [=](const LegalityQuery &Query) {
2003	// Pick the next power of 2, or a multiple of 64 over 128.
2004	// Whichever is smaller.
2005	const LLT &Ty = Query.Types [BigTyIdx];
2006	unsigned NewSizeInBits = `1` << Log2_32_Ceil(Value: Ty.getSizeInBits() + `1`);
2007	if (NewSizeInBits >= `256`) {
2008	unsigned RoundedTo = alignTo<`64`>(Value: Ty.getSizeInBits() + `1`);
2009	if (RoundedTo < NewSizeInBits)
2010	NewSizeInBits = RoundedTo;
2011	}
2012	return std::pair(BigTyIdx, LLT::scalar(SizeInBits: NewSizeInBits));
2013	})
2014	// Any vectors left are the wrong size. Scalarize them.
2015	.scalarize(TypeIdx: `0`)
2016	.scalarize(TypeIdx: `1`);
2017	}
2018
2019	// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2020	// RegBankSelect.
2021	auto &SextInReg = getActionDefinitionsBuilder(Opcode: G_SEXT_INREG)
2022	.legalFor(Types: {{S32}, {S64}})
2023	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64);
2024
2025	if (ST.hasVOP3PInsts()) {
2026	SextInReg.lowerFor(Types: {{V2S16}})
2027	// Prefer to reduce vector widths for 16-bit vectors before lowering, to
2028	// get more vector shift opportunities, since we'll get those when
2029	// expanded.
2030	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`);
2031	} else if (ST.has16BitInsts()) {
2032	SextInReg.lowerFor(Types: {{S32}, {S64}, {S16}});
2033	} else {
2034	// Prefer to promote to s32 before lowering if we don't have 16-bit
2035	// shifts. This avoid a lot of intermediate truncate and extend operations.
2036	SextInReg.lowerFor(Types: {{S32}, {S64}});
2037	}
2038
2039	SextInReg
2040	.scalarize(TypeIdx: `0`)
2041	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
2042	.lower();
2043
2044	getActionDefinitionsBuilder(Opcodes: {G_ROTR, G_ROTL})
2045	.scalarize(TypeIdx: `0`)
2046	.lower();
2047
2048	// TODO: Only Try to form v2s16 with legal packed instructions.
2049	getActionDefinitionsBuilder(Opcode: G_FSHR)
2050	.legalFor(Types: {{S32, S32}})
2051	.lowerFor(Types: {{V2S16, V2S16}})
2052	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
2053	.scalarize(TypeIdx: `0`)
2054	.lower();
2055
2056	if (ST.hasVOP3PInsts()) {
2057	getActionDefinitionsBuilder(Opcode: G_FSHL)
2058	.lowerFor(Types: {{V2S16, V2S16}})
2059	.clampMaxNumElementsStrict(TypeIdx: `0`, EltTy: S16, NumElts: `2`)
2060	.scalarize(TypeIdx: `0`)
2061	.lower();
2062	} else {
2063	getActionDefinitionsBuilder(Opcode: G_FSHL)
2064	.scalarize(TypeIdx: `0`)
2065	.lower();
2066	}
2067
2068	getActionDefinitionsBuilder(Opcode: G_READCYCLECOUNTER)
2069	.legalFor(Types: {S64});
2070
2071	getActionDefinitionsBuilder(Opcode: G_READSTEADYCOUNTER).legalFor(Types: {S64});
2072
2073	getActionDefinitionsBuilder(Opcode: G_FENCE)
2074	.alwaysLegal();
2075
2076	getActionDefinitionsBuilder(Opcodes: {G_SMULO, G_UMULO})
2077	.scalarize(TypeIdx: `0`)
2078	.minScalar(TypeIdx: `0`, Ty: S32)
2079	.lower();
2080
2081	getActionDefinitionsBuilder(Opcodes: {G_SBFX, G_UBFX})
2082	.legalFor(Types: {{S32, S32}, {S64, S32}})
2083	.clampScalar(TypeIdx: `1`, MinTy: S32, MaxTy: S32)
2084	.clampScalar(TypeIdx: `0`, MinTy: S32, MaxTy: S64)
2085	.widenScalarToNextPow2(TypeIdx: `0`)
2086	.scalarize(TypeIdx: `0`);
2087
2088	getActionDefinitionsBuilder(
2089	Opcodes: {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2090	G_FCOPYSIGN,
2091
2092	G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2093	G_READ_REGISTER, G_WRITE_REGISTER,
2094
2095	G_SADDO, G_SSUBO})
2096	.lower();
2097
2098	if (ST.hasIEEEMinMax()) {
2099	getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM})
2100	.legalFor(Types: FPTypesPK16)
2101	.clampMaxNumElements(TypeIdx: `0`, EltTy: S16, MaxElements: `2`)
2102	.scalarize(TypeIdx: `0`);
2103	} else {
2104	// TODO: Implement
2105	getActionDefinitionsBuilder(Opcodes: {G_FMINIMUM, G_FMAXIMUM}).lower();
2106	}
2107
2108	getActionDefinitionsBuilder(Opcodes: {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2109	.lower();
2110
2111	getActionDefinitionsBuilder(Opcodes: {G_TRAP, G_DEBUGTRAP}).custom();
2112
2113	getActionDefinitionsBuilder(Opcodes: {G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2114	G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2115	G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2116	.unsupported();
2117
2118	getActionDefinitionsBuilder(Opcode: G_PREFETCH).alwaysLegal();
2119
2120	getActionDefinitionsBuilder(
2121	Opcodes: {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2122	G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2123	G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2124	G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2125	.legalFor(Types: AllVectors)
2126	.scalarize(TypeIdx: `1`)
2127	.lower();
2128
2129	getLegacyLegalizerInfo().computeTables();
2130	verify(MII: *ST.getInstrInfo());
2131	}
2132
2133	bool AMDGPULegalizerInfo::legalizeCustom(
2134	LegalizerHelper &Helper, MachineInstr &MI,
2135	LostDebugLocObserver &LocObserver) const {
2136	MachineIRBuilder &B = Helper.MIRBuilder;
2137	MachineRegisterInfo &MRI = *B.getMRI();
2138
2139	switch (MI.getOpcode()) {
2140	case TargetOpcode::G_ADDRSPACE_CAST:
2141	return legalizeAddrSpaceCast(MI, MRI, B);
2142	case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2143	return legalizeFroundeven(MI, MRI, B);
2144	case TargetOpcode::G_FCEIL:
2145	return legalizeFceil(MI, MRI, B);
2146	case TargetOpcode::G_FREM:
2147	return legalizeFrem(MI, MRI, B);
2148	case TargetOpcode::G_INTRINSIC_TRUNC:
2149	return legalizeIntrinsicTrunc(MI, MRI, B);
2150	case TargetOpcode::G_SITOFP:
2151	return legalizeITOFP(MI, MRI, B, Signed: true);
2152	case TargetOpcode::G_UITOFP:
2153	return legalizeITOFP(MI, MRI, B, Signed: false);
2154	case TargetOpcode::G_FPTOSI:
2155	return legalizeFPTOI(MI, MRI, B, Signed: true);
2156	case TargetOpcode::G_FPTOUI:
2157	return legalizeFPTOI(MI, MRI, B, Signed: false);
2158	case TargetOpcode::G_FMINNUM:
2159	case TargetOpcode::G_FMAXNUM:
2160	case TargetOpcode::G_FMINIMUMNUM:
2161	case TargetOpcode::G_FMAXIMUMNUM:
2162	case TargetOpcode::G_FMINNUM_IEEE:
2163	case TargetOpcode::G_FMAXNUM_IEEE:
2164	return legalizeMinNumMaxNum(Helper, MI);
2165	case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2166	return legalizeExtractVectorElt(MI, MRI, B);
2167	case TargetOpcode::G_INSERT_VECTOR_ELT:
2168	return legalizeInsertVectorElt(MI, MRI, B);
2169	case TargetOpcode::G_FSIN:
2170	case TargetOpcode::G_FCOS:
2171	return legalizeSinCos(MI, MRI, B);
2172	case TargetOpcode::G_GLOBAL_VALUE:
2173	return legalizeGlobalValue(MI, MRI, B);
2174	case TargetOpcode::G_LOAD:
2175	case TargetOpcode::G_SEXTLOAD:
2176	case TargetOpcode::G_ZEXTLOAD:
2177	return legalizeLoad(Helper, MI);
2178	case TargetOpcode::G_STORE:
2179	return legalizeStore(Helper, MI);
2180	case TargetOpcode::G_FMAD:
2181	return legalizeFMad(MI, MRI, B);
2182	case TargetOpcode::G_FDIV:
2183	return legalizeFDIV(MI, MRI, B);
2184	case TargetOpcode::G_FFREXP:
2185	return legalizeFFREXP(MI, MRI, B);
2186	case TargetOpcode::G_FSQRT:
2187	return legalizeFSQRT(MI, MRI, B);
2188	case TargetOpcode::G_UDIV:
2189	case TargetOpcode::G_UREM:
2190	case TargetOpcode::G_UDIVREM:
2191	return legalizeUnsignedDIV_REM(MI, MRI, B);
2192	case TargetOpcode::G_SDIV:
2193	case TargetOpcode::G_SREM:
2194	case TargetOpcode::G_SDIVREM:
2195	return legalizeSignedDIV_REM(MI, MRI, B);
2196	case TargetOpcode::G_ATOMIC_CMPXCHG:
2197	return legalizeAtomicCmpXChg(MI, MRI, B);
2198	case TargetOpcode::G_FLOG2:
2199	return legalizeFlog2(MI, B);
2200	case TargetOpcode::G_FLOG:
2201	case TargetOpcode::G_FLOG10:
2202	return legalizeFlogCommon(MI, B);
2203	case TargetOpcode::G_FEXP2:
2204	return legalizeFExp2(MI, B);
2205	case TargetOpcode::G_FEXP:
2206	case TargetOpcode::G_FEXP10:
2207	return legalizeFExp(MI, B);
2208	case TargetOpcode::G_FPOW:
2209	return legalizeFPow(MI, B);
2210	case TargetOpcode::G_FFLOOR:
2211	return legalizeFFloor(MI, MRI, B);
2212	case TargetOpcode::G_BUILD_VECTOR:
2213	case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2214	return legalizeBuildVector(MI, MRI, B);
2215	case TargetOpcode::G_MUL:
2216	return legalizeMul(Helper, MI);
2217	case TargetOpcode::G_CTLZ:
2218	case TargetOpcode::G_CTTZ:
2219	return legalizeCTLZ_CTTZ(MI, MRI, B);
2220	case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2221	return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2222	case TargetOpcode::G_STACKSAVE:
2223	return legalizeStackSave(MI, B);
2224	case TargetOpcode::G_GET_FPENV:
2225	return legalizeGetFPEnv(MI, MRI, B);
2226	case TargetOpcode::G_SET_FPENV:
2227	return legalizeSetFPEnv(MI, MRI, B);
2228	case TargetOpcode::G_TRAP:
2229	return legalizeTrap(MI, MRI, B);
2230	case TargetOpcode::G_DEBUGTRAP:
2231	return legalizeDebugTrap(MI, MRI, B);
2232	default:
2233	return false;
2234	}
2235
2236	llvm_unreachable("expected switch to return");
2237	}
2238
2239	Register AMDGPULegalizerInfo::getSegmentAperture(
2240	unsigned AS,
2241	MachineRegisterInfo &MRI,
2242	MachineIRBuilder &B) const {
2243	MachineFunction &MF = B.getMF();
2244	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2245	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2246	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2247
2248	assert(AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::PRIVATE_ADDRESS);
2249
2250	if (ST.hasApertureRegs()) {
2251	// Note: this register is somewhat broken. When used as a 32-bit operand,
2252	// it only returns zeroes. The real value is in the upper 32 bits.
2253	// Thus, we must emit extract the high 32 bits.
2254	const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2255	? AMDGPU::SRC_SHARED_BASE
2256	: AMDGPU::SRC_PRIVATE_BASE;
2257	// FIXME: It would be more natural to emit a COPY here, but then copy
2258	// coalescing would kick in and it would think it's okay to use the "HI"
2259	// subregister (instead of extracting the HI 32 bits) which is an artificial
2260	// (unusable) register.
2261	// Register TableGen definitions would need an overhaul to get rid of the
2262	// artificial "HI" aperture registers and prevent this kind of issue from
2263	// happening.
2264	Register Dst = MRI.createGenericVirtualRegister(Ty: S64);
2265	MRI.setRegClass(Reg: Dst, RC: &AMDGPU::SReg_64RegClass);
2266	B.buildInstr(Opc: AMDGPU::S_MOV_B64, DstOps: {Dst}, SrcOps: {Register (ApertureRegNo)});
2267	return B.buildUnmerge(Res: S32, Op: Dst).getReg(Idx: `1`);
2268	}
2269
2270	// TODO: can we be smarter about machine pointer info?
2271	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2272	Register LoadAddr = MRI.createGenericVirtualRegister(
2273	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2274	// For code object version 5, private_base and shared_base are passed through
2275	// implicit kernargs.
2276	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
2277	AMDGPU::AMDHSA_COV5) {
2278	AMDGPUTargetLowering::ImplicitParameter Param =
2279	AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2280	: AMDGPUTargetLowering::PRIVATE_BASE;
2281	uint64_t Offset =
2282	ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
2283
2284	Register KernargPtrReg = MRI.createGenericVirtualRegister(
2285	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2286
2287	if (!loadInputValue(DstReg: KernargPtrReg, B,
2288	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2289	return Register ();
2290
2291	MachineMemOperand *MMO = MF.getMachineMemOperand(
2292	PtrInfo,
2293	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
2294	MachineMemOperand::MOInvariant,
2295	MemTy: LLT::scalar(SizeInBits: `32`), base_alignment: commonAlignment(A: Align (`64`), Offset));
2296
2297	// Pointer address
2298	B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
2299	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset).getReg(Idx: `0`));
2300	// Load address
2301	return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
2302	}
2303
2304	Register QueuePtr = MRI.createGenericVirtualRegister(
2305	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2306
2307	if (!loadInputValue(DstReg: QueuePtr, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
2308	return Register ();
2309
2310	// Offset into amd_queue_t for group_segment_aperture_base_hi /
2311	// private_segment_aperture_base_hi.
2312	uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? `0x40` : `0x44`;
2313
2314	MachineMemOperand *MMO = MF.getMachineMemOperand(
2315	PtrInfo,
2316	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
2317	MachineMemOperand::MOInvariant,
2318	MemTy: LLT::scalar(SizeInBits: `32`), base_alignment: commonAlignment(A: Align (`64`), Offset: StructOffset));
2319
2320	B.buildPtrAdd(Res: LoadAddr, Op0: QueuePtr,
2321	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: StructOffset).getReg(Idx: `0`));
2322	return B.buildLoad(Res: S32, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
2323	}
2324
2325	/// Return true if the value is a known valid address, such that a null check is
2326	/// not necessary.
2327	static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2328	const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2329	MachineInstr *Def = MRI.getVRegDef(Reg: Val);
2330	switch (Def->getOpcode()) {
2331	case AMDGPU::G_FRAME_INDEX:
2332	case AMDGPU::G_GLOBAL_VALUE:
2333	case AMDGPU::G_BLOCK_ADDR:
2334	return true;
2335	case AMDGPU::G_CONSTANT: {
2336	const ConstantInt *CI = Def->getOperand(i: `1`).getCImm();
2337	return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2338	}
2339	default:
2340	return false;
2341	}
2342
2343	return false;
2344	}
2345
2346	bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2347	MachineInstr &MI, MachineRegisterInfo &MRI,
2348	MachineIRBuilder &B) const {
2349	MachineFunction &MF = B.getMF();
2350
2351	// MI can either be a G_ADDRSPACE_CAST or a
2352	// G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2353	assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST \|\|
2354	(isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2355	Intrinsic::amdgcn_addrspacecast_nonnull));
2356
2357	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2358	Register Dst = MI.getOperand(i: `0`).getReg();
2359	Register Src = isa<GIntrinsic>(Val: MI) ? MI.getOperand(i: `2`).getReg()
2360	: MI.getOperand(i: `1`).getReg();
2361	LLT DstTy = MRI.getType(Reg: Dst);
2362	LLT SrcTy = MRI.getType(Reg: Src);
2363	unsigned DestAS = DstTy.getAddressSpace();
2364	unsigned SrcAS = SrcTy.getAddressSpace();
2365
2366	// TODO: Avoid reloading from the queue ptr for each cast, or at least each
2367	// vector element.
2368	assert(!DstTy.isVector());
2369
2370	const AMDGPUTargetMachine &TM
2371	= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2372
2373	if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2374	MI.setDesc(B.getTII().get(Opcode: TargetOpcode::G_BITCAST));
2375	return true;
2376	}
2377
2378	if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2379	(DestAS == AMDGPUAS::LOCAL_ADDRESS \|\|
2380	DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2381	// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2382	// G_ADDRSPACE_CAST we need to guess.
2383	if (isa<GIntrinsic>(Val: MI) \|\| isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2384	// Extract low 32-bits of the pointer.
2385	B.buildExtract(Res: Dst, Src, Index: `0`);
2386	MI.eraseFromParent();
2387	return true;
2388	}
2389
2390	unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
2391
2392	auto SegmentNull = B.buildConstant(Res: DstTy, Val: NullVal);
2393	auto FlatNull = B.buildConstant(Res: SrcTy, Val: `0`);
2394
2395	// Extract low 32-bits of the pointer.
2396	auto PtrLo32 = B.buildExtract(Res: DstTy, Src, Index: `0`);
2397
2398	auto CmpRes =
2399	B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: `1`), Op0: Src, Op1: FlatNull.getReg(Idx: `0`));
2400	B.buildSelect(Res: Dst, Tst: CmpRes, Op0: PtrLo32, Op1: SegmentNull.getReg(Idx: `0`));
2401
2402	MI.eraseFromParent();
2403	return true;
2404	}
2405
2406	if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2407	(SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\|
2408	SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2409	auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2410	Register ApertureReg = getSegmentAperture(AS: SrcAS, MRI, B);
2411	if (!ApertureReg.isValid())
2412	return false;
2413
2414	// Coerce the type of the low half of the result so we can use
2415	// merge_values.
2416	Register SrcAsInt = B.buildPtrToInt(Dst: S32, Src).getReg(Idx: `0`);
2417
2418	// TODO: Should we allow mismatched types but matching sizes in merges to
2419	// avoid the ptrtoint?
2420	return B.buildMergeLikeInstr(Res: Dst, Ops: {SrcAsInt, ApertureReg}).getReg(Idx: `0`);
2421	};
2422
2423	// For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2424	// G_ADDRSPACE_CAST we need to guess.
2425	if (isa<GIntrinsic>(Val: MI) \|\| isKnownNonNull(Val: Src, MRI, TM, AddrSpace: SrcAS)) {
2426	castLocalOrPrivateToFlat (Dst);
2427	MI.eraseFromParent();
2428	return true;
2429	}
2430
2431	Register BuildPtr = castLocalOrPrivateToFlat (DstTy);
2432
2433	auto SegmentNull = B.buildConstant(Res: SrcTy, Val: TM.getNullPointerValue(AddrSpace: SrcAS));
2434	auto FlatNull = B.buildConstant(Res: DstTy, Val: TM.getNullPointerValue(AddrSpace: DestAS));
2435
2436	auto CmpRes = B.buildICmp(Pred: CmpInst::ICMP_NE, Res: LLT::scalar(SizeInBits: `1`), Op0: Src,
2437	Op1: SegmentNull.getReg(Idx: `0`));
2438
2439	B.buildSelect(Res: Dst, Tst: CmpRes, Op0: BuildPtr, Op1: FlatNull);
2440
2441	MI.eraseFromParent();
2442	return true;
2443	}
2444
2445	if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2446	SrcTy.getSizeInBits() == `64`) {
2447	// Truncate.
2448	B.buildExtract(Res: Dst, Src, Index: `0`);
2449	MI.eraseFromParent();
2450	return true;
2451	}
2452
2453	if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2454	DstTy.getSizeInBits() == `64`) {
2455	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2456	uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2457	auto PtrLo = B.buildPtrToInt(Dst: S32, Src);
2458	auto HighAddr = B.buildConstant(Res: S32, Val: AddrHiVal);
2459	B.buildMergeLikeInstr(Res: Dst, Ops: {PtrLo, HighAddr});
2460	MI.eraseFromParent();
2461	return true;
2462	}
2463
2464	// Invalid casts are poison.
2465	// TODO: Should return poison
2466	B.buildUndef(Res: Dst);
2467	MI.eraseFromParent();
2468	return true;
2469	}
2470
2471	bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2472	MachineRegisterInfo &MRI,
2473	MachineIRBuilder &B) const {
2474	Register Src = MI.getOperand(i: `1`).getReg();
2475	LLT Ty = MRI.getType(Reg: Src);
2476	assert(Ty.isScalar() && Ty.getSizeInBits() == `64`);
2477
2478	APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2479	APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2480
2481	auto C1 = B.buildFConstant(Res: Ty, Val: C1Val);
2482	auto CopySign = B.buildFCopysign(Dst: Ty, Src0: C1, Src1: Src);
2483
2484	// TODO: Should this propagate fast-math-flags?
2485	auto Tmp1 = B.buildFAdd(Dst: Ty, Src0: Src, Src1: CopySign);
2486	auto Tmp2 = B.buildFSub(Dst: Ty, Src0: Tmp1, Src1: CopySign);
2487
2488	auto C2 = B.buildFConstant(Res: Ty, Val: C2Val);
2489	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Src);
2490
2491	auto Cond = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: C2);
2492	B.buildSelect(Res: MI.getOperand(i: `0`).getReg(), Tst: Cond, Op0: Src, Op1: Tmp2);
2493	MI.eraseFromParent();
2494	return true;
2495	}
2496
2497	bool AMDGPULegalizerInfo::legalizeFceil(
2498	MachineInstr &MI, MachineRegisterInfo &MRI,
2499	MachineIRBuilder &B) const {
2500
2501	const LLT S1 = LLT::scalar(SizeInBits: `1`);
2502	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2503
2504	Register Src = MI.getOperand(i: `1`).getReg();
2505	assert(MRI.getType(Src) == S64);
2506
2507	// result = trunc(src)
2508	// if (src > 0.0 && src != result)
2509	// result += 1.0
2510
2511	auto Trunc = B.buildIntrinsicTrunc(Dst: S64, Src0: Src);
2512
2513	const auto Zero = B.buildFConstant(Res: S64, Val: `0.0`);
2514	const auto One = B.buildFConstant(Res: S64, Val: `1.0`);
2515	auto Lt0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Src, Op1: Zero);
2516	auto NeTrunc = B.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: S1, Op0: Src, Op1: Trunc);
2517	auto And = B.buildAnd(Dst: S1, Src0: Lt0, Src1: NeTrunc);
2518	auto Add = B.buildSelect(Res: S64, Tst: And, Op0: One, Op1: Zero);
2519
2520	// TODO: Should this propagate fast-math-flags?
2521	B.buildFAdd(Dst: MI.getOperand(i: `0`).getReg(), Src0: Trunc, Src1: Add);
2522	MI.eraseFromParent();
2523	return true;
2524	}
2525
2526	bool AMDGPULegalizerInfo::legalizeFrem(
2527	MachineInstr &MI, MachineRegisterInfo &MRI,
2528	MachineIRBuilder &B) const {
2529	Register DstReg = MI.getOperand(i: `0`).getReg();
2530	Register Src0Reg = MI.getOperand(i: `1`).getReg();
2531	Register Src1Reg = MI.getOperand(i: `2`).getReg();
2532	auto Flags = MI.getFlags();
2533	LLT Ty = MRI.getType(Reg: DstReg);
2534
2535	auto Div = B.buildFDiv(Dst: Ty, Src0: Src0Reg, Src1: Src1Reg, Flags);
2536	auto Trunc = B.buildIntrinsicTrunc(Dst: Ty, Src0: Div, Flags);
2537	auto Neg = B.buildFNeg(Dst: Ty, Src0: Trunc, Flags);
2538	B.buildFMA(Dst: DstReg, Src0: Neg, Src1: Src1Reg, Src2: Src0Reg, Flags);
2539	MI.eraseFromParent();
2540	return true;
2541	}
2542
2543	static MachineInstrBuilder extractF64Exponent(Register Hi,
2544	MachineIRBuilder &B) {
2545	const unsigned FractBits = `52`;
2546	const unsigned ExpBits = `11`;
2547	LLT S32 = LLT::scalar(SizeInBits: `32`);
2548
2549	auto Const0 = B.buildConstant(Res: S32, Val: FractBits - `32`);
2550	auto Const1 = B.buildConstant(Res: S32, Val: ExpBits);
2551
2552	auto ExpPart = B.buildIntrinsic(ID: Intrinsic::amdgcn_ubfe, Res: {S32})
2553	.addUse(RegNo: Hi)
2554	.addUse(RegNo: Const0.getReg(Idx: `0`))
2555	.addUse(RegNo: Const1.getReg(Idx: `0`));
2556
2557	return B.buildSub(Dst: S32, Src0: ExpPart, Src1: B.buildConstant(Res: S32, Val: `1023`));
2558	}
2559
2560	bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2561	MachineInstr &MI, MachineRegisterInfo &MRI,
2562	MachineIRBuilder &B) const {
2563	const LLT S1 = LLT::scalar(SizeInBits: `1`);
2564	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2565	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2566
2567	Register Src = MI.getOperand(i: `1`).getReg();
2568	assert(MRI.getType(Src) == S64);
2569
2570	// TODO: Should this use extract since the low half is unused?
2571	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2572	Register Hi = Unmerge.getReg(Idx: `1`);
2573
2574	// Extract the upper half, since this is where we will find the sign and
2575	// exponent.
2576	auto Exp = extractF64Exponent(Hi, B);
2577
2578	const unsigned FractBits = `52`;
2579
2580	// Extract the sign bit.
2581	const auto SignBitMask = B.buildConstant(Res: S32, UINT32_C(`1`) << `31`);
2582	auto SignBit = B.buildAnd(Dst: S32, Src0: Hi, Src1: SignBitMask);
2583
2584	const auto FractMask = B.buildConstant(Res: S64, Val: (UINT64_C(`1`) << FractBits) - `1`);
2585
2586	const auto Zero32 = B.buildConstant(Res: S32, Val: `0`);
2587
2588	// Extend back to 64-bits.
2589	auto SignBit64 = B.buildMergeLikeInstr(Res: S64, Ops: {Zero32, SignBit});
2590
2591	auto Shr = B.buildAShr(Dst: S64, Src0: FractMask, Src1: Exp);
2592	auto Not = B.buildNot(Dst: S64, Src0: Shr);
2593	auto Tmp0 = B.buildAnd(Dst: S64, Src0: Src, Src1: Not);
2594	auto FiftyOne = B.buildConstant(Res: S32, Val: FractBits - `1`);
2595
2596	auto ExpLt0 = B.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: Exp, Op1: Zero32);
2597	auto ExpGt51 = B.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: Exp, Op1: FiftyOne);
2598
2599	auto Tmp1 = B.buildSelect(Res: S64, Tst: ExpLt0, Op0: SignBit64, Op1: Tmp0);
2600	B.buildSelect(Res: MI.getOperand(i: `0`).getReg(), Tst: ExpGt51, Op0: Src, Op1: Tmp1);
2601	MI.eraseFromParent();
2602	return true;
2603	}
2604
2605	bool AMDGPULegalizerInfo::legalizeITOFP(
2606	MachineInstr &MI, MachineRegisterInfo &MRI,
2607	MachineIRBuilder &B, bool Signed) const {
2608
2609	Register Dst = MI.getOperand(i: `0`).getReg();
2610	Register Src = MI.getOperand(i: `1`).getReg();
2611
2612	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2613	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2614
2615	assert(MRI.getType(Src) == S64);
2616
2617	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: Src);
2618	auto ThirtyTwo = B.buildConstant(Res: S32, Val: `32`);
2619
2620	if (MRI.getType(Reg: Dst) == S64) {
2621	auto CvtHi = Signed ? B.buildSITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `1`))
2622	: B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `1`));
2623
2624	auto CvtLo = B.buildUITOFP(Dst: S64, Src0: Unmerge.getReg(Idx: `0`));
2625	auto LdExp = B.buildFLdexp(Dst: S64, Src0: CvtHi, Src1: ThirtyTwo);
2626
2627	// TODO: Should this propagate fast-math-flags?
2628	B.buildFAdd(Dst, Src0: LdExp, Src1: CvtLo);
2629	MI.eraseFromParent();
2630	return true;
2631	}
2632
2633	assert(MRI.getType(Dst) == S32);
2634
2635	auto One = B.buildConstant(Res: S32, Val: `1`);
2636
2637	MachineInstrBuilder ShAmt;
2638	if (Signed) {
2639	auto ThirtyOne = B.buildConstant(Res: S32, Val: `31`);
2640	auto X = B.buildXor(Dst: S32, Src0: Unmerge.getReg(Idx: `0`), Src1: Unmerge.getReg(Idx: `1`));
2641	auto OppositeSign = B.buildAShr(Dst: S32, Src0: X, Src1: ThirtyOne);
2642	auto MaxShAmt = B.buildAdd(Dst: S32, Src0: ThirtyTwo, Src1: OppositeSign);
2643	auto LS = B.buildIntrinsic(ID: Intrinsic::amdgcn_sffbh, Res: {S32})
2644	.addUse(RegNo: Unmerge.getReg(Idx: `1`));
2645	auto LS2 = B.buildSub(Dst: S32, Src0: LS, Src1: One);
2646	ShAmt = B.buildUMin(Dst: S32, Src0: LS2, Src1: MaxShAmt);
2647	} else
2648	ShAmt = B.buildCTLZ(Dst: S32, Src0: Unmerge.getReg(Idx: `1`));
2649	auto Norm = B.buildShl(Dst: S64, Src0: Src, Src1: ShAmt);
2650	auto Unmerge2 = B.buildUnmerge(Res: {S32, S32}, Op: Norm);
2651	auto Adjust = B.buildUMin(Dst: S32, Src0: One, Src1: Unmerge2.getReg(Idx: `0`));
2652	auto Norm2 = B.buildOr(Dst: S32, Src0: Unmerge2.getReg(Idx: `1`), Src1: Adjust);
2653	auto FVal = Signed ? B.buildSITOFP(Dst: S32, Src0: Norm2) : B.buildUITOFP(Dst: S32, Src0: Norm2);
2654	auto Scale = B.buildSub(Dst: S32, Src0: ThirtyTwo, Src1: ShAmt);
2655	B.buildFLdexp(Dst, Src0: FVal, Src1: Scale);
2656	MI.eraseFromParent();
2657	return true;
2658	}
2659
2660	// TODO: Copied from DAG implementation. Verify logic and document how this
2661	// actually works.
2662	bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2663	MachineRegisterInfo &MRI,
2664	MachineIRBuilder &B,
2665	bool Signed) const {
2666
2667	Register Dst = MI.getOperand(i: `0`).getReg();
2668	Register Src = MI.getOperand(i: `1`).getReg();
2669
2670	const LLT S64 = LLT::scalar(SizeInBits: `64`);
2671	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2672
2673	const LLT SrcLT = MRI.getType(Reg: Src);
2674	assert((SrcLT == S32 \|\| SrcLT == S64) && MRI.getType(Dst) == S64);
2675
2676	unsigned Flags = MI.getFlags();
2677
2678	// The basic idea of converting a floating point number into a pair of 32-bit
2679	// integers is illustrated as follows:
2680	//
2681	// tf := trunc(val);
2682	// hif := floor(tf 2^-32);*
2683	// lof := tf - hif 2^32; // lof is always positive due to floor.*
2684	// hi := fptoi(hif);
2685	// lo := fptoi(lof);
2686	//
2687	auto Trunc = B.buildIntrinsicTrunc(Dst: SrcLT, Src0: Src, Flags);
2688	MachineInstrBuilder Sign;
2689	if (Signed && SrcLT == S32) {
2690	// However, a 32-bit floating point number has only 23 bits mantissa and
2691	// it's not enough to hold all the significant bits of `lof` if val is
2692	// negative. To avoid the loss of precision, We need to take the absolute
2693	// value after truncating and flip the result back based on the original
2694	// signedness.
2695	Sign = B.buildAShr(Dst: S32, Src0: Src, Src1: B.buildConstant(Res: S32, Val: `31`));
2696	Trunc = B.buildFAbs(Dst: S32, Src0: Trunc, Flags);
2697	}
2698	MachineInstrBuilder K0, K1;
2699	if (SrcLT == S64) {
2700	K0 = B.buildFConstant(
2701	Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/2^-32/ `0x3df0000000000000`)));
2702	K1 = B.buildFConstant(
2703	Res: S64, Val: llvm::bit_cast<double>(UINT64_C(/-2^32/ `0xc1f0000000000000`)));
2704	} else {
2705	K0 = B.buildFConstant(
2706	Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/2^-32/ `0x2f800000`)));
2707	K1 = B.buildFConstant(
2708	Res: S32, Val: llvm::bit_cast<float>(UINT32_C(/-2^32/ `0xcf800000`)));
2709	}
2710
2711	auto Mul = B.buildFMul(Dst: SrcLT, Src0: Trunc, Src1: K0, Flags);
2712	auto FloorMul = B.buildFFloor(Dst: SrcLT, Src0: Mul, Flags);
2713	auto Fma = B.buildFMA(Dst: SrcLT, Src0: FloorMul, Src1: K1, Src2: Trunc, Flags);
2714
2715	auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(Dst: S32, Src0: FloorMul)
2716	: B.buildFPTOUI(Dst: S32, Src0: FloorMul);
2717	auto Lo = B.buildFPTOUI(Dst: S32, Src0: Fma);
2718
2719	if (Signed && SrcLT == S32) {
2720	// Flip the result based on the signedness, which is either all 0s or 1s.
2721	Sign = B.buildMergeLikeInstr(Res: S64, Ops: {Sign, Sign});
2722	// r := xor({lo, hi}, sign) - sign;
2723	B.buildSub(Dst, Src0: B.buildXor(Dst: S64, Src0: B.buildMergeLikeInstr(Res: S64, Ops: {Lo, Hi}), Src1: Sign),
2724	Src1: Sign);
2725	} else
2726	B.buildMergeLikeInstr(Res: Dst, Ops: {Lo, Hi});
2727	MI.eraseFromParent();
2728
2729	return true;
2730	}
2731
2732	bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2733	MachineInstr &MI) const {
2734	MachineFunction &MF = Helper.MIRBuilder.getMF();
2735	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2736
2737	const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE \|\|
2738	MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2739
2740	// With ieee_mode disabled, the instructions have the correct behavior
2741	// already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
2742	//
2743	// FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
2744	// enabled.
2745	if (!MFI->getMode().IEEE) {
2746	if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM \|\|
2747	MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2748	return true;
2749
2750	return !IsIEEEOp;
2751	}
2752
2753	if (IsIEEEOp)
2754	return true;
2755
2756	return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2757	}
2758
2759	bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2760	MachineInstr &MI, MachineRegisterInfo &MRI,
2761	MachineIRBuilder &B) const {
2762	// TODO: Should move some of this into LegalizerHelper.
2763
2764	// TODO: Promote dynamic indexing of s16 to s32
2765
2766	Register Dst = MI.getOperand(i: `0`).getReg();
2767	Register Vec = MI.getOperand(i: `1`).getReg();
2768
2769	LLT VecTy = MRI.getType(Reg: Vec);
2770	LLT EltTy = VecTy.getElementType();
2771	assert(EltTy == MRI.getType(Dst));
2772
2773	// Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2774	// but we can't go directly to that logic becasue you can't bitcast a vector
2775	// of pointers to a vector of integers. Therefore, introduce an intermediate
2776	// vector of integers using ptrtoint (and inttoptr on the output) in order to
2777	// drive the legalization forward.
2778	if (EltTy.isPointer() && EltTy.getSizeInBits() > `64`) {
2779	LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2780	LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2781
2782	auto IntVec = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2783	auto IntElt = B.buildExtractVectorElement(Res: IntTy, Val: IntVec, Idx: MI.getOperand(i: `2`));
2784	B.buildIntToPtr(Dst, Src: IntElt);
2785
2786	MI.eraseFromParent();
2787	return true;
2788	}
2789
2790	// FIXME: Artifact combiner probably should have replaced the truncated
2791	// constant before this, so we shouldn't need
2792	// getIConstantVRegValWithLookThrough.
2793	std::optional<ValueAndVReg> MaybeIdxVal =
2794	getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: `2`).getReg(), MRI);
2795	if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2796	return true;
2797	const uint64_t IdxVal = MaybeIdxVal ->Value.getZExtValue();
2798
2799	if (IdxVal < VecTy.getNumElements()) {
2800	auto Unmerge = B.buildUnmerge(Res: EltTy, Op: Vec);
2801	B.buildCopy(Res: Dst, Op: Unmerge.getReg(Idx: IdxVal));
2802	} else {
2803	B.buildUndef(Res: Dst);
2804	}
2805
2806	MI.eraseFromParent();
2807	return true;
2808	}
2809
2810	bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2811	MachineInstr &MI, MachineRegisterInfo &MRI,
2812	MachineIRBuilder &B) const {
2813	// TODO: Should move some of this into LegalizerHelper.
2814
2815	// TODO: Promote dynamic indexing of s16 to s32
2816
2817	Register Dst = MI.getOperand(i: `0`).getReg();
2818	Register Vec = MI.getOperand(i: `1`).getReg();
2819	Register Ins = MI.getOperand(i: `2`).getReg();
2820
2821	LLT VecTy = MRI.getType(Reg: Vec);
2822	LLT EltTy = VecTy.getElementType();
2823	assert(EltTy == MRI.getType(Ins));
2824
2825	// Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2826	// but we can't go directly to that logic becasue you can't bitcast a vector
2827	// of pointers to a vector of integers. Therefore, make the pointer vector
2828	// into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2829	// new value, and then inttoptr the result vector back. This will then allow
2830	// the rest of legalization to take over.
2831	if (EltTy.isPointer() && EltTy.getSizeInBits() > `64`) {
2832	LLT IntTy = LLT::scalar(SizeInBits: EltTy.getSizeInBits());
2833	LLT IntVecTy = VecTy.changeElementType(NewEltTy: IntTy);
2834
2835	auto IntVecSource = B.buildPtrToInt(Dst: IntVecTy, Src: Vec);
2836	auto IntIns = B.buildPtrToInt(Dst: IntTy, Src: Ins);
2837	auto IntVecDest = B.buildInsertVectorElement(Res: IntVecTy, Val: IntVecSource, Elt: IntIns,
2838	Idx: MI.getOperand(i: `3`));
2839	B.buildIntToPtr(Dst, Src: IntVecDest);
2840	MI.eraseFromParent();
2841	return true;
2842	}
2843
2844	// FIXME: Artifact combiner probably should have replaced the truncated
2845	// constant before this, so we shouldn't need
2846	// getIConstantVRegValWithLookThrough.
2847	std::optional<ValueAndVReg> MaybeIdxVal =
2848	getIConstantVRegValWithLookThrough(VReg: MI.getOperand(i: `3`).getReg(), MRI);
2849	if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2850	return true;
2851
2852	const uint64_t IdxVal = MaybeIdxVal ->Value.getZExtValue();
2853
2854	unsigned NumElts = VecTy.getNumElements();
2855	if (IdxVal < NumElts) {
2856	SmallVector<Register, `8`> SrcRegs;
2857	for (unsigned i = `0`; i < NumElts; ++i)
2858	SrcRegs.push_back(Elt: MRI.createGenericVirtualRegister(Ty: EltTy));
2859	B.buildUnmerge(Res: SrcRegs, Op: Vec);
2860
2861	SrcRegs [IdxVal] = MI.getOperand(i: `2`).getReg();
2862	B.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
2863	} else {
2864	B.buildUndef(Res: Dst);
2865	}
2866
2867	MI.eraseFromParent();
2868	return true;
2869	}
2870
2871	bool AMDGPULegalizerInfo::legalizeSinCos(
2872	MachineInstr &MI, MachineRegisterInfo &MRI,
2873	MachineIRBuilder &B) const {
2874
2875	Register DstReg = MI.getOperand(i: `0`).getReg();
2876	Register SrcReg = MI.getOperand(i: `1`).getReg();
2877	LLT Ty = MRI.getType(Reg: DstReg);
2878	unsigned Flags = MI.getFlags();
2879
2880	Register TrigVal;
2881	auto OneOver2Pi = B.buildFConstant(Res: Ty, Val: `0.5` * numbers::inv_pi);
2882	if (ST.hasTrigReducedRange()) {
2883	auto MulVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags);
2884	TrigVal = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {Ty})
2885	.addUse(RegNo: MulVal.getReg(Idx: `0`))
2886	.setMIFlags(Flags)
2887	.getReg(Idx: `0`);
2888	} else
2889	TrigVal = B.buildFMul(Dst: Ty, Src0: SrcReg, Src1: OneOver2Pi, Flags).getReg(Idx: `0`);
2890
2891	Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2892	Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2893	B.buildIntrinsic(ID: TrigIntrin, Res: ArrayRef<Register>(DstReg))
2894	.addUse(RegNo: TrigVal)
2895	.setMIFlags(Flags);
2896	MI.eraseFromParent();
2897	return true;
2898	}
2899
2900	bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2901	MachineIRBuilder &B,
2902	const GlobalValue *GV,
2903	int64_t Offset,
2904	unsigned GAFlags) const {
2905	assert(isInt<`32`>(Offset + `4`) && "32-bit offset is expected!");
2906	// In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2907	// to the following code sequence:
2908	//
2909	// For constant address space:
2910	// s_getpc_b64 s[0:1]
2911	// s_add_u32 s0, s0, $symbol
2912	// s_addc_u32 s1, s1, 0
2913	//
2914	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
2915	// a fixup or relocation is emitted to replace $symbol with a literal
2916	// constant, which is a pc-relative offset from the encoding of the $symbol
2917	// operand to the global variable.
2918	//
2919	// For global address space:
2920	// s_getpc_b64 s[0:1]
2921	// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2922	// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2923	//
2924	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
2925	// fixups or relocations are emitted to replace $symbol@@lo and*
2926	// $symbol@@hi with lower 32 bits and higher 32 bits of a literal constant,*
2927	// which is a 64-bit pc-relative offset from the encoding of the $symbol
2928	// operand to the global variable.
2929
2930	LLT ConstPtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
2931
2932	Register PCReg = PtrTy.getSizeInBits() != `32` ? DstReg :
2933	B.getMRI()->createGenericVirtualRegister(Ty: ConstPtrTy);
2934
2935	MachineInstrBuilder MIB = B.buildInstr(Opcode: AMDGPU::SI_PC_ADD_REL_OFFSET)
2936	.addDef(RegNo: PCReg);
2937
2938	MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags);
2939	if (GAFlags == SIInstrInfo::MO_NONE)
2940	MIB.addImm(Val: `0`);
2941	else
2942	MIB.addGlobalAddress(GV, Offset, TargetFlags: GAFlags + `1`);
2943
2944	if (!B.getMRI()->getRegClassOrNull(Reg: PCReg))
2945	B.getMRI()->setRegClass(Reg: PCReg, RC: &AMDGPU::SReg_64RegClass);
2946
2947	if (PtrTy.getSizeInBits() == `32`)
2948	B.buildExtract(Res: DstReg, Src: PCReg, Index: `0`);
2949	return true;
2950	}
2951
2952	// Emit a ABS32_LO / ABS32_HI relocation stub.
2953	void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2954	Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2955	MachineRegisterInfo &MRI) const {
2956	bool RequiresHighHalf = PtrTy.getSizeInBits() != `32`;
2957
2958	LLT S32 = LLT::scalar(SizeInBits: `32`);
2959
2960	// Use the destination directly, if and only if we store the lower address
2961	// part only and we don't have a register class being set.
2962	Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(Reg: DstReg)
2963	? DstReg
2964	: MRI.createGenericVirtualRegister(Ty: S32);
2965
2966	if (!MRI.getRegClassOrNull(Reg: AddrLo))
2967	MRI.setRegClass(Reg: AddrLo, RC: &AMDGPU::SReg_32RegClass);
2968
2969	// Write the lower half.
2970	B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
2971	.addDef(RegNo: AddrLo)
2972	.addGlobalAddress(GV, Offset: `0`, TargetFlags: SIInstrInfo::MO_ABS32_LO);
2973
2974	// If required, write the upper half as well.
2975	if (RequiresHighHalf) {
2976	assert(PtrTy.getSizeInBits() == `64` &&
2977	"Must provide a 64-bit pointer type!");
2978
2979	Register AddrHi = MRI.createGenericVirtualRegister(Ty: S32);
2980	MRI.setRegClass(Reg: AddrHi, RC: &AMDGPU::SReg_32RegClass);
2981
2982	B.buildInstr(Opcode: AMDGPU::S_MOV_B32)
2983	.addDef(RegNo: AddrHi)
2984	.addGlobalAddress(GV, Offset: `0`, TargetFlags: SIInstrInfo::MO_ABS32_HI);
2985
2986	// Use the destination directly, if and only if we don't have a register
2987	// class being set.
2988	Register AddrDst = !MRI.getRegClassOrNull(Reg: DstReg)
2989	? DstReg
2990	: MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `64`));
2991
2992	if (!MRI.getRegClassOrNull(Reg: AddrDst))
2993	MRI.setRegClass(Reg: AddrDst, RC: &AMDGPU::SReg_64RegClass);
2994
2995	B.buildMergeValues(Res: AddrDst, Ops: {AddrLo, AddrHi});
2996
2997	// If we created a new register for the destination, cast the result into
2998	// the final output.
2999	if (AddrDst != DstReg)
3000	B.buildCast(Dst: DstReg, Src: AddrDst);
3001	} else if (AddrLo != DstReg) {
3002	// If we created a new register for the destination, cast the result into
3003	// the final output.
3004	B.buildCast(Dst: DstReg, Src: AddrLo);
3005	}
3006	}
3007
3008	bool AMDGPULegalizerInfo::legalizeGlobalValue(
3009	MachineInstr &MI, MachineRegisterInfo &MRI,
3010	MachineIRBuilder &B) const {
3011	Register DstReg = MI.getOperand(i: `0`).getReg();
3012	LLT Ty = MRI.getType(Reg: DstReg);
3013	unsigned AS = Ty.getAddressSpace();
3014
3015	const GlobalValue *GV = MI.getOperand(i: `1`).getGlobal();
3016	MachineFunction &MF = B.getMF();
3017	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3018
3019	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
3020	if (!MFI->isModuleEntryFunction() &&
3021	GV->getName() != "llvm.amdgcn.module.lds" &&
3022	!AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
3023	const Function &Fn = MF.getFunction();
3024	Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported (
3025	Fn, "local memory global used by non-kernel function",
3026	MI.getDebugLoc(), DS_Warning));
3027
3028	// We currently don't have a way to correctly allocate LDS objects that
3029	// aren't directly associated with a kernel. We do force inlining of
3030	// functions that use local objects. However, if these dead functions are
3031	// not eliminated, we don't want a compile time error. Just emit a warning
3032	// and a trap, since there should be no callable path here.
3033	B.buildTrap();
3034	B.buildUndef(Res: DstReg);
3035	MI.eraseFromParent();
3036	return true;
3037	}
3038
3039	// TODO: We could emit code to handle the initialization somewhere.
3040	// We ignore the initializer for now and legalize it to allow selection.
3041	// The initializer will anyway get errored out during assembly emission.
3042	const SITargetLowering *TLI = ST.getTargetLowering();
3043	if (!TLI->shouldUseLDSConstAddress(GV)) {
3044	MI.getOperand(i: `1`).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3045	return true; // Leave in place;
3046	}
3047
3048	if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3049	Type *Ty = GV->getValueType();
3050	// HIP uses an unsized array `extern __shared__ T s[]` or similar
3051	// zero-sized type in other languages to declare the dynamic shared
3052	// memory which size is not known at the compile time. They will be
3053	// allocated by the runtime and placed directly after the static
3054	// allocated ones. They all share the same offset.
3055	if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3056	// Adjust alignment for that dynamic shared memory array.
3057	MFI->setDynLDSAlign(F: MF.getFunction(), GV: *cast<GlobalVariable>(Val: GV));
3058	LLT S32 = LLT::scalar(SizeInBits: `32`);
3059	auto Sz = B.buildIntrinsic(ID: Intrinsic::amdgcn_groupstaticsize, Res: {S32});
3060	B.buildIntToPtr(Dst: DstReg, Src: Sz);
3061	MI.eraseFromParent();
3062	return true;
3063	}
3064	}
3065
3066	B.buildConstant(Res: DstReg, Val: MFI->allocateLDSGlobal(DL: B.getDataLayout(),
3067	GV: *cast<GlobalVariable>(Val: GV)));
3068	MI.eraseFromParent();
3069	return true;
3070	}
3071
3072	if (ST.isAmdPalOS() \|\| ST.isMesa3DOS()) {
3073	buildAbsGlobalAddress(DstReg, PtrTy: Ty, B, GV, MRI);
3074	MI.eraseFromParent();
3075	return true;
3076	}
3077
3078	const SITargetLowering *TLI = ST.getTargetLowering();
3079
3080	if (TLI->shouldEmitFixup(GV)) {
3081	buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: `0`);
3082	MI.eraseFromParent();
3083	return true;
3084	}
3085
3086	if (TLI->shouldEmitPCReloc(GV)) {
3087	buildPCRelGlobalAddress(DstReg, PtrTy: Ty, B, GV, Offset: `0`, GAFlags: SIInstrInfo::MO_REL32);
3088	MI.eraseFromParent();
3089	return true;
3090	}
3091
3092	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
3093	Register GOTAddr = MRI.createGenericVirtualRegister(Ty: PtrTy);
3094
3095	LLT LoadTy = Ty.getSizeInBits() == `32` ? PtrTy : Ty;
3096	MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3097	PtrInfo: MachinePointerInfo::getGOT(MF),
3098	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
3099	MachineMemOperand::MOInvariant,
3100	MemTy: LoadTy, base_alignment: Align (`8`));
3101
3102	buildPCRelGlobalAddress(DstReg: GOTAddr, PtrTy, B, GV, Offset: `0`, GAFlags: SIInstrInfo::MO_GOTPCREL32);
3103
3104	if (Ty.getSizeInBits() == `32`) {
3105	// Truncate if this is a 32-bit constant address.
3106	auto Load = B.buildLoad(Res: PtrTy, Addr: GOTAddr, MMO&: *GOTMMO);
3107	B.buildExtract(Res: DstReg, Src: Load, Index: `0`);
3108	} else
3109	B.buildLoad(Res: DstReg, Addr: GOTAddr, MMO&: *GOTMMO);
3110
3111	MI.eraseFromParent();
3112	return true;
3113	}
3114
3115	static LLT widenToNextPowerOf2(LLT Ty) {
3116	if (Ty.isVector())
3117	return Ty.changeElementCount(
3118	EC: ElementCount::getFixed(MinVal: PowerOf2Ceil(A: Ty.getNumElements())));
3119	return LLT::scalar(SizeInBits: PowerOf2Ceil(A: Ty.getSizeInBits()));
3120	}
3121
3122	bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3123	MachineInstr &MI) const {
3124	MachineIRBuilder &B = Helper.MIRBuilder;
3125	MachineRegisterInfo &MRI = *B.getMRI();
3126	GISelChangeObserver &Observer = Helper.Observer;
3127
3128	Register PtrReg = MI.getOperand(i: `1`).getReg();
3129	LLT PtrTy = MRI.getType(Reg: PtrReg);
3130	unsigned AddrSpace = PtrTy.getAddressSpace();
3131
3132	if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3133	LLT ConstPtr = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
3134	auto Cast = B.buildAddrSpaceCast(Dst: ConstPtr, Src: PtrReg);
3135	Observer.changingInstr(MI);
3136	MI.getOperand(i: `1`).setReg(Cast.getReg(Idx: `0`));
3137	Observer.changedInstr(MI);
3138	return true;
3139	}
3140
3141	if (MI.getOpcode() != AMDGPU::G_LOAD)
3142	return false;
3143
3144	Register ValReg = MI.getOperand(i: `0`).getReg();
3145	LLT ValTy = MRI.getType(Reg: ValReg);
3146
3147	if (hasBufferRsrcWorkaround(Ty: ValTy)) {
3148	Observer.changingInstr(MI);
3149	castBufferRsrcFromV4I32(MI, B, MRI, Idx: `0`);
3150	Observer.changedInstr(MI);
3151	return true;
3152	}
3153
3154	MachineMemOperand MMO = MI.memoperands_begin();
3155	const unsigned ValSize = ValTy.getSizeInBits();
3156	const LLT MemTy = MMO->getMemoryType();
3157	const Align MemAlign = MMO->getAlign();
3158	const unsigned MemSize = MemTy.getSizeInBits();
3159	const uint64_t AlignInBits = `8` * MemAlign.value();
3160
3161	// Widen non-power-of-2 loads to the alignment if needed
3162	if (shouldWidenLoad(ST, MemoryTy: MemTy, AlignInBits, AddrSpace, Opcode: MI.getOpcode())) {
3163	const unsigned WideMemSize = PowerOf2Ceil(A: MemSize);
3164
3165	// This was already the correct extending load result type, so just adjust
3166	// the memory type.
3167	if (WideMemSize == ValSize) {
3168	MachineFunction &MF = B.getMF();
3169
3170	MachineMemOperand *WideMMO =
3171	MF.getMachineMemOperand(MMO, Offset: `0`, Size: WideMemSize / `8`);
3172	Observer.changingInstr(MI);
3173	MI.setMemRefs(MF, MemRefs: {WideMMO});
3174	Observer.changedInstr(MI);
3175	return true;
3176	}
3177
3178	// Don't bother handling edge case that should probably never be produced.
3179	if (ValSize > WideMemSize)
3180	return false;
3181
3182	LLT WideTy = widenToNextPowerOf2(Ty: ValTy);
3183
3184	Register WideLoad;
3185	if (!WideTy.isVector()) {
3186	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3187	B.buildTrunc(Res: ValReg, Op: WideLoad).getReg(Idx: `0`);
3188	} else {
3189	// Extract the subvector.
3190
3191	if (isRegisterType(ST, Ty: ValTy)) {
3192	// If this a case where G_EXTRACT is legal, use it.
3193	// (e.g. <3 x s32> -> <4 x s32>)
3194	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3195	B.buildExtract(Res: ValReg, Src: WideLoad, Index: `0`);
3196	} else {
3197	// For cases where the widened type isn't a nice register value, unmerge
3198	// from a widened register (e.g. <3 x s16> -> <4 x s16>)
3199	WideLoad = B.buildLoadFromOffset(Dst: WideTy, BasePtr: PtrReg, BaseMMO&: *MMO, Offset: `0`).getReg(Idx: `0`);
3200	B.buildDeleteTrailingVectorElements(Res: ValReg, Op0: WideLoad);
3201	}
3202	}
3203
3204	MI.eraseFromParent();
3205	return true;
3206	}
3207
3208	return false;
3209	}
3210
3211	bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3212	MachineInstr &MI) const {
3213	MachineIRBuilder &B = Helper.MIRBuilder;
3214	MachineRegisterInfo &MRI = *B.getMRI();
3215	GISelChangeObserver &Observer = Helper.Observer;
3216
3217	Register DataReg = MI.getOperand(i: `0`).getReg();
3218	LLT DataTy = MRI.getType(Reg: DataReg);
3219
3220	if (hasBufferRsrcWorkaround(Ty: DataTy)) {
3221	Observer.changingInstr(MI);
3222	castBufferRsrcArgToV4I32(MI, B, Idx: `0`);
3223	Observer.changedInstr(MI);
3224	return true;
3225	}
3226	return false;
3227	}
3228
3229	bool AMDGPULegalizerInfo::legalizeFMad(
3230	MachineInstr &MI, MachineRegisterInfo &MRI,
3231	MachineIRBuilder &B) const {
3232	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
3233	assert(Ty.isScalar());
3234
3235	MachineFunction &MF = B.getMF();
3236	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3237
3238	// TODO: Always legal with future ftz flag.
3239	// FIXME: Do we need just output?
3240	if (Ty == LLT::float32() &&
3241	MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3242	return true;
3243	if (Ty == LLT::float16() &&
3244	MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3245	return true;
3246
3247	MachineIRBuilder HelperBuilder(MI);
3248	GISelObserverWrapper DummyObserver;
3249	LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3250	return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3251	}
3252
3253	bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3254	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3255	Register DstReg = MI.getOperand(i: `0`).getReg();
3256	Register PtrReg = MI.getOperand(i: `1`).getReg();
3257	Register CmpVal = MI.getOperand(i: `2`).getReg();
3258	Register NewVal = MI.getOperand(i: `3`).getReg();
3259
3260	assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3261	"this should not have been custom lowered");
3262
3263	LLT ValTy = MRI.getType(Reg: CmpVal);
3264	LLT VecTy = LLT::fixed_vector(NumElements: `2`, ScalarTy: ValTy);
3265
3266	Register PackedVal = B.buildBuildVector(Res: VecTy, Ops: { NewVal, CmpVal }).getReg(Idx: `0`);
3267
3268	B.buildInstr(Opcode: AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3269	.addDef(RegNo: DstReg)
3270	.addUse(RegNo: PtrReg)
3271	.addUse(RegNo: PackedVal)
3272	.setMemRefs(MI.memoperands());
3273
3274	MI.eraseFromParent();
3275	return true;
3276	}
3277
3278	/// Return true if it's known that \p Src can never be an f32 denormal value.
3279	static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3280	Register Src) {
3281	const MachineInstr *DefMI = MRI.getVRegDef(Reg: Src);
3282	switch (DefMI->getOpcode()) {
3283	case TargetOpcode::G_INTRINSIC: {
3284	switch (cast<GIntrinsic>(Val: DefMI)->getIntrinsicID()) {
3285	case Intrinsic::amdgcn_frexp_mant:
3286	return true;
3287	default:
3288	break;
3289	}
3290
3291	break;
3292	}
3293	case TargetOpcode::G_FFREXP: {
3294	if (DefMI->getOperand(i: `0`).getReg() == Src)
3295	return true;
3296	break;
3297	}
3298	case TargetOpcode::G_FPEXT: {
3299	return MRI.getType(Reg: DefMI->getOperand(i: `1`).getReg()) == LLT::scalar(SizeInBits: `16`);
3300	}
3301	default:
3302	return false;
3303	}
3304
3305	return false;
3306	}
3307
3308	static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3309	if (Flags & MachineInstr::FmAfn)
3310	return true;
3311	const auto &Options = MF.getTarget().Options;
3312	return Options.UnsafeFPMath \|\| Options.ApproxFuncFPMath;
3313	}
3314
3315	static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3316	unsigned Flags) {
3317	return !valueIsKnownNeverF32Denorm(MRI: MF.getRegInfo(), Src) &&
3318	MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Input !=
3319	DenormalMode::PreserveSign;
3320	}
3321
3322	std::pair<Register, Register>
3323	AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3324	unsigned Flags) const {
3325	if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags))
3326	return {};
3327
3328	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3329	auto SmallestNormal = B.buildFConstant(
3330	Res: F32, Val: APFloat::getSmallestNormalized(Sem: APFloat::IEEEsingle()));
3331	auto IsLtSmallestNormal =
3332	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Src, Op1: SmallestNormal);
3333
3334	auto Scale32 = B.buildFConstant(Res: F32, Val: `0x1.0p+32`);
3335	auto One = B.buildFConstant(Res: F32, Val: `1.0`);
3336	auto ScaleFactor =
3337	B.buildSelect(Res: F32, Tst: IsLtSmallestNormal, Op0: Scale32, Op1: One, Flags);
3338	auto ScaledInput = B.buildFMul(Dst: F32, Src0: Src, Src1: ScaleFactor, Flags);
3339
3340	return {ScaledInput.getReg(Idx: `0`), IsLtSmallestNormal.getReg(Idx: `0`)};
3341	}
3342
3343	bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3344	MachineIRBuilder &B) const {
3345	// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3346	// If we have to handle denormals, scale up the input and adjust the result.
3347
3348	// scaled = x (is_denormal ? 0x1.0p+32 : 1.0)*
3349	// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3350
3351	Register Dst = MI.getOperand(i: `0`).getReg();
3352	Register Src = MI.getOperand(i: `1`).getReg();
3353	LLT Ty = B.getMRI()->getType(Reg: Dst);
3354	unsigned Flags = MI.getFlags();
3355
3356	if (Ty == LLT::scalar(SizeInBits: `16`)) {
3357	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3358	// Nothing in half is a denormal when promoted to f32.
3359	auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3360	auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {F32})
3361	.addUse(RegNo: Ext.getReg(Idx: `0`))
3362	.setMIFlags(Flags);
3363	B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3364	MI.eraseFromParent();
3365	return true;
3366	}
3367
3368	assert(Ty == LLT::scalar(`32`));
3369
3370	auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3371	if (!ScaledInput) {
3372	B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {MI.getOperand(i: `0`)})
3373	.addUse(RegNo: Src)
3374	.setMIFlags(Flags);
3375	MI.eraseFromParent();
3376	return true;
3377	}
3378
3379	auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3380	.addUse(RegNo: ScaledInput)
3381	.setMIFlags(Flags);
3382
3383	auto ThirtyTwo = B.buildFConstant(Res: Ty, Val: `32.0`);
3384	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3385	auto ResultOffset =
3386	B.buildSelect(Res: Ty, Tst: IsLtSmallestNormal, Op0: ThirtyTwo, Op1: Zero, Flags);
3387	B.buildFSub(Dst, Src0: Log2, Src1: ResultOffset, Flags);
3388
3389	MI.eraseFromParent();
3390	return true;
3391	}
3392
3393	static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3394	Register Z, unsigned Flags) {
3395	auto FMul = B.buildFMul(Dst: Ty, Src0: X, Src1: Y, Flags);
3396	return B.buildFAdd(Dst: Ty, Src0: FMul, Src1: Z, Flags).getReg(Idx: `0`);
3397	}
3398
3399	bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3400	MachineIRBuilder &B) const {
3401	const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3402	assert(IsLog10 \|\| MI.getOpcode() == TargetOpcode::G_FLOG);
3403
3404	MachineRegisterInfo &MRI = *B.getMRI();
3405	Register Dst = MI.getOperand(i: `0`).getReg();
3406	Register X = MI.getOperand(i: `1`).getReg();
3407	unsigned Flags = MI.getFlags();
3408	const LLT Ty = MRI.getType(Reg: X);
3409	MachineFunction &MF = B.getMF();
3410
3411	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3412	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3413
3414	const AMDGPUTargetMachine &TM =
3415	static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3416
3417	if (Ty == F16 \|\| MI.getFlag(Flag: MachineInstr::FmAfn) \|\|
3418	TM.Options.ApproxFuncFPMath \|\| TM.Options.UnsafeFPMath) {
3419	if (Ty == F16 && !ST.has16BitInsts()) {
3420	Register LogVal = MRI.createGenericVirtualRegister(Ty: F32);
3421	auto PromoteSrc = B.buildFPExt(Res: F32, Op: X);
3422	legalizeFlogUnsafe(B, Dst: LogVal, Src: PromoteSrc.getReg(Idx: `0`), IsLog10, Flags);
3423	B.buildFPTrunc(Res: Dst, Op: LogVal);
3424	} else {
3425	legalizeFlogUnsafe(B, Dst, Src: X, IsLog10, Flags);
3426	}
3427
3428	MI.eraseFromParent();
3429	return true;
3430	}
3431
3432	auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src: X, Flags);
3433	if (ScaledInput)
3434	X = ScaledInput;
3435
3436	auto Y =
3437	B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty}).addUse(RegNo: X).setMIFlags(Flags);
3438
3439	Register R;
3440	if (ST.hasFastFMAF32()) {
3441	// c+cc are ln(2)/ln(10) to more than 49 bits
3442	const float c_log10 = `0x1.344134p-2f`;
3443	const float cc_log10 = `0x1.09f79ep-26f`;
3444
3445	// c + cc is ln(2) to more than 49 bits
3446	const float c_log = `0x1.62e42ep-1f`;
3447	const float cc_log = `0x1.efa39ep-25f`;
3448
3449	auto C = B.buildFConstant(Res: Ty, Val: IsLog10 ? c_log10 : c_log);
3450	auto CC = B.buildFConstant(Res: Ty, Val: IsLog10 ? cc_log10 : cc_log);
3451
3452	R = B.buildFMul(Dst: Ty, Src0: Y, Src1: C, Flags).getReg(Idx: `0`);
3453	auto NegR = B.buildFNeg(Dst: Ty, Src0: R, Flags);
3454	auto FMA0 = B.buildFMA(Dst: Ty, Src0: Y, Src1: C, Src2: NegR, Flags);
3455	auto FMA1 = B.buildFMA(Dst: Ty, Src0: Y, Src1: CC, Src2: FMA0, Flags);
3456	R = B.buildFAdd(Dst: Ty, Src0: R, Src1: FMA1, Flags).getReg(Idx: `0`);
3457	} else {
3458	// ch+ct is ln(2)/ln(10) to more than 36 bits
3459	const float ch_log10 = `0x1.344000p-2f`;
3460	const float ct_log10 = `0x1.3509f6p-18f`;
3461
3462	// ch + ct is ln(2) to more than 36 bits
3463	const float ch_log = `0x1.62e000p-1f`;
3464	const float ct_log = `0x1.0bfbe8p-15f`;
3465
3466	auto CH = B.buildFConstant(Res: Ty, Val: IsLog10 ? ch_log10 : ch_log);
3467	auto CT = B.buildFConstant(Res: Ty, Val: IsLog10 ? ct_log10 : ct_log);
3468
3469	auto MaskConst = B.buildConstant(Res: Ty, Val: `0xfffff000`);
3470	auto YH = B.buildAnd(Dst: Ty, Src0: Y, Src1: MaskConst);
3471	auto YT = B.buildFSub(Dst: Ty, Src0: Y, Src1: YH, Flags);
3472	auto YTCT = B.buildFMul(Dst: Ty, Src0: YT, Src1: CT, Flags);
3473
3474	Register Mad0 =
3475	getMad(B, Ty, X: YH.getReg(Idx: `0`), Y: CT.getReg(Idx: `0`), Z: YTCT.getReg(Idx: `0`), Flags);
3476	Register Mad1 = getMad(B, Ty, X: YT.getReg(Idx: `0`), Y: CH.getReg(Idx: `0`), Z: Mad0, Flags);
3477	R = getMad(B, Ty, X: YH.getReg(Idx: `0`), Y: CH.getReg(Idx: `0`), Z: Mad1, Flags);
3478	}
3479
3480	const bool IsFiniteOnly =
3481	(MI.getFlag(Flag: MachineInstr::FmNoNans) \|\| TM.Options.NoNaNsFPMath) &&
3482	(MI.getFlag(Flag: MachineInstr::FmNoInfs) \|\| TM.Options.NoInfsFPMath);
3483
3484	if (!IsFiniteOnly) {
3485	// Expand isfinite(x) => fabs(x) < inf
3486	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3487	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Y);
3488	auto IsFinite =
3489	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: Inf, Flags);
3490	R = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: R, Op1: Y, Flags).getReg(Idx: `0`);
3491	}
3492
3493	if (ScaledInput) {
3494	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3495	auto ShiftK =
3496	B.buildFConstant(Res: Ty, Val: IsLog10 ? `0x1.344136p+3f` : `0x1.62e430p+4f`);
3497	auto Shift = B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ShiftK, Op1: Zero, Flags);
3498	B.buildFSub(Dst, Src0: R, Src1: Shift, Flags);
3499	} else {
3500	B.buildCopy(Res: Dst, Op: R);
3501	}
3502
3503	MI.eraseFromParent();
3504	return true;
3505	}
3506
3507	bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3508	Register Src, bool IsLog10,
3509	unsigned Flags) const {
3510	const double Log2BaseInverted =
3511	IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3512
3513	LLT Ty = B.getMRI()->getType(Reg: Dst);
3514
3515	if (Ty == LLT::scalar(SizeInBits: `32`)) {
3516	auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3517	if (ScaledInput) {
3518	auto LogSrc = B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3519	.addUse(RegNo: Src)
3520	.setMIFlags(Flags);
3521	auto ScaledResultOffset = B.buildFConstant(Res: Ty, Val: -`32.0` * Log2BaseInverted);
3522	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3523	auto ResultOffset =
3524	B.buildSelect(Res: Ty, Tst: IsScaled, Op0: ScaledResultOffset, Op1: Zero, Flags);
3525	auto Log2Inv = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3526
3527	if (ST.hasFastFMAF32())
3528	B.buildFMA(Dst, Src0: LogSrc, Src1: Log2Inv, Src2: ResultOffset, Flags);
3529	else {
3530	auto Mul = B.buildFMul(Dst: Ty, Src0: LogSrc, Src1: Log2Inv, Flags);
3531	B.buildFAdd(Dst, Src0: Mul, Src1: ResultOffset, Flags);
3532	}
3533
3534	return true;
3535	}
3536	}
3537
3538	auto Log2Operand = Ty == LLT::scalar(SizeInBits: `16`)
3539	? B.buildFLog2(Dst: Ty, Src, Flags)
3540	: B.buildIntrinsic(ID: Intrinsic::amdgcn_log, Res: {Ty})
3541	.addUse(RegNo: Src)
3542	.setMIFlags(Flags);
3543	auto Log2BaseInvertedOperand = B.buildFConstant(Res: Ty, Val: Log2BaseInverted);
3544	B.buildFMul(Dst, Src0: Log2Operand, Src1: Log2BaseInvertedOperand, Flags);
3545	return true;
3546	}
3547
3548	bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3549	MachineIRBuilder &B) const {
3550	// v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3551	// If we have to handle denormals, scale up the input and adjust the result.
3552
3553	Register Dst = MI.getOperand(i: `0`).getReg();
3554	Register Src = MI.getOperand(i: `1`).getReg();
3555	unsigned Flags = MI.getFlags();
3556	LLT Ty = B.getMRI()->getType(Reg: Dst);
3557	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3558	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3559
3560	if (Ty == F16) {
3561	// Nothing in half is a denormal when promoted to f32.
3562	auto Ext = B.buildFPExt(Res: F32, Op: Src, Flags);
3563	auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {F32})
3564	.addUse(RegNo: Ext.getReg(Idx: `0`))
3565	.setMIFlags(Flags);
3566	B.buildFPTrunc(Res: Dst, Op: Log2, Flags);
3567	MI.eraseFromParent();
3568	return true;
3569	}
3570
3571	assert(Ty == F32);
3572
3573	if (!needsDenormHandlingF32(MF: B.getMF(), Src, Flags)) {
3574	B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3575	.addUse(RegNo: Src)
3576	.setMIFlags(Flags);
3577	MI.eraseFromParent();
3578	return true;
3579	}
3580
3581	// bool needs_scaling = x < -0x1.f80000p+6f;
3582	// v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) (s ? 0x1.0p-64f : 1.0f);*
3583
3584	// -nextafter(128.0, -1)
3585	auto RangeCheckConst = B.buildFConstant(Res: Ty, Val: -`0x1.f80000p+6f`);
3586	auto NeedsScaling = B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Src,
3587	Op1: RangeCheckConst, Flags);
3588
3589	auto SixtyFour = B.buildFConstant(Res: Ty, Val: `0x1.0p+6f`);
3590	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3591	auto AddOffset = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: SixtyFour, Op1: Zero, Flags);
3592	auto AddInput = B.buildFAdd(Dst: F32, Src0: Src, Src1: AddOffset, Flags);
3593
3594	auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3595	.addUse(RegNo: AddInput.getReg(Idx: `0`))
3596	.setMIFlags(Flags);
3597
3598	auto TwoExpNeg64 = B.buildFConstant(Res: Ty, Val: `0x1.0p-64f`);
3599	auto One = B.buildFConstant(Res: Ty, Val: `1.0`);
3600	auto ResultScale = B.buildSelect(Res: F32, Tst: NeedsScaling, Op0: TwoExpNeg64, Op1: One, Flags);
3601	B.buildFMul(Dst, Src0: Exp2, Src1: ResultScale, Flags);
3602	MI.eraseFromParent();
3603	return true;
3604	}
3605
3606	bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3607	Register X, unsigned Flags) const {
3608	LLT Ty = B.getMRI()->getType(Reg: Dst);
3609	LLT F32 = LLT::scalar(SizeInBits: `32`);
3610
3611	if (Ty != F32 \|\| !needsDenormHandlingF32(MF: B.getMF(), Src: X, Flags)) {
3612	auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3613	auto Mul = B.buildFMul(Dst: Ty, Src0: X, Src1: Log2E, Flags);
3614
3615	if (Ty == F32) {
3616	B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: ArrayRef<Register>{Dst})
3617	.addUse(RegNo: Mul.getReg(Idx: `0`))
3618	.setMIFlags(Flags);
3619	} else {
3620	B.buildFExp2(Dst, Src: Mul.getReg(Idx: `0`), Flags);
3621	}
3622
3623	return true;
3624	}
3625
3626	auto Threshold = B.buildFConstant(Res: Ty, Val: -`0x1.5d58a0p+6f`);
3627	auto NeedsScaling =
3628	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: Threshold, Flags);
3629	auto ScaleOffset = B.buildFConstant(Res: Ty, Val: `0x1.0p+6f`);
3630	auto ScaledX = B.buildFAdd(Dst: Ty, Src0: X, Src1: ScaleOffset, Flags);
3631	auto AdjustedX = B.buildSelect(Res: Ty, Tst: NeedsScaling, Op0: ScaledX, Op1: X, Flags);
3632
3633	auto Log2E = B.buildFConstant(Res: Ty, Val: numbers::log2e);
3634	auto ExpInput = B.buildFMul(Dst: Ty, Src0: AdjustedX, Src1: Log2E, Flags);
3635
3636	auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3637	.addUse(RegNo: ExpInput.getReg(Idx: `0`))
3638	.setMIFlags(Flags);
3639
3640	auto ResultScaleFactor = B.buildFConstant(Res: Ty, Val: `0x1.969d48p-93f`);
3641	auto AdjustedResult = B.buildFMul(Dst: Ty, Src0: Exp2, Src1: ResultScaleFactor, Flags);
3642	B.buildSelect(Res: Dst, Tst: NeedsScaling, Op0: AdjustedResult, Op1: Exp2, Flags);
3643	return true;
3644	}
3645
3646	bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3647	MachineIRBuilder &B) const {
3648	Register Dst = MI.getOperand(i: `0`).getReg();
3649	Register X = MI.getOperand(i: `1`).getReg();
3650	const unsigned Flags = MI.getFlags();
3651	MachineFunction &MF = B.getMF();
3652	MachineRegisterInfo &MRI = *B.getMRI();
3653	LLT Ty = MRI.getType(Reg: Dst);
3654	const LLT F16 = LLT::scalar(SizeInBits: `16`);
3655	const LLT F32 = LLT::scalar(SizeInBits: `32`);
3656	const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3657
3658	if (Ty == F16) {
3659	// v_exp_f16 (fmul x, log2e)
3660	if (allowApproxFunc(MF, Flags)) {
3661	// TODO: Does this really require fast?
3662	legalizeFExpUnsafe(B, Dst, X, Flags);
3663	MI.eraseFromParent();
3664	return true;
3665	}
3666
3667	// exp(f16 x) ->
3668	// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3669
3670	// Nothing in half is a denormal when promoted to f32.
3671	auto Ext = B.buildFPExt(Res: F32, Op: X, Flags);
3672	Register Lowered = MRI.createGenericVirtualRegister(Ty: F32);
3673	legalizeFExpUnsafe(B, Dst: Lowered, X: Ext.getReg(Idx: `0`), Flags);
3674	B.buildFPTrunc(Res: Dst, Op: Lowered, Flags);
3675	MI.eraseFromParent();
3676	return true;
3677	}
3678
3679	assert(Ty == F32);
3680
3681	// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3682	// library behavior. Also, is known-not-daz source sufficient?
3683	if (allowApproxFunc(MF, Flags)) {
3684	legalizeFExpUnsafe(B, Dst, X, Flags);
3685	MI.eraseFromParent();
3686	return true;
3687	}
3688
3689	// Algorithm:
3690	//
3691	// e^x = 2^(x/ln(2)) = 2^(x(64/ln(2))/64)*
3692	//
3693	// x(64/ln(2)) = n + f, \|f\| <= 0.5, n is integer*
3694	// n = 64m + j, 0 <= j < 64*
3695	//
3696	// e^x = 2^((64m + j + f)/64)*
3697	// = (2^m) (2^(j/64)) * 2^(f/64)*
3698	// = (2^m) (2^(j/64)) * e^(f(ln(2)/64))
3699	//
3700	// f = x(64/ln(2)) - n*
3701	// r = f(ln(2)/64) = x - n(ln(2)/64)
3702	//
3703	// e^x = (2^m) (2^(j/64)) * e^r*
3704	//
3705	// (2^(j/64)) is precomputed
3706	//
3707	// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3708	// e^r = 1 + q
3709	//
3710	// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3711	//
3712	// e^x = (2^m) ( (2^(j/64)) + q(2^(j/64)) )
3713	const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3714	Register PH, PL;
3715
3716	if (ST.hasFastFMAF32()) {
3717	const float c_exp = numbers::log2ef;
3718	const float cc_exp = `0x1.4ae0bep-26f`; // c+cc are 49 bits
3719	const float c_exp10 = `0x1.a934f0p+1f`;
3720	const float cc_exp10 = `0x1.2f346ep-24f`;
3721
3722	auto C = B.buildFConstant(Res: Ty, Val: IsExp10 ? c_exp10 : c_exp);
3723	PH = B.buildFMul(Dst: Ty, Src0: X, Src1: C, Flags).getReg(Idx: `0`);
3724	auto NegPH = B.buildFNeg(Dst: Ty, Src0: PH, Flags);
3725	auto FMA0 = B.buildFMA(Dst: Ty, Src0: X, Src1: C, Src2: NegPH, Flags);
3726
3727	auto CC = B.buildFConstant(Res: Ty, Val: IsExp10 ? cc_exp10 : cc_exp);
3728	PL = B.buildFMA(Dst: Ty, Src0: X, Src1: CC, Src2: FMA0, Flags).getReg(Idx: `0`);
3729	} else {
3730	const float ch_exp = `0x1.714000p+0f`;
3731	const float cl_exp = `0x1.47652ap-12f`; // ch + cl are 36 bits
3732
3733	const float ch_exp10 = `0x1.a92000p+1f`;
3734	const float cl_exp10 = `0x1.4f0978p-11f`;
3735
3736	auto MaskConst = B.buildConstant(Res: Ty, Val: `0xfffff000`);
3737	auto XH = B.buildAnd(Dst: Ty, Src0: X, Src1: MaskConst);
3738	auto XL = B.buildFSub(Dst: Ty, Src0: X, Src1: XH, Flags);
3739
3740	auto CH = B.buildFConstant(Res: Ty, Val: IsExp10 ? ch_exp10 : ch_exp);
3741	PH = B.buildFMul(Dst: Ty, Src0: XH, Src1: CH, Flags).getReg(Idx: `0`);
3742
3743	auto CL = B.buildFConstant(Res: Ty, Val: IsExp10 ? cl_exp10 : cl_exp);
3744	auto XLCL = B.buildFMul(Dst: Ty, Src0: XL, Src1: CL, Flags);
3745
3746	Register Mad0 =
3747	getMad(B, Ty, X: XL.getReg(Idx: `0`), Y: CH.getReg(Idx: `0`), Z: XLCL.getReg(Idx: `0`), Flags);
3748	PL = getMad(B, Ty, X: XH.getReg(Idx: `0`), Y: CL.getReg(Idx: `0`), Z: Mad0, Flags);
3749	}
3750
3751	auto E = B.buildIntrinsicRoundeven(Dst: Ty, Src0: PH, Flags);
3752
3753	// It is unsafe to contract this fsub into the PH multiply.
3754	auto PHSubE = B.buildFSub(Dst: Ty, Src0: PH, Src1: E, Flags: FlagsNoContract);
3755	auto A = B.buildFAdd(Dst: Ty, Src0: PHSubE, Src1: PL, Flags);
3756	auto IntE = B.buildFPTOSI(Dst: LLT::scalar(SizeInBits: `32`), Src0: E);
3757
3758	auto Exp2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_exp2, Res: {Ty})
3759	.addUse(RegNo: A.getReg(Idx: `0`))
3760	.setMIFlags(Flags);
3761	auto R = B.buildFLdexp(Dst: Ty, Src0: Exp2, Src1: IntE, Flags);
3762
3763	auto UnderflowCheckConst =
3764	B.buildFConstant(Res: Ty, Val: IsExp10 ? -`0x1.66d3e8p+5f` : -`0x1.9d1da0p+6f`);
3765	auto Zero = B.buildFConstant(Res: Ty, Val: `0.0`);
3766	auto Underflow =
3767	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: UnderflowCheckConst);
3768
3769	R = B.buildSelect(Res: Ty, Tst: Underflow, Op0: Zero, Op1: R);
3770
3771	const auto &Options = MF.getTarget().Options;
3772
3773	if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3774	auto OverflowCheckConst =
3775	B.buildFConstant(Res: Ty, Val: IsExp10 ? `0x1.344136p+5f` : `0x1.62e430p+6f`);
3776
3777	auto Overflow =
3778	B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: LLT::scalar(SizeInBits: `1`), Op0: X, Op1: OverflowCheckConst);
3779	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: APFloat::IEEEsingle()));
3780	R = B.buildSelect(Res: Ty, Tst: Overflow, Op0: Inf, Op1: R, Flags);
3781	}
3782
3783	B.buildCopy(Res: Dst, Op: R);
3784	MI.eraseFromParent();
3785	return true;
3786	}
3787
3788	bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3789	MachineIRBuilder &B) const {
3790	Register Dst = MI.getOperand(i: `0`).getReg();
3791	Register Src0 = MI.getOperand(i: `1`).getReg();
3792	Register Src1 = MI.getOperand(i: `2`).getReg();
3793	unsigned Flags = MI.getFlags();
3794	LLT Ty = B.getMRI()->getType(Reg: Dst);
3795	const LLT F16 = LLT::float16();
3796	const LLT F32 = LLT::float32();
3797
3798	if (Ty == F32) {
3799	auto Log = B.buildFLog2(Dst: F32, Src: Src0, Flags);
3800	auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
3801	.addUse(RegNo: Log.getReg(Idx: `0`))
3802	.addUse(RegNo: Src1)
3803	.setMIFlags(Flags);
3804	B.buildFExp2(Dst, Src: Mul, Flags);
3805	} else if (Ty == F16) {
3806	// There's no f16 fmul_legacy, so we need to convert for it.
3807	auto Log = B.buildFLog2(Dst: F16, Src: Src0, Flags);
3808	auto Ext0 = B.buildFPExt(Res: F32, Op: Log, Flags);
3809	auto Ext1 = B.buildFPExt(Res: F32, Op: Src1, Flags);
3810	auto Mul = B.buildIntrinsic(ID: Intrinsic::amdgcn_fmul_legacy, Res: {F32})
3811	.addUse(RegNo: Ext0.getReg(Idx: `0`))
3812	.addUse(RegNo: Ext1.getReg(Idx: `0`))
3813	.setMIFlags(Flags);
3814	B.buildFExp2(Dst, Src: B.buildFPTrunc(Res: F16, Op: Mul), Flags);
3815	} else
3816	return false;
3817
3818	MI.eraseFromParent();
3819	return true;
3820	}
3821
3822	// Find a source register, ignoring any possible source modifiers.
3823	static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3824	Register ModSrc = OrigSrc;
3825	if (MachineInstr *SrcFNeg = getOpcodeDef(Opcode: AMDGPU::G_FNEG, Reg: ModSrc, MRI)) {
3826	ModSrc = SrcFNeg->getOperand(i: `1`).getReg();
3827	if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
3828	ModSrc = SrcFAbs->getOperand(i: `1`).getReg();
3829	} else if (MachineInstr *SrcFAbs = getOpcodeDef(Opcode: AMDGPU::G_FABS, Reg: ModSrc, MRI))
3830	ModSrc = SrcFAbs->getOperand(i: `1`).getReg();
3831	return ModSrc;
3832	}
3833
3834	bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3835	MachineRegisterInfo &MRI,
3836	MachineIRBuilder &B) const {
3837
3838	const LLT S1 = LLT::scalar(SizeInBits: `1`);
3839	const LLT F64 = LLT::float64();
3840	Register Dst = MI.getOperand(i: `0`).getReg();
3841	Register OrigSrc = MI.getOperand(i: `1`).getReg();
3842	unsigned Flags = MI.getFlags();
3843	assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3844	"this should not have been custom lowered");
3845
3846	// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3847	// is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3848	// efficient way to implement it is using V_FRACT_F64. The workaround for the
3849	// V_FRACT bug is:
3850	// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3851	//
3852	// Convert floor(x) to (x - fract(x))
3853
3854	auto Fract = B.buildIntrinsic(ID: Intrinsic::amdgcn_fract, Res: {F64})
3855	.addUse(RegNo: OrigSrc)
3856	.setMIFlags(Flags);
3857
3858	// Give source modifier matching some assistance before obscuring a foldable
3859	// pattern.
3860
3861	// TODO: We can avoid the neg on the fract? The input sign to fract
3862	// shouldn't matter?
3863	Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3864
3865	auto Const =
3866	B.buildFConstant(Res: F64, Val: llvm::bit_cast<double>(from: `0x3fefffffffffffff`));
3867
3868	Register Min = MRI.createGenericVirtualRegister(Ty: F64);
3869
3870	// We don't need to concern ourselves with the snan handling difference, so
3871	// use the one which will directly select.
3872	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3873	if (MFI->getMode().IEEE)
3874	B.buildFMinNumIEEE(Dst: Min, Src0: Fract, Src1: Const, Flags);
3875	else
3876	B.buildFMinNum(Dst: Min, Src0: Fract, Src1: Const, Flags);
3877
3878	Register CorrectedFract = Min;
3879	if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
3880	auto IsNan = B.buildFCmp(Pred: CmpInst::FCMP_ORD, Res: S1, Op0: ModSrc, Op1: ModSrc, Flags);
3881	CorrectedFract = B.buildSelect(Res: F64, Tst: IsNan, Op0: ModSrc, Op1: Min, Flags).getReg(Idx: `0`);
3882	}
3883
3884	auto NegFract = B.buildFNeg(Dst: F64, Src0: CorrectedFract, Flags);
3885	B.buildFAdd(Dst, Src0: OrigSrc, Src1: NegFract, Flags);
3886
3887	MI.eraseFromParent();
3888	return true;
3889	}
3890
3891	// Turn an illegal packed v2s16 build vector into bit operations.
3892	// TODO: This should probably be a bitcast action in LegalizerHelper.
3893	bool AMDGPULegalizerInfo::legalizeBuildVector(
3894	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3895	Register Dst = MI.getOperand(i: `0`).getReg();
3896	const LLT S32 = LLT::scalar(SizeInBits: `32`);
3897	const LLT S16 = LLT::scalar(SizeInBits: `16`);
3898	assert(MRI.getType(Dst) == LLT::fixed_vector(`2`, `16`));
3899
3900	Register Src0 = MI.getOperand(i: `1`).getReg();
3901	Register Src1 = MI.getOperand(i: `2`).getReg();
3902
3903	if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3904	assert(MRI.getType(Src0) == S32);
3905	Src0 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: `1`).getReg()).getReg(Idx: `0`);
3906	Src1 = B.buildTrunc(Res: S16, Op: MI.getOperand(i: `2`).getReg()).getReg(Idx: `0`);
3907	}
3908
3909	auto Merge = B.buildMergeLikeInstr(Res: S32, Ops: {Src0, Src1});
3910	B.buildBitcast(Dst, Src: Merge);
3911
3912	MI.eraseFromParent();
3913	return true;
3914	}
3915
3916	// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3917	//
3918	// Source and accumulation registers must all be 32-bits.
3919	//
3920	// TODO: When the multiply is uniform, we should produce a code sequence
3921	// that is better suited to instruction selection on the SALU. Instead of
3922	// the outer loop going over parts of the result, the outer loop should go
3923	// over parts of one of the factors. This should result in instruction
3924	// selection that makes full use of S_ADDC_U32 instructions.
3925	void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3926	MutableArrayRef<Register> Accum,
3927	ArrayRef<Register> Src0,
3928	ArrayRef<Register> Src1,
3929	bool UsePartialMad64_32,
3930	bool SeparateOddAlignedProducts) const {
3931	// Use (possibly empty) vectors of S1 registers to represent the set of
3932	// carries from one pair of positions to the next.
3933	using Carry = SmallVector<Register, `2`>;
3934
3935	MachineIRBuilder &B = Helper.MIRBuilder;
3936	GISelValueTracking &VT = *Helper.getValueTracking();
3937
3938	const LLT S1 = LLT::scalar(SizeInBits: `1`);
3939	const LLT S32 = LLT::scalar(SizeInBits: `32`);
3940	const LLT S64 = LLT::scalar(SizeInBits: `64`);
3941
3942	Register Zero32;
3943	Register Zero64;
3944
3945	auto getZero32 = [&]() -> Register {
3946	if (!Zero32)
3947	Zero32 = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
3948	return Zero32;
3949	};
3950	auto getZero64 = [&]() -> Register {
3951	if (!Zero64)
3952	Zero64 = B.buildConstant(Res: S64, Val: `0`).getReg(Idx: `0`);
3953	return Zero64;
3954	};
3955
3956	SmallVector<bool, `2`> Src0KnownZeros, Src1KnownZeros;
3957	for (unsigned i = `0`; i < Src0.size(); ++i) {
3958	Src0KnownZeros.push_back(Elt: VT.getKnownBits(R: Src0 [i]).isZero());
3959	Src1KnownZeros.push_back(Elt: VT.getKnownBits(R: Src1 [i]).isZero());
3960	}
3961
3962	// Merge the given carries into the 32-bit LocalAccum, which is modified
3963	// in-place.
3964	//
3965	// Returns the carry-out, which is a single S1 register or null.
3966	auto mergeCarry =
3967	[&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3968	if (CarryIn.empty())
3969	return Register ();
3970
3971	bool HaveCarryOut = true;
3972	Register CarryAccum;
3973	if (CarryIn.size() == `1`) {
3974	if (!LocalAccum) {
3975	LocalAccum = B.buildZExt(Res: S32, Op: CarryIn [`0`]).getReg(Idx: `0`);
3976	return Register ();
3977	}
3978
3979	CarryAccum = getZero32 ();
3980	} else {
3981	CarryAccum = B.buildZExt(Res: S32, Op: CarryIn [`0`]).getReg(Idx: `0`);
3982	for (unsigned i = `1`; i + `1` < CarryIn.size(); ++i) {
3983	CarryAccum =
3984	B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: getZero32 (), CarryIn: CarryIn [i])
3985	.getReg(Idx: `0`);
3986	}
3987
3988	if (!LocalAccum) {
3989	LocalAccum = getZero32 ();
3990	HaveCarryOut = false;
3991	}
3992	}
3993
3994	auto Add =
3995	B.buildUAdde(Res: S32, CarryOut: S1, Op0: CarryAccum, Op1: LocalAccum, CarryIn: CarryIn.back());
3996	LocalAccum = Add.getReg(Idx: `0`);
3997	return HaveCarryOut ? Add.getReg(Idx: `1`) : Register ();
3998	};
3999
4000	// Build a multiply-add chain to compute
4001	//
4002	// LocalAccum + (partial products at DstIndex)
4003	// + (opportunistic subset of CarryIn)
4004	//
4005	// LocalAccum is an array of one or two 32-bit registers that are updated
4006	// in-place. The incoming registers may be null.
4007	//
4008	// In some edge cases, carry-ins can be consumed "for free". In that case,
4009	// the consumed carry bits are removed from CarryIn in-place.
4010	auto buildMadChain =
4011	[&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4012	-> Carry {
4013	assert((DstIndex + `1` < Accum.size() && LocalAccum.size() == `2`) \|\|
4014	(DstIndex + `1` >= Accum.size() && LocalAccum.size() == `1`));
4015
4016	Carry CarryOut;
4017	unsigned j0 = `0`;
4018
4019	// Use plain 32-bit multiplication for the most significant part of the
4020	// result by default.
4021	if (LocalAccum.size() == `1` &&
4022	(!UsePartialMad64_32 \|\| !CarryIn.empty())) {
4023	do {
4024	// Skip multiplication if one of the operands is 0
4025	unsigned j1 = DstIndex - j0;
4026	if (Src0KnownZeros [j0] \|\| Src1KnownZeros [j1]) {
4027	++j0;
4028	continue;
4029	}
4030	auto Mul = B.buildMul(Dst: S32, Src0: Src0 [j0], Src1: Src1 [j1]);
4031	if (!LocalAccum [`0`] \|\| VT.getKnownBits(R: LocalAccum [`0`]).isZero()) {
4032	LocalAccum [`0`] = Mul.getReg(Idx: `0`);
4033	} else {
4034	if (CarryIn.empty()) {
4035	LocalAccum [`0`] = B.buildAdd(Dst: S32, Src0: LocalAccum [`0`], Src1: Mul).getReg(Idx: `0`);
4036	} else {
4037	LocalAccum [`0`] =
4038	B.buildUAdde(Res: S32, CarryOut: S1, Op0: LocalAccum [`0`], Op1: Mul, CarryIn: CarryIn.back())
4039	.getReg(Idx: `0`);
4040	CarryIn.pop_back();
4041	}
4042	}
4043	++j0;
4044	} while (j0 <= DstIndex && (!UsePartialMad64_32 \|\| !CarryIn.empty()));
4045	}
4046
4047	// Build full 64-bit multiplies.
4048	if (j0 <= DstIndex) {
4049	bool HaveSmallAccum = false;
4050	Register Tmp;
4051
4052	if (LocalAccum [`0`]) {
4053	if (LocalAccum.size() == `1`) {
4054	Tmp = B.buildAnyExt(Res: S64, Op: LocalAccum [`0`]).getReg(Idx: `0`);
4055	HaveSmallAccum = true;
4056	} else if (LocalAccum [`1`]) {
4057	Tmp = B.buildMergeLikeInstr(Res: S64, Ops: LocalAccum).getReg(Idx: `0`);
4058	HaveSmallAccum = false;
4059	} else {
4060	Tmp = B.buildZExt(Res: S64, Op: LocalAccum [`0`]).getReg(Idx: `0`);
4061	HaveSmallAccum = true;
4062	}
4063	} else {
4064	assert(LocalAccum.size() == `1` \|\| !LocalAccum[`1`]);
4065	Tmp = getZero64 ();
4066	HaveSmallAccum = true;
4067	}
4068
4069	do {
4070	unsigned j1 = DstIndex - j0;
4071	if (Src0KnownZeros [j0] \|\| Src1KnownZeros [j1]) {
4072	++j0;
4073	continue;
4074	}
4075	auto Mad = B.buildInstr(Opc: AMDGPU::G_AMDGPU_MAD_U64_U32, DstOps: {S64, S1},
4076	SrcOps: {Src0 [j0], Src1 [j1], Tmp});
4077	Tmp = Mad.getReg(Idx: `0`);
4078	if (!HaveSmallAccum)
4079	CarryOut.push_back(Elt: Mad.getReg(Idx: `1`));
4080	HaveSmallAccum = false;
4081
4082	++j0;
4083	} while (j0 <= DstIndex);
4084
4085	auto Unmerge = B.buildUnmerge(Res: S32, Op: Tmp);
4086	LocalAccum [`0`] = Unmerge.getReg(Idx: `0`);
4087	if (LocalAccum.size() > `1`)
4088	LocalAccum [`1`] = Unmerge.getReg(Idx: `1`);
4089	}
4090
4091	return CarryOut;
4092	};
4093
4094	// Outer multiply loop, iterating over destination parts from least
4095	// significant to most significant parts.
4096	//
4097	// The columns of the following diagram correspond to the destination parts
4098	// affected by one iteration of the outer loop (ignoring boundary
4099	// conditions).
4100	//
4101	// Dest index relative to 2 i: 1 0 -1*
4102	// ------
4103	// Carries from previous iteration: e o
4104	// Even-aligned partial product sum: E E .
4105	// Odd-aligned partial product sum: O O
4106	//
4107	// 'o' is OddCarry, 'e' is EvenCarry.
4108	// EE and OO are computed from partial products via buildMadChain and use
4109	// accumulation where possible and appropriate.
4110	//
4111	Register SeparateOddCarry;
4112	Carry EvenCarry;
4113	Carry OddCarry;
4114
4115	for (unsigned i = `0`; i <= Accum.size() / `2`; ++i) {
4116	Carry OddCarryIn = std::move(OddCarry);
4117	Carry EvenCarryIn = std::move(EvenCarry);
4118	OddCarry.clear();
4119	EvenCarry.clear();
4120
4121	// Partial products at offset 2 i.*
4122	if (`2` * i < Accum.size()) {
4123	auto LocalAccum = Accum.drop_front(N: `2` * i).take_front(N: `2`);
4124	EvenCarry = buildMadChain (LocalAccum, `2` * i, EvenCarryIn);
4125	}
4126
4127	// Partial products at offset 2 i - 1.*
4128	if (i > `0`) {
4129	if (!SeparateOddAlignedProducts) {
4130	auto LocalAccum = Accum.drop_front(N: `2` * i - `1`).take_front(N: `2`);
4131	OddCarry = buildMadChain (LocalAccum, `2` * i - `1`, OddCarryIn);
4132	} else {
4133	bool IsHighest = `2` * i >= Accum.size();
4134	Register SeparateOddOut[`2`];
4135	auto LocalAccum = MutableArrayRef(SeparateOddOut)
4136	.take_front(N: IsHighest ? `1` : `2`);
4137	OddCarry = buildMadChain (LocalAccum, `2` * i - `1`, OddCarryIn);
4138
4139	MachineInstr *Lo;
4140
4141	if (i == `1`) {
4142	if (!IsHighest)
4143	Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Accum [`2` * i - `1`], Op1: SeparateOddOut[`0`]);
4144	else
4145	Lo = B.buildAdd(Dst: S32, Src0: Accum [`2` * i - `1`], Src1: SeparateOddOut[`0`]);
4146	} else {
4147	Lo = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum [`2` * i - `1`], Op1: SeparateOddOut[`0`],
4148	CarryIn: SeparateOddCarry);
4149	}
4150	Accum [`2` * i - `1`] = Lo->getOperand(i: `0`).getReg();
4151
4152	if (!IsHighest) {
4153	auto Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Accum [`2` * i], Op1: SeparateOddOut[`1`],
4154	CarryIn: Lo->getOperand(i: `1`).getReg());
4155	Accum [`2` * i] = Hi.getReg(Idx: `0`);
4156	SeparateOddCarry = Hi.getReg(Idx: `1`);
4157	}
4158	}
4159	}
4160
4161	// Add in the carries from the previous iteration
4162	if (i > `0`) {
4163	if (Register CarryOut = mergeCarry (Accum [`2` * i - `1`], OddCarryIn))
4164	EvenCarryIn.push_back(Elt: CarryOut);
4165
4166	if (`2` * i < Accum.size()) {
4167	if (Register CarryOut = mergeCarry (Accum [`2` * i], EvenCarryIn))
4168	OddCarry.push_back(Elt: CarryOut);
4169	}
4170	}
4171	}
4172	}
4173
4174	// Custom narrowing of wide multiplies using wide multiply-add instructions.
4175	//
4176	// TODO: If the multiply is followed by an addition, we should attempt to
4177	// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4178	bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4179	MachineInstr &MI) const {
4180	assert(ST.hasMad64_32());
4181	assert(MI.getOpcode() == TargetOpcode::G_MUL);
4182
4183	MachineIRBuilder &B = Helper.MIRBuilder;
4184	MachineRegisterInfo &MRI = *B.getMRI();
4185
4186	Register DstReg = MI.getOperand(i: `0`).getReg();
4187	Register Src0 = MI.getOperand(i: `1`).getReg();
4188	Register Src1 = MI.getOperand(i: `2`).getReg();
4189
4190	LLT Ty = MRI.getType(Reg: DstReg);
4191	assert(Ty.isScalar());
4192
4193	unsigned Size = Ty.getSizeInBits();
4194	unsigned NumParts = Size / `32`;
4195	assert((Size % `32`) == `0`);
4196	assert(NumParts >= `2`);
4197
4198	// Whether to use MAD_64_32 for partial products whose high half is
4199	// discarded. This avoids some ADD instructions but risks false dependency
4200	// stalls on some subtargets in some cases.
4201	const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4202
4203	// Whether to compute odd-aligned partial products separately. This is
4204	// advisable on subtargets where the accumulator of MAD_64_32 must be placed
4205	// in an even-aligned VGPR.
4206	const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4207
4208	LLT S32 = LLT::scalar(SizeInBits: `32`);
4209	SmallVector<Register, `2`> Src0Parts, Src1Parts;
4210	for (unsigned i = `0`; i < NumParts; ++i) {
4211	Src0Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4212	Src1Parts.push_back(Elt: MRI.createGenericVirtualRegister(Ty: S32));
4213	}
4214	B.buildUnmerge(Res: Src0Parts, Op: Src0);
4215	B.buildUnmerge(Res: Src1Parts, Op: Src1);
4216
4217	SmallVector<Register, `2`> AccumRegs(NumParts);
4218	buildMultiply(Helper, Accum: AccumRegs, Src0: Src0Parts, Src1: Src1Parts, UsePartialMad64_32,
4219	SeparateOddAlignedProducts);
4220
4221	B.buildMergeLikeInstr(Res: DstReg, Ops: AccumRegs);
4222	MI.eraseFromParent();
4223	return true;
4224	}
4225
4226	// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4227	// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4228	// case with a single min instruction instead of a compare+select.
4229	bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4230	MachineRegisterInfo &MRI,
4231	MachineIRBuilder &B) const {
4232	Register Dst = MI.getOperand(i: `0`).getReg();
4233	Register Src = MI.getOperand(i: `1`).getReg();
4234	LLT DstTy = MRI.getType(Reg: Dst);
4235	LLT SrcTy = MRI.getType(Reg: Src);
4236
4237	unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4238	? AMDGPU::G_AMDGPU_FFBH_U32
4239	: AMDGPU::G_AMDGPU_FFBL_B32;
4240	auto Tmp = B.buildInstr(Opc: NewOpc, DstOps: {DstTy}, SrcOps: {Src});
4241	B.buildUMin(Dst, Src0: Tmp, Src1: B.buildConstant(Res: DstTy, Val: SrcTy.getSizeInBits()));
4242
4243	MI.eraseFromParent();
4244	return true;
4245	}
4246
4247	bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4248	MachineRegisterInfo &MRI,
4249	MachineIRBuilder &B) const {
4250	Register Dst = MI.getOperand(i: `0`).getReg();
4251	Register Src = MI.getOperand(i: `1`).getReg();
4252	LLT SrcTy = MRI.getType(Reg: Src);
4253	TypeSize NumBits = SrcTy.getSizeInBits();
4254
4255	assert(NumBits < `32u`);
4256
4257	auto ShiftAmt = B.buildConstant(Res: S32, Val: `32u` - NumBits);
4258	auto Extend = B.buildAnyExt(Res: S32, Op: {Src}).getReg(Idx: `0u`);
4259	auto Shift = B.buildShl(Dst: S32, Src0: Extend, Src1: ShiftAmt);
4260	auto Ctlz = B.buildInstr(Opc: AMDGPU::G_AMDGPU_FFBH_U32, DstOps: {S32}, SrcOps: {Shift});
4261	B.buildTrunc(Res: Dst, Op: Ctlz);
4262	MI.eraseFromParent();
4263	return true;
4264	}
4265
4266	// Check that this is a G_XOR x, -1
4267	static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4268	if (MI.getOpcode() != TargetOpcode::G_XOR)
4269	return false;
4270	auto ConstVal = getIConstantVRegSExtVal(VReg: MI.getOperand(i: `2`).getReg(), MRI);
4271	return ConstVal == -`1`;
4272	}
4273
4274	// Return the use branch instruction, otherwise null if the usage is invalid.
4275	static MachineInstr *
4276	verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4277	MachineBasicBlock &UncondBrTarget, bool* &Negated) {
4278	Register CondDef = MI.getOperand(i: `0`).getReg();
4279	if (!MRI.hasOneNonDBGUse(RegNo: CondDef))
4280	return nullptr;
4281
4282	MachineBasicBlock *Parent = MI.getParent();
4283	MachineInstr UseMI = &MRI.use_instr_nodbg_begin(RegNo: CondDef);
4284
4285	if (isNot(MRI, MI: *UseMI)) {
4286	Register NegatedCond = UseMI->getOperand(i: `0`).getReg();
4287	if (!MRI.hasOneNonDBGUse(RegNo: NegatedCond))
4288	return nullptr;
4289
4290	// We're deleting the def of this value, so we need to remove it.
4291	eraseInstr(MI&: *UseMI, MRI);
4292
4293	UseMI = &*MRI.use_instr_nodbg_begin(RegNo: NegatedCond);
4294	Negated = true;
4295	}
4296
4297	if (UseMI->getParent() != Parent \|\| UseMI->getOpcode() != AMDGPU::G_BRCOND)
4298	return nullptr;
4299
4300	// Make sure the cond br is followed by a G_BR, or is the last instruction.
4301	MachineBasicBlock::iterator Next = std::next(x: UseMI->getIterator());
4302	if (Next == Parent->end()) {
4303	MachineFunction::iterator NextMBB = std::next(x: Parent->getIterator());
4304	if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4305	return nullptr;
4306	UncondBrTarget = &*NextMBB;
4307	} else {
4308	if (Next ->getOpcode() != AMDGPU::G_BR)
4309	return nullptr;
4310	Br = &*Next;
4311	UncondBrTarget = Br->getOperand(i: `0`).getMBB();
4312	}
4313
4314	return UseMI;
4315	}
4316
4317	void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
4318	MachineIRBuilder &B,
4319	const ArgDescriptor *Arg,
4320	const TargetRegisterClass *ArgRC,
4321	LLT ArgTy) const {
4322	MCRegister SrcReg = Arg->getRegister();
4323	assert(SrcReg.isPhysical() && "Physical register expected");
4324	assert(DstReg.isVirtual() && "Virtual register expected");
4325
4326	Register LiveIn = getFunctionLiveInPhysReg(MF&: B.getMF(), TII: B.getTII(), PhysReg: SrcReg,
4327	RC: *ArgRC, DL: B.getDebugLoc(), RegTy: ArgTy);
4328	if (Arg->isMasked()) {
4329	// TODO: Should we try to emit this once in the entry block?
4330	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4331	const unsigned Mask = Arg->getMask();
4332	const unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
4333
4334	Register AndMaskSrc = LiveIn;
4335
4336	// TODO: Avoid clearing the high bits if we know workitem id y/z are always
4337	// 0.
4338	if (Shift != `0`) {
4339	auto ShiftAmt = B.buildConstant(Res: S32, Val: Shift);
4340	AndMaskSrc = B.buildLShr(Dst: S32, Src0: LiveIn, Src1: ShiftAmt).getReg(Idx: `0`);
4341	}
4342
4343	B.buildAnd(Dst: DstReg, Src0: AndMaskSrc, Src1: B.buildConstant(Res: S32, Val: Mask >> Shift));
4344	} else {
4345	B.buildCopy(Res: DstReg, Op: LiveIn);
4346	}
4347	}
4348
4349	bool AMDGPULegalizerInfo::loadInputValue(
4350	Register DstReg, MachineIRBuilder &B,
4351	AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4352	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4353	const ArgDescriptor Arg = nullptr*;
4354	const TargetRegisterClass *ArgRC;
4355	LLT ArgTy;
4356
4357	CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4358	const ArgDescriptor WorkGroupIDX =
4359	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
4360	// If GridZ is not programmed in an entry function then the hardware will set
4361	// it to all zeros, so there is no need to mask the GridY value in the low
4362	// order bits.
4363	const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4364	Reg: AMDGPU::TTMP7,
4365	Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~`0u` : `0xFFFFu`);
4366	const ArgDescriptor WorkGroupIDZ =
4367	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: `0xFFFF0000u`);
4368	if (ST.hasArchitectedSGPRs() &&
4369	(AMDGPU::isCompute(CC) \|\| CC == CallingConv::AMDGPU_Gfx)) {
4370	switch (ArgType) {
4371	case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4372	Arg = &WorkGroupIDX;
4373	ArgRC = &AMDGPU::SReg_32RegClass;
4374	ArgTy = LLT::scalar(SizeInBits: `32`);
4375	break;
4376	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4377	Arg = &WorkGroupIDY;
4378	ArgRC = &AMDGPU::SReg_32RegClass;
4379	ArgTy = LLT::scalar(SizeInBits: `32`);
4380	break;
4381	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4382	Arg = &WorkGroupIDZ;
4383	ArgRC = &AMDGPU::SReg_32RegClass;
4384	ArgTy = LLT::scalar(SizeInBits: `32`);
4385	break;
4386	default:
4387	break;
4388	}
4389	}
4390
4391	if (!Arg)
4392	std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4393
4394	if (!Arg) {
4395	if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4396	// The intrinsic may appear when we have a 0 sized kernarg segment, in which
4397	// case the pointer argument may be missing and we use null.
4398	B.buildConstant(Res: DstReg, Val: `0`);
4399	return true;
4400	}
4401
4402	// It's undefined behavior if a function marked with the amdgpu-no-*
4403	// attributes uses the corresponding intrinsic.
4404	B.buildUndef(Res: DstReg);
4405	return true;
4406	}
4407
4408	if (!Arg->isRegister() \|\| !Arg->getRegister().isValid())
4409	return false; // TODO: Handle these
4410	buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4411	return true;
4412	}
4413
4414	bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4415	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4416	AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4417	if (!loadInputValue(DstReg: MI.getOperand(i: `0`).getReg(), B, ArgType))
4418	return false;
4419
4420	MI.eraseFromParent();
4421	return true;
4422	}
4423
4424	static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4425	int64_t C) {
4426	B.buildConstant(Res: MI.getOperand(i: `0`).getReg(), Val: C);
4427	MI.eraseFromParent();
4428	return true;
4429	}
4430
4431	bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4432	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4433	unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4434	unsigned MaxID = ST.getMaxWorkitemID(Kernel: B.getMF().getFunction(), Dimension: Dim);
4435	if (MaxID == `0`)
4436	return replaceWithConstant(B, MI, C: `0`);
4437
4438	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4439	const ArgDescriptor *Arg;
4440	const TargetRegisterClass *ArgRC;
4441	LLT ArgTy;
4442	std::tie(args&: Arg, args&: ArgRC, args&: ArgTy) = MFI->getPreloadedValue(Value: ArgType);
4443
4444	Register DstReg = MI.getOperand(i: `0`).getReg();
4445	if (!Arg) {
4446	// It's undefined behavior if a function marked with the amdgpu-no-*
4447	// attributes uses the corresponding intrinsic.
4448	B.buildUndef(Res: DstReg);
4449	MI.eraseFromParent();
4450	return true;
4451	}
4452
4453	if (Arg->isMasked()) {
4454	// Don't bother inserting AssertZext for packed IDs since we're emitting the
4455	// masking operations anyway.
4456	//
4457	// TODO: We could assert the top bit is 0 for the source copy.
4458	if (!loadInputValue(DstReg, B, ArgType))
4459	return false;
4460	} else {
4461	Register TmpReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `32`));
4462	if (!loadInputValue(DstReg: TmpReg, B, ArgType))
4463	return false;
4464	B.buildAssertZExt(Res: DstReg, Op: TmpReg, Size: llvm::bit_width(Value: MaxID));
4465	}
4466
4467	MI.eraseFromParent();
4468	return true;
4469	}
4470
4471	Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4472	int64_t Offset) const {
4473	LLT PtrTy = LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`);
4474	Register KernArgReg = B.getMRI()->createGenericVirtualRegister(Ty: PtrTy);
4475
4476	// TODO: If we passed in the base kernel offset we could have a better
4477	// alignment than 4, but we don't really need it.
4478	if (!loadInputValue(DstReg: KernArgReg, B,
4479	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4480	llvm_unreachable("failed to find kernarg segment ptr");
4481
4482	auto COffset = B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset);
4483	// TODO: Should get nuw
4484	return B.buildPtrAdd(Res: PtrTy, Op0: KernArgReg, Op1: COffset).getReg(Idx: `0`);
4485	}
4486
4487	/// Legalize a value that's loaded from kernel arguments. This is only used by
4488	/// legacy intrinsics.
4489	bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4490	MachineIRBuilder &B,
4491	uint64_t Offset,
4492	Align Alignment) const {
4493	Register DstReg = MI.getOperand(i: `0`).getReg();
4494
4495	assert(B.getMRI()->getType(DstReg) == LLT::scalar(`32`) &&
4496	"unexpected kernarg parameter type");
4497
4498	Register Ptr = getKernargParameterPtr(B, Offset);
4499	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4500	B.buildLoad(Res: DstReg, Addr: Ptr, PtrInfo, Alignment: Align (`4`),
4501	MMOFlags: MachineMemOperand::MODereferenceable \|
4502	MachineMemOperand::MOInvariant);
4503	MI.eraseFromParent();
4504	return true;
4505	}
4506
4507	bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4508	MachineRegisterInfo &MRI,
4509	MachineIRBuilder &B) const {
4510	Register Dst = MI.getOperand(i: `0`).getReg();
4511	LLT DstTy = MRI.getType(Reg: Dst);
4512	LLT S16 = LLT::scalar(SizeInBits: `16`);
4513	LLT S32 = LLT::scalar(SizeInBits: `32`);
4514	LLT S64 = LLT::scalar(SizeInBits: `64`);
4515
4516	if (DstTy == S16)
4517	return legalizeFDIV16(MI, MRI, B);
4518	if (DstTy == S32)
4519	return legalizeFDIV32(MI, MRI, B);
4520	if (DstTy == S64)
4521	return legalizeFDIV64(MI, MRI, B);
4522
4523	return false;
4524	}
4525
4526	void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4527	Register DstDivReg,
4528	Register DstRemReg,
4529	Register X,
4530	Register Y) const {
4531	const LLT S1 = LLT::scalar(SizeInBits: `1`);
4532	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4533
4534	// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4535	// algorithm used here.
4536
4537	// Initial estimate of inv(y).
4538	auto FloatY = B.buildUITOFP(Dst: S32, Src0: Y);
4539	auto RcpIFlag = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {FloatY});
4540	auto Scale = B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x4f7ffffe`));
4541	auto ScaledY = B.buildFMul(Dst: S32, Src0: RcpIFlag, Src1: Scale);
4542	auto Z = B.buildFPTOUI(Dst: S32, Src0: ScaledY);
4543
4544	// One round of UNR.
4545	auto NegY = B.buildSub(Dst: S32, Src0: B.buildConstant(Res: S32, Val: `0`), Src1: Y);
4546	auto NegYZ = B.buildMul(Dst: S32, Src0: NegY, Src1: Z);
4547	Z = B.buildAdd(Dst: S32, Src0: Z, Src1: B.buildUMulH(Dst: S32, Src0: Z, Src1: NegYZ));
4548
4549	// Quotient/remainder estimate.
4550	auto Q = B.buildUMulH(Dst: S32, Src0: X, Src1: Z);
4551	auto R = B.buildSub(Dst: S32, Src0: X, Src1: B.buildMul(Dst: S32, Src0: Q, Src1: Y));
4552
4553	// First quotient/remainder refinement.
4554	auto One = B.buildConstant(Res: S32, Val: `1`);
4555	auto Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4556	if (DstDivReg)
4557	Q = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4558	R = B.buildSelect(Res: S32, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4559
4560	// Second quotient/remainder refinement.
4561	Cond = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: R, Op1: Y);
4562	if (DstDivReg)
4563	B.buildSelect(Res: DstDivReg, Tst: Cond, Op0: B.buildAdd(Dst: S32, Src0: Q, Src1: One), Op1: Q);
4564
4565	if (DstRemReg)
4566	B.buildSelect(Res: DstRemReg, Tst: Cond, Op0: B.buildSub(Dst: S32, Src0: R, Src1: Y), Op1: R);
4567	}
4568
4569	// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4570	//
4571	// Return lo, hi of result
4572	//
4573	// %cvt.lo = G_UITOFP Val.lo
4574	// %cvt.hi = G_UITOFP Val.hi
4575	// %mad = G_FMAD %cvt.hi, 232, %cvt.lo
4576	// %rcp = G_AMDGPU_RCP_IFLAG %mad
4577	// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4578	// %mul2 = G_FMUL %mul1, 2(-32)
4579	// %trunc = G_INTRINSIC_TRUNC %mul2
4580	// %mad2 = G_FMAD %trunc, -(232), %mul1
4581	// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4582	static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4583	Register Val) {
4584	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4585	auto Unmerge = B.buildUnmerge(Res: S32, Op: Val);
4586
4587	auto CvtLo = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: `0`));
4588	auto CvtHi = B.buildUITOFP(Dst: S32, Src0: Unmerge.getReg(Idx: `1`));
4589
4590	auto Mad = B.buildFMAD(
4591	Dst: S32, Src0: CvtHi, // 232
4592	Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x4f800000`)), Src2: CvtLo);
4593
4594	auto Rcp = B.buildInstr(Opc: AMDGPU::G_AMDGPU_RCP_IFLAG, DstOps: {S32}, SrcOps: {Mad});
4595	auto Mul1 = B.buildFMul(
4596	Dst: S32, Src0: Rcp, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x5f7ffffc`)));
4597
4598	// 2(-32)
4599	auto Mul2 = B.buildFMul(
4600	Dst: S32, Src0: Mul1, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0x2f800000`)));
4601	auto Trunc = B.buildIntrinsicTrunc(Dst: S32, Src0: Mul2);
4602
4603	// -(232)
4604	auto Mad2 = B.buildFMAD(
4605	Dst: S32, Src0: Trunc, Src1: B.buildFConstant(Res: S32, Val: llvm::bit_cast<float>(from: `0xcf800000`)),
4606	Src2: Mul1);
4607
4608	auto ResultLo = B.buildFPTOUI(Dst: S32, Src0: Mad2);
4609	auto ResultHi = B.buildFPTOUI(Dst: S32, Src0: Trunc);
4610
4611	return {ResultLo.getReg(Idx: `0`), ResultHi.getReg(Idx: `0`)};
4612	}
4613
4614	void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4615	Register DstDivReg,
4616	Register DstRemReg,
4617	Register Numer,
4618	Register Denom) const {
4619	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4620	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4621	const LLT S1 = LLT::scalar(SizeInBits: `1`);
4622	Register RcpLo, RcpHi;
4623
4624	std::tie(args&: RcpLo, args&: RcpHi) = emitReciprocalU64(B, Val: Denom);
4625
4626	auto Rcp = B.buildMergeLikeInstr(Res: S64, Ops: {RcpLo, RcpHi});
4627
4628	auto Zero64 = B.buildConstant(Res: S64, Val: `0`);
4629	auto NegDenom = B.buildSub(Dst: S64, Src0: Zero64, Src1: Denom);
4630
4631	auto MulLo1 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Rcp);
4632	auto MulHi1 = B.buildUMulH(Dst: S64, Src0: Rcp, Src1: MulLo1);
4633
4634	auto UnmergeMulHi1 = B.buildUnmerge(Res: S32, Op: MulHi1);
4635	Register MulHi1_Lo = UnmergeMulHi1.getReg(Idx: `0`);
4636	Register MulHi1_Hi = UnmergeMulHi1.getReg(Idx: `1`);
4637
4638	auto Add1_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: RcpLo, Op1: MulHi1_Lo);
4639	auto Add1_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: RcpHi, Op1: MulHi1_Hi, CarryIn: Add1_Lo.getReg(Idx: `1`));
4640	auto Add1 = B.buildMergeLikeInstr(Res: S64, Ops: {Add1_Lo, Add1_Hi});
4641
4642	auto MulLo2 = B.buildMul(Dst: S64, Src0: NegDenom, Src1: Add1);
4643	auto MulHi2 = B.buildUMulH(Dst: S64, Src0: Add1, Src1: MulLo2);
4644	auto UnmergeMulHi2 = B.buildUnmerge(Res: S32, Op: MulHi2);
4645	Register MulHi2_Lo = UnmergeMulHi2.getReg(Idx: `0`);
4646	Register MulHi2_Hi = UnmergeMulHi2.getReg(Idx: `1`);
4647
4648	auto Zero32 = B.buildConstant(Res: S32, Val: `0`);
4649	auto Add2_Lo = B.buildUAddo(Res: S32, CarryOut: S1, Op0: Add1_Lo, Op1: MulHi2_Lo);
4650	auto Add2_Hi = B.buildUAdde(Res: S32, CarryOut: S1, Op0: Add1_Hi, Op1: MulHi2_Hi, CarryIn: Add2_Lo.getReg(Idx: `1`));
4651	auto Add2 = B.buildMergeLikeInstr(Res: S64, Ops: {Add2_Lo, Add2_Hi});
4652
4653	auto UnmergeNumer = B.buildUnmerge(Res: S32, Op: Numer);
4654	Register NumerLo = UnmergeNumer.getReg(Idx: `0`);
4655	Register NumerHi = UnmergeNumer.getReg(Idx: `1`);
4656
4657	auto MulHi3 = B.buildUMulH(Dst: S64, Src0: Numer, Src1: Add2);
4658	auto Mul3 = B.buildMul(Dst: S64, Src0: Denom, Src1: MulHi3);
4659	auto UnmergeMul3 = B.buildUnmerge(Res: S32, Op: Mul3);
4660	Register Mul3_Lo = UnmergeMul3.getReg(Idx: `0`);
4661	Register Mul3_Hi = UnmergeMul3.getReg(Idx: `1`);
4662	auto Sub1_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: NumerLo, Op1: Mul3_Lo);
4663	auto Sub1_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: NumerHi, Op1: Mul3_Hi, CarryIn: Sub1_Lo.getReg(Idx: `1`));
4664	auto Sub1_Mi = B.buildSub(Dst: S32, Src0: NumerHi, Src1: Mul3_Hi);
4665	auto Sub1 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub1_Lo, Sub1_Hi});
4666
4667	auto UnmergeDenom = B.buildUnmerge(Res: S32, Op: Denom);
4668	Register DenomLo = UnmergeDenom.getReg(Idx: `0`);
4669	Register DenomHi = UnmergeDenom.getReg(Idx: `1`);
4670
4671	auto CmpHi = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4672	auto C1 = B.buildSExt(Res: S32, Op: CmpHi);
4673
4674	auto CmpLo = B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub1_Lo, Op1: DenomLo);
4675	auto C2 = B.buildSExt(Res: S32, Op: CmpLo);
4676
4677	auto CmpEq = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub1_Hi, Op1: DenomHi);
4678	auto C3 = B.buildSelect(Res: S32, Tst: CmpEq, Op0: C2, Op1: C1);
4679
4680	// TODO: Here and below portions of the code can be enclosed into if/endif.
4681	// Currently control flow is unconditional and we have 4 selects after
4682	// potential endif to substitute PHIs.
4683
4684	// if C3 != 0 ...
4685	auto Sub2_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub1_Lo, Op1: DenomLo);
4686	auto Sub2_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub1_Mi, Op1: DenomHi, CarryIn: Sub1_Lo.getReg(Idx: `1`));
4687	auto Sub2_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: Zero32, CarryIn: Sub2_Lo.getReg(Idx: `1`));
4688	auto Sub2 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub2_Lo, Sub2_Hi});
4689
4690	auto One64 = B.buildConstant(Res: S64, Val: `1`);
4691	auto Add3 = B.buildAdd(Dst: S64, Src0: MulHi3, Src1: One64);
4692
4693	auto C4 =
4694	B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Hi, Op1: DenomHi));
4695	auto C5 =
4696	B.buildSExt(Res: S32, Op: B.buildICmp(Pred: CmpInst::ICMP_UGE, Res: S1, Op0: Sub2_Lo, Op1: DenomLo));
4697	auto C6 = B.buildSelect(
4698	Res: S32, Tst: B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: Sub2_Hi, Op1: DenomHi), Op0: C5, Op1: C4);
4699
4700	// if (C6 != 0)
4701	auto Add4 = B.buildAdd(Dst: S64, Src0: Add3, Src1: One64);
4702	auto Sub3_Lo = B.buildUSubo(Res: S32, CarryOut: S1, Op0: Sub2_Lo, Op1: DenomLo);
4703
4704	auto Sub3_Mi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub2_Mi, Op1: DenomHi, CarryIn: Sub2_Lo.getReg(Idx: `1`));
4705	auto Sub3_Hi = B.buildUSube(Res: S32, CarryOut: S1, Op0: Sub3_Mi, Op1: Zero32, CarryIn: Sub3_Lo.getReg(Idx: `1`));
4706	auto Sub3 = B.buildMergeLikeInstr(Res: S64, Ops: {Sub3_Lo, Sub3_Hi});
4707
4708	// endif C6
4709	// endif C3
4710
4711	if (DstDivReg) {
4712	auto Sel1 = B.buildSelect(
4713	Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Add4, Op1: Add3);
4714	B.buildSelect(Res: DstDivReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4715	Op0: Sel1, Op1: MulHi3);
4716	}
4717
4718	if (DstRemReg) {
4719	auto Sel2 = B.buildSelect(
4720	Res: S64, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C6, Op1: Zero32), Op0: Sub3, Op1: Sub2);
4721	B.buildSelect(Res: DstRemReg, Tst: B.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: C3, Op1: Zero32),
4722	Op0: Sel2, Op1: Sub1);
4723	}
4724	}
4725
4726	bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4727	MachineRegisterInfo &MRI,
4728	MachineIRBuilder &B) const {
4729	Register DstDivReg, DstRemReg;
4730	switch (MI.getOpcode()) {
4731	default:
4732	llvm_unreachable("Unexpected opcode!");
4733	case AMDGPU::G_UDIV: {
4734	DstDivReg = MI.getOperand(i: `0`).getReg();
4735	break;
4736	}
4737	case AMDGPU::G_UREM: {
4738	DstRemReg = MI.getOperand(i: `0`).getReg();
4739	break;
4740	}
4741	case AMDGPU::G_UDIVREM: {
4742	DstDivReg = MI.getOperand(i: `0`).getReg();
4743	DstRemReg = MI.getOperand(i: `1`).getReg();
4744	break;
4745	}
4746	}
4747
4748	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4749	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4750	const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4751	Register Num = MI.getOperand(i: FirstSrcOpIdx).getReg();
4752	Register Den = MI.getOperand(i: FirstSrcOpIdx + `1`).getReg();
4753	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4754
4755	if (Ty == S32)
4756	legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, X: Num, Y: Den);
4757	else if (Ty == S64)
4758	legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Numer: Num, Denom: Den);
4759	else
4760	return false;
4761
4762	MI.eraseFromParent();
4763	return true;
4764	}
4765
4766	bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4767	MachineRegisterInfo &MRI,
4768	MachineIRBuilder &B) const {
4769	const LLT S64 = LLT::scalar(SizeInBits: `64`);
4770	const LLT S32 = LLT::scalar(SizeInBits: `32`);
4771
4772	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
4773	if (Ty != S32 && Ty != S64)
4774	return false;
4775
4776	const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4777	Register LHS = MI.getOperand(i: FirstSrcOpIdx).getReg();
4778	Register RHS = MI.getOperand(i: FirstSrcOpIdx + `1`).getReg();
4779
4780	auto SignBitOffset = B.buildConstant(Res: S32, Val: Ty.getSizeInBits() - `1`);
4781	auto LHSign = B.buildAShr(Dst: Ty, Src0: LHS, Src1: SignBitOffset);
4782	auto RHSign = B.buildAShr(Dst: Ty, Src0: RHS, Src1: SignBitOffset);
4783
4784	LHS = B.buildAdd(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: `0`);
4785	RHS = B.buildAdd(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: `0`);
4786
4787	LHS = B.buildXor(Dst: Ty, Src0: LHS, Src1: LHSign).getReg(Idx: `0`);
4788	RHS = B.buildXor(Dst: Ty, Src0: RHS, Src1: RHSign).getReg(Idx: `0`);
4789
4790	Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4791	switch (MI.getOpcode()) {
4792	default:
4793	llvm_unreachable("Unexpected opcode!");
4794	case AMDGPU::G_SDIV: {
4795	DstDivReg = MI.getOperand(i: `0`).getReg();
4796	TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4797	break;
4798	}
4799	case AMDGPU::G_SREM: {
4800	DstRemReg = MI.getOperand(i: `0`).getReg();
4801	TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4802	break;
4803	}
4804	case AMDGPU::G_SDIVREM: {
4805	DstDivReg = MI.getOperand(i: `0`).getReg();
4806	DstRemReg = MI.getOperand(i: `1`).getReg();
4807	TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4808	TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4809	break;
4810	}
4811	}
4812
4813	if (Ty == S32)
4814	legalizeUnsignedDIV_REM32Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, X: LHS, Y: RHS);
4815	else
4816	legalizeUnsignedDIV_REM64Impl(B, DstDivReg: TmpDivReg, DstRemReg: TmpRemReg, Numer: LHS, Denom: RHS);
4817
4818	if (DstDivReg) {
4819	auto Sign = B.buildXor(Dst: Ty, Src0: LHSign, Src1: RHSign).getReg(Idx: `0`);
4820	auto SignXor = B.buildXor(Dst: Ty, Src0: TmpDivReg, Src1: Sign).getReg(Idx: `0`);
4821	B.buildSub(Dst: DstDivReg, Src0: SignXor, Src1: Sign);
4822	}
4823
4824	if (DstRemReg) {
4825	auto Sign = LHSign.getReg(Idx: `0`); // Remainder sign is the same as LHS
4826	auto SignXor = B.buildXor(Dst: Ty, Src0: TmpRemReg, Src1: Sign).getReg(Idx: `0`);
4827	B.buildSub(Dst: DstRemReg, Src0: SignXor, Src1: Sign);
4828	}
4829
4830	MI.eraseFromParent();
4831	return true;
4832	}
4833
4834	bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4835	MachineRegisterInfo &MRI,
4836	MachineIRBuilder &B) const {
4837	Register Res = MI.getOperand(i: `0`).getReg();
4838	Register LHS = MI.getOperand(i: `1`).getReg();
4839	Register RHS = MI.getOperand(i: `2`).getReg();
4840	uint16_t Flags = MI.getFlags();
4841	LLT ResTy = MRI.getType(Reg: Res);
4842
4843	const MachineFunction &MF = B.getMF();
4844	bool AllowInaccurateRcp = MI.getFlag(Flag: MachineInstr::FmAfn) \|\|
4845	MF.getTarget().Options.UnsafeFPMath;
4846
4847	if (const auto *CLHS = getConstantFPVRegVal(VReg: LHS, MRI)) {
4848	if (!AllowInaccurateRcp && ResTy != LLT::scalar(SizeInBits: `16`))
4849	return false;
4850
4851	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4852	// the CI documentation has a worst case error of 1 ulp.
4853	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4854	// use it as long as we aren't trying to use denormals.
4855	//
4856	// v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4857
4858	// 1 / x -> RCP(x)
4859	if (CLHS->isExactlyValue(V: `1.0`)) {
4860	B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
4861	.addUse(RegNo: RHS)
4862	.setMIFlags(Flags);
4863
4864	MI.eraseFromParent();
4865	return true;
4866	}
4867
4868	// -1 / x -> RCP( FNEG(x) )
4869	if (CLHS->isExactlyValue(V: -`1.0`)) {
4870	auto FNeg = B.buildFNeg(Dst: ResTy, Src0: RHS, Flags);
4871	B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res)
4872	.addUse(RegNo: FNeg.getReg(Idx: `0`))
4873	.setMIFlags(Flags);
4874
4875	MI.eraseFromParent();
4876	return true;
4877	}
4878	}
4879
4880	// For f16 require afn or arcp.
4881	// For f32 require afn.
4882	if (!AllowInaccurateRcp && (ResTy != LLT::scalar(SizeInBits: `16`) \|\|
4883	!MI.getFlag(Flag: MachineInstr::FmArcp)))
4884	return false;
4885
4886	// x / y -> x (1.0 / y)*
4887	auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
4888	.addUse(RegNo: RHS)
4889	.setMIFlags(Flags);
4890	B.buildFMul(Dst: Res, Src0: LHS, Src1: RCP, Flags);
4891
4892	MI.eraseFromParent();
4893	return true;
4894	}
4895
4896	bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4897	MachineRegisterInfo &MRI,
4898	MachineIRBuilder &B) const {
4899	Register Res = MI.getOperand(i: `0`).getReg();
4900	Register X = MI.getOperand(i: `1`).getReg();
4901	Register Y = MI.getOperand(i: `2`).getReg();
4902	uint16_t Flags = MI.getFlags();
4903	LLT ResTy = MRI.getType(Reg: Res);
4904
4905	const MachineFunction &MF = B.getMF();
4906	bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath \|\|
4907	MI.getFlag(Flag: MachineInstr::FmAfn);
4908
4909	if (!AllowInaccurateRcp)
4910	return false;
4911
4912	auto NegY = B.buildFNeg(Dst: ResTy, Src0: Y);
4913	auto One = B.buildFConstant(Res: ResTy, Val: `1.0`);
4914
4915	auto R = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {ResTy})
4916	.addUse(RegNo: Y)
4917	.setMIFlags(Flags);
4918
4919	auto Tmp0 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4920	R = B.buildFMA(Dst: ResTy, Src0: Tmp0, Src1: R, Src2: R);
4921
4922	auto Tmp1 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: R, Src2: One);
4923	R = B.buildFMA(Dst: ResTy, Src0: Tmp1, Src1: R, Src2: R);
4924
4925	auto Ret = B.buildFMul(Dst: ResTy, Src0: X, Src1: R);
4926	auto Tmp2 = B.buildFMA(Dst: ResTy, Src0: NegY, Src1: Ret, Src2: X);
4927
4928	B.buildFMA(Dst: Res, Src0: Tmp2, Src1: R, Src2: Ret);
4929	MI.eraseFromParent();
4930	return true;
4931	}
4932
4933	bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4934	MachineRegisterInfo &MRI,
4935	MachineIRBuilder &B) const {
4936	if (legalizeFastUnsafeFDIV(MI, MRI, B))
4937	return true;
4938
4939	Register Res = MI.getOperand(i: `0`).getReg();
4940	Register LHS = MI.getOperand(i: `1`).getReg();
4941	Register RHS = MI.getOperand(i: `2`).getReg();
4942
4943	uint16_t Flags = MI.getFlags();
4944
4945	LLT S16 = LLT::scalar(SizeInBits: `16`);
4946	LLT S32 = LLT::scalar(SizeInBits: `32`);
4947
4948	// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4949	// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4950	// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4951	// q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n rcp*
4952	// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d q + n*
4953	// q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n rcp*
4954	// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d q + n*
4955	// tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4956	// tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4957	// q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4958	// q16.u = opx(V_CVT_F16_F32, q32.u);
4959	// q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4960
4961	auto LHSExt = B.buildFPExt(Res: S32, Op: LHS, Flags);
4962	auto RHSExt = B.buildFPExt(Res: S32, Op: RHS, Flags);
4963	auto NegRHSExt = B.buildFNeg(Dst: S32, Src0: RHSExt);
4964	auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
4965	.addUse(RegNo: RHSExt.getReg(Idx: `0`))
4966	.setMIFlags(Flags);
4967	auto Quot = B.buildFMul(Dst: S32, Src0: LHSExt, Src1: Rcp, Flags);
4968	MachineInstrBuilder Err;
4969	if (ST.hasMadMacF32Insts()) {
4970	Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
4971	Quot = B.buildFMAD(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
4972	Err = B.buildFMAD(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
4973	} else {
4974	Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
4975	Quot = B.buildFMA(Dst: S32, Src0: Err, Src1: Rcp, Src2: Quot, Flags);
4976	Err = B.buildFMA(Dst: S32, Src0: NegRHSExt, Src1: Quot, Src2: LHSExt, Flags);
4977	}
4978	auto Tmp = B.buildFMul(Dst: S32, Src0: Err, Src1: Rcp, Flags);
4979	Tmp = B.buildAnd(Dst: S32, Src0: Tmp, Src1: B.buildConstant(Res: S32, Val: `0xff800000`));
4980	Quot = B.buildFAdd(Dst: S32, Src0: Tmp, Src1: Quot, Flags);
4981	auto RDst = B.buildFPTrunc(Res: S16, Op: Quot, Flags);
4982	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
4983	.addUse(RegNo: RDst.getReg(Idx: `0`))
4984	.addUse(RegNo: RHS)
4985	.addUse(RegNo: LHS)
4986	.setMIFlags(Flags);
4987
4988	MI.eraseFromParent();
4989	return true;
4990	}
4991
4992	static constexpr unsigned SPDenormModeBitField =
4993	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `4`, Values: `2`);
4994
4995	// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4996	// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4997	static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4998	const GCNSubtarget &ST,
4999	SIModeRegisterDefaults Mode) {
5000	// Set SP denorm mode to this value.
5001	unsigned SPDenormMode =
5002	Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5003
5004	if (ST.hasDenormModeInst()) {
5005	// Preserve default FP64FP16 denorm mode while updating FP32 mode.
5006	uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5007
5008	uint32_t NewDenormModeValue = SPDenormMode \| (DPDenormModeDefault << `2`);
5009	B.buildInstr(Opcode: AMDGPU::S_DENORM_MODE)
5010	.addImm(Val: NewDenormModeValue);
5011
5012	} else {
5013	B.buildInstr(Opcode: AMDGPU::S_SETREG_IMM32_B32)
5014	.addImm(Val: SPDenormMode)
5015	.addImm(Val: SPDenormModeBitField);
5016	}
5017	}
5018
5019	bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
5020	MachineRegisterInfo &MRI,
5021	MachineIRBuilder &B) const {
5022	if (legalizeFastUnsafeFDIV(MI, MRI, B))
5023	return true;
5024
5025	Register Res = MI.getOperand(i: `0`).getReg();
5026	Register LHS = MI.getOperand(i: `1`).getReg();
5027	Register RHS = MI.getOperand(i: `2`).getReg();
5028	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5029	SIModeRegisterDefaults Mode = MFI->getMode();
5030
5031	uint16_t Flags = MI.getFlags();
5032
5033	LLT S32 = LLT::scalar(SizeInBits: `32`);
5034	LLT S1 = LLT::scalar(SizeInBits: `1`);
5035
5036	auto One = B.buildFConstant(Res: S32, Val: `1.0f`);
5037
5038	auto DenominatorScaled =
5039	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5040	.addUse(RegNo: LHS)
5041	.addUse(RegNo: RHS)
5042	.addImm(Val: `0`)
5043	.setMIFlags(Flags);
5044	auto NumeratorScaled =
5045	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S32, S1})
5046	.addUse(RegNo: LHS)
5047	.addUse(RegNo: RHS)
5048	.addImm(Val: `1`)
5049	.setMIFlags(Flags);
5050
5051	auto ApproxRcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5052	.addUse(RegNo: DenominatorScaled.getReg(Idx: `0`))
5053	.setMIFlags(Flags);
5054	auto NegDivScale0 = B.buildFNeg(Dst: S32, Src0: DenominatorScaled, Flags);
5055
5056	const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5057	const bool HasDynamicDenormals =
5058	(Mode.FP32Denormals.Input == DenormalMode::Dynamic) \|\|
5059	(Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5060
5061	Register SavedSPDenormMode;
5062	if (!PreservesDenormals) {
5063	if (HasDynamicDenormals) {
5064	SavedSPDenormMode = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5065	B.buildInstr(Opcode: AMDGPU::S_GETREG_B32)
5066	.addDef(RegNo: SavedSPDenormMode)
5067	.addImm(Val: SPDenormModeBitField);
5068	}
5069	toggleSPDenormMode(Enable: true, B, ST, Mode);
5070	}
5071
5072	auto Fma0 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: ApproxRcp, Src2: One, Flags);
5073	auto Fma1 = B.buildFMA(Dst: S32, Src0: Fma0, Src1: ApproxRcp, Src2: ApproxRcp, Flags);
5074	auto Mul = B.buildFMul(Dst: S32, Src0: NumeratorScaled, Src1: Fma1, Flags);
5075	auto Fma2 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Mul, Src2: NumeratorScaled, Flags);
5076	auto Fma3 = B.buildFMA(Dst: S32, Src0: Fma2, Src1: Fma1, Src2: Mul, Flags);
5077	auto Fma4 = B.buildFMA(Dst: S32, Src0: NegDivScale0, Src1: Fma3, Src2: NumeratorScaled, Flags);
5078
5079	if (!PreservesDenormals) {
5080	if (HasDynamicDenormals) {
5081	assert(SavedSPDenormMode);
5082	B.buildInstr(Opcode: AMDGPU::S_SETREG_B32)
5083	.addReg(RegNo: SavedSPDenormMode)
5084	.addImm(Val: SPDenormModeBitField);
5085	} else
5086	toggleSPDenormMode(Enable: false, B, ST, Mode);
5087	}
5088
5089	auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S32})
5090	.addUse(RegNo: Fma4.getReg(Idx: `0`))
5091	.addUse(RegNo: Fma1.getReg(Idx: `0`))
5092	.addUse(RegNo: Fma3.getReg(Idx: `0`))
5093	.addUse(RegNo: NumeratorScaled.getReg(Idx: `1`))
5094	.setMIFlags(Flags);
5095
5096	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res)
5097	.addUse(RegNo: Fmas.getReg(Idx: `0`))
5098	.addUse(RegNo: RHS)
5099	.addUse(RegNo: LHS)
5100	.setMIFlags(Flags);
5101
5102	MI.eraseFromParent();
5103	return true;
5104	}
5105
5106	bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5107	MachineRegisterInfo &MRI,
5108	MachineIRBuilder &B) const {
5109	if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5110	return true;
5111
5112	Register Res = MI.getOperand(i: `0`).getReg();
5113	Register LHS = MI.getOperand(i: `1`).getReg();
5114	Register RHS = MI.getOperand(i: `2`).getReg();
5115
5116	uint16_t Flags = MI.getFlags();
5117
5118	LLT S64 = LLT::scalar(SizeInBits: `64`);
5119	LLT S1 = LLT::scalar(SizeInBits: `1`);
5120
5121	auto One = B.buildFConstant(Res: S64, Val: `1.0`);
5122
5123	auto DivScale0 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5124	.addUse(RegNo: LHS)
5125	.addUse(RegNo: RHS)
5126	.addImm(Val: `0`)
5127	.setMIFlags(Flags);
5128
5129	auto NegDivScale0 = B.buildFNeg(Dst: S64, Src0: DivScale0.getReg(Idx: `0`), Flags);
5130
5131	auto Rcp = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S64})
5132	.addUse(RegNo: DivScale0.getReg(Idx: `0`))
5133	.setMIFlags(Flags);
5134
5135	auto Fma0 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Rcp, Src2: One, Flags);
5136	auto Fma1 = B.buildFMA(Dst: S64, Src0: Rcp, Src1: Fma0, Src2: Rcp, Flags);
5137	auto Fma2 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Fma1, Src2: One, Flags);
5138
5139	auto DivScale1 = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_scale, Res: {S64, S1})
5140	.addUse(RegNo: LHS)
5141	.addUse(RegNo: RHS)
5142	.addImm(Val: `1`)
5143	.setMIFlags(Flags);
5144
5145	auto Fma3 = B.buildFMA(Dst: S64, Src0: Fma1, Src1: Fma2, Src2: Fma1, Flags);
5146	auto Mul = B.buildFMul(Dst: S64, Src0: DivScale1.getReg(Idx: `0`), Src1: Fma3, Flags);
5147	auto Fma4 = B.buildFMA(Dst: S64, Src0: NegDivScale0, Src1: Mul, Src2: DivScale1.getReg(Idx: `0`), Flags);
5148
5149	Register Scale;
5150	if (!ST.hasUsableDivScaleConditionOutput()) {
5151	// Workaround a hardware bug on SI where the condition output from div_scale
5152	// is not usable.
5153
5154	LLT S32 = LLT::scalar(SizeInBits: `32`);
5155
5156	auto NumUnmerge = B.buildUnmerge(Res: S32, Op: LHS);
5157	auto DenUnmerge = B.buildUnmerge(Res: S32, Op: RHS);
5158	auto Scale0Unmerge = B.buildUnmerge(Res: S32, Op: DivScale0);
5159	auto Scale1Unmerge = B.buildUnmerge(Res: S32, Op: DivScale1);
5160
5161	auto CmpNum = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: NumUnmerge.getReg(Idx: `1`),
5162	Op1: Scale1Unmerge.getReg(Idx: `1`));
5163	auto CmpDen = B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: S1, Op0: DenUnmerge.getReg(Idx: `1`),
5164	Op1: Scale0Unmerge.getReg(Idx: `1`));
5165	Scale = B.buildXor(Dst: S1, Src0: CmpNum, Src1: CmpDen).getReg(Idx: `0`);
5166	} else {
5167	Scale = DivScale1.getReg(Idx: `1`);
5168	}
5169
5170	auto Fmas = B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fmas, Res: {S64})
5171	.addUse(RegNo: Fma4.getReg(Idx: `0`))
5172	.addUse(RegNo: Fma3.getReg(Idx: `0`))
5173	.addUse(RegNo: Mul.getReg(Idx: `0`))
5174	.addUse(RegNo: Scale)
5175	.setMIFlags(Flags);
5176
5177	B.buildIntrinsic(ID: Intrinsic::amdgcn_div_fixup, Res: ArrayRef(Res))
5178	.addUse(RegNo: Fmas.getReg(Idx: `0`))
5179	.addUse(RegNo: RHS)
5180	.addUse(RegNo: LHS)
5181	.setMIFlags(Flags);
5182
5183	MI.eraseFromParent();
5184	return true;
5185	}
5186
5187	bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5188	MachineRegisterInfo &MRI,
5189	MachineIRBuilder &B) const {
5190	Register Res0 = MI.getOperand(i: `0`).getReg();
5191	Register Res1 = MI.getOperand(i: `1`).getReg();
5192	Register Val = MI.getOperand(i: `2`).getReg();
5193	uint16_t Flags = MI.getFlags();
5194
5195	LLT Ty = MRI.getType(Reg: Res0);
5196	LLT InstrExpTy = Ty == LLT::scalar(SizeInBits: `16`) ? LLT::scalar(SizeInBits: `16`) : LLT::scalar(SizeInBits: `32`);
5197
5198	auto Mant = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_mant, Res: {Ty})
5199	.addUse(RegNo: Val)
5200	.setMIFlags(Flags);
5201	auto Exp = B.buildIntrinsic(ID: Intrinsic::amdgcn_frexp_exp, Res: {InstrExpTy})
5202	.addUse(RegNo: Val)
5203	.setMIFlags(Flags);
5204
5205	if (ST.hasFractBug()) {
5206	auto Fabs = B.buildFAbs(Dst: Ty, Src0: Val);
5207	auto Inf = B.buildFConstant(Res: Ty, Val: APFloat::getInf(Sem: getFltSemanticForLLT(Ty)));
5208	auto IsFinite =
5209	B.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: LLT::scalar(SizeInBits: `1`), Op0: Fabs, Op1: Inf, Flags);
5210	auto Zero = B.buildConstant(Res: InstrExpTy, Val: `0`);
5211	Exp = B.buildSelect(Res: InstrExpTy, Tst: IsFinite, Op0: Exp, Op1: Zero);
5212	Mant = B.buildSelect(Res: Ty, Tst: IsFinite, Op0: Mant, Op1: Val);
5213	}
5214
5215	B.buildCopy(Res: Res0, Op: Mant);
5216	B.buildSExtOrTrunc(Res: Res1, Op: Exp);
5217
5218	MI.eraseFromParent();
5219	return true;
5220	}
5221
5222	bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5223	MachineRegisterInfo &MRI,
5224	MachineIRBuilder &B) const {
5225	Register Res = MI.getOperand(i: `0`).getReg();
5226	Register LHS = MI.getOperand(i: `2`).getReg();
5227	Register RHS = MI.getOperand(i: `3`).getReg();
5228	uint16_t Flags = MI.getFlags();
5229
5230	LLT S32 = LLT::scalar(SizeInBits: `32`);
5231	LLT S1 = LLT::scalar(SizeInBits: `1`);
5232
5233	auto Abs = B.buildFAbs(Dst: S32, Src0: RHS, Flags);
5234	const APFloat C0Val(`1.0f`);
5235
5236	auto C0 = B.buildFConstant(Res: S32, Val: `0x1p+96f`);
5237	auto C1 = B.buildFConstant(Res: S32, Val: `0x1p-32f`);
5238	auto C2 = B.buildFConstant(Res: S32, Val: `1.0f`);
5239
5240	auto CmpRes = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: Abs, Op1: C0, Flags);
5241	auto Sel = B.buildSelect(Res: S32, Tst: CmpRes, Op0: C1, Op1: C2, Flags);
5242
5243	auto Mul0 = B.buildFMul(Dst: S32, Src0: RHS, Src1: Sel, Flags);
5244
5245	auto RCP = B.buildIntrinsic(ID: Intrinsic::amdgcn_rcp, Res: {S32})
5246	.addUse(RegNo: Mul0.getReg(Idx: `0`))
5247	.setMIFlags(Flags);
5248
5249	auto Mul1 = B.buildFMul(Dst: S32, Src0: LHS, Src1: RCP, Flags);
5250
5251	B.buildFMul(Dst: Res, Src0: Sel, Src1: Mul1, Flags);
5252
5253	MI.eraseFromParent();
5254	return true;
5255	}
5256
5257	bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5258	MachineRegisterInfo &MRI,
5259	MachineIRBuilder &B) const {
5260	// Bypass the correct expansion a standard promotion through G_FSQRT would
5261	// get. The f32 op is accurate enough for the f16 cas.
5262	unsigned Flags = MI.getFlags();
5263	assert(!ST.has16BitInsts());
5264	const LLT F32 = LLT::scalar(SizeInBits: `32`);
5265	auto Ext = B.buildFPExt(Res: F32, Op: MI.getOperand(i: `1`), Flags);
5266	auto Log2 = B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: {F32})
5267	.addUse(RegNo: Ext.getReg(Idx: `0`))
5268	.setMIFlags(Flags);
5269	B.buildFPTrunc(Res: MI.getOperand(i: `0`), Op: Log2, Flags);
5270	MI.eraseFromParent();
5271	return true;
5272	}
5273
5274	bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5275	MachineRegisterInfo &MRI,
5276	MachineIRBuilder &B) const {
5277	MachineFunction &MF = B.getMF();
5278	Register Dst = MI.getOperand(i: `0`).getReg();
5279	Register X = MI.getOperand(i: `1`).getReg();
5280	const unsigned Flags = MI.getFlags();
5281	const LLT S1 = LLT::scalar(SizeInBits: `1`);
5282	const LLT F32 = LLT::scalar(SizeInBits: `32`);
5283	const LLT I32 = LLT::scalar(SizeInBits: `32`);
5284
5285	if (allowApproxFunc(MF, Flags)) {
5286	B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({Dst}))
5287	.addUse(RegNo: X)
5288	.setMIFlags(Flags);
5289	MI.eraseFromParent();
5290	return true;
5291	}
5292
5293	auto ScaleThreshold = B.buildFConstant(Res: F32, Val: `0x1.0p-96f`);
5294	auto NeedScale = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: ScaleThreshold, Op1: X, Flags);
5295	auto ScaleUpFactor = B.buildFConstant(Res: F32, Val: `0x1.0p+32f`);
5296	auto ScaledX = B.buildFMul(Dst: F32, Src0: X, Src1: ScaleUpFactor, Flags);
5297	auto SqrtX = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledX, Op1: X, Flags);
5298
5299	Register SqrtS = MRI.createGenericVirtualRegister(Ty: F32);
5300	if (needsDenormHandlingF32(MF, Src: X, Flags)) {
5301	B.buildIntrinsic(ID: Intrinsic::amdgcn_sqrt, Res: ArrayRef<Register>({SqrtS}))
5302	.addUse(RegNo: SqrtX.getReg(Idx: `0`))
5303	.setMIFlags(Flags);
5304
5305	auto NegOne = B.buildConstant(Res: I32, Val: -`1`);
5306	auto SqrtSNextDown = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: NegOne);
5307
5308	auto NegSqrtSNextDown = B.buildFNeg(Dst: F32, Src0: SqrtSNextDown, Flags);
5309	auto SqrtVP = B.buildFMA(Dst: F32, Src0: NegSqrtSNextDown, Src1: SqrtS, Src2: SqrtX, Flags);
5310
5311	auto PosOne = B.buildConstant(Res: I32, Val: `1`);
5312	auto SqrtSNextUp = B.buildAdd(Dst: I32, Src0: SqrtS, Src1: PosOne);
5313
5314	auto NegSqrtSNextUp = B.buildFNeg(Dst: F32, Src0: SqrtSNextUp, Flags);
5315	auto SqrtVS = B.buildFMA(Dst: F32, Src0: NegSqrtSNextUp, Src1: SqrtS, Src2: SqrtX, Flags);
5316
5317	auto Zero = B.buildFConstant(Res: F32, Val: `0.0f`);
5318	auto SqrtVPLE0 = B.buildFCmp(Pred: CmpInst::FCMP_OLE, Res: S1, Op0: SqrtVP, Op1: Zero, Flags);
5319
5320	SqrtS =
5321	B.buildSelect(Res: F32, Tst: SqrtVPLE0, Op0: SqrtSNextDown, Op1: SqrtS, Flags).getReg(Idx: `0`);
5322
5323	auto SqrtVPVSGT0 = B.buildFCmp(Pred: CmpInst::FCMP_OGT, Res: S1, Op0: SqrtVS, Op1: Zero, Flags);
5324	SqrtS =
5325	B.buildSelect(Res: F32, Tst: SqrtVPVSGT0, Op0: SqrtSNextUp, Op1: SqrtS, Flags).getReg(Idx: `0`);
5326	} else {
5327	auto SqrtR =
5328	B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F32}).addReg(RegNo: SqrtX.getReg(Idx: `0`));
5329	B.buildFMul(Dst: SqrtS, Src0: SqrtX, Src1: SqrtR, Flags);
5330
5331	auto Half = B.buildFConstant(Res: F32, Val: `0.5f`);
5332	auto SqrtH = B.buildFMul(Dst: F32, Src0: SqrtR, Src1: Half, Flags);
5333	auto NegSqrtH = B.buildFNeg(Dst: F32, Src0: SqrtH, Flags);
5334	auto SqrtE = B.buildFMA(Dst: F32, Src0: NegSqrtH, Src1: SqrtS, Src2: Half, Flags);
5335	SqrtH = B.buildFMA(Dst: F32, Src0: SqrtH, Src1: SqrtE, Src2: SqrtH, Flags);
5336	SqrtS = B.buildFMA(Dst: F32, Src0: SqrtS, Src1: SqrtE, Src2: SqrtS, Flags).getReg(Idx: `0`);
5337	auto NegSqrtS = B.buildFNeg(Dst: F32, Src0: SqrtS, Flags);
5338	auto SqrtD = B.buildFMA(Dst: F32, Src0: NegSqrtS, Src1: SqrtS, Src2: SqrtX, Flags);
5339	SqrtS = B.buildFMA(Dst: F32, Src0: SqrtD, Src1: SqrtH, Src2: SqrtS, Flags).getReg(Idx: `0`);
5340	}
5341
5342	auto ScaleDownFactor = B.buildFConstant(Res: F32, Val: `0x1.0p-16f`);
5343
5344	auto ScaledDown = B.buildFMul(Dst: F32, Src0: SqrtS, Src1: ScaleDownFactor, Flags);
5345
5346	SqrtS = B.buildSelect(Res: F32, Tst: NeedScale, Op0: ScaledDown, Op1: SqrtS, Flags).getReg(Idx: `0`);
5347
5348	auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: `1`), Src: SqrtX, Mask: fcZero \| fcPosInf);
5349	B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtS, Flags);
5350
5351	MI.eraseFromParent();
5352	return true;
5353	}
5354
5355	bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5356	MachineRegisterInfo &MRI,
5357	MachineIRBuilder &B) const {
5358	// For double type, the SQRT and RSQ instructions don't have required
5359	// precision, we apply Goldschmidt's algorithm to improve the result:
5360	//
5361	// y0 = rsq(x)
5362	// g0 = x y0*
5363	// h0 = 0.5 y0*
5364	//
5365	// r0 = 0.5 - h0 g0*
5366	// g1 = g0 r0 + g0*
5367	// h1 = h0 r0 + h0*
5368	//
5369	// r1 = 0.5 - h1 g1 => d0 = x - g1 * g1*
5370	// g2 = g1 r1 + g1 g2 = d0 * h1 + g1*
5371	// h2 = h1 r1 + h1*
5372	//
5373	// r2 = 0.5 - h2 g2 => d1 = x - g2 * g2*
5374	// g3 = g2 r2 + g2 g3 = d1 * h1 + g2*
5375	//
5376	// sqrt(x) = g3
5377
5378	const LLT S1 = LLT::scalar(SizeInBits: `1`);
5379	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5380	const LLT F64 = LLT::scalar(SizeInBits: `64`);
5381
5382	Register Dst = MI.getOperand(i: `0`).getReg();
5383	assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5384
5385	Register X = MI.getOperand(i: `1`).getReg();
5386	unsigned Flags = MI.getFlags();
5387
5388	auto ScaleConstant = B.buildFConstant(Res: F64, Val: `0x1.0p-767`);
5389
5390	auto ZeroInt = B.buildConstant(Res: S32, Val: `0`);
5391	auto Scaling = B.buildFCmp(Pred: FCmpInst::FCMP_OLT, Res: S1, Op0: X, Op1: ScaleConstant);
5392
5393	// Scale up input if it is too small.
5394	auto ScaleUpFactor = B.buildConstant(Res: S32, Val: `256`);
5395	auto ScaleUp = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleUpFactor, Op1: ZeroInt);
5396	auto SqrtX = B.buildFLdexp(Dst: F64, Src0: X, Src1: ScaleUp, Flags);
5397
5398	auto SqrtY =
5399	B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {F64}).addReg(RegNo: SqrtX.getReg(Idx: `0`));
5400
5401	auto Half = B.buildFConstant(Res: F64, Val: `0.5`);
5402	auto SqrtH0 = B.buildFMul(Dst: F64, Src0: SqrtY, Src1: Half);
5403	auto SqrtS0 = B.buildFMul(Dst: F64, Src0: SqrtX, Src1: SqrtY);
5404
5405	auto NegSqrtH0 = B.buildFNeg(Dst: F64, Src0: SqrtH0);
5406	auto SqrtR0 = B.buildFMA(Dst: F64, Src0: NegSqrtH0, Src1: SqrtS0, Src2: Half);
5407
5408	auto SqrtS1 = B.buildFMA(Dst: F64, Src0: SqrtS0, Src1: SqrtR0, Src2: SqrtS0);
5409	auto SqrtH1 = B.buildFMA(Dst: F64, Src0: SqrtH0, Src1: SqrtR0, Src2: SqrtH0);
5410
5411	auto NegSqrtS1 = B.buildFNeg(Dst: F64, Src0: SqrtS1);
5412	auto SqrtD0 = B.buildFMA(Dst: F64, Src0: NegSqrtS1, Src1: SqrtS1, Src2: SqrtX);
5413
5414	auto SqrtS2 = B.buildFMA(Dst: F64, Src0: SqrtD0, Src1: SqrtH1, Src2: SqrtS1);
5415
5416	auto NegSqrtS2 = B.buildFNeg(Dst: F64, Src0: SqrtS2);
5417	auto SqrtD1 = B.buildFMA(Dst: F64, Src0: NegSqrtS2, Src1: SqrtS2, Src2: SqrtX);
5418
5419	auto SqrtRet = B.buildFMA(Dst: F64, Src0: SqrtD1, Src1: SqrtH1, Src2: SqrtS2);
5420
5421	// Scale down the result.
5422	auto ScaleDownFactor = B.buildConstant(Res: S32, Val: -`128`);
5423	auto ScaleDown = B.buildSelect(Res: S32, Tst: Scaling, Op0: ScaleDownFactor, Op1: ZeroInt);
5424	SqrtRet = B.buildFLdexp(Dst: F64, Src0: SqrtRet, Src1: ScaleDown, Flags);
5425
5426	// TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5427	// with finite only or nsz because rsq(+/-0) = +/-inf
5428
5429	// TODO: Check for DAZ and expand to subnormals
5430	auto IsZeroOrInf = B.buildIsFPClass(Res: LLT::scalar(SizeInBits: `1`), Src: SqrtX, Mask: fcZero \| fcPosInf);
5431
5432	// If x is +INF, +0, or -0, use its original value
5433	B.buildSelect(Res: Dst, Tst: IsZeroOrInf, Op0: SqrtX, Op1: SqrtRet, Flags);
5434
5435	MI.eraseFromParent();
5436	return true;
5437	}
5438
5439	bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5440	MachineRegisterInfo &MRI,
5441	MachineIRBuilder &B) const {
5442	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
5443	if (Ty == LLT::scalar(SizeInBits: `32`))
5444	return legalizeFSQRTF32(MI, MRI, B);
5445	if (Ty == LLT::scalar(SizeInBits: `64`))
5446	return legalizeFSQRTF64(MI, MRI, B);
5447	if (Ty == LLT::scalar(SizeInBits: `16`))
5448	return legalizeFSQRTF16(MI, MRI, B);
5449	return false;
5450	}
5451
5452	// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5453	// FIXME: Why do we handle this one but not other removed instructions?
5454	//
5455	// Reciprocal square root. The clamp prevents infinite results, clamping
5456	// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5457	// +-max_float.
5458	bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5459	MachineRegisterInfo &MRI,
5460	MachineIRBuilder &B) const {
5461	if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5462	return true;
5463
5464	Register Dst = MI.getOperand(i: `0`).getReg();
5465	Register Src = MI.getOperand(i: `2`).getReg();
5466	auto Flags = MI.getFlags();
5467
5468	LLT Ty = MRI.getType(Reg: Dst);
5469
5470	const fltSemantics *FltSemantics;
5471	if (Ty == LLT::scalar(SizeInBits: `32`))
5472	FltSemantics = &APFloat::IEEEsingle();
5473	else if (Ty == LLT::scalar(SizeInBits: `64`))
5474	FltSemantics = &APFloat::IEEEdouble();
5475	else
5476	return false;
5477
5478	auto Rsq = B.buildIntrinsic(ID: Intrinsic::amdgcn_rsq, Res: {Ty})
5479	.addUse(RegNo: Src)
5480	.setMIFlags(Flags);
5481
5482	// We don't need to concern ourselves with the snan handling difference, since
5483	// the rsq quieted (or not) so use the one which will directly select.
5484	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5485	const bool UseIEEE = MFI->getMode().IEEE;
5486
5487	auto MaxFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: *FltSemantics));
5488	auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags) :
5489	B.buildFMinNum(Dst: Ty, Src0: Rsq, Src1: MaxFlt, Flags);
5490
5491	auto MinFlt = B.buildFConstant(Res: Ty, Val: APFloat::getLargest(Sem: FltSemantics, Negative: true*));
5492
5493	if (UseIEEE)
5494	B.buildFMaxNumIEEE(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5495	else
5496	B.buildFMaxNum(Dst, Src0: ClampMax, Src1: MinFlt, Flags);
5497	MI.eraseFromParent();
5498	return true;
5499	}
5500
5501	// TODO: Fix pointer type handling
5502	bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5503	MachineInstr &MI,
5504	Intrinsic::ID IID) const {
5505
5506	MachineIRBuilder &B = Helper.MIRBuilder;
5507	MachineRegisterInfo &MRI = *B.getMRI();
5508
5509	bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 \|\|
5510	IID == Intrinsic::amdgcn_permlanex16;
5511	bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive \|\|
5512	IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5513
5514	auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5515	Register Src2, LLT VT) -> Register {
5516	auto LaneOp = B.buildIntrinsic(ID: IID, Res: {VT}).addUse(RegNo: Src0);
5517	switch (IID) {
5518	case Intrinsic::amdgcn_readfirstlane:
5519	case Intrinsic::amdgcn_permlane64:
5520	return LaneOp.getReg(Idx: `0`);
5521	case Intrinsic::amdgcn_readlane:
5522	case Intrinsic::amdgcn_set_inactive:
5523	case Intrinsic::amdgcn_set_inactive_chain_arg:
5524	return LaneOp.addUse(RegNo: Src1).getReg(Idx: `0`);
5525	case Intrinsic::amdgcn_writelane:
5526	return LaneOp.addUse(RegNo: Src1).addUse(RegNo: Src2).getReg(Idx: `0`);
5527	case Intrinsic::amdgcn_permlane16:
5528	case Intrinsic::amdgcn_permlanex16: {
5529	Register Src3 = MI.getOperand(i: `5`).getReg();
5530	int64_t Src4 = MI.getOperand(i: `6`).getImm();
5531	int64_t Src5 = MI.getOperand(i: `7`).getImm();
5532	return LaneOp.addUse(RegNo: Src1)
5533	.addUse(RegNo: Src2)
5534	.addUse(RegNo: Src3)
5535	.addImm(Val: Src4)
5536	.addImm(Val: Src5)
5537	.getReg(Idx: `0`);
5538	}
5539	case Intrinsic::amdgcn_mov_dpp8:
5540	return LaneOp.addImm(Val: MI.getOperand(i: `3`).getImm()).getReg(Idx: `0`);
5541	case Intrinsic::amdgcn_update_dpp:
5542	return LaneOp.addUse(RegNo: Src1)
5543	.addImm(Val: MI.getOperand(i: `4`).getImm())
5544	.addImm(Val: MI.getOperand(i: `5`).getImm())
5545	.addImm(Val: MI.getOperand(i: `6`).getImm())
5546	.addImm(Val: MI.getOperand(i: `7`).getImm())
5547	.getReg(Idx: `0`);
5548	default:
5549	llvm_unreachable("unhandled lane op");
5550	}
5551	};
5552
5553	Register DstReg = MI.getOperand(i: `0`).getReg();
5554	Register Src0 = MI.getOperand(i: `2`).getReg();
5555	Register Src1, Src2;
5556	if (IID == Intrinsic::amdgcn_readlane \|\| IID == Intrinsic::amdgcn_writelane \|\|
5557	IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16) {
5558	Src1 = MI.getOperand(i: `3`).getReg();
5559	if (IID == Intrinsic::amdgcn_writelane \|\| IsPermLane16) {
5560	Src2 = MI.getOperand(i: `4`).getReg();
5561	}
5562	}
5563
5564	LLT Ty = MRI.getType(Reg: DstReg);
5565	unsigned Size = Ty.getSizeInBits();
5566
5567	unsigned SplitSize = `32`;
5568	if (IID == Intrinsic::amdgcn_update_dpp && (Size % `64` == `0`) &&
5569	ST.hasDPALU_DPP() &&
5570	AMDGPU::isLegalDPALU_DPPControl(DC: MI.getOperand(i: `4`).getImm()))
5571	SplitSize = `64`;
5572
5573	if (Size == SplitSize) {
5574	// Already legal
5575	return true;
5576	}
5577
5578	if (Size < `32`) {
5579	Src0 = B.buildAnyExt(Res: S32, Op: Src0).getReg(Idx: `0`);
5580
5581	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16)
5582	Src1 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: Src1).getReg(Idx: `0`);
5583
5584	if (IID == Intrinsic::amdgcn_writelane)
5585	Src2 = B.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: Src2).getReg(Idx: `0`);
5586
5587	Register LaneOpDst = createLaneOp (Src0, Src1, Src2, S32);
5588	B.buildTrunc(Res: DstReg, Op: LaneOpDst);
5589	MI.eraseFromParent();
5590	return true;
5591	}
5592
5593	if (Size % SplitSize != `0`)
5594	return false;
5595
5596	LLT PartialResTy = LLT::scalar(SizeInBits: SplitSize);
5597	bool NeedsBitcast = false;
5598	if (Ty.isVector()) {
5599	LLT EltTy = Ty.getElementType();
5600	unsigned EltSize = EltTy.getSizeInBits();
5601	if (EltSize == SplitSize) {
5602	PartialResTy = EltTy;
5603	} else if (EltSize == `16` \|\| EltSize == `32`) {
5604	unsigned NElem = SplitSize / EltSize;
5605	PartialResTy = Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: NElem));
5606	} else {
5607	// Handle all other cases via S32/S64 pieces
5608	NeedsBitcast = true;
5609	}
5610	}
5611
5612	SmallVector<Register, `4`> PartialRes;
5613	unsigned NumParts = Size / SplitSize;
5614	MachineInstrBuilder Src0Parts = B.buildUnmerge(Res: PartialResTy, Op: Src0);
5615	MachineInstrBuilder Src1Parts, Src2Parts;
5616
5617	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16)
5618	Src1Parts = B.buildUnmerge(Res: PartialResTy, Op: Src1);
5619
5620	if (IID == Intrinsic::amdgcn_writelane)
5621	Src2Parts = B.buildUnmerge(Res: PartialResTy, Op: Src2);
5622
5623	for (unsigned i = `0`; i < NumParts; ++i) {
5624	Src0 = Src0Parts.getReg(Idx: i);
5625
5626	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16)
5627	Src1 = Src1Parts.getReg(Idx: i);
5628
5629	if (IID == Intrinsic::amdgcn_writelane)
5630	Src2 = Src2Parts.getReg(Idx: i);
5631
5632	PartialRes.push_back(Elt: createLaneOp (Src0, Src1, Src2, PartialResTy));
5633	}
5634
5635	if (NeedsBitcast)
5636	B.buildBitcast(Dst: DstReg, Src: B.buildMergeLikeInstr(
5637	Res: LLT::scalar(SizeInBits: Ty.getSizeInBits()), Ops: PartialRes));
5638	else
5639	B.buildMergeLikeInstr(Res: DstReg, Ops: PartialRes);
5640
5641	MI.eraseFromParent();
5642	return true;
5643	}
5644
5645	bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5646	MachineRegisterInfo &MRI,
5647	MachineIRBuilder &B) const {
5648	uint64_t Offset =
5649	ST.getTargetLowering()->getImplicitParameterOffset(
5650	MF: B.getMF(), Param: AMDGPUTargetLowering::FIRST_IMPLICIT);
5651	LLT DstTy = MRI.getType(Reg: DstReg);
5652	LLT IdxTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
5653
5654	Register KernargPtrReg = MRI.createGenericVirtualRegister(Ty: DstTy);
5655	if (!loadInputValue(DstReg: KernargPtrReg, B,
5656	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5657	return false;
5658
5659	// FIXME: This should be nuw
5660	B.buildPtrAdd(Res: DstReg, Op0: KernargPtrReg, Op1: B.buildConstant(Res: IdxTy, Val: Offset).getReg(Idx: `0`));
5661	return true;
5662	}
5663
5664	/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5665	/// bits of the pointer and replace them with the stride argument, then
5666	/// merge_values everything together. In the common case of a raw buffer (the
5667	/// stride component is 0), we can just AND off the upper half.
5668	bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5669	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5670	Register Result = MI.getOperand(i: `0`).getReg();
5671	Register Pointer = MI.getOperand(i: `2`).getReg();
5672	Register Stride = MI.getOperand(i: `3`).getReg();
5673	Register NumRecords = MI.getOperand(i: `4`).getReg();
5674	Register Flags = MI.getOperand(i: `5`).getReg();
5675
5676	LLT S32 = LLT::scalar(SizeInBits: `32`);
5677
5678	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
5679	auto Unmerge = B.buildUnmerge(Res: S32, Op: Pointer);
5680	Register LowHalf = Unmerge.getReg(Idx: `0`);
5681	Register HighHalf = Unmerge.getReg(Idx: `1`);
5682
5683	auto AndMask = B.buildConstant(Res: S32, Val: `0x0000ffff`);
5684	auto Masked = B.buildAnd(Dst: S32, Src0: HighHalf, Src1: AndMask);
5685
5686	MachineInstrBuilder NewHighHalf = Masked;
5687	std::optional<ValueAndVReg> StrideConst =
5688	getIConstantVRegValWithLookThrough(VReg: Stride, MRI);
5689	if (!StrideConst \|\| !StrideConst ->Value.isZero()) {
5690	MachineInstrBuilder ShiftedStride;
5691	if (StrideConst) {
5692	uint32_t StrideVal = StrideConst ->Value.getZExtValue();
5693	uint32_t ShiftedStrideVal = StrideVal << `16`;
5694	ShiftedStride = B.buildConstant(Res: S32, Val: ShiftedStrideVal);
5695	} else {
5696	auto ExtStride = B.buildAnyExt(Res: S32, Op: Stride);
5697	auto ShiftConst = B.buildConstant(Res: S32, Val: `16`);
5698	ShiftedStride = B.buildShl(Dst: S32, Src0: ExtStride, Src1: ShiftConst);
5699	}
5700	NewHighHalf = B.buildOr(Dst: S32, Src0: Masked, Src1: ShiftedStride);
5701	}
5702	Register NewHighHalfReg = NewHighHalf.getReg(Idx: `0`);
5703	B.buildMergeValues(Res: Result, Ops: {LowHalf, NewHighHalfReg, NumRecords, Flags});
5704	MI.eraseFromParent();
5705	return true;
5706	}
5707
5708	bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5709	MachineRegisterInfo &MRI,
5710	MachineIRBuilder &B) const {
5711	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5712	if (!MFI->isEntryFunction()) {
5713	return legalizePreloadedArgIntrin(MI, MRI, B,
5714	ArgType: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5715	}
5716
5717	Register DstReg = MI.getOperand(i: `0`).getReg();
5718	if (!getImplicitArgPtr(DstReg, MRI, B))
5719	return false;
5720
5721	MI.eraseFromParent();
5722	return true;
5723	}
5724
5725	bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5726	MachineRegisterInfo &MRI,
5727	MachineIRBuilder &B) const {
5728	Function &F = B.getMF().getFunction();
5729	std::optional<uint32_t> KnownSize =
5730	AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5731	if (KnownSize.has_value())
5732	B.buildConstant(Res: DstReg, Val: *KnownSize);
5733	return false;
5734	}
5735
5736	bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5737	MachineRegisterInfo &MRI,
5738	MachineIRBuilder &B) const {
5739
5740	const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5741	if (!MFI->isEntryFunction()) {
5742	return legalizePreloadedArgIntrin(MI, MRI, B,
5743	ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5744	}
5745
5746	Register DstReg = MI.getOperand(i: `0`).getReg();
5747	if (!getLDSKernelId(DstReg, MRI, B))
5748	return false;
5749
5750	MI.eraseFromParent();
5751	return true;
5752	}
5753
5754	bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5755	MachineRegisterInfo &MRI,
5756	MachineIRBuilder &B,
5757	unsigned AddrSpace) const {
5758	Register ApertureReg = getSegmentAperture(AS: AddrSpace, MRI, B);
5759	auto Unmerge = B.buildUnmerge(Res: LLT::scalar(SizeInBits: `32`), Op: MI.getOperand(i: `2`).getReg());
5760	Register Hi32 = Unmerge.getReg(Idx: `1`);
5761
5762	B.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: MI.getOperand(i: `0`), Op0: Hi32, Op1: ApertureReg);
5763	MI.eraseFromParent();
5764	return true;
5765	}
5766
5767	// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5768	// offset (the offset that is included in bounds checking and swizzling, to be
5769	// split between the instruction's voffset and immoffset fields) and soffset
5770	// (the offset that is excluded from bounds checking and swizzling, to go in
5771	// the instruction's soffset field). This function takes the first kind of
5772	// offset and figures out how to split it between voffset and immoffset.
5773	std::pair<Register, unsigned>
5774	AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5775	Register OrigOffset) const {
5776	const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5777	Register BaseReg;
5778	unsigned ImmOffset;
5779	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5780	MachineRegisterInfo &MRI = *B.getMRI();
5781
5782	std::tie(args&: BaseReg, args&: ImmOffset) =
5783	AMDGPU::getBaseWithConstantOffset(MRI, Reg: OrigOffset);
5784
5785	// If BaseReg is a pointer, convert it to int.
5786	if (MRI.getType(Reg: BaseReg).isPointer())
5787	BaseReg = B.buildPtrToInt(Dst: MRI.getType(Reg: OrigOffset), Src: BaseReg).getReg(Idx: `0`);
5788
5789	// If the immediate value is too big for the immoffset field, put only bits
5790	// that would normally fit in the immoffset field. The remaining value that
5791	// is copied/added for the voffset field is a large power of 2, and it
5792	// stands more chance of being CSEd with the copy/add for another similar
5793	// load/store.
5794	// However, do not do that rounding down if that is a negative
5795	// number, as it appears to be illegal to have a negative offset in the
5796	// vgpr, even if adding the immediate offset makes it positive.
5797	unsigned Overflow = ImmOffset & ~MaxImm;
5798	ImmOffset -= Overflow;
5799	if ((int32_t)Overflow < `0`) {
5800	Overflow += ImmOffset;
5801	ImmOffset = `0`;
5802	}
5803
5804	if (Overflow != `0`) {
5805	if (!BaseReg) {
5806	BaseReg = B.buildConstant(Res: S32, Val: Overflow).getReg(Idx: `0`);
5807	} else {
5808	auto OverflowVal = B.buildConstant(Res: S32, Val: Overflow);
5809	BaseReg = B.buildAdd(Dst: S32, Src0: BaseReg, Src1: OverflowVal).getReg(Idx: `0`);
5810	}
5811	}
5812
5813	if (!BaseReg)
5814	BaseReg = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
5815
5816	return std::pair(BaseReg, ImmOffset);
5817	}
5818
5819	/// Handle register layout difference for f16 images for some subtargets.
5820	Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5821	MachineRegisterInfo &MRI,
5822	Register Reg,
5823	bool ImageStore) const {
5824	const LLT S16 = LLT::scalar(SizeInBits: `16`);
5825	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5826	LLT StoreVT = MRI.getType(Reg);
5827	assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5828
5829	if (ST.hasUnpackedD16VMem()) {
5830	auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5831
5832	SmallVector<Register, `4`> WideRegs;
5833	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5834	WideRegs.push_back(Elt: B.buildAnyExt(Res: S32, Op: Unmerge.getReg(Idx: I)).getReg(Idx: `0`));
5835
5836	int NumElts = StoreVT.getNumElements();
5837
5838	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumElts, ScalarTy: S32), Ops: WideRegs)
5839	.getReg(Idx: `0`);
5840	}
5841
5842	if (ImageStore && ST.hasImageStoreD16Bug()) {
5843	if (StoreVT.getNumElements() == `2`) {
5844	SmallVector<Register, `4`> PackedRegs;
5845	Reg = B.buildBitcast(Dst: S32, Src: Reg).getReg(Idx: `0`);
5846	PackedRegs.push_back(Elt: Reg);
5847	PackedRegs.resize(N: `2`, NV: B.buildUndef(Res: S32).getReg(Idx: `0`));
5848	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `2`, ScalarTy: S32), Ops: PackedRegs)
5849	.getReg(Idx: `0`);
5850	}
5851
5852	if (StoreVT.getNumElements() == `3`) {
5853	SmallVector<Register, `4`> PackedRegs;
5854	auto Unmerge = B.buildUnmerge(Res: S16, Op: Reg);
5855	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5856	PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5857	PackedRegs.resize(N: `6`, NV: B.buildUndef(Res: S16).getReg(Idx: `0`));
5858	Reg = B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `6`, ScalarTy: S16), Ops: PackedRegs).getReg(Idx: `0`);
5859	return B.buildBitcast(Dst: LLT::fixed_vector(NumElements: `3`, ScalarTy: S32), Src: Reg).getReg(Idx: `0`);
5860	}
5861
5862	if (StoreVT.getNumElements() == `4`) {
5863	SmallVector<Register, `4`> PackedRegs;
5864	Reg = B.buildBitcast(Dst: LLT::fixed_vector(NumElements: `2`, ScalarTy: S32), Src: Reg).getReg(Idx: `0`);
5865	auto Unmerge = B.buildUnmerge(Res: S32, Op: Reg);
5866	for (int I = `0`, E = Unmerge ->getNumOperands() - `1`; I != E; ++I)
5867	PackedRegs.push_back(Elt: Unmerge.getReg(Idx: I));
5868	PackedRegs.resize(N: `4`, NV: B.buildUndef(Res: S32).getReg(Idx: `0`));
5869	return B.buildBuildVector(Res: LLT::fixed_vector(NumElements: `4`, ScalarTy: S32), Ops: PackedRegs)
5870	.getReg(Idx: `0`);
5871	}
5872
5873	llvm_unreachable("invalid data type");
5874	}
5875
5876	if (StoreVT == LLT::fixed_vector(NumElements: `3`, ScalarTy: S16)) {
5877	Reg = B.buildPadVectorWithUndefElements(Res: LLT::fixed_vector(NumElements: `4`, ScalarTy: S16), Op0: Reg)
5878	.getReg(Idx: `0`);
5879	}
5880	return Reg;
5881	}
5882
5883	Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
5884	Register VData, LLT MemTy,
5885	bool IsFormat) const {
5886	MachineRegisterInfo *MRI = B.getMRI();
5887	LLT Ty = MRI->getType(Reg: VData);
5888
5889	const LLT S16 = LLT::scalar(SizeInBits: `16`);
5890
5891	// Fixup buffer resources themselves needing to be v4i128.
5892	if (hasBufferRsrcWorkaround(Ty))
5893	return castBufferRsrcToV4I32(Pointer: VData, B);
5894
5895	if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
5896	Ty = getBitcastRegisterType(Ty);
5897	VData = B.buildBitcast(Dst: Ty, Src: VData).getReg(Idx: `0`);
5898	}
5899	// Fixup illegal register types for i8 stores.
5900	if (Ty == LLT::scalar(SizeInBits: `8`) \|\| Ty == S16) {
5901	Register AnyExt = B.buildAnyExt(Res: LLT::scalar(SizeInBits: `32`), Op: VData).getReg(Idx: `0`);
5902	return AnyExt;
5903	}
5904
5905	if (Ty.isVector()) {
5906	if (Ty.getElementType() == S16 && Ty.getNumElements() <= `4`) {
5907	if (IsFormat)
5908	return handleD16VData(B, MRI&: *MRI, Reg: VData);
5909	}
5910	}
5911
5912	return VData;
5913	}
5914
5915	bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5916	LegalizerHelper &Helper,
5917	bool IsTyped,
5918	bool IsFormat) const {
5919	MachineIRBuilder &B = Helper.MIRBuilder;
5920	MachineRegisterInfo &MRI = *B.getMRI();
5921
5922	Register VData = MI.getOperand(i: `1`).getReg();
5923	LLT Ty = MRI.getType(Reg: VData);
5924	LLT EltTy = Ty.getScalarType();
5925	const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == `16`);
5926	const LLT S32 = LLT::scalar(SizeInBits: `32`);
5927
5928	MachineMemOperand MMO = MI.memoperands_begin();
5929	const int MemSize = MMO->getSize().getValue();
5930	LLT MemTy = MMO->getMemoryType();
5931
5932	VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
5933
5934	castBufferRsrcArgToV4I32(MI, B, Idx: `2`);
5935	Register RSrc = MI.getOperand(i: `2`).getReg();
5936
5937	unsigned ImmOffset;
5938
5939	// The typed intrinsics add an immediate after the registers.
5940	const unsigned NumVIndexOps = IsTyped ? `8` : `7`;
5941
5942	// The struct intrinsic variants add one additional operand over raw.
5943	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5944	Register VIndex;
5945	int OpOffset = `0`;
5946	if (HasVIndex) {
5947	VIndex = MI.getOperand(i: `3`).getReg();
5948	OpOffset = `1`;
5949	} else {
5950	VIndex = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
5951	}
5952
5953	Register VOffset = MI.getOperand(i: `3` + OpOffset).getReg();
5954	Register SOffset = MI.getOperand(i: `4` + OpOffset).getReg();
5955
5956	unsigned Format = `0`;
5957	if (IsTyped) {
5958	Format = MI.getOperand(i: `5` + OpOffset).getImm();
5959	++OpOffset;
5960	}
5961
5962	unsigned AuxiliaryData = MI.getOperand(i: `5` + OpOffset).getImm();
5963
5964	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
5965
5966	unsigned Opc;
5967	if (IsTyped) {
5968	Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5969	AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5970	} else if (IsFormat) {
5971	Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5972	AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5973	} else {
5974	switch (MemSize) {
5975	case `1`:
5976	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5977	break;
5978	case `2`:
5979	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5980	break;
5981	default:
5982	Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5983	break;
5984	}
5985	}
5986
5987	auto MIB = B.buildInstr(Opcode: Opc)
5988	.addUse(RegNo: VData) // vdata
5989	.addUse(RegNo: RSrc) // rsrc
5990	.addUse(RegNo: VIndex) // vindex
5991	.addUse(RegNo: VOffset) // voffset
5992	.addUse(RegNo: SOffset) // soffset
5993	.addImm(Val: ImmOffset); // offset(imm)
5994
5995	if (IsTyped)
5996	MIB.addImm(Val: Format);
5997
5998	MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5999	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
6000	.addMemOperand(MMO);
6001
6002	MI.eraseFromParent();
6003	return true;
6004	}
6005
6006	static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6007	Register VIndex, Register VOffset, Register SOffset,
6008	unsigned ImmOffset, unsigned Format,
6009	unsigned AuxiliaryData, MachineMemOperand *MMO,
6010	bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6011	auto MIB = B.buildInstr(Opcode: Opc)
6012	.addDef(RegNo: LoadDstReg) // vdata
6013	.addUse(RegNo: RSrc) // rsrc
6014	.addUse(RegNo: VIndex) // vindex
6015	.addUse(RegNo: VOffset) // voffset
6016	.addUse(RegNo: SOffset) // soffset
6017	.addImm(Val: ImmOffset); // offset(imm)
6018
6019	if (IsTyped)
6020	MIB.addImm(Val: Format);
6021
6022	MIB.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6023	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
6024	.addMemOperand(MMO);
6025	}
6026
6027	bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
6028	LegalizerHelper &Helper,
6029	bool IsFormat,
6030	bool IsTyped) const {
6031	MachineIRBuilder &B = Helper.MIRBuilder;
6032	MachineRegisterInfo &MRI = *B.getMRI();
6033	GISelChangeObserver &Observer = Helper.Observer;
6034
6035	// FIXME: Verifier should enforce 1 MMO for these intrinsics.
6036	MachineMemOperand MMO = MI.memoperands_begin();
6037	const LLT MemTy = MMO->getMemoryType();
6038	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6039
6040	Register Dst = MI.getOperand(i: `0`).getReg();
6041
6042	Register StatusDst;
6043	int OpOffset = `0`;
6044	assert(MI.getNumExplicitDefs() == `1` \|\| MI.getNumExplicitDefs() == `2`);
6045	bool IsTFE = MI.getNumExplicitDefs() == `2`;
6046	if (IsTFE) {
6047	StatusDst = MI.getOperand(i: `1`).getReg();
6048	++OpOffset;
6049	}
6050
6051	castBufferRsrcArgToV4I32(MI, B, Idx: `2` + OpOffset);
6052	Register RSrc = MI.getOperand(i: `2` + OpOffset).getReg();
6053
6054	// The typed intrinsics add an immediate after the registers.
6055	const unsigned NumVIndexOps = IsTyped ? `8` : `7`;
6056
6057	// The struct intrinsic variants add one additional operand over raw.
6058	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6059	Register VIndex;
6060	if (HasVIndex) {
6061	VIndex = MI.getOperand(i: `3` + OpOffset).getReg();
6062	++OpOffset;
6063	} else {
6064	VIndex = B.buildConstant(Res: S32, Val: `0`).getReg(Idx: `0`);
6065	}
6066
6067	Register VOffset = MI.getOperand(i: `3` + OpOffset).getReg();
6068	Register SOffset = MI.getOperand(i: `4` + OpOffset).getReg();
6069
6070	unsigned Format = `0`;
6071	if (IsTyped) {
6072	Format = MI.getOperand(i: `5` + OpOffset).getImm();
6073	++OpOffset;
6074	}
6075
6076	unsigned AuxiliaryData = MI.getOperand(i: `5` + OpOffset).getImm();
6077	unsigned ImmOffset;
6078
6079	LLT Ty = MRI.getType(Reg: Dst);
6080	// Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6081	// logic doesn't have to handle that case.
6082	if (hasBufferRsrcWorkaround(Ty)) {
6083	Observer.changingInstr(MI);
6084	Ty = castBufferRsrcFromV4I32(MI, B, MRI, Idx: `0`);
6085	Observer.changedInstr(MI);
6086	Dst = MI.getOperand(i: `0`).getReg();
6087	B.setInsertPt(MBB&: B.getMBB(), II: MI);
6088	}
6089	if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6090	Ty = getBitcastRegisterType(Ty);
6091	Observer.changingInstr(MI);
6092	Helper.bitcastDst(MI, CastTy: Ty, OpIdx: `0`);
6093	Observer.changedInstr(MI);
6094	Dst = MI.getOperand(i: `0`).getReg();
6095	B.setInsertPt(MBB&: B.getMBB(), II: MI);
6096	}
6097
6098	LLT EltTy = Ty.getScalarType();
6099	const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == `16`);
6100	const bool Unpacked = ST.hasUnpackedD16VMem();
6101
6102	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6103
6104	unsigned Opc;
6105
6106	// TODO: Support TFE for typed and narrow loads.
6107	if (IsTyped) {
6108	if (IsTFE)
6109	return false;
6110	Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6111	AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6112	} else if (IsFormat) {
6113	if (IsD16) {
6114	if (IsTFE)
6115	return false;
6116	Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6117	} else {
6118	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6119	: AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6120	}
6121	} else {
6122	switch (MemTy.getSizeInBits()) {
6123	case `8`:
6124	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6125	: AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6126	break;
6127	case `16`:
6128	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6129	: AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6130	break;
6131	default:
6132	Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6133	: AMDGPU::G_AMDGPU_BUFFER_LOAD;
6134	break;
6135	}
6136	}
6137
6138	if (IsTFE) {
6139	unsigned NumValueDWords = divideCeil(Numerator: Ty.getSizeInBits(), Denominator: `32`);
6140	unsigned NumLoadDWords = NumValueDWords + `1`;
6141	LLT LoadTy = LLT::fixed_vector(NumElements: NumLoadDWords, ScalarTy: S32);
6142	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: LoadTy);
6143	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6144	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6145	if (MemTy.getSizeInBits() < `32`) {
6146	Register ExtDst = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6147	B.buildUnmerge(Res: {ExtDst, StatusDst}, Op: LoadDstReg);
6148	B.buildTrunc(Res: Dst, Op: ExtDst);
6149	} else if (NumValueDWords == `1`) {
6150	B.buildUnmerge(Res: {Dst, StatusDst}, Op: LoadDstReg);
6151	} else {
6152	SmallVector<Register, `5`> LoadElts;
6153	for (unsigned I = `0`; I != NumValueDWords; ++I)
6154	LoadElts.push_back(Elt: B.getMRI()->createGenericVirtualRegister(Ty: S32));
6155	LoadElts.push_back(Elt: StatusDst);
6156	B.buildUnmerge(Res: LoadElts, Op: LoadDstReg);
6157	LoadElts.truncate(N: NumValueDWords);
6158	B.buildMergeLikeInstr(Res: Dst, Ops: LoadElts);
6159	}
6160	} else if ((!IsD16 && MemTy.getSizeInBits() < `32`) \|\|
6161	(IsD16 && !Ty.isVector())) {
6162	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: S32);
6163	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6164	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6165	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6166	B.buildTrunc(Res: Dst, Op: LoadDstReg);
6167	} else if (Unpacked && IsD16 && Ty.isVector()) {
6168	LLT UnpackedTy = Ty.changeElementSize(NewEltSize: `32`);
6169	Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(Ty: UnpackedTy);
6170	buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6171	Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6172	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6173	// FIXME: G_TRUNC should work, but legalization currently fails
6174	auto Unmerge = B.buildUnmerge(Res: S32, Op: LoadDstReg);
6175	SmallVector<Register, `4`> Repack;
6176	for (unsigned I = `0`, N = Unmerge ->getNumOperands() - `1`; I != N; ++I)
6177	Repack.push_back(Elt: B.buildTrunc(Res: EltTy, Op: Unmerge.getReg(Idx: I)).getReg(Idx: `0`));
6178	B.buildMergeLikeInstr(Res: Dst, Ops: Repack);
6179	} else {
6180	buildBufferLoad(Opc, LoadDstReg: Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6181	AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6182	}
6183
6184	MI.eraseFromParent();
6185	return true;
6186	}
6187
6188	static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6189	switch (IntrID) {
6190	case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6191	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6192	case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6193	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6194	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6195	case Intrinsic::amdgcn_raw_buffer_atomic_add:
6196	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6197	case Intrinsic::amdgcn_struct_buffer_atomic_add:
6198	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6199	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6200	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6201	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6202	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6203	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6204	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6205	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6206	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6207	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6208	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6209	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6210	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6211	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6212	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6213	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6214	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6215	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6216	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6217	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6218	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6219	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6220	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6221	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6222	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6223	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6224	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6225	case Intrinsic::amdgcn_raw_buffer_atomic_and:
6226	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6227	case Intrinsic::amdgcn_struct_buffer_atomic_and:
6228	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6229	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6230	case Intrinsic::amdgcn_raw_buffer_atomic_or:
6231	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6232	case Intrinsic::amdgcn_struct_buffer_atomic_or:
6233	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6234	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6235	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6236	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6237	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6238	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6239	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6240	case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6241	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6242	case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6243	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6244	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6245	case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6246	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6247	case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6248	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6249	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6250	case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6251	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6252	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6253	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6254	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6255	case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6256	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6257	case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6258	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6259	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6260	case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6261	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6262	case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6263	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6264	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6265	case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6266	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6267	case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6268	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6269	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6270	case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6271	case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6272	return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6273	default:
6274	llvm_unreachable("unhandled atomic opcode");
6275	}
6276	}
6277
6278	bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6279	MachineIRBuilder &B,
6280	Intrinsic::ID IID) const {
6281	const bool IsCmpSwap =
6282	IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap \|\|
6283	IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap \|\|
6284	IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap \|\|
6285	IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6286
6287	Register Dst = MI.getOperand(i: `0`).getReg();
6288	// Since we don't have 128-bit atomics, we don't need to handle the case of
6289	// p8 argmunents to the atomic itself
6290	Register VData = MI.getOperand(i: `2`).getReg();
6291
6292	Register CmpVal;
6293	int OpOffset = `0`;
6294
6295	if (IsCmpSwap) {
6296	CmpVal = MI.getOperand(i: `3`).getReg();
6297	++OpOffset;
6298	}
6299
6300	castBufferRsrcArgToV4I32(MI, B, Idx: `3` + OpOffset);
6301	Register RSrc = MI.getOperand(i: `3` + OpOffset).getReg();
6302	const unsigned NumVIndexOps = IsCmpSwap ? `9` : `8`;
6303
6304	// The struct intrinsic variants add one additional operand over raw.
6305	const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6306	Register VIndex;
6307	if (HasVIndex) {
6308	VIndex = MI.getOperand(i: `4` + OpOffset).getReg();
6309	++OpOffset;
6310	} else {
6311	VIndex = B.buildConstant(Res: LLT::scalar(SizeInBits: `32`), Val: `0`).getReg(Idx: `0`);
6312	}
6313
6314	Register VOffset = MI.getOperand(i: `4` + OpOffset).getReg();
6315	Register SOffset = MI.getOperand(i: `5` + OpOffset).getReg();
6316	unsigned AuxiliaryData = MI.getOperand(i: `6` + OpOffset).getImm();
6317
6318	MachineMemOperand MMO = MI.memoperands_begin();
6319
6320	unsigned ImmOffset;
6321	std::tie(args&: VOffset, args&: ImmOffset) = splitBufferOffsets(B, OrigOffset: VOffset);
6322
6323	auto MIB = B.buildInstr(Opcode: getBufferAtomicPseudo(IntrID: IID))
6324	.addDef(RegNo: Dst)
6325	.addUse(RegNo: VData); // vdata
6326
6327	if (IsCmpSwap)
6328	MIB.addReg(RegNo: CmpVal);
6329
6330	MIB.addUse(RegNo: RSrc) // rsrc
6331	.addUse(RegNo: VIndex) // vindex
6332	.addUse(RegNo: VOffset) // voffset
6333	.addUse(RegNo: SOffset) // soffset
6334	.addImm(Val: ImmOffset) // offset(imm)
6335	.addImm(Val: AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6336	.addImm(Val: HasVIndex ? -`1` : `0`) // idxen(imm)
6337	.addMemOperand(MMO);
6338
6339	MI.eraseFromParent();
6340	return true;
6341	}
6342
6343	/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6344	/// vector with s16 typed elements.
6345	static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6346	SmallVectorImpl<Register> &PackedAddrs,
6347	unsigned ArgOffset,
6348	const AMDGPU::ImageDimIntrinsicInfo *Intr,
6349	bool IsA16, bool IsG16) {
6350	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6351	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
6352	auto EndIdx = Intr->VAddrEnd;
6353
6354	for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6355	MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6356	if (!SrcOp.isReg())
6357	continue; // _L to _LZ may have eliminated this.
6358
6359	Register AddrReg = SrcOp.getReg();
6360
6361	if ((I < Intr->GradientStart) \|\|
6362	(I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) \|\|
6363	(I >= Intr->CoordStart && !IsA16)) {
6364	if ((I < Intr->GradientStart) && IsA16 &&
6365	(B.getMRI()->getType(Reg: AddrReg) == S16)) {
6366	assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6367	// Special handling of bias when A16 is on. Bias is of type half but
6368	// occupies full 32-bit.
6369	PackedAddrs.push_back(
6370	Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: `0`)})
6371	.getReg(Idx: `0`));
6372	} else {
6373	assert((!IsA16 \|\| Intr->NumBiasArgs == `0` \|\| I != Intr->BiasIndex) &&
6374	"Bias needs to be converted to 16 bit in A16 mode");
6375	// Handle any gradient or coordinate operands that should not be packed
6376	AddrReg = B.buildBitcast(Dst: V2S16, Src: AddrReg).getReg(Idx: `0`);
6377	PackedAddrs.push_back(Elt: AddrReg);
6378	}
6379	} else {
6380	// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6381	// derivatives dx/dh and dx/dv are packed with undef.
6382	if (((I + `1`) >= EndIdx) \|\|
6383	((Intr->NumGradients / `2`) % `2` == `1` &&
6384	(I == static_cast<unsigned>(Intr->GradientStart +
6385	(Intr->NumGradients / `2`) - `1`) \|\|
6386	I == static_cast<unsigned>(Intr->GradientStart +
6387	Intr->NumGradients - `1`))) \|\|
6388	// Check for _L to _LZ optimization
6389	!MI.getOperand(i: ArgOffset + I + `1`).isReg()) {
6390	PackedAddrs.push_back(
6391	Elt: B.buildBuildVector(Res: V2S16, Ops: {AddrReg, B.buildUndef(Res: S16).getReg(Idx: `0`)})
6392	.getReg(Idx: `0`));
6393	} else {
6394	PackedAddrs.push_back(
6395	Elt: B.buildBuildVector(
6396	Res: V2S16, Ops: {AddrReg, MI.getOperand(i: ArgOffset + I + `1`).getReg()})
6397	.getReg(Idx: `0`));
6398	++I;
6399	}
6400	}
6401	}
6402	}
6403
6404	/// Convert from separate vaddr components to a single vector address register,
6405	/// and replace the remaining operands with $noreg.
6406	static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6407	int DimIdx, int NumVAddrs) {
6408	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6409	(void)S32;
6410	SmallVector<Register, `8`> AddrRegs;
6411	for (int I = `0`; I != NumVAddrs; ++I) {
6412	MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6413	if (SrcOp.isReg()) {
6414	AddrRegs.push_back(Elt: SrcOp.getReg());
6415	assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6416	}
6417	}
6418
6419	int NumAddrRegs = AddrRegs.size();
6420	if (NumAddrRegs != `1`) {
6421	auto VAddr =
6422	B.buildBuildVector(Res: LLT::fixed_vector(NumElements: NumAddrRegs, ScalarSizeInBits: `32`), Ops: AddrRegs);
6423	MI.getOperand(i: DimIdx).setReg(VAddr.getReg(Idx: `0`));
6424	}
6425
6426	for (int I = `1`; I != NumVAddrs; ++I) {
6427	MachineOperand &SrcOp = MI.getOperand(i: DimIdx + I);
6428	if (SrcOp.isReg())
6429	MI.getOperand(i: DimIdx + I).setReg(AMDGPU::NoRegister);
6430	}
6431	}
6432
6433	/// Rewrite image intrinsics to use register layouts expected by the subtarget.
6434	///
6435	/// Depending on the subtarget, load/store with 16-bit element data need to be
6436	/// rewritten to use the low half of 32-bit registers, or directly use a packed
6437	/// layout. 16-bit addresses should also sometimes be packed into 32-bit
6438	/// registers.
6439	///
6440	/// We don't want to directly select image instructions just yet, but also want
6441	/// to exposes all register repacking to the legalizer/combiners. We also don't
6442	/// want a selected instruction entering RegBankSelect. In order to avoid
6443	/// defining a multitude of intermediate image instructions, directly hack on
6444	/// the intrinsic's arguments. In cases like a16 addresses, this requires
6445	/// padding now unnecessary arguments with $noreg.
6446	bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6447	MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6448	const AMDGPU::ImageDimIntrinsicInfo Intr) const* {
6449
6450	const MachineFunction &MF = *MI.getMF();
6451	const unsigned NumDefs = MI.getNumExplicitDefs();
6452	const unsigned ArgOffset = NumDefs + `1`;
6453	bool IsTFE = NumDefs == `2`;
6454	// We are only processing the operands of d16 image operations on subtargets
6455	// that use the unpacked register layout, or need to repack the TFE result.
6456
6457	// TODO: Do we need to guard against already legalized intrinsics?
6458	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6459	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
6460
6461	MachineRegisterInfo *MRI = B.getMRI();
6462	const LLT S32 = LLT::scalar(SizeInBits: `32`);
6463	const LLT S16 = LLT::scalar(SizeInBits: `16`);
6464	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
6465
6466	unsigned DMask = `0`;
6467	Register VData;
6468	LLT Ty;
6469
6470	if (!BaseOpcode->NoReturn \|\| BaseOpcode->Store) {
6471	VData = MI.getOperand(i: NumDefs == `0` ? `1` : `0`).getReg();
6472	Ty = MRI->getType(Reg: VData);
6473	}
6474
6475	const bool IsAtomicPacked16Bit =
6476	(BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 \|\|
6477	BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6478
6479	// Check for 16 bit addresses and pack if true.
6480	LLT GradTy =
6481	MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->GradientStart).getReg());
6482	LLT AddrTy =
6483	MRI->getType(Reg: MI.getOperand(i: ArgOffset + Intr->CoordStart).getReg());
6484	const bool IsG16 =
6485	ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6486	const bool IsA16 = AddrTy == S16;
6487	const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6488
6489	int DMaskLanes = `0`;
6490	if (!BaseOpcode->Atomic) {
6491	DMask = MI.getOperand(i: ArgOffset + Intr->DMaskIndex).getImm();
6492	if (BaseOpcode->Gather4) {
6493	DMaskLanes = `4`;
6494	} else if (DMask != `0`) {
6495	DMaskLanes = llvm::popcount(Value: DMask);
6496	} else if (!IsTFE && !BaseOpcode->Store) {
6497	// If dmask is 0, this is a no-op load. This can be eliminated.
6498	B.buildUndef(Res: MI.getOperand(i: `0`));
6499	MI.eraseFromParent();
6500	return true;
6501	}
6502	}
6503
6504	Observer.changingInstr(MI);
6505	auto ChangedInstr = make_scope_exit(F: [&] { Observer.changedInstr(MI); });
6506
6507	const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6508	: AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6509	const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6510	: AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6511	unsigned NewOpcode = LoadOpcode;
6512	if (BaseOpcode->Store)
6513	NewOpcode = StoreOpcode;
6514	else if (BaseOpcode->NoReturn)
6515	NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6516
6517	// Track that we legalized this
6518	MI.setDesc(B.getTII().get(Opcode: NewOpcode));
6519
6520	// Expecting to get an error flag since TFC is on - and dmask is 0 Force
6521	// dmask to be at least 1 otherwise the instruction will fail
6522	if (IsTFE && DMask == `0`) {
6523	DMask = `0x1`;
6524	DMaskLanes = `1`;
6525	MI.getOperand(i: ArgOffset + Intr->DMaskIndex).setImm(DMask);
6526	}
6527
6528	if (BaseOpcode->Atomic) {
6529	Register VData0 = MI.getOperand(i: `2`).getReg();
6530	LLT Ty = MRI->getType(Reg: VData0);
6531
6532	// TODO: Allow atomic swap and bit ops for v2s16/v4s16
6533	if (Ty.isVector() && !IsAtomicPacked16Bit)
6534	return false;
6535
6536	if (BaseOpcode->AtomicX2) {
6537	Register VData1 = MI.getOperand(i: `3`).getReg();
6538	// The two values are packed in one register.
6539	LLT PackedTy = LLT::fixed_vector(NumElements: `2`, ScalarTy: Ty);
6540	auto Concat = B.buildBuildVector(Res: PackedTy, Ops: {VData0, VData1});
6541	MI.getOperand(i: `2`).setReg(Concat.getReg(Idx: `0`));
6542	MI.getOperand(i: `3`).setReg(AMDGPU::NoRegister);
6543	}
6544	}
6545
6546	unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6547
6548	// Rewrite the addressing register layout before doing anything else.
6549	if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6550	// 16 bit gradients are supported, but are tied to the A16 control
6551	// so both gradients and addresses must be 16 bit
6552	return false;
6553	}
6554
6555	if (IsA16 && !ST.hasA16()) {
6556	// A16 not supported
6557	return false;
6558	}
6559
6560	const unsigned NSAMaxSize = ST.getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
6561	const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6562
6563	if (IsA16 \|\| IsG16) {
6564	// Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6565	// instructions expect VGPR_32
6566	SmallVector<Register, `4`> PackedRegs;
6567
6568	packImage16bitOpsToDwords(B, MI, PackedAddrs&: PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6569
6570	// See also below in the non-a16 branch
6571	const bool UseNSA = ST.hasNSAEncoding() &&
6572	PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6573	(PackedRegs.size() <= NSAMaxSize \|\| HasPartialNSA);
6574	const bool UsePartialNSA =
6575	UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6576
6577	if (UsePartialNSA) {
6578	// Pack registers that would go over NSAMaxSize into last VAddr register
6579	LLT PackedAddrTy =
6580	LLT::fixed_vector(NumElements: `2` * (PackedRegs.size() - NSAMaxSize + `1`), ScalarSizeInBits: `16`);
6581	auto Concat = B.buildConcatVectors(
6582	Res: PackedAddrTy, Ops: ArrayRef(PackedRegs).slice(N: NSAMaxSize - `1`));
6583	PackedRegs [NSAMaxSize - `1`] = Concat.getReg(Idx: `0`);
6584	PackedRegs.resize(N: NSAMaxSize);
6585	} else if (!UseNSA && PackedRegs.size() > `1`) {
6586	LLT PackedAddrTy = LLT::fixed_vector(NumElements: `2` * PackedRegs.size(), ScalarSizeInBits: `16`);
6587	auto Concat = B.buildConcatVectors(Res: PackedAddrTy, Ops: PackedRegs);
6588	PackedRegs [`0`] = Concat.getReg(Idx: `0`);
6589	PackedRegs.resize(N: `1`);
6590	}
6591
6592	const unsigned NumPacked = PackedRegs.size();
6593	for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6594	MachineOperand &SrcOp = MI.getOperand(i: ArgOffset + I);
6595	if (!SrcOp.isReg()) {
6596	assert(SrcOp.isImm() && SrcOp.getImm() == `0`);
6597	continue;
6598	}
6599
6600	assert(SrcOp.getReg() != AMDGPU::NoRegister);
6601
6602	if (I - Intr->VAddrStart < NumPacked)
6603	SrcOp.setReg(PackedRegs [I - Intr->VAddrStart]);
6604	else
6605	SrcOp.setReg(AMDGPU::NoRegister);
6606	}
6607	} else {
6608	// If the register allocator cannot place the address registers contiguously
6609	// without introducing moves, then using the non-sequential address encoding
6610	// is always preferable, since it saves VALU instructions and is usually a
6611	// wash in terms of code size or even better.
6612	//
6613	// However, we currently have no way of hinting to the register allocator
6614	// that MIMG addresses should be placed contiguously when it is possible to
6615	// do so, so force non-NSA for the common 2-address case as a heuristic.
6616	//
6617	// SIShrinkInstructions will convert NSA encodings to non-NSA after register
6618	// allocation when possible.
6619	//
6620	// Partial NSA is allowed on GFX11+ where the final register is a contiguous
6621	// set of the remaining addresses.
6622	const bool UseNSA = ST.hasNSAEncoding() &&
6623	CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6624	(CorrectedNumVAddrs <= NSAMaxSize \|\| HasPartialNSA);
6625	const bool UsePartialNSA =
6626	UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6627
6628	if (UsePartialNSA) {
6629	convertImageAddrToPacked(B, MI,
6630	DimIdx: ArgOffset + Intr->VAddrStart + NSAMaxSize - `1`,
6631	NumVAddrs: Intr->NumVAddrs - NSAMaxSize + `1`);
6632	} else if (!UseNSA && Intr->NumVAddrs > `1`) {
6633	convertImageAddrToPacked(B, MI, DimIdx: ArgOffset + Intr->VAddrStart,
6634	NumVAddrs: Intr->NumVAddrs);
6635	}
6636	}
6637
6638	int Flags = `0`;
6639	if (IsA16)
6640	Flags \|= `1`;
6641	if (IsG16)
6642	Flags \|= `2`;
6643	MI.addOperand(Op: MachineOperand::CreateImm(Val: Flags));
6644
6645	if (BaseOpcode->NoReturn) { // No TFE for stores?
6646	// TODO: Handle dmask trim
6647	if (!Ty.isVector() \|\| !IsD16)
6648	return true;
6649
6650	Register RepackedReg = handleD16VData(B, MRI&: MRI, Reg: VData, ImageStore: true*);
6651	if (RepackedReg != VData) {
6652	MI.getOperand(i: `1`).setReg(RepackedReg);
6653	}
6654
6655	return true;
6656	}
6657
6658	Register DstReg = MI.getOperand(i: `0`).getReg();
6659	const LLT EltTy = Ty.getScalarType();
6660	const int NumElts = Ty.isVector() ? Ty.getNumElements() : `1`;
6661
6662	// Confirm that the return type is large enough for the dmask specified
6663	if (NumElts < DMaskLanes)
6664	return false;
6665
6666	if (NumElts > `4` \|\| DMaskLanes > `4`)
6667	return false;
6668
6669	// Image atomic instructions are using DMask to specify how many bits
6670	// input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6671	// DMaskLanes for image atomic has default value '0'.
6672	// We must be sure that atomic variants (especially packed) will not be
6673	// truncated from v2s16 or v4s16 to s16 type.
6674	//
6675	// ChangeElementCount will be needed for image load where Ty is always scalar.
6676	const unsigned AdjustedNumElts = DMaskLanes == `0` ? `1` : DMaskLanes;
6677	const LLT AdjustedTy =
6678	DMaskLanes == `0`
6679	? Ty
6680	: Ty.changeElementCount(EC: ElementCount::getFixed(MinVal: AdjustedNumElts));
6681
6682	// The raw dword aligned data component of the load. The only legal cases
6683	// where this matters should be when using the packed D16 format, for
6684	// s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6685	LLT RoundedTy;
6686
6687	// S32 vector to cover all data, plus TFE result element.
6688	LLT TFETy;
6689
6690	// Register type to use for each loaded component. Will be S32 or V2S16.
6691	LLT RegTy;
6692
6693	if (IsD16 && ST.hasUnpackedD16VMem()) {
6694	RoundedTy =
6695	LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: AdjustedNumElts), ScalarSize: `32`);
6696	TFETy = LLT::fixed_vector(NumElements: AdjustedNumElts + `1`, ScalarSizeInBits: `32`);
6697	RegTy = S32;
6698	} else {
6699	unsigned EltSize = EltTy.getSizeInBits();
6700	unsigned RoundedElts = (AdjustedTy.getSizeInBits() + `31`) / `32`;
6701	unsigned RoundedSize = `32` * RoundedElts;
6702	RoundedTy = LLT::scalarOrVector(
6703	EC: ElementCount::getFixed(MinVal: RoundedSize / EltSize), ScalarSize: EltSize);
6704	TFETy = LLT::fixed_vector(NumElements: RoundedSize / `32` + `1`, ScalarTy: S32);
6705	RegTy = !IsTFE && EltSize == `16` ? V2S16 : S32;
6706	}
6707
6708	// The return type does not need adjustment.
6709	// TODO: Should we change s16 case to s32 or <2 x s16>?
6710	if (!IsTFE && (RoundedTy == Ty \|\| !Ty.isVector()))
6711	return true;
6712
6713	Register Dst1Reg;
6714
6715	// Insert after the instruction.
6716	B.setInsertPt(MBB&: *MI.getParent(), II: ++MI.getIterator());
6717
6718	// TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6719	// s16> instead of s32, we would only need 1 bitcast instead of multiple.
6720	const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6721	const int ResultNumRegs = LoadResultTy.getSizeInBits() / `32`;
6722
6723	Register NewResultReg = MRI->createGenericVirtualRegister(Ty: LoadResultTy);
6724
6725	MI.getOperand(i: `0`).setReg(NewResultReg);
6726
6727	// In the IR, TFE is supposed to be used with a 2 element struct return
6728	// type. The instruction really returns these two values in one contiguous
6729	// register, with one additional dword beyond the loaded data. Rewrite the
6730	// return type to use a single register result.
6731
6732	if (IsTFE) {
6733	Dst1Reg = MI.getOperand(i: `1`).getReg();
6734	if (MRI->getType(Reg: Dst1Reg) != S32)
6735	return false;
6736
6737	// TODO: Make sure the TFE operand bit is set.
6738	MI.removeOperand(OpNo: `1`);
6739
6740	// Handle the easy case that requires no repack instructions.
6741	if (Ty == S32) {
6742	B.buildUnmerge(Res: {DstReg, Dst1Reg}, Op: NewResultReg);
6743	return true;
6744	}
6745	}
6746
6747	// Now figure out how to copy the new result register back into the old
6748	// result.
6749	SmallVector<Register, `5`> ResultRegs(ResultNumRegs, Dst1Reg);
6750
6751	const int NumDataRegs = IsTFE ? ResultNumRegs - `1` : ResultNumRegs;
6752
6753	if (ResultNumRegs == `1`) {
6754	assert(!IsTFE);
6755	ResultRegs [`0`] = NewResultReg;
6756	} else {
6757	// We have to repack into a new vector of some kind.
6758	for (int I = `0`; I != NumDataRegs; ++I)
6759	ResultRegs [I] = MRI->createGenericVirtualRegister(Ty: RegTy);
6760	B.buildUnmerge(Res: ResultRegs, Op: NewResultReg);
6761
6762	// Drop the final TFE element to get the data part. The TFE result is
6763	// directly written to the right place already.
6764	if (IsTFE)
6765	ResultRegs.resize(N: NumDataRegs);
6766	}
6767
6768	// For an s16 scalar result, we form an s32 result with a truncate regardless
6769	// of packed vs. unpacked.
6770	if (IsD16 && !Ty.isVector()) {
6771	B.buildTrunc(Res: DstReg, Op: ResultRegs [`0`]);
6772	return true;
6773	}
6774
6775	// Avoid a build/concat_vector of 1 entry.
6776	if (Ty == V2S16 && NumDataRegs == `1` && !ST.hasUnpackedD16VMem()) {
6777	B.buildBitcast(Dst: DstReg, Src: ResultRegs [`0`]);
6778	return true;
6779	}
6780
6781	assert(Ty.isVector());
6782
6783	if (IsD16) {
6784	// For packed D16 results with TFE enabled, all the data components are
6785	// S32. Cast back to the expected type.
6786	//
6787	// TODO: We don't really need to use load s32 elements. We would only need one
6788	// cast for the TFE result if a multiple of v2s16 was used.
6789	if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6790	for (Register &Reg : ResultRegs)
6791	Reg = B.buildBitcast(Dst: V2S16, Src: Reg).getReg(Idx: `0`);
6792	} else if (ST.hasUnpackedD16VMem()) {
6793	for (Register &Reg : ResultRegs)
6794	Reg = B.buildTrunc(Res: S16, Op: Reg).getReg(Idx: `0`);
6795	}
6796	}
6797
6798	auto padWithUndef = [&](LLT Ty, int NumElts) {
6799	if (NumElts == `0`)
6800	return;
6801	Register Undef = B.buildUndef(Res: Ty).getReg(Idx: `0`);
6802	for (int I = `0`; I != NumElts; ++I)
6803	ResultRegs.push_back(Elt: Undef);
6804	};
6805
6806	// Pad out any elements eliminated due to the dmask.
6807	LLT ResTy = MRI->getType(Reg: ResultRegs [`0`]);
6808	if (!ResTy.isVector()) {
6809	padWithUndef (ResTy, NumElts - ResultRegs.size());
6810	B.buildBuildVector(Res: DstReg, Ops: ResultRegs);
6811	return true;
6812	}
6813
6814	assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6815	const int RegsToCover = (Ty.getSizeInBits() + `31`) / `32`;
6816
6817	// Deal with the one annoying legal case.
6818	const LLT V3S16 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `16`);
6819	if (Ty == V3S16) {
6820	if (IsTFE) {
6821	if (ResultRegs.size() == `1`) {
6822	NewResultReg = ResultRegs [`0`];
6823	} else if (ResultRegs.size() == `2`) {
6824	LLT V4S16 = LLT::fixed_vector(NumElements: `4`, ScalarSizeInBits: `16`);
6825	NewResultReg = B.buildConcatVectors(Res: V4S16, Ops: ResultRegs).getReg(Idx: `0`);
6826	} else {
6827	return false;
6828	}
6829	}
6830
6831	if (MRI->getType(Reg: DstReg).getNumElements() <
6832	MRI->getType(Reg: NewResultReg).getNumElements()) {
6833	B.buildDeleteTrailingVectorElements(Res: DstReg, Op0: NewResultReg);
6834	} else {
6835	B.buildPadVectorWithUndefElements(Res: DstReg, Op0: NewResultReg);
6836	}
6837	return true;
6838	}
6839
6840	padWithUndef (ResTy, RegsToCover - ResultRegs.size());
6841	B.buildConcatVectors(Res: DstReg, Ops: ResultRegs);
6842	return true;
6843	}
6844
6845	bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6846	MachineInstr &MI) const {
6847	MachineIRBuilder &B = Helper.MIRBuilder;
6848	GISelChangeObserver &Observer = Helper.Observer;
6849
6850	Register OrigDst = MI.getOperand(i: `0`).getReg();
6851	Register Dst;
6852	LLT Ty = B.getMRI()->getType(Reg: OrigDst);
6853	unsigned Size = Ty.getSizeInBits();
6854	MachineFunction &MF = B.getMF();
6855	unsigned Opc = `0`;
6856	if (Size < `32` && ST.hasScalarSubwordLoads()) {
6857	assert(Size == `8` \|\| Size == `16`);
6858	Opc = Size == `8` ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6859	: AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6860	// The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6861	// destination register.
6862	Dst = B.getMRI()->createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: `32`));
6863	} else {
6864	Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6865	Dst = OrigDst;
6866	}
6867
6868	Observer.changingInstr(MI);
6869
6870	// Handle needing to s.buffer.load() a p8 value.
6871	if (hasBufferRsrcWorkaround(Ty)) {
6872	Ty = castBufferRsrcFromV4I32(MI, B, MRI&: *B.getMRI(), Idx: `0`);
6873	B.setInsertPt(MBB&: B.getMBB(), II: MI);
6874	}
6875	if (shouldBitcastLoadStoreType(ST, Ty, MemTy: LLT::scalar(SizeInBits: Size))) {
6876	Ty = getBitcastRegisterType(Ty);
6877	Helper.bitcastDst(MI, CastTy: Ty, OpIdx: `0`);
6878	B.setInsertPt(MBB&: B.getMBB(), II: MI);
6879	}
6880
6881	// FIXME: We don't really need this intermediate instruction. The intrinsic
6882	// should be fixed to have a memory operand. Since it's readnone, we're not
6883	// allowed to add one.
6884	MI.setDesc(B.getTII().get(Opcode: Opc));
6885	MI.removeOperand(OpNo: `1`); // Remove intrinsic ID
6886
6887	// FIXME: When intrinsic definition is fixed, this should have an MMO already.
6888	const unsigned MemSize = (Size + `7`) / `8`;
6889	const Align MemAlign = B.getDataLayout().getABITypeAlign(
6890	Ty: getTypeForLLT(Ty, C&: MF.getFunction().getContext()));
6891	MachineMemOperand *MMO = MF.getMachineMemOperand(
6892	PtrInfo: MachinePointerInfo (),
6893	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
6894	MachineMemOperand::MOInvariant,
6895	Size: MemSize, BaseAlignment: MemAlign);
6896	MI.addMemOperand(MF, MO: MMO);
6897	if (Dst != OrigDst) {
6898	MI.getOperand(i: `0`).setReg(Dst);
6899	B.setInsertPt(MBB&: B.getMBB(), II: ++B.getInsertPt());
6900	B.buildTrunc(Res: OrigDst, Op: Dst);
6901	}
6902
6903	// If we don't have 96-bit result scalar loads, widening to 128-bit should
6904	// always be legal. We may need to restore this to a 96-bit result if it turns
6905	// out this needs to be converted to a vector load during RegBankSelect.
6906	if (!isPowerOf2_32(Value: Size) && (Size != `96` \|\| !ST.hasScalarDwordx3Loads())) {
6907	if (Ty.isVector())
6908	Helper.moreElementsVectorDst(MI, MoreTy: getPow2VectorType(Ty), OpIdx: `0`);
6909	else
6910	Helper.widenScalarDst(MI, WideTy: getPow2ScalarType(Ty), OpIdx: `0`);
6911	}
6912
6913	Observer.changedInstr(MI);
6914	return true;
6915	}
6916
6917	bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
6918	MachineInstr &MI) const {
6919	MachineIRBuilder &B = Helper.MIRBuilder;
6920	GISelChangeObserver &Observer = Helper.Observer;
6921	Observer.changingInstr(MI);
6922	MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6923	MI.removeOperand(OpNo: `0`); // Remove intrinsic ID
6924	castBufferRsrcArgToV4I32(MI, B, Idx: `0`);
6925	Observer.changedInstr(MI);
6926	return true;
6927	}
6928
6929	// TODO: Move to selection
6930	bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6931	MachineRegisterInfo &MRI,
6932	MachineIRBuilder &B) const {
6933	if (!ST.isTrapHandlerEnabled() \|\|
6934	ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6935	return legalizeTrapEndpgm(MI, MRI, B);
6936
6937	return ST.supportsGetDoorbellID() ?
6938	legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6939	}
6940
6941	bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6942	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6943	const DebugLoc &DL = MI.getDebugLoc();
6944	MachineBasicBlock &BB = B.getMBB();
6945	MachineFunction *MF = BB.getParent();
6946
6947	if (BB.succ_empty() && std::next(x: MI.getIterator()) == BB.end()) {
6948	BuildMI(BB, I: BB.end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
6949	.addImm(Val: `0`);
6950	MI.eraseFromParent();
6951	return true;
6952	}
6953
6954	// We need a block split to make the real endpgm a terminator. We also don't
6955	// want to break phis in successor blocks, so we can't just delete to the
6956	// end of the block.
6957	BB.splitAt(SplitInst&: MI, UpdateLiveIns: false /UpdateLiveIns/);
6958	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6959	MF->push_back(MBB: TrapBB);
6960	BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_ENDPGM))
6961	.addImm(Val: `0`);
6962	BuildMI(BB, I: &MI, MIMD: DL, MCID: B.getTII().get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
6963	.addMBB(MBB: TrapBB);
6964
6965	BB.addSuccessor(Succ: TrapBB);
6966	MI.eraseFromParent();
6967	return true;
6968	}
6969
6970	bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6971	MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6972	MachineFunction &MF = B.getMF();
6973	const LLT S64 = LLT::scalar(SizeInBits: `64`);
6974
6975	Register SGPR01(AMDGPU::SGPR0_SGPR1);
6976	// For code object version 5, queue_ptr is passed through implicit kernarg.
6977	if (AMDGPU::getAMDHSACodeObjectVersion(M: *MF.getFunction().getParent()) >=
6978	AMDGPU::AMDHSA_COV5) {
6979	AMDGPUTargetLowering::ImplicitParameter Param =
6980	AMDGPUTargetLowering::QUEUE_PTR;
6981	uint64_t Offset =
6982	ST.getTargetLowering()->getImplicitParameterOffset(MF: B.getMF(), Param);
6983
6984	Register KernargPtrReg = MRI.createGenericVirtualRegister(
6985	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
6986
6987	if (!loadInputValue(DstReg: KernargPtrReg, B,
6988	ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6989	return false;
6990
6991	// TODO: can we be smarter about machine pointer info?
6992	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6993	MachineMemOperand *MMO = MF.getMachineMemOperand(
6994	PtrInfo,
6995	f: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
6996	MachineMemOperand::MOInvariant,
6997	MemTy: LLT::scalar(SizeInBits: `64`), base_alignment: commonAlignment(A: Align (`64`), Offset));
6998
6999	// Pointer address
7000	Register LoadAddr = MRI.createGenericVirtualRegister(
7001	Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
7002	B.buildPtrAdd(Res: LoadAddr, Op0: KernargPtrReg,
7003	Op1: B.buildConstant(Res: LLT::scalar(SizeInBits: `64`), Val: Offset).getReg(Idx: `0`));
7004	// Load address
7005	Register Temp = B.buildLoad(Res: S64, Addr: LoadAddr, MMO&: *MMO).getReg(Idx: `0`);
7006	B.buildCopy(Res: SGPR01, Op: Temp);
7007	B.buildInstr(Opcode: AMDGPU::S_TRAP)
7008	.addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7009	.addReg(RegNo: SGPR01, flags: RegState::Implicit);
7010	MI.eraseFromParent();
7011	return true;
7012	}
7013
7014	// Pass queue pointer to trap handler as input, and insert trap instruction
7015	// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7016	Register LiveIn =
7017	MRI.createGenericVirtualRegister(Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
7018	if (!loadInputValue(DstReg: LiveIn, B, ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR))
7019	return false;
7020
7021	B.buildCopy(Res: SGPR01, Op: LiveIn);
7022	B.buildInstr(Opcode: AMDGPU::S_TRAP)
7023	.addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7024	.addReg(RegNo: SGPR01, flags: RegState::Implicit);
7025
7026	MI.eraseFromParent();
7027	return true;
7028	}
7029
7030	bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
7031	MachineRegisterInfo &MRI,
7032	MachineIRBuilder &B) const {
7033	// We need to simulate the 's_trap 2' instruction on targets that run in
7034	// PRIV=1 (where it is treated as a nop).
7035	if (ST.hasPrivEnabledTrap2NopBug()) {
7036	ST.getInstrInfo()->insertSimulatedTrap(MRI, MBB&: B.getMBB(), MI,
7037	DL: MI.getDebugLoc());
7038	MI.eraseFromParent();
7039	return true;
7040	}
7041
7042	B.buildInstr(Opcode: AMDGPU::S_TRAP)
7043	.addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7044	MI.eraseFromParent();
7045	return true;
7046	}
7047
7048	bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7049	MachineRegisterInfo &MRI,
7050	MachineIRBuilder &B) const {
7051	// Is non-HSA path or trap-handler disabled? Then, report a warning
7052	// accordingly
7053	if (!ST.isTrapHandlerEnabled() \|\|
7054	ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7055	Function &Fn = B.getMF().getFunction();
7056	Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported (
7057	Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7058	} else {
7059	// Insert debug-trap instruction
7060	B.buildInstr(Opcode: AMDGPU::S_TRAP)
7061	.addImm(Val: static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7062	}
7063
7064	MI.eraseFromParent();
7065	return true;
7066	}
7067
7068	bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
7069	MachineInstr &MI, MachineIRBuilder &B) const {
7070	MachineRegisterInfo &MRI = *B.getMRI();
7071	const LLT S16 = LLT::scalar(SizeInBits: `16`);
7072	const LLT S32 = LLT::scalar(SizeInBits: `32`);
7073	const LLT V2S16 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `16`);
7074	const LLT V3S32 = LLT::fixed_vector(NumElements: `3`, ScalarSizeInBits: `32`);
7075
7076	Register DstReg = MI.getOperand(i: `0`).getReg();
7077	Register NodePtr = MI.getOperand(i: `2`).getReg();
7078	Register RayExtent = MI.getOperand(i: `3`).getReg();
7079	Register RayOrigin = MI.getOperand(i: `4`).getReg();
7080	Register RayDir = MI.getOperand(i: `5`).getReg();
7081	Register RayInvDir = MI.getOperand(i: `6`).getReg();
7082	Register TDescr = MI.getOperand(i: `7`).getReg();
7083
7084	if (!ST.hasGFX10_AEncoding()) {
7085	Function &Fn = B.getMF().getFunction();
7086	Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported (
7087	Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7088	return false;
7089	}
7090
7091	const bool IsGFX11 = AMDGPU::isGFX11(STI: ST);
7092	const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: ST);
7093	const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: ST);
7094	const bool IsA16 = MRI.getType(Reg: RayDir).getElementType().getSizeInBits() == `16`;
7095	const bool Is64 = MRI.getType(Reg: NodePtr).getSizeInBits() == `64`;
7096	const unsigned NumVDataDwords = `4`;
7097	const unsigned NumVAddrDwords = IsA16 ? (Is64 ? `9` : `8`) : (Is64 ? `12` : `11`);
7098	const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? `4` : `5`) : NumVAddrDwords;
7099	const bool UseNSA =
7100	IsGFX12Plus \|\| (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7101
7102	const unsigned BaseOpcodes[`2`][`2`] = {
7103	{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7104	{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7105	AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7106	int Opcode;
7107	if (UseNSA) {
7108	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7109	MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7110	: IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7111	: AMDGPU::MIMGEncGfx10NSA,
7112	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7113	} else {
7114	assert(!IsGFX12Plus);
7115	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
7116	MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7117	: AMDGPU::MIMGEncGfx10Default,
7118	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7119	}
7120	assert(Opcode != -`1`);
7121
7122	SmallVector<Register, `12`> Ops;
7123	if (UseNSA && IsGFX11Plus) {
7124	auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7125	auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7126	auto Merged = B.buildMergeLikeInstr(
7127	Res: V3S32, Ops: {Unmerge.getReg(Idx: `0`), Unmerge.getReg(Idx: `1`), Unmerge.getReg(Idx: `2`)});
7128	Ops.push_back(Elt: Merged.getReg(Idx: `0`));
7129	};
7130
7131	Ops.push_back(Elt: NodePtr);
7132	Ops.push_back(Elt: RayExtent);
7133	packLanes (RayOrigin);
7134
7135	if (IsA16) {
7136	auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7137	auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7138	auto MergedDir = B.buildMergeLikeInstr(
7139	Res: V3S32,
7140	Ops: {B.buildBitcast(
7141	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `0`),
7142	UnmergeRayDir.getReg(Idx: `0`)}))
7143	.getReg(Idx: `0`),
7144	B.buildBitcast(
7145	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `1`),
7146	UnmergeRayDir.getReg(Idx: `1`)}))
7147	.getReg(Idx: `0`),
7148	B.buildBitcast(
7149	Dst: S32, Src: B.buildMergeLikeInstr(Res: V2S16, Ops: {UnmergeRayInvDir.getReg(Idx: `2`),
7150	UnmergeRayDir.getReg(Idx: `2`)}))
7151	.getReg(Idx: `0`)});
7152	Ops.push_back(Elt: MergedDir.getReg(Idx: `0`));
7153	} else {
7154	packLanes (RayDir);
7155	packLanes (RayInvDir);
7156	}
7157	} else {
7158	if (Is64) {
7159	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: NodePtr);
7160	Ops.push_back(Elt: Unmerge.getReg(Idx: `0`));
7161	Ops.push_back(Elt: Unmerge.getReg(Idx: `1`));
7162	} else {
7163	Ops.push_back(Elt: NodePtr);
7164	}
7165	Ops.push_back(Elt: RayExtent);
7166
7167	auto packLanes = [&Ops, &S32, &B](Register Src) {
7168	auto Unmerge = B.buildUnmerge(Res: {S32, S32, S32}, Op: Src);
7169	Ops.push_back(Elt: Unmerge.getReg(Idx: `0`));
7170	Ops.push_back(Elt: Unmerge.getReg(Idx: `1`));
7171	Ops.push_back(Elt: Unmerge.getReg(Idx: `2`));
7172	};
7173
7174	packLanes (RayOrigin);
7175	if (IsA16) {
7176	auto UnmergeRayDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayDir);
7177	auto UnmergeRayInvDir = B.buildUnmerge(Res: {S16, S16, S16}, Op: RayInvDir);
7178	Register R1 = MRI.createGenericVirtualRegister(Ty: S32);
7179	Register R2 = MRI.createGenericVirtualRegister(Ty: S32);
7180	Register R3 = MRI.createGenericVirtualRegister(Ty: S32);
7181	B.buildMergeLikeInstr(Res: R1,
7182	Ops: {UnmergeRayDir.getReg(Idx: `0`), UnmergeRayDir.getReg(Idx: `1`)});
7183	B.buildMergeLikeInstr(
7184	Res: R2, Ops: {UnmergeRayDir.getReg(Idx: `2`), UnmergeRayInvDir.getReg(Idx: `0`)});
7185	B.buildMergeLikeInstr(
7186	Res: R3, Ops: {UnmergeRayInvDir.getReg(Idx: `1`), UnmergeRayInvDir.getReg(Idx: `2`)});
7187	Ops.push_back(Elt: R1);
7188	Ops.push_back(Elt: R2);
7189	Ops.push_back(Elt: R3);
7190	} else {
7191	packLanes (RayDir);
7192	packLanes (RayInvDir);
7193	}
7194	}
7195
7196	if (!UseNSA) {
7197	// Build a single vector containing all the operands so far prepared.
7198	LLT OpTy = LLT::fixed_vector(NumElements: Ops.size(), ScalarSizeInBits: `32`);
7199	Register MergedOps = B.buildMergeLikeInstr(Res: OpTy, Ops).getReg(Idx: `0`);
7200	Ops.clear();
7201	Ops.push_back(Elt: MergedOps);
7202	}
7203
7204	auto MIB = B.buildInstr(Opcode: AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7205	.addDef(RegNo: DstReg)
7206	.addImm(Val: Opcode);
7207
7208	for (Register R : Ops) {
7209	MIB.addUse(RegNo: R);
7210	}
7211
7212	MIB.addUse(RegNo: TDescr)
7213	.addImm(Val: IsA16 ? `1` : `0`)
7214	.cloneMemRefs(OtherMI: MI);
7215
7216	MI.eraseFromParent();
7217	return true;
7218	}
7219
7220	bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic(
7221	MachineInstr &MI, MachineIRBuilder &B) const {
7222	const LLT S32 = LLT::scalar(SizeInBits: `32`);
7223	const LLT V2S32 = LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `32`);
7224
7225	Register DstReg = MI.getOperand(i: `0`).getReg();
7226	Register DstOrigin = MI.getOperand(i: `1`).getReg();
7227	Register DstDir = MI.getOperand(i: `2`).getReg();
7228	Register NodePtr = MI.getOperand(i: `4`).getReg();
7229	Register RayExtent = MI.getOperand(i: `5`).getReg();
7230	Register InstanceMask = MI.getOperand(i: `6`).getReg();
7231	Register RayOrigin = MI.getOperand(i: `7`).getReg();
7232	Register RayDir = MI.getOperand(i: `8`).getReg();
7233	Register Offsets = MI.getOperand(i: `9`).getReg();
7234	Register TDescr = MI.getOperand(i: `10`).getReg();
7235
7236	if (!ST.hasBVHDualAndBVH8Insts()) {
7237	Function &Fn = B.getMF().getFunction();
7238	Fn.getContext().diagnose(DI: DiagnosticInfoUnsupported (
7239	Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7240	return false;
7241	}
7242
7243	bool IsBVH8 = cast<GIntrinsic>(Val&: MI).getIntrinsicID() ==
7244	Intrinsic::amdgcn_image_bvh8_intersect_ray;
7245	const unsigned NumVDataDwords = `10`;
7246	const unsigned NumVAddrDwords = IsBVH8 ? `11` : `12`;
7247	int Opcode = AMDGPU::getMIMGOpcode(
7248	BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7249	: AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7250	MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
7251	assert(Opcode != -`1`);
7252
7253	auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7254	Res: V2S32, Ops: {RayExtent, B.buildAnyExt(Res: S32, Op: InstanceMask)});
7255
7256	B.buildInstr(Opcode: IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7257	: AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7258	.addDef(RegNo: DstReg)
7259	.addDef(RegNo: DstOrigin)
7260	.addDef(RegNo: DstDir)
7261	.addImm(Val: Opcode)
7262	.addUse(RegNo: NodePtr)
7263	.addUse(RegNo: RayExtentInstanceMaskVec.getReg(Idx: `0`))
7264	.addUse(RegNo: RayOrigin)
7265	.addUse(RegNo: RayDir)
7266	.addUse(RegNo: Offsets)
7267	.addUse(RegNo: TDescr)
7268	.cloneMemRefs(OtherMI: MI);
7269
7270	MI.eraseFromParent();
7271	return true;
7272	}
7273
7274	bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7275	MachineIRBuilder &B) const {
7276	const SITargetLowering *TLI = ST.getTargetLowering();
7277	Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7278	Register DstReg = MI.getOperand(i: `0`).getReg();
7279	B.buildInstr(Opc: AMDGPU::G_AMDGPU_WAVE_ADDRESS, DstOps: {DstReg}, SrcOps: {StackPtr});
7280	MI.eraseFromParent();
7281	return true;
7282	}
7283
7284	bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7285	MachineIRBuilder &B) const {
7286	// With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7287	if (!ST.hasArchitectedSGPRs())
7288	return false;
7289	LLT S32 = LLT::scalar(SizeInBits: `32`);
7290	Register DstReg = MI.getOperand(i: `0`).getReg();
7291	auto TTMP8 = B.buildCopy(Res: S32, Op: Register (AMDGPU::TTMP8));
7292	auto LSB = B.buildConstant(Res: S32, Val: `25`);
7293	auto Width = B.buildConstant(Res: S32, Val: `5`);
7294	B.buildUbfx(Dst: DstReg, Src: TTMP8, LSB, Width);
7295	MI.eraseFromParent();
7296	return true;
7297	}
7298
7299	static constexpr unsigned FPEnvModeBitField =
7300	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `23`);
7301
7302	static constexpr unsigned FPEnvTrapBitField =
7303	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: `0`, Values: `5`);
7304
7305	bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7306	MachineRegisterInfo &MRI,
7307	MachineIRBuilder &B) const {
7308	Register Src = MI.getOperand(i: `0`).getReg();
7309	if (MRI.getType(Reg: Src) != S64)
7310	return false;
7311
7312	auto ModeReg =
7313	B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7314	/HasSideEffects=/true, /isConvergent=/false)
7315	.addImm(Val: FPEnvModeBitField);
7316	auto TrapReg =
7317	B.buildIntrinsic(ID: Intrinsic::amdgcn_s_getreg, Res: {S32},
7318	/HasSideEffects=/true, /isConvergent=/false)
7319	.addImm(Val: FPEnvTrapBitField);
7320	B.buildMergeLikeInstr(Res: Src, Ops: {ModeReg, TrapReg});
7321	MI.eraseFromParent();
7322	return true;
7323	}
7324
7325	bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7326	MachineRegisterInfo &MRI,
7327	MachineIRBuilder &B) const {
7328	Register Src = MI.getOperand(i: `0`).getReg();
7329	if (MRI.getType(Reg: Src) != S64)
7330	return false;
7331
7332	auto Unmerge = B.buildUnmerge(Res: {S32, S32}, Op: MI.getOperand(i: `0`));
7333	B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7334	/HasSideEffects=/true, /isConvergent=/false)
7335	.addImm(Val: static_cast<int16_t>(FPEnvModeBitField))
7336	.addReg(RegNo: Unmerge.getReg(Idx: `0`));
7337	B.buildIntrinsic(ID: Intrinsic::amdgcn_s_setreg, Res: ArrayRef<DstOp>(),
7338	/HasSideEffects=/true, /isConvergent=/false)
7339	.addImm(Val: static_cast<int16_t>(FPEnvTrapBitField))
7340	.addReg(RegNo: Unmerge.getReg(Idx: `1`));
7341	MI.eraseFromParent();
7342	return true;
7343	}
7344
7345	bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7346	MachineInstr &MI) const {
7347	MachineIRBuilder &B = Helper.MIRBuilder;
7348	MachineRegisterInfo &MRI = *B.getMRI();
7349
7350	// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7351	auto IntrID = cast<GIntrinsic>(Val&: MI).getIntrinsicID();
7352	switch (IntrID) {
7353	case Intrinsic::amdgcn_if:
7354	case Intrinsic::amdgcn_else: {
7355	MachineInstr Br = nullptr*;
7356	MachineBasicBlock UncondBrTarget = nullptr*;
7357	bool Negated = false;
7358	if (MachineInstr *BrCond =
7359	verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7360	const SIRegisterInfo *TRI
7361	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7362
7363	Register Def = MI.getOperand(i: `1`).getReg();
7364	Register Use = MI.getOperand(i: `3`).getReg();
7365
7366	MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: `1`).getMBB();
7367
7368	if (Negated)
7369	std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7370
7371	B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7372	if (IntrID == Intrinsic::amdgcn_if) {
7373	B.buildInstr(Opcode: AMDGPU::SI_IF)
7374	.addDef(RegNo: Def)
7375	.addUse(RegNo: Use)
7376	.addMBB(MBB: UncondBrTarget);
7377	} else {
7378	B.buildInstr(Opcode: AMDGPU::SI_ELSE)
7379	.addDef(RegNo: Def)
7380	.addUse(RegNo: Use)
7381	.addMBB(MBB: UncondBrTarget);
7382	}
7383
7384	if (Br) {
7385	Br->getOperand(i: `0`).setMBB(CondBrTarget);
7386	} else {
7387	// The IRTranslator skips inserting the G_BR for fallthrough cases, but
7388	// since we're swapping branch targets it needs to be reinserted.
7389	// FIXME: IRTranslator should probably not do this
7390	B.buildBr(Dest&: *CondBrTarget);
7391	}
7392
7393	MRI.setRegClass(Reg: Def, RC: TRI->getWaveMaskRegClass());
7394	MRI.setRegClass(Reg: Use, RC: TRI->getWaveMaskRegClass());
7395	MI.eraseFromParent();
7396	BrCond->eraseFromParent();
7397	return true;
7398	}
7399
7400	return false;
7401	}
7402	case Intrinsic::amdgcn_loop: {
7403	MachineInstr Br = nullptr*;
7404	MachineBasicBlock UncondBrTarget = nullptr*;
7405	bool Negated = false;
7406	if (MachineInstr *BrCond =
7407	verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7408	const SIRegisterInfo *TRI
7409	= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7410
7411	MachineBasicBlock *CondBrTarget = BrCond->getOperand(i: `1`).getMBB();
7412	Register Reg = MI.getOperand(i: `2`).getReg();
7413
7414	if (Negated)
7415	std::swap(a&: CondBrTarget, b&: UncondBrTarget);
7416
7417	B.setInsertPt(MBB&: B.getMBB(), II: BrCond->getIterator());
7418	B.buildInstr(Opcode: AMDGPU::SI_LOOP)
7419	.addUse(RegNo: Reg)
7420	.addMBB(MBB: UncondBrTarget);
7421
7422	if (Br)
7423	Br->getOperand(i: `0`).setMBB(CondBrTarget);
7424	else
7425	B.buildBr(Dest&: *CondBrTarget);
7426
7427	MI.eraseFromParent();
7428	BrCond->eraseFromParent();
7429	MRI.setRegClass(Reg, RC: TRI->getWaveMaskRegClass());
7430	return true;
7431	}
7432
7433	return false;
7434	}
7435	case Intrinsic::amdgcn_addrspacecast_nonnull:
7436	return legalizeAddrSpaceCast(MI, MRI, B);
7437	case Intrinsic::amdgcn_make_buffer_rsrc:
7438	return legalizePointerAsRsrcIntrin(MI, MRI, B);
7439	case Intrinsic::amdgcn_kernarg_segment_ptr:
7440	if (!AMDGPU::isKernel(CC: B.getMF().getFunction().getCallingConv())) {
7441	// This only makes sense to call in a kernel, so just lower to null.
7442	B.buildConstant(Res: MI.getOperand(i: `0`).getReg(), Val: `0`);
7443	MI.eraseFromParent();
7444	return true;
7445	}
7446
7447	return legalizePreloadedArgIntrin(
7448	MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7449	case Intrinsic::amdgcn_implicitarg_ptr:
7450	return legalizeImplicitArgPtr(MI, MRI, B);
7451	case Intrinsic::amdgcn_workitem_id_x:
7452	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `0`,
7453	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7454	case Intrinsic::amdgcn_workitem_id_y:
7455	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `1`,
7456	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7457	case Intrinsic::amdgcn_workitem_id_z:
7458	return legalizeWorkitemIDIntrinsic(MI, MRI, B, Dim: `2`,
7459	ArgType: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7460	case Intrinsic::amdgcn_workgroup_id_x:
7461	return legalizePreloadedArgIntrin(MI, MRI, B,
7462	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7463	case Intrinsic::amdgcn_workgroup_id_y:
7464	return legalizePreloadedArgIntrin(MI, MRI, B,
7465	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7466	case Intrinsic::amdgcn_workgroup_id_z:
7467	return legalizePreloadedArgIntrin(MI, MRI, B,
7468	ArgType: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7469	case Intrinsic::amdgcn_wave_id:
7470	return legalizeWaveID(MI, B);
7471	case Intrinsic::amdgcn_lds_kernel_id:
7472	return legalizePreloadedArgIntrin(MI, MRI, B,
7473	ArgType: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7474	case Intrinsic::amdgcn_dispatch_ptr:
7475	return legalizePreloadedArgIntrin(MI, MRI, B,
7476	ArgType: AMDGPUFunctionArgInfo::DISPATCH_PTR);
7477	case Intrinsic::amdgcn_queue_ptr:
7478	return legalizePreloadedArgIntrin(MI, MRI, B,
7479	ArgType: AMDGPUFunctionArgInfo::QUEUE_PTR);
7480	case Intrinsic::amdgcn_implicit_buffer_ptr:
7481	return legalizePreloadedArgIntrin(
7482	MI, MRI, B, ArgType: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7483	case Intrinsic::amdgcn_dispatch_id:
7484	return legalizePreloadedArgIntrin(MI, MRI, B,
7485	ArgType: AMDGPUFunctionArgInfo::DISPATCH_ID);
7486	case Intrinsic::r600_read_ngroups_x:
7487	// TODO: Emit error for hsa
7488	return legalizeKernargMemParameter(MI, B,
7489	Offset: SI::KernelInputOffsets::NGROUPS_X);
7490	case Intrinsic::r600_read_ngroups_y:
7491	return legalizeKernargMemParameter(MI, B,
7492	Offset: SI::KernelInputOffsets::NGROUPS_Y);
7493	case Intrinsic::r600_read_ngroups_z:
7494	return legalizeKernargMemParameter(MI, B,
7495	Offset: SI::KernelInputOffsets::NGROUPS_Z);
7496	case Intrinsic::r600_read_local_size_x:
7497	// TODO: Could insert G_ASSERT_ZEXT from s16
7498	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
7499	case Intrinsic::r600_read_local_size_y:
7500	// TODO: Could insert G_ASSERT_ZEXT from s16
7501	return legalizeKernargMemParameter(MI, B, Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
7502	// TODO: Could insert G_ASSERT_ZEXT from s16
7503	case Intrinsic::r600_read_local_size_z:
7504	return legalizeKernargMemParameter(MI, B,
7505	Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
7506	case Intrinsic::amdgcn_fdiv_fast:
7507	return legalizeFDIVFastIntrin(MI, MRI, B);
7508	case Intrinsic::amdgcn_is_shared:
7509	return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::LOCAL_ADDRESS);
7510	case Intrinsic::amdgcn_is_private:
7511	return legalizeIsAddrSpace(MI, MRI, B, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS);
7512	case Intrinsic::amdgcn_wavefrontsize: {
7513	B.buildConstant(Res: MI.getOperand(i: `0`), Val: ST.getWavefrontSize());
7514	MI.eraseFromParent();
7515	return true;
7516	}
7517	case Intrinsic::amdgcn_s_buffer_load:
7518	return legalizeSBufferLoad(Helper, MI);
7519	case Intrinsic::amdgcn_raw_buffer_store:
7520	case Intrinsic::amdgcn_raw_ptr_buffer_store:
7521	case Intrinsic::amdgcn_struct_buffer_store:
7522	case Intrinsic::amdgcn_struct_ptr_buffer_store:
7523	return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: false);
7524	case Intrinsic::amdgcn_raw_buffer_store_format:
7525	case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7526	case Intrinsic::amdgcn_struct_buffer_store_format:
7527	case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7528	return legalizeBufferStore(MI, Helper, IsTyped: false, IsFormat: true);
7529	case Intrinsic::amdgcn_raw_tbuffer_store:
7530	case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7531	case Intrinsic::amdgcn_struct_tbuffer_store:
7532	case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7533	return legalizeBufferStore(MI, Helper, IsTyped: true, IsFormat: true);
7534	case Intrinsic::amdgcn_raw_buffer_load:
7535	case Intrinsic::amdgcn_raw_ptr_buffer_load:
7536	case Intrinsic::amdgcn_raw_atomic_buffer_load:
7537	case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7538	case Intrinsic::amdgcn_struct_buffer_load:
7539	case Intrinsic::amdgcn_struct_ptr_buffer_load:
7540	case Intrinsic::amdgcn_struct_atomic_buffer_load:
7541	case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7542	return legalizeBufferLoad(MI, Helper, IsFormat: false, IsTyped: false);
7543	case Intrinsic::amdgcn_raw_buffer_load_format:
7544	case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7545	case Intrinsic::amdgcn_struct_buffer_load_format:
7546	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7547	return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: false);
7548	case Intrinsic::amdgcn_raw_tbuffer_load:
7549	case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7550	case Intrinsic::amdgcn_struct_tbuffer_load:
7551	case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7552	return legalizeBufferLoad(MI, Helper, IsFormat: true, IsTyped: true);
7553	case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7554	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7555	case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7556	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7557	case Intrinsic::amdgcn_raw_buffer_atomic_add:
7558	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7559	case Intrinsic::amdgcn_struct_buffer_atomic_add:
7560	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7561	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7562	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7563	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7564	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7565	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7566	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7567	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7568	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7569	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7570	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7571	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7572	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7573	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7574	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7575	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7576	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7577	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7578	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7579	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7580	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7581	case Intrinsic::amdgcn_raw_buffer_atomic_and:
7582	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7583	case Intrinsic::amdgcn_struct_buffer_atomic_and:
7584	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7585	case Intrinsic::amdgcn_raw_buffer_atomic_or:
7586	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7587	case Intrinsic::amdgcn_struct_buffer_atomic_or:
7588	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7589	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7590	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7591	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7592	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7593	case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7594	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7595	case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7596	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7597	case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7598	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7599	case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7600	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7601	case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7602	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7603	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7604	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7605	case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7606	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7607	case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7608	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7609	case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7610	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7611	case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7612	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7613	case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7614	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7615	case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7616	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7617	return legalizeBufferAtomic(MI, B, IID: IntrID);
7618	case Intrinsic::amdgcn_rsq_clamp:
7619	return legalizeRsqClampIntrinsic(MI, MRI, B);
7620	case Intrinsic::amdgcn_image_bvh_intersect_ray:
7621	return legalizeBVHIntersectRayIntrinsic(MI, B);
7622	case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7623	case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7624	return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
7625	case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7626	case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7627	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7628	case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7629	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7630	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7631	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7632	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7633	Register Index = MI.getOperand(i: `5`).getReg();
7634	LLT S32 = LLT::scalar(SizeInBits: `32`);
7635	if (MRI.getType(Reg: Index) != S32)
7636	MI.getOperand(i: `5`).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: `0`));
7637	return true;
7638	}
7639	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7640	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7641	case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7642	Register Index = MI.getOperand(i: `7`).getReg();
7643	LLT S32 = LLT::scalar(SizeInBits: `32`);
7644	if (MRI.getType(Reg: Index) != S32)
7645	MI.getOperand(i: `7`).setReg(B.buildAnyExt(Res: S32, Op: Index).getReg(Idx: `0`));
7646	return true;
7647	}
7648	case Intrinsic::amdgcn_fmed3: {
7649	GISelChangeObserver &Observer = Helper.Observer;
7650
7651	// FIXME: This is to workaround the inability of tablegen match combiners to
7652	// match intrinsics in patterns.
7653	Observer.changingInstr(MI);
7654	MI.setDesc(B.getTII().get(Opcode: AMDGPU::G_AMDGPU_FMED3));
7655	MI.removeOperand(OpNo: `1`);
7656	Observer.changedInstr(MI);
7657	return true;
7658	}
7659	case Intrinsic::amdgcn_readlane:
7660	case Intrinsic::amdgcn_writelane:
7661	case Intrinsic::amdgcn_readfirstlane:
7662	case Intrinsic::amdgcn_permlane16:
7663	case Intrinsic::amdgcn_permlanex16:
7664	case Intrinsic::amdgcn_permlane64:
7665	case Intrinsic::amdgcn_set_inactive:
7666	case Intrinsic::amdgcn_set_inactive_chain_arg:
7667	case Intrinsic::amdgcn_mov_dpp8:
7668	case Intrinsic::amdgcn_update_dpp:
7669	return legalizeLaneOp(Helper, MI, IID: IntrID);
7670	case Intrinsic::amdgcn_s_buffer_prefetch_data:
7671	return legalizeSBufferPrefetch(Helper, MI);
7672	case Intrinsic::amdgcn_dead: {
7673	// TODO: Use poison instead of undef
7674	for (const MachineOperand &Def : MI.defs())
7675	B.buildUndef(Res: Def);
7676	MI.eraseFromParent();
7677	return true;
7678	}
7679	default: {
7680	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7681	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
7682	return legalizeImageIntrinsic(MI, B, Observer&: Helper.Observer, Intr: ImageDimIntr);
7683	return true;
7684	}
7685	}
7686
7687	return true;
7688	}
7689

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp